InternLM · jayhenry · Mar 30, 2026 · Mar 30, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/tests/profiler/qwen35_text_compile_determ.py b/tests/profiler/qwen35_text_compile_determ.py
diff --git a/tests/profiler/qwen35_vl_determ.py b/tests/profiler/qwen35_vl_determ.py
diff --git a/tests/profiler/test_prober.py b/tests/profiler/test_prober.py
diff --git a/xtuner/_testing/utils.py b/xtuner/_testing/utils.py
@@ -2,6 +2,7 @@
 import torch
 from contextlib import contextmanager
 import io
+from xtuner.v1.utils.misc import set_deterministic
 
 
 
@@ -10,14 +11,18 @@ def enable_full_determinism():
     Helper function for reproducible behavior during distributed training. See
     - https://pytorch.org/docs/stable/notes/randomness.html for pytorch
     """
+    set_deterministic(True)
+
+    # already set in set_deterministic
     #  Enable PyTorch deterministic mode. This potentially requires either the environment
     #  variable 'CUDA_LAUNCH_BLOCKING' or 'CUBLAS_WORKSPACE_CONFIG' to be set,
     # depending on the CUDA version, so we set them both here
-    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
-    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+    # os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # should be replaced by CUBLAS_WORKSPACE_CONFIG
+    # os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
 
+    # already set in set_deterministic
     # torch.use_deterministic_algorithms(True, warn_only=True)
-    torch.set_deterministic_debug_mode(0)
+    # torch.set_deterministic_debug_mode(0)  # should be replaced by torch.use_deterministic_algorithms
 
 
 class _CaptureIO(io.TextIOWrapper):

diff --git a/xtuner/v1/__init__.py b/xtuner/v1/__init__.py
@@ -1 +1,32 @@
-from . import patch  # noqa: F401
+import os
+from typing import Any, cast
+
+
+def _patch_triton_autotune_for_determinism() -> None:
+    # 必须放在 xtuner.v1 初始化最前面：FLA kernel 在导入时就会读取 triton.autotune 装饰器。
+    # 如果等到 GatedDeltaNet 模块导入后再 patch，单独使用 xtuner 包的场景可能已经错过时机。
+    import triton
+
+    original_autotune = triton.autotune
+    if getattr(original_autotune, "_xtuner_deterministic_patched", False):
+        return
+
+    def deterministic_autotune(configs, *args, **kwargs):
+        # Triton autotune 会按 benchmark/cache 在多个 kernel config 中选一个实现；
+        # 不同 cache 目录或计时抖动可能选到不同 tiling/num_warps/reduction 路径，
+        # 从而改变浮点累加顺序。确定性模式固定第一个 config，并禁用 cache 结果。
+        if configs:
+            configs = configs[:1]
+        kwargs["cache_results"] = False
+        return original_autotune(configs, *args, **kwargs)
+
+    patched = cast(Any, deterministic_autotune)
+    patched._xtuner_deterministic_patched = True
+    patched._xtuner_original_autotune = original_autotune
+    triton.autotune = deterministic_autotune
+
+
+if os.getenv("XTUNER_DETERMINISTIC") == "true":
+    _patch_triton_autotune_for_determinism()
+
+from . import patch  # noqa: E402,F401