Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
458 changes: 458 additions & 0 deletions tests/profiler/qwen35_text_compile_determ.py

Large diffs are not rendered by default.

476 changes: 476 additions & 0 deletions tests/profiler/qwen35_vl_determ.py

Large diffs are not rendered by default.

510 changes: 510 additions & 0 deletions tests/profiler/test_prober.py

Large diffs are not rendered by default.

11 changes: 8 additions & 3 deletions xtuner/_testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import torch
from contextlib import contextmanager
import io
from xtuner.v1.utils.misc import set_deterministic



Expand All @@ -10,14 +11,18 @@ def enable_full_determinism():
Helper function for reproducible behavior during distributed training. See
- https://pytorch.org/docs/stable/notes/randomness.html for pytorch
"""
set_deterministic(True)

# already set in set_deterministic
# Enable PyTorch deterministic mode. This potentially requires either the environment
# variable 'CUDA_LAUNCH_BLOCKING' or 'CUBLAS_WORKSPACE_CONFIG' to be set,
# depending on the CUDA version, so we set them both here
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1" # should be replaced by CUBLAS_WORKSPACE_CONFIG
# os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"

# already set in set_deterministic
# torch.use_deterministic_algorithms(True, warn_only=True)
torch.set_deterministic_debug_mode(0)
# torch.set_deterministic_debug_mode(0) # should be replaced by torch.use_deterministic_algorithms


class _CaptureIO(io.TextIOWrapper):
Expand Down
33 changes: 32 additions & 1 deletion xtuner/v1/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,32 @@
from . import patch # noqa: F401
import os
from typing import Any, cast


def _patch_triton_autotune_for_determinism() -> None:
# 必须放在 xtuner.v1 初始化最前面:FLA kernel 在导入时就会读取 triton.autotune 装饰器。
# 如果等到 GatedDeltaNet 模块导入后再 patch,单独使用 xtuner 包的场景可能已经错过时机。
import triton

original_autotune = triton.autotune
if getattr(original_autotune, "_xtuner_deterministic_patched", False):
return

def deterministic_autotune(configs, *args, **kwargs):
# Triton autotune 会按 benchmark/cache 在多个 kernel config 中选一个实现;
# 不同 cache 目录或计时抖动可能选到不同 tiling/num_warps/reduction 路径,
# 从而改变浮点累加顺序。确定性模式固定第一个 config,并禁用 cache 结果。
if configs:
configs = configs[:1]
kwargs["cache_results"] = False
return original_autotune(configs, *args, **kwargs)

patched = cast(Any, deterministic_autotune)
patched._xtuner_deterministic_patched = True
patched._xtuner_original_autotune = original_autotune
triton.autotune = deterministic_autotune


if os.getenv("XTUNER_DETERMINISTIC") == "true":
_patch_triton_autotune_for_determinism()

from . import patch # noqa: E402,F401
Loading
Loading