From 3e9d168228ad56f5e1198b9cccfcf922b70b7cb1 Mon Sep 17 00:00:00 2001 From: big-yellow-duck Date: Wed, 11 Mar 2026 03:28:50 +0000 Subject: [PATCH 1/4] add aiter gemm_a8w8_blockscale support for gfx1201 --- vllm/_aiter_ops.py | 4 ++-- vllm/platforms/rocm.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py index c8366ecce543..df90e59f0582 100644 --- a/vllm/_aiter_ops.py +++ b/vllm/_aiter_ops.py @@ -50,9 +50,9 @@ def is_aiter_found_and_supported() -> bool: VLLM_ROCM_USE_AITER=0, while preventing unwanted JIT warnings for auto-discovery. """ if current_platform.is_rocm() and IS_AITER_FOUND: - from vllm.platforms.rocm import on_gfx9 + from vllm.platforms.rocm import on_gfx9, on_gfx12x - return on_gfx9() + return on_gfx9() or on_gfx12x() return False diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index f1fd3331802b..5606a85e6092 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -145,6 +145,7 @@ def _get_gcn_arch() -> str: _GCN_ARCH = _get_gcn_arch() _ON_GFX1X = any(arch in _GCN_ARCH for arch in ["gfx11", "gfx12"]) +_ON_GFX12X = any(arch in _GCN_ARCH for arch in ["gfx12"]) _ON_MI3XX = any(arch in _GCN_ARCH for arch in ["gfx942", "gfx950"]) _ON_GFX9 = any(arch in _GCN_ARCH for arch in ["gfx90a", "gfx942", "gfx950"]) _ON_GFX942 = "gfx942" in _GCN_ARCH @@ -225,6 +226,8 @@ def _capability_from_gcn_arch(gcn_arch: str) -> tuple[int, int] | None: def on_gfx1x() -> bool: return _ON_GFX1X +def on_gfx12x()-> bool: + return _ON_GFX12X def on_mi3xx() -> bool: return _ON_MI3XX From ca408968df52e26d4b2f8b60b871081121b10f4b Mon Sep 17 00:00:00 2001 From: big-yellow-duck Date: Wed, 11 Mar 2026 03:42:03 +0000 Subject: [PATCH 2/4] add tuned moe configs for gfx1201 --- ...on_AI_PRO_R9700,block_shape=[128,128].json | 69 +++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 57 +++++++++++++++ .../layers/fused_moe/fused_moe.py | 6 +- 3 files changed, 130 insertions(+), 2 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=AMD_Radeon_AI_PRO_R9700,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=AMD_Radeon_AI_PRO_R9700,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=AMD_Radeon_AI_PRO_R9700,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=AMD_Radeon_AI_PRO_R9700,block_shape=[128,128].json new file mode 100644 index 000000000000..678c64cc7c54 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=AMD_Radeon_AI_PRO_R9700,block_shape=[128,128].json @@ -0,0 +1,69 @@ +{ + "triton_version": "3.5.1+rocm7.2.0.gita272dfa8", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 2, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 4, + "matrix_instr_nonkdim": 16, + "kpack": 1 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=AMD_Radeon_AI_PRO_R9700,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=AMD_Radeon_AI_PRO_R9700,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..b03511a2cc72 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=768,device_name=AMD_Radeon_AI_PRO_R9700,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,57 @@ +{ + "triton_version": "3.5.1+rocm7.2.0.gita272dfa8", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 1 + }, + "8": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index ee321f241aad..90a90d819b95 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1953,13 +1953,15 @@ def _supports_quant_scheme( ) -> bool: p = current_platform if p.is_rocm(): - from vllm.platforms.rocm import on_gfx9 + from vllm.platforms.rocm import on_gfx9, on_gfx12x is_rocm_on_gfx9 = on_gfx9() + is_rocm_on_gfx12x = on_gfx12x() else: is_rocm_on_gfx9 = False + is_rocm_on_gfx12x = False - device_supports_fp8 = is_rocm_on_gfx9 or ( + device_supports_fp8 = is_rocm_on_gfx9 or is_rocm_on_gfx12x or ( p.is_cuda() and p.has_device_capability((8, 9)) ) From 6fb9a3ef36d97ff379f87c05f9cdb0d4235b9b8c Mon Sep 17 00:00:00 2001 From: big-yellow-duck Date: Mon, 16 Mar 2026 04:47:56 +0000 Subject: [PATCH 3/4] remove unneceesary gfx12 in aiter_ops --- vllm/_aiter_ops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py index df90e59f0582..c8366ecce543 100644 --- a/vllm/_aiter_ops.py +++ b/vllm/_aiter_ops.py @@ -50,9 +50,9 @@ def is_aiter_found_and_supported() -> bool: VLLM_ROCM_USE_AITER=0, while preventing unwanted JIT warnings for auto-discovery. """ if current_platform.is_rocm() and IS_AITER_FOUND: - from vllm.platforms.rocm import on_gfx9, on_gfx12x + from vllm.platforms.rocm import on_gfx9 - return on_gfx9() or on_gfx12x() + return on_gfx9() return False From 826b9f1d15a49682fa1046a425c53e289c5796b0 Mon Sep 17 00:00:00 2001 From: big-yellow-duck Date: Mon, 16 Mar 2026 06:32:27 +0000 Subject: [PATCH 4/4] fix formatting Signed-off-by: big-yellow-duck --- vllm/model_executor/layers/fused_moe/fused_moe.py | 3 ++- vllm/platforms/rocm.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index f10f2909490b..6a1a0eb1089c 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1935,7 +1935,8 @@ def _supports_quant_scheme( is_rocm_on_gfx12x = False device_supports_fp8 = ( - is_rocm_on_gfx9 or is_rocm_on_gfx12 + is_rocm_on_gfx9 + or is_rocm_on_gfx12x or (p.is_cuda() and p.has_device_capability((8, 9))) or p.is_xpu() ) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 8f749ac4054c..0551586f1ef3 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -226,9 +226,11 @@ def _capability_from_gcn_arch(gcn_arch: str) -> tuple[int, int] | None: def on_gfx1x() -> bool: return _ON_GFX1X -def on_gfx12x()-> bool: + +def on_gfx12x() -> bool: return _ON_GFX12X + def on_mi3xx() -> bool: return _ON_MI3XX