diff --git a/3rdparty/aiter b/3rdparty/aiter index a64fa18e6..4a8ecd743 160000 --- a/3rdparty/aiter +++ b/3rdparty/aiter @@ -1 +1 @@ -Subproject commit a64fa18e60235994e4cbfd7059cc2f60d06e743f +Subproject commit 4a8ecd7432a0e695833b8b94cceec007e2e3af97 diff --git a/transformer_engine/common/ck_fused_attn/CMakeLists.txt b/transformer_engine/common/ck_fused_attn/CMakeLists.txt index beae0e9ce..2f59fce39 100644 --- a/transformer_engine/common/ck_fused_attn/CMakeLists.txt +++ b/transformer_engine/common/ck_fused_attn/CMakeLists.txt @@ -56,15 +56,50 @@ else() # If not downloaded, Fallback: Build from source if(NOT AITER_PREBUILT_DOWNLOAD_SUCCESS) + # Generate the embedded HSA header and pass it into AITER compile.py via env vars. + # This avoids needing to set AITER_ASM_DIR (thread-unsafe). + set(AITER_HSA_DIR "${__AITER_SOURCE_DIR}/hsa") + set(AITER_EMBEDDED_HSA_HEADER_PATH "${CMAKE_CURRENT_BINARY_DIR}/aiter_embedded_hsa/aiter_embedded_hsa.h") + + set(AITER_EMBEDDED_HSA_SUBDIRS) + foreach(ARCH IN LISTS V3_ASM_ARCHS) + list(APPEND AITER_EMBEDDED_HSA_SUBDIRS + "${ARCH}/fmha_v3_bwd" + "${ARCH}/fmha_v3_fwd") + endforeach() + + set(AITER_BUILD_ENV_ARGS) + if(AITER_EMBEDDED_HSA_SUBDIRS) + execute_process( + COMMAND python3 "${CMAKE_CURRENT_LIST_DIR}/generate_aiter_embedded_hsa.py" + --hsa-dir ${AITER_HSA_DIR} + --output ${AITER_EMBEDDED_HSA_HEADER_PATH} + --subdirs ${AITER_EMBEDDED_HSA_SUBDIRS} + RESULT_VARIABLE AITER_MAKE_HSA_RET + ) + if(AITER_MAKE_HSA_RET AND NOT AITER_MAKE_HSA_RET EQUAL 0) + message(FATAL_ERROR "Failed to generate aiter_embedded_hsa.h") + endif() + list(APPEND AITER_BUILD_ENV_ARGS AITER_EMBEDDED_HSA_HEADER_PATH=${AITER_EMBEDDED_HSA_HEADER_PATH}) + else() + message(STATUS "[AITER-BUILD] No supported V3 ASM arch selected; skipping embedded HSA generation.") + endif() + message(STATUS " [AITER-BUILD] Building aiter from source.") execute_process( - COMMAND bash ${CMAKE_CURRENT_LIST_DIR}/aiter_build.sh + COMMAND ${CMAKE_COMMAND} -E env + ${AITER_BUILD_ENV_ARGS} + bash ${CMAKE_CURRENT_LIST_DIR}/aiter_build.sh --aiter-dir ${__AITER_SOURCE_DIR} --aiter-test-dir ${__AITER_TEST_DIR} --gpu-archs "${V3_ASM_ARCHS_STR}" --ck-tile-bf16 ${CK_FUSED_ATTN_FLOAT_TO_BFLOAT16_DEFAULT} + RESULT_VARIABLE AITER_BUILD_RET ) - # libmha_fwd.so and libmha_bwd.so will be under 3rdparty/aiter/op_tests/cpp/mha + if(AITER_BUILD_RET AND NOT AITER_BUILD_RET EQUAL 0) + message(FATAL_ERROR "[AITER-BUILD] aiter_build.sh failed") + endif() + # libmha_fwd.a and libmha_bwd.a will be under 3rdparty/aiter/op_tests/cpp/mha cache_local_aiter_build(${__AITER_TEST_DIR}) endif() endif() diff --git a/transformer_engine/common/ck_fused_attn/generate_aiter_embedded_hsa.py b/transformer_engine/common/ck_fused_attn/generate_aiter_embedded_hsa.py new file mode 100644 index 000000000..3c73d2c1e --- /dev/null +++ b/transformer_engine/common/ck_fused_attn/generate_aiter_embedded_hsa.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +# Generate aiter_embedded_hsa.h with embedded binary .co files for AITER HSA kernels. + +import argparse +import re +import sys +from pathlib import Path + + +def sanitize_identifier(name: str) -> str: + """Convert a file path to a valid C++ identifier.""" + return re.sub(r"[^a-zA-Z0-9]", "_", name) + + +def bytes_to_hex_array(data: bytes, bytes_per_line: int = 16) -> str: + """Convert bytes to a formatted C hex array string.""" + hex_bytes = [] + for i, byte in enumerate(data): + if i > 0 and i % bytes_per_line == 0: + hex_bytes.append("\n ") + hex_bytes.append(f"0x{byte:02x}") + if i < len(data) - 1: + hex_bytes.append(",") + return "".join(hex_bytes) + + +def generate_embedded_hsa_header( + hsa_dir: Path, output_file: Path, subdirs: list[str] +) -> int: + """ + Generate a C++ header file embedding all .co files from specified subdirectories. + + Args: + hsa_dir: Base directory containing hsa files (e.g., third_party/aiter/hsa) + output_file: Path to the output header file + subdirs: List of subdirectories to scan for .co files (e.g., ["gfx942/fmha_v3_bwd", "gfx950/fmha_v3_bwd"]) + + Returns: + Number of .co files embedded + """ + # Collect all .co files + co_files: list[tuple[str, Path]] = [] + for subdir in subdirs: + pattern_dir = hsa_dir / subdir + if pattern_dir.exists(): + for co_file in sorted(pattern_dir.glob("**/*.co")): + # Key format: hsa/gfx942/fmha_v3_bwd/xxx.co + # Use as_posix() to ensure forward slashes on all platforms + rel_path = co_file.relative_to(hsa_dir).as_posix() + map_key = f"hsa/{rel_path}" + co_files.append((map_key, co_file)) + + if not co_files: + print(f"Warning: No .co files found in {hsa_dir} under {subdirs}") + return 0 + + # Generate header content + # Using std::string_view instead of std::span for C++17 compatibility + # std::string_view provides .data() method which is what hipModuleLoadData needs + lines = [ + "// Auto-generated file. Do not edit.", + "// Embedded AITER HSA binary files for fmha_v3_bwd", + "#pragma once", + "", + "#include ", + "#include ", + "#include ", + "#include ", + "", + "// Define AITER_EMBEDDED_HSA_MAP macro so that aiter_hip_common.h", + "// can detect the embedded map is available via #if defined(AITER_EMBEDDED_HSA_MAP)", + "#define AITER_EMBEDDED_HSA_MAP ::aiter_hsa::embedded_hsa_map", + "", + "namespace aiter_hsa {", + "", + ] + + # Generate array declarations and map entries + array_entries = [] + for map_key, co_file in co_files: + with open(co_file, "rb") as f: + data = f.read() + + # Only generate array and map entry if file has content + if len(data) > 0: + safe_name = sanitize_identifier(co_file.relative_to(hsa_dir).as_posix()) + array_name = f"data_{safe_name}" + file_size = len(data) + array_entries.append((map_key, array_name, file_size)) + + hex_array = bytes_to_hex_array(data) + lines.append( + f"alignas(4096) inline const unsigned char {array_name}[] = {{\n {hex_array}\n}};" + ) + lines.append("") + + # Generate the map + lines.append( + "inline const std::unordered_map embedded_hsa_map = {" + ) + for map_key, array_name, file_size in array_entries: + lines.append( + f' {{"{map_key}", std::string_view(reinterpret_cast({array_name}), {file_size})}},' + ) + lines.append("};") + lines.append("") + lines.append("} // namespace aiter_hsa") + lines.append("") + + # Write output + output_file.parent.mkdir(parents=True, exist_ok=True) + with open(output_file, "w") as f: + f.write("\n".join(lines)) + + return len(array_entries) + + +def main(): + parser = argparse.ArgumentParser( + description="Generate aiter_embedded_hsa.h with embedded binary .co files" + ) + parser.add_argument( + "--hsa-dir", required=True, type=Path, help="Path to the aiter hsa directory" + ) + parser.add_argument( + "--output", required=True, type=Path, help="Path to the output header file" + ) + parser.add_argument( + "--subdirs", + nargs="+", + default=["gfx942/fmha_v3_bwd", "gfx950/fmha_v3_bwd"], + help="Subdirectories to scan for .co files", + ) + + args = parser.parse_args() + + if not args.hsa_dir.exists(): + print(f"Error: HSA directory does not exist: {args.hsa_dir}", file=sys.stderr) + return 1 + + count = generate_embedded_hsa_header(args.hsa_dir, args.output, args.subdirs) + print(f"Generated {args.output} with {count} embedded .co files") + return 0 + + +if __name__ == "__main__": + sys.exit(main())