From 43011caccd742217d1ee110270afbe142f37de56 Mon Sep 17 00:00:00 2001 From: Ziminli <70735843+Ziminli@users.noreply.github.com> Date: Fri, 24 Apr 2026 16:47:44 +0800 Subject: [PATCH] =?UTF-8?q?Revert=20"feat(ascend):=20op-norm-rope=20group?= =?UTF-8?q?=20=E2=80=94=20Swiglu,=20SiluAndMul,=20CausalSoftmax,=E2=80=A6"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 38a23cfd0adb803107672a040462349d0295af34. --- CMakeLists.txt | 21 +- pyproject.toml | 9 - scripts/generate_wrappers.py | 27 +- src/CMakeLists.txt | 70 +- src/ascend/add_rms_norm/kernel.h | 144 ---- src/ascend/add_rms_norm/kernel_custom.h | 171 ----- src/ascend/add_rms_norm/kernel_fused.h | 132 ---- src/ascend/causal_softmax/kernel.h | 173 ----- src/ascend/custom/CMakeLists.txt | 18 +- .../add_rms_norm/op_host/add_rms_norm.cpp | 19 +- .../custom/add_rms_norm/op_kernel/.clang-tidy | 9 - .../add_rms_norm/op_kernel/add_rms_norm.cpp | 350 ++++----- src/ascend/custom/build.sh | 33 +- src/ascend/custom/cmake/config_ascend.cmake | 14 +- src/ascend/custom/cmake/detect_soc.cmake | 24 - .../custom/rms_norm/op_host/rms_norm.cpp | 18 +- .../custom/rms_norm/op_kernel/rms_norm.cpp | 281 ++++--- src/ascend/linear/kernel.h | 6 - src/ascend/rms_norm/kernel.h | 100 --- src/ascend/rms_norm/kernel_custom.h | 155 ---- src/ascend/rotary_embedding/kernel.h | 373 --------- src/ascend/rotary_embedding/kernel_atb.h | 449 ----------- .../rotary_embedding/kernel_sincos_cache.h | 177 ----- src/ascend/silu_and_mul/kernel.h | 127 --- src/ascend/swiglu/kernel.h | 109 --- src/ascend/swiglu/kernel_fused.h | 202 ----- src/base/add_rms_norm.h | 41 +- src/base/linear.h | 33 +- src/base/rotary_embedding.h | 107 ++- src/base/silu_and_mul.h | 62 -- src/cpu/linear/linear.h | 4 +- src/data_type.h | 31 +- tests/test_add_rms_norm.py | 113 --- tests/test_rotary_embedding.py | 723 ------------------ tests/test_silu_and_mul.py | 76 -- 35 files changed, 438 insertions(+), 3963 deletions(-) delete mode 100644 src/ascend/add_rms_norm/kernel.h delete mode 100644 src/ascend/add_rms_norm/kernel_custom.h delete mode 100644 src/ascend/add_rms_norm/kernel_fused.h delete mode 100644 src/ascend/causal_softmax/kernel.h delete mode 100644 src/ascend/custom/add_rms_norm/op_kernel/.clang-tidy delete mode 100644 src/ascend/custom/cmake/detect_soc.cmake delete mode 100644 src/ascend/rms_norm/kernel.h delete mode 100644 src/ascend/rms_norm/kernel_custom.h delete mode 100644 src/ascend/rotary_embedding/kernel.h delete mode 100644 src/ascend/rotary_embedding/kernel_atb.h delete mode 100644 src/ascend/rotary_embedding/kernel_sincos_cache.h delete mode 100644 src/ascend/silu_and_mul/kernel.h delete mode 100644 src/ascend/swiglu/kernel.h delete mode 100644 src/ascend/swiglu/kernel_fused.h delete mode 100644 src/base/silu_and_mul.h delete mode 100644 tests/test_add_rms_norm.py delete mode 100644 tests/test_rotary_embedding.py delete mode 100644 tests/test_silu_and_mul.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 2e10db2e..91c2b015 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,21 +18,12 @@ option(WITH_ASCEND "Enable Ascend backend" OFF) option(WITH_TORCH "Enable PyTorch C++ backend" OFF) -# Custom `AscendC` kernels under `src/ascend/custom/`. `ON` by default -# so CI and routine dev builds always exercise `implementation_index=1/2` -# for `RmsNorm` / `AddRmsNorm`. Gated by `WITH_ASCEND` in -# `src/CMakeLists.txt` — non-Ascend builds ignore it. Pass -# `-DBUILD_ASCEND_CUSTOM=OFF` to skip the `ccec` build on Ascend -# machines where the custom kernels aren't needed. -# -# When `ON`, `src/CMakeLists.txt` drives the standalone -# `src/ascend/custom/build.sh` via `execute_process` at configure time -# (sidesteps a `CANN` `extract_host_stub.py` path bug that breaks -# in-tree `ascendc_library()` under `scikit-build-core` temp-dir builds) -# and links the produced `libno_workspace_kernel.a` into the `ops` -# module with `--whole-archive`. Requires `torch_npu` and the -# `AscendC` toolchain (`ccec`). -option(BUILD_ASCEND_CUSTOM "Build custom AscendC kernels" ON) +# Default OFF until CANN's `extract_host_stub.py` path handling is fixed for +# `scikit-build-core` temp-dir builds (triggers `KeyError` on the preprocessed +# object path). Enable explicitly with `-DBUILD_CUSTOM_KERNEL=ON` when the +# toolchain is compatible or when building via the standalone +# `src/ascend/custom/build.sh` script. +option(BUILD_CUSTOM_KERNEL "Build custom AscendC kernel PyTorch extension (requires `torch_npu`)" OFF) option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF) option(AUTO_DETECT_BACKENDS "Automatically detect available backends" OFF) diff --git a/pyproject.toml b/pyproject.toml index 6b517026..959699f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,15 +7,6 @@ name = "InfiniOps" version = "0.1.0" [project.optional-dependencies] -# TODO: `torch` here is unconstrained. On Ascend hosts, the working -# torch is the Ascend-matched `torch 2.9.0+cpu` paired with -# `torch_npu 2.9.0.post1+…`. A `pip install -e .[dev] --force-reinstall` -# will re-resolve `torch` to the latest PyPI version (currently -# `torch 2.11.0`), which now declares `cuda-toolkit` / `nvidia-cublas` / -# `nvidia-cudnn` / … as hard deps — downloads GBs of CUDA wheels and -# kills the `torch_npu` / `vllm-ascend` pairing. Needs a platform-aware -# split (e.g. `torch; platform_machine != 'aarch64'`, or move `torch` -# out of `dev` and require it pre-installed in the container image). dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch", "pyyaml"] [tool.scikit-build.wheel] diff --git a/scripts/generate_wrappers.py b/scripts/generate_wrappers.py index 9810404d..49b6c199 100644 --- a/scripts/generate_wrappers.py +++ b/scripts/generate_wrappers.py @@ -112,29 +112,9 @@ def _find_vector_tensor_params(op_name): return set(re.findall(r"std::vector\s+(\w+)", source)) -def _find_params_with_defaults(op_name): - """Return ``{param_name: default_literal}`` for base-header params that - carry a `= ` default value. `libclang`'s cursor API does not - expose defaults reliably, so we regex-scan the source. Only used for - plain scalar defaults such as ``bool pre_gathered = false``. - """ - source = (_BASE_DIR / f"{op_name}.h").read_text() - - mapping = {} - - for name, default in re.findall( - r"\b(?:bool|int(?:64_t|32_t|8_t|16_t)?|std::size_t|std::uint\w+_t|float|double)\s+(\w+)\s*=\s*([^,\)]+?)\s*(?:,|\))", - source, - ): - mapping[name] = default.strip() - - return mapping - - def _generate_pybind11(operator): optional_tensor_params = _find_optional_tensor_params(operator.name) vector_tensor_params = _find_vector_tensor_params(operator.name) - params_with_defaults = _find_params_with_defaults(operator.name) def _is_optional_tensor(arg): if arg.spelling in optional_tensor_params: @@ -206,10 +186,6 @@ def _generate_py_args(node): if _is_optional(arg): parts.append(f'py::arg("{arg.spelling}") = py::none()') - elif arg.spelling in params_with_defaults: - parts.append( - f'py::arg("{arg.spelling}") = {params_with_defaults[arg.spelling]}' - ) else: parts.append(f'py::arg("{arg.spelling}")') @@ -281,7 +257,8 @@ def _generate_call(op_name, call, method=True): }}) .def_static("clear_cache", &Self::clear_cache); -{callers}}} +{callers} +}} }} // namespace infini::ops diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 443ac0e2..32c92949 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -241,66 +241,8 @@ if(WITH_ASCEND) list(APPEND DEVICE_LIST "ascend") # Custom `AscendC` kernels (PyTorch extension, requires `torch_npu`). - if(BUILD_ASCEND_CUSTOM) - # In-tree `ascendc_library()` trips the `CANN` `extract_host_stub.py` - # path-handling bug under `scikit-build-core`'s temp-dir builds - # (`KeyError` on `/./workspace/...` paths in `$`). - # Work around it by driving the standalone `src/ascend/custom/build.sh` - # — that script invokes a separate `cmake` with - # `src/ascend/custom/` as its `SOURCE_DIR`, avoiding the buggy - # path shape. The produced `.a` is imported and linked into - # `ops` with `--whole-archive`. - set(_custom_build_dir "${CMAKE_SOURCE_DIR}/build/build_ascend_custom") - set(_custom_lib "${_custom_build_dir}/lib/libno_workspace_kernel.a") - - if(NOT DEFINED SOC_VERSION OR "${SOC_VERSION}" STREQUAL "") - include(${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/cmake/detect_soc.cmake) - infiniops_detect_soc(SOC_VERSION) - endif() - - # Drive `build.sh` as a build-phase target with explicit source - # dependencies so that editing any `op_host/` or `op_kernel/` - # source re-triggers the build (plain `execute_process` at - # configure time would only gate on file existence and leave - # stale `.a` files in place). - file(GLOB_RECURSE _custom_srcs CONFIGURE_DEPENDS - "${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/*.cpp" - "${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/*.h" - "${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/build.sh") - - # Scrub env inherited from the outer `scikit-build-core` invocation - # before handing control to `build.sh`: - # * `CMAKE_GENERATOR` / `CMAKE_EXPORT_COMPILE_COMMANDS` leaking - # into the inner `cmake` change the path format passed to - # `ninja`'s `_host_cpp` rule and re-trigger the `CANN` - # `extract_host_stub.py` `KeyError` (`/./workspace/...`) that - # standalone `build.sh` avoids. - # * `PYTHONPATH` from `pip`'s build-isolation overlay makes the - # child `python3` skip the system `site-packages` — child - # `cmake` modules that `import torch` (`config_envs.cmake`) - # then fail with `ModuleNotFoundError` even though `torch` is - # installed. - add_custom_command( - OUTPUT ${_custom_lib} - COMMAND ${CMAKE_COMMAND} -E env - --unset=CMAKE_GENERATOR - --unset=CMAKE_EXPORT_COMPILE_COMMANDS - --unset=CMAKE_BUILD_PARALLEL_LEVEL - --unset=PYTHONPATH - "BUILD_DIR=${_custom_build_dir}" - "CMAKE_EXE=${CMAKE_COMMAND}" - bash ${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/build.sh ${SOC_VERSION} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom - DEPENDS ${_custom_srcs} - COMMENT "Building custom AscendC kernels (SOC_VERSION=${SOC_VERSION})" - VERBATIM) - - add_custom_target(no_workspace_kernel_build ALL DEPENDS ${_custom_lib}) - - add_library(no_workspace_kernel STATIC IMPORTED GLOBAL) - set_target_properties(no_workspace_kernel PROPERTIES - IMPORTED_LOCATION "${_custom_lib}") - add_dependencies(no_workspace_kernel no_workspace_kernel_build) + if(BUILD_CUSTOM_KERNEL) + add_subdirectory(ascend/custom) # Link the compiled `AscendC` kernel objects into `infiniops` so that # custom kernel implementations (e.g. `RmsNorm` index 1) can call @@ -437,13 +379,9 @@ if(GENERATE_PYTHON_BINDINGS) # The `Operator<..., 1>` template instantiations that call # `aclrtlaunch_*` live in `ops.cc`, so link here with # `--whole-archive` to ensure all launch functions are available. - # `$` works for both real `ascendc_library()` targets and - # `IMPORTED` targets pointing at a pre-built `.a`. - if(BUILD_ASCEND_CUSTOM) + if(BUILD_CUSTOM_KERNEL) target_link_libraries(ops PRIVATE - -Wl,--whole-archive $ -Wl,--no-whole-archive) - # `ops` link step must wait for `build.sh` to produce the `.a`. - add_dependencies(ops no_workspace_kernel_build) + -Wl,--whole-archive no_workspace_kernel -Wl,--no-whole-archive) endif() set_target_properties(infiniops PROPERTIES INSTALL_RPATH "$ORIGIN") diff --git a/src/ascend/add_rms_norm/kernel.h b/src/ascend/add_rms_norm/kernel.h deleted file mode 100644 index 38b0a5ab..00000000 --- a/src/ascend/add_rms_norm/kernel.h +++ /dev/null @@ -1,144 +0,0 @@ -#ifndef INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_H_ -#define INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_H_ - -#include - -#include "acl/acl.h" -#include "aclnn/aclnn_base.h" -#include "aclnn_add.h" -#include "aclnn_rms_norm.h" -#include "ascend/common.h" -#include "ascend/workspace_pool_.h" -#include "base/add_rms_norm.h" -#include "operator.h" - -namespace infini::ops { - -// Decomposed implementation: `aclnnAdd` + `aclnnRmsNorm`. -// -// The fused `aclnnAddRmsNorm` API has ~200 us host-side launch overhead that -// dominates small-tensor dispatch. Decomposing into two fast ACLNN calls -// reduces host dispatch from ~224 us to ~56 us (4x faster) with negligible -// NPU-side impact for inference tensor sizes. -template <> -class Operator : public AddRmsNorm { - public: - Operator(const Tensor input, const Tensor residual, const Tensor weight, - float eps, Tensor out, Tensor residual_out) - : AddRmsNorm(input, residual, weight, eps, out, residual_out), - input_cache_(input), - residual_cache_(residual), - weight_cache_(weight), - out_cache_(out), - residual_out_cache_(residual_out) { - // Alpha scalar for `aclnnAdd` (`residual_out = input + 1.0 * residual`). - alpha_ = aclCreateScalar(&alpha_storage_, ACL_FLOAT); - - // `aclnnRmsNorm` writes `rstd` as a required side output. Size is - // computed here; the buffer is obtained from the pool in `operator()`. - rstd_shape_ = {static_cast(batch_size_), - static_cast(nhead_)}; - rstd_size_ = batch_size_ * nhead_ * sizeof(float); - } - - ~Operator() { - if (!ascend::IsAclRuntimeAlive()) return; - - // Null cached descriptors — see `AclTensorCache::release()`. - input_cache_.release(); - residual_cache_.release(); - weight_cache_.release(); - out_cache_.release(); - residual_out_cache_.release(); - - // `rstd_tensor_` leaks with `norm_exec_` at shutdown (see `64c367c`). - if (alpha_) aclDestroyScalar(alpha_); - } - - void operator()(const Tensor input, const Tensor residual, - const Tensor weight, float eps, Tensor out, - Tensor residual_out) const override { - auto t_input = input_cache_.get(const_cast(input.data())); - auto t_residual = residual_cache_.get(const_cast(residual.data())); - auto t_weight = weight_cache_.get(const_cast(weight.data())); - auto t_out = out_cache_.get(out.data()); - auto t_residual_out = residual_out_cache_.get(residual_out.data()); - auto stream = static_cast(stream_); - - // Step 1: `residual_out = input + residual`. - if (!add_exec_) { - aclnnAddGetWorkspaceSize(t_input, t_residual, alpha_, t_residual_out, - &add_ws_, &add_exec_); - aclSetAclOpExecutorRepeatable(add_exec_); - } else { - aclSetInputTensorAddr(add_exec_, 0, t_input, - const_cast(input.data())); - aclSetInputTensorAddr(add_exec_, 1, t_residual, - const_cast(residual.data())); - aclSetOutputTensorAddr(add_exec_, 0, t_residual_out, residual_out.data()); - } - auto& add_arena = ascend::GetWorkspacePool().Ensure(stream, add_ws_); - aclnnAdd(add_arena.buf, add_ws_, add_exec_, stream); - - // Obtain shared `rstd` buffer from pool. - auto& rstd_arena = - ascend::GetWorkspacePool().Ensure(stream, rstd_size_, "temp"); - - // Lazily create the `rstd` tensor descriptor on first call. - if (!rstd_tensor_) { - rstd_tensor_ = aclCreateTensor(rstd_shape_.data(), 2, ACL_FLOAT, - /*strides=*/nullptr, 0, ACL_FORMAT_ND, - rstd_shape_.data(), 2, rstd_arena.buf); - } else { - aclSetRawTensorAddr(rstd_tensor_, rstd_arena.buf); - } - - // Step 2: `out = rms_norm(residual_out, weight, eps)`. - if (!norm_exec_) { - aclnnRmsNormGetWorkspaceSize(t_residual_out, t_weight, eps, t_out, - rstd_tensor_, &norm_ws_, &norm_exec_); - aclSetAclOpExecutorRepeatable(norm_exec_); - } else { - aclSetInputTensorAddr(norm_exec_, 0, t_residual_out, residual_out.data()); - aclSetInputTensorAddr(norm_exec_, 1, t_weight, - const_cast(weight.data())); - aclSetOutputTensorAddr(norm_exec_, 0, t_out, out.data()); - aclSetOutputTensorAddr(norm_exec_, 1, rstd_tensor_, rstd_arena.buf); - } - auto& norm_arena = ascend::GetWorkspacePool().Ensure(stream, norm_ws_); - aclnnRmsNorm(norm_arena.buf, norm_ws_, norm_exec_, stream); - } - - private: - mutable ascend::AclTensorCache input_cache_; - - mutable ascend::AclTensorCache residual_cache_; - - mutable ascend::AclTensorCache weight_cache_; - - mutable ascend::AclTensorCache out_cache_; - - mutable ascend::AclTensorCache residual_out_cache_; - - float alpha_storage_ = 1.0f; - - aclScalar* alpha_ = nullptr; - - std::vector rstd_shape_; - - uint64_t rstd_size_ = 0; - - mutable aclTensor* rstd_tensor_ = nullptr; - - mutable aclOpExecutor* add_exec_ = nullptr; - - mutable uint64_t add_ws_ = 0; - - mutable aclOpExecutor* norm_exec_ = nullptr; - - mutable uint64_t norm_ws_ = 0; -}; - -} // namespace infini::ops - -#endif diff --git a/src/ascend/add_rms_norm/kernel_custom.h b/src/ascend/add_rms_norm/kernel_custom.h deleted file mode 100644 index daaa8c39..00000000 --- a/src/ascend/add_rms_norm/kernel_custom.h +++ /dev/null @@ -1,171 +0,0 @@ -#ifndef INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_CUSTOM_H_ -#define INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_CUSTOM_H_ - -#ifdef INFINI_HAS_CUSTOM_KERNELS - -#include -#include - -#include "acl/acl.h" -#include "aclnn/aclnn_base.h" -#include "aclnnop/aclnn_cast.h" -#include "ascend/common.h" -#include "ascend/workspace_pool_.h" -#include "base/add_rms_norm.h" -#include "operator.h" - -// Forward-declare the `aclrtlaunch_AddRmsNorm` launch symbol defined -// by the AscendC toolchain from `custom/add_rms_norm/op_kernel/`. -extern "C" uint32_t aclrtlaunch_AddRmsNorm( - uint32_t block_dim, void* stream, void* input, void* residual, void* weight, - int64_t total_rows, int64_t dim_length, int64_t dim_length_align, - int64_t former_num, int64_t former_length, int64_t tail_length, float eps, - int64_t dtype_code, void* out, void* residual_out); - -namespace infini::ops { - -// Custom AscendC fused `AddRmsNorm` kernel (implementation index 2). -// -// A single-kernel implementation that computes `residual_out = input + -// residual` followed by `out = rms_norm(residual_out, weight, eps)` in one -// launch, avoiding the decomposed `aclnnAdd` + `aclnnRmsNorm` calls (index 0) -// or the fused `aclnnAddRmsNorm` call (index 1). Migrated from the custom -// `RmsNorm` kernel (index 1 of `RmsNorm`). -// -// Select via `implementation_index=2` in Python: -// `infini.ops.add_rms_norm(input, residual, weight, eps, out, residual_out, -// implementation_index=2, stream=s)`. -// -// Requirements: -// - Input last dimension must be 32-byte aligned (divisible by 16 for -// `float16` or 8 for `float32`). All standard LLM hidden dimensions -// satisfy this. -// - `weight` must have the same dtype as `input`. -// - The custom kernel binary must be linked (`BUILD_ASCEND_CUSTOM=ON`). -template <> -class Operator : public AddRmsNorm { - public: - Operator(const Tensor input, const Tensor residual, const Tensor weight, - float eps, Tensor out, Tensor residual_out) - : AddRmsNorm(input, residual, weight, eps, out, residual_out), - dtype_{input.dtype()} { - assert((dtype_ == DataType::kFloat16 || dtype_ == DataType::kBFloat16 || - dtype_ == DataType::kFloat32) && - "`AddRmsNorm` custom kernel: `input` must be `fp16`, `bf16`, or " - "`fp32`"); - - // 32-byte alignment on the last dimension — kernel relies on aligned - // `DataCopyPad` loads/stores. - int64_t align_elems = 32 / static_cast(kDataTypeToSize.at(dtype_)); - dim_length_align_ = - ((static_cast(dim_) + align_elems - 1) / align_elems) * - align_elems; - assert(static_cast(dim_) == dim_length_align_ && - "`AddRmsNorm` custom kernel: last dimension must be 32-byte " - "aligned"); - - total_rows_ = - static_cast(batch_size_) * static_cast(nhead_); - - // The custom kernel always reads `weight` as fp32. fp16 / bf16 inputs - // trigger a lazy cast in `operator()` (guarded by `last_weight_ptr_` - // so that the cast runs only when the weight pointer changes — model - // weights are typically fixed after loading). - if (dtype_ != DataType::kFloat32) { - size_t fp32_bytes = static_cast(dim_) * sizeof(float); - aclrtMalloc(&weight_fp32_data_, fp32_bytes, ACL_MEM_MALLOC_NORMAL_ONLY); - - weight_src_cache_ = ascend::AclTensorCache( - {static_cast(dim_)}, ascend::ToAclDtype(dtype_), nullptr); - weight_dst_cache_ = ascend::AclTensorCache({static_cast(dim_)}, - ACL_FLOAT, weight_fp32_data_); - } - } - - ~Operator() { - if (!ascend::IsAclRuntimeAlive()) return; - - // Null cached descriptors — see `AclTensorCache::release()`. - weight_src_cache_.release(); - weight_dst_cache_.release(); - - if (weight_fp32_data_) aclrtFree(weight_fp32_data_); - } - - void operator()(const Tensor input, const Tensor residual, - const Tensor weight, float eps, Tensor out, - Tensor residual_out) const override { - auto stream = static_cast(stream_); - - void* weight_fp32; - - if (dtype_ != DataType::kFloat32) { - const void* cur_weight = weight.data(); - - // Model weights are fixed after loading, so the cast typically runs - // once on the first call and is skipped on all subsequent calls. - if (cur_weight != last_weight_ptr_) { - auto t_src = weight_src_cache_.get(const_cast(cur_weight)); - auto t_dst = weight_dst_cache_.get(weight_fp32_data_); - - if (!cast_exec_) { - aclnnCastGetWorkspaceSize(t_src, ACL_FLOAT, t_dst, &cast_ws_, - &cast_exec_); - aclSetAclOpExecutorRepeatable(cast_exec_); - } else { - aclSetInputTensorAddr(cast_exec_, 0, t_src, - const_cast(cur_weight)); - aclSetOutputTensorAddr(cast_exec_, 0, t_dst, weight_fp32_data_); - } - - auto& arena = ascend::GetWorkspacePool().Ensure(stream, cast_ws_); - aclnnCast(arena.buf, cast_ws_, cast_exec_, stream); - last_weight_ptr_ = cur_weight; - } - - weight_fp32 = weight_fp32_data_; - } else { - weight_fp32 = const_cast(weight.data()); - } - - // Block-level tiling. Ascend 910B has 20–40 AIV cores; over-subscribing - // is safe (runtime multiplexes) but wastes one weight load per block. - static constexpr int64_t kMaxBlockDim = 40; - int64_t used_cores = std::min(total_rows_, kMaxBlockDim); - int64_t former_length = (total_rows_ + used_cores - 1) / used_cores; - int64_t tail_length = former_length - 1; - int64_t former_num = total_rows_ - tail_length * used_cores; - uint32_t block_dim = static_cast(used_cores); - - aclrtlaunch_AddRmsNorm(block_dim, stream, const_cast(input.data()), - const_cast(residual.data()), weight_fp32, - total_rows_, static_cast(dim_), - dim_length_align_, former_num, former_length, - tail_length, eps, static_cast(dtype_), - out.data(), residual_out.data()); - } - - private: - DataType dtype_; - - int64_t dim_length_align_; - - int64_t total_rows_; - - void* weight_fp32_data_ = nullptr; - - mutable ascend::AclTensorCache weight_src_cache_; - - mutable ascend::AclTensorCache weight_dst_cache_; - - mutable const void* last_weight_ptr_ = nullptr; - - mutable aclOpExecutor* cast_exec_ = nullptr; - - mutable uint64_t cast_ws_ = 0; -}; - -} // namespace infini::ops - -#endif // INFINI_HAS_CUSTOM_KERNELS -#endif // INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_CUSTOM_H_ diff --git a/src/ascend/add_rms_norm/kernel_fused.h b/src/ascend/add_rms_norm/kernel_fused.h deleted file mode 100644 index e28d7c28..00000000 --- a/src/ascend/add_rms_norm/kernel_fused.h +++ /dev/null @@ -1,132 +0,0 @@ -#ifndef INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_FUSED_H_ -#define INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_FUSED_H_ - -#include - -#include "acl/acl.h" -#include "aclnn/aclnn_base.h" -#include "aclnnop/aclnn_add_rms_norm.h" -#include "ascend/common.h" -#include "ascend/workspace_pool_.h" -#include "base/add_rms_norm.h" -#include "operator.h" - -namespace infini::ops { - -// Fused implementation via `aclnnAddRmsNorm` (implementation index 1). -// -// Computes `residual_out = input + residual` and `out = rms_norm(residual_out, -// weight, eps)` in a single CANN launch. The fused API has higher host-side -// launch overhead (~200 us) compared to the decomposed `aclnnAdd` + -// `aclnnRmsNorm` path (~39 us), but may offer better NPU-side efficiency for -// large tensors where kernel fusion reduces memory traffic. -// -// Select via `implementation_index=1` in Python: -// `infini.ops.add_rms_norm(..., implementation_index=1, stream=s)`. -template <> -class Operator : public AddRmsNorm { - public: - Operator(const Tensor input, const Tensor residual, const Tensor weight, - float eps, Tensor out, Tensor residual_out) - : AddRmsNorm(input, residual, weight, eps, out, residual_out), - input_cache_(input), - residual_cache_(residual), - weight_cache_(weight), - out_cache_(out), - residual_out_cache_(residual_out) { - // `aclnnAddRmsNorm` requires `rstdOut` to have the same ndim as `input`, - // with the last `weight.ndim()` dimensions set to 1. For example: - // `input` (2, 32, 128), `weight` (128) -> `rstdOut` (2, 32, 1). - // `input` (64, 128), `weight` (128) -> `rstdOut` (64, 1). - fused_rstd_shape_.reserve(ndim_); - for (size_t i = 0; i < ndim_ - weight.ndim(); ++i) { - fused_rstd_shape_.push_back(static_cast(input.size(i))); - } - for (size_t i = 0; i < weight.ndim(); ++i) { - fused_rstd_shape_.push_back(1); - } - - size_t rstd_elems = 1; - for (auto d : fused_rstd_shape_) { - rstd_elems *= static_cast(d); - } - size_t rstd_bytes = rstd_elems * sizeof(float); - aclrtMalloc(&rstd_data_, rstd_bytes, ACL_MEM_MALLOC_NORMAL_ONLY); - - rstd_tensor_ = aclCreateTensor( - fused_rstd_shape_.data(), - static_cast(fused_rstd_shape_.size()), ACL_FLOAT, - /*strides=*/nullptr, 0, ACL_FORMAT_ND, fused_rstd_shape_.data(), - static_cast(fused_rstd_shape_.size()), rstd_data_); - } - - ~Operator() { - if (!ascend::IsAclRuntimeAlive()) return; - - // Null cached descriptors — see `AclTensorCache::release()`. - input_cache_.release(); - residual_cache_.release(); - weight_cache_.release(); - out_cache_.release(); - residual_out_cache_.release(); - - // `rstd_tensor_` leaks with the executor at shutdown (see `64c367c`). - if (rstd_data_) aclrtFree(rstd_data_); - } - - void operator()(const Tensor input, const Tensor residual, - const Tensor weight, float eps, Tensor out, - Tensor residual_out) const override { - auto t_input = input_cache_.get(const_cast(input.data())); - auto t_residual = residual_cache_.get(const_cast(residual.data())); - auto t_weight = weight_cache_.get(const_cast(weight.data())); - auto t_out = out_cache_.get(out.data()); - auto t_residual_out = residual_out_cache_.get(residual_out.data()); - auto stream = static_cast(stream_); - - if (!executor_) { - aclnnAddRmsNormGetWorkspaceSize( - t_input, t_residual, t_weight, static_cast(eps), t_out, - rstd_tensor_, t_residual_out, &ws_size_, &executor_); - aclSetAclOpExecutorRepeatable(executor_); - } else { - aclSetInputTensorAddr(executor_, 0, t_input, - const_cast(input.data())); - aclSetInputTensorAddr(executor_, 1, t_residual, - const_cast(residual.data())); - aclSetInputTensorAddr(executor_, 2, t_weight, - const_cast(weight.data())); - aclSetOutputTensorAddr(executor_, 0, t_out, out.data()); - // `rstd` at output index 1 has a stable address — no update needed. - aclSetOutputTensorAddr(executor_, 2, t_residual_out, residual_out.data()); - } - - auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_); - aclnnAddRmsNorm(arena.buf, ws_size_, executor_, stream); - } - - private: - mutable ascend::AclTensorCache input_cache_; - - mutable ascend::AclTensorCache residual_cache_; - - mutable ascend::AclTensorCache weight_cache_; - - mutable ascend::AclTensorCache out_cache_; - - mutable ascend::AclTensorCache residual_out_cache_; - - std::vector fused_rstd_shape_; - - void* rstd_data_ = nullptr; - - aclTensor* rstd_tensor_ = nullptr; - - mutable aclOpExecutor* executor_ = nullptr; - - mutable uint64_t ws_size_ = 0; -}; - -} // namespace infini::ops - -#endif diff --git a/src/ascend/causal_softmax/kernel.h b/src/ascend/causal_softmax/kernel.h deleted file mode 100644 index 975a0346..00000000 --- a/src/ascend/causal_softmax/kernel.h +++ /dev/null @@ -1,173 +0,0 @@ -#ifndef INFINI_OPS_ASCEND_CAUSAL_SOFTMAX_KERNEL_H_ -#define INFINI_OPS_ASCEND_CAUSAL_SOFTMAX_KERNEL_H_ - -#include -#include - -#include "acl/acl.h" -#include "aclnn/aclnn_base.h" -#include "aclnn_copy.h" -#include "aclnn_masked_fill_scalar.h" -#include "aclnn_softmax.h" -#include "ascend/common.h" -#include "ascend/workspace_pool_.h" -#include "base/causal_softmax.h" -#include "data_type.h" -#include "operator.h" - -namespace infini::ops { - -// CANN 8.5 has no single API covering causal-mask-then-softmax: the nearest -// candidates (`aclnnSoftmaxV2`, `aclnnScaledSoftmaxGrad`) do not accept a -// boolean mask argument, and `aclnnScaledMaskedSoftmax` requires a -// pre-scaled attention-score tensor produced inside flash-attention, not a -// standalone softmax input. Decomposing into three ACLNN calls is therefore -// unavoidable until a `aclnnCausalSoftmax` ships: -// 1. `aclnnInplaceCopy(temp, input)` — stride-aware copy to a contiguous -// `temp` buffer. -// 2. `aclnnInplaceMaskedFillScalar(temp, mask, -inf)` — apply the -// upper-triangle mask. -// 3. `aclnnSoftmax(temp, dim=-1, out)` — softmax over the last dimension. -// -// The boolean causal mask is pre-computed and uploaded to device once in the -// constructor. Its shape `(seq_len, total_seq_len)` broadcasts over the -// batch dimension. -template <> -class Operator : public CausalSoftmax { - public: - Operator(const Tensor input, Tensor out) - : CausalSoftmax(input, out), in_cache_(input), out_cache_(out) { - // Compute `temp` buffer size — allocated lazily from the pool in - // `operator()`. - size_t n_elems = input.numel(); - size_t elem_bytes = kDataTypeToSize.at(dtype_); - temp_size_ = n_elems * elem_bytes; - - // Build a contiguous `Tensor` descriptor — data pointer set on first use. - Tensor temp_t{nullptr, input.shape(), input.dtype(), input.device()}; - temp_cache_ = ascend::AclTensorCache(temp_t); - - // Causal mask: `mask[i][j] = 1` when position `j` must be masked for - // query `i`. Shape `(seq_len, total_seq_len)` broadcasts over the batch - // dimension. - size_t mask_elems = seq_len_ * total_seq_len_; - std::vector mask_host(mask_elems, 0); - - for (size_t i = 0; i < seq_len_; ++i) { - auto vis_end = static_cast(total_seq_len_ - seq_len_ + i); - - for (auto j = vis_end + 1; j < static_cast(total_seq_len_); - ++j) { - mask_host[i * total_seq_len_ + j] = 1; - } - } - - aclrtMalloc(&mask_buf_, mask_elems, ACL_MEM_MALLOC_NORMAL_ONLY); - aclrtMemcpy(mask_buf_, mask_elems, mask_host.data(), mask_elems, - ACL_MEMCPY_HOST_TO_DEVICE); - - std::vector mshape = {static_cast(seq_len_), - static_cast(total_seq_len_)}; - std::vector mstrides = {static_cast(total_seq_len_), 1}; - mask_tensor_ = aclCreateTensor(mshape.data(), mshape.size(), ACL_BOOL, - mstrides.data(), 0, ACL_FORMAT_ND, - mshape.data(), mshape.size(), mask_buf_); - - // Scalar `-inf` for the masked-fill step. `aclCreateScalar` stores the - // pointer rather than copying, so `neg_inf_storage_` must stay alive - // with the object. - neg_inf_ = aclCreateScalar(&neg_inf_storage_, ACL_FLOAT); - // Workspaces are allocated lazily on the first `operator()` call. - } - - ~Operator() { - if (!ascend::IsAclRuntimeAlive()) return; - - // Null cached descriptors — see `AclTensorCache::release()`. - in_cache_.release(); - out_cache_.release(); - temp_cache_.release(); - - // `mask_tensor_` leaks with `fill_exec_` at shutdown (see `64c367c`). - if (mask_buf_) aclrtFree(mask_buf_); - if (neg_inf_) aclDestroyScalar(neg_inf_); - } - - void operator()(const Tensor input, Tensor out) const override { - auto t_in = in_cache_.get(const_cast(input.data())); - auto t_out = out_cache_.get(out.data()); - auto stream = static_cast(stream_); - - // Obtain shared `temp` buffer from the pool. - auto& temp = ascend::GetWorkspacePool().Ensure(stream, temp_size_, "temp"); - auto t_temp = temp_cache_.get(temp.buf); - - // Step 1: copy `input` (possibly non-contiguous) into a contiguous `temp`. - if (!copy_exec_) { - aclnnInplaceCopyGetWorkspaceSize(t_temp, t_in, ©_ws_, ©_exec_); - aclSetAclOpExecutorRepeatable(copy_exec_); - } else { - aclSetInputTensorAddr(copy_exec_, 0, t_temp, temp.buf); - aclSetInputTensorAddr(copy_exec_, 1, t_in, - const_cast(input.data())); - } - auto& copy_arena = ascend::GetWorkspacePool().Ensure(stream, copy_ws_); - aclnnInplaceCopy(copy_arena.buf, copy_ws_, copy_exec_, stream); - - // Step 2: mask upper-triangle positions with `-inf` in-place. - // `mask_tensor_` and `neg_inf_` have stable addresses — first-call only. - if (!fill_exec_) { - aclnnInplaceMaskedFillScalarGetWorkspaceSize( - t_temp, mask_tensor_, neg_inf_, &fill_ws_, &fill_exec_); - aclSetAclOpExecutorRepeatable(fill_exec_); - } - auto& fill_arena = ascend::GetWorkspacePool().Ensure(stream, fill_ws_); - aclnnInplaceMaskedFillScalar(fill_arena.buf, fill_ws_, fill_exec_, stream); - - // Step 3: softmax over the last dimension -> `out`. - if (!softmax_exec_) { - constexpr int64_t kLastDim = -1; - aclnnSoftmaxGetWorkspaceSize(t_temp, kLastDim, t_out, &softmax_ws_, - &softmax_exec_); - aclSetAclOpExecutorRepeatable(softmax_exec_); - } else { - aclSetOutputTensorAddr(softmax_exec_, 0, t_out, out.data()); - } - auto& softmax_arena = - ascend::GetWorkspacePool().Ensure(stream, softmax_ws_); - aclnnSoftmax(softmax_arena.buf, softmax_ws_, softmax_exec_, stream); - } - - private: - mutable ascend::AclTensorCache in_cache_; - - mutable ascend::AclTensorCache out_cache_; - - mutable ascend::AclTensorCache temp_cache_; - - float neg_inf_storage_ = -std::numeric_limits::infinity(); - - uint64_t temp_size_ = 0; - - void* mask_buf_ = nullptr; - - aclTensor* mask_tensor_ = nullptr; - - aclScalar* neg_inf_ = nullptr; - - mutable aclOpExecutor* copy_exec_ = nullptr; - - mutable uint64_t copy_ws_ = 0; - - mutable aclOpExecutor* fill_exec_ = nullptr; - - mutable uint64_t fill_ws_ = 0; - - mutable aclOpExecutor* softmax_exec_ = nullptr; - - mutable uint64_t softmax_ws_ = 0; -}; - -} // namespace infini::ops - -#endif diff --git a/src/ascend/custom/CMakeLists.txt b/src/ascend/custom/CMakeLists.txt index fb900419..ca6e6883 100644 --- a/src/ascend/custom/CMakeLists.txt +++ b/src/ascend/custom/CMakeLists.txt @@ -30,6 +30,8 @@ else() endif() set(PROJECT_OP_SRC_BASE ${PROJECT_SOURCE_DIR}) +set(PROJECT_BUILD_PATH ${PROJECT_SOURCE_DIR}/build) +set(PROJECT_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/output) include(cmake/config_envs.cmake) include(cmake/config_ascend.cmake) @@ -41,15 +43,13 @@ if(CCACHE_PROGRAM) set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") endif() -# `CMAKE_LIBRARY_OUTPUT_DIRECTORY` is set by `build.sh` so that the -# standalone `libascend_kernel.so` lands next to `libno_workspace_kernel.a` -# under `/build/build_ascend_custom/output/`. +# Shared library output location. +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_OUTPUT_PATH}) # Host-side files. file(GLOB OP_SRCS ${PROJECT_OP_SRC_BASE}/torch_binding.cpp ${PROJECT_OP_SRC_BASE}/rms_norm/op_host/rms_norm.cpp - ${PROJECT_OP_SRC_BASE}/add_rms_norm/op_host/add_rms_norm.cpp ) # Shared library name — consumed by `kernel_custom.h` variants and by the @@ -59,18 +59,8 @@ set(OP_PLUGIN_NAME ascend_kernel) # Kernel-side files (device code compiled by the `AscendC` toolchain). ascendc_library(no_workspace_kernel STATIC ${PROJECT_OP_SRC_BASE}/rms_norm/op_kernel/rms_norm.cpp - ${PROJECT_OP_SRC_BASE}/add_rms_norm/op_kernel/add_rms_norm.cpp ) -# The kernel translation units include `"data_type_enum.h"` from the main -# project's `src/` so that launcher and device code share one `DataType` -# enum. `ascendc_library` forwards the interface target's `INCLUDES` -# property to the nested `ExternalProject_Add` (see -# `${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake/legacy_modules/function.cmake`), -# so append the main `src/` dir here. -set_property(TARGET no_workspace_kernel_interface APPEND PROPERTY - INCLUDES ${PROJECT_OP_SRC_BASE}/../..) - # Create the shared library `libascend_kernel.so`. add_library(${OP_PLUGIN_NAME} SHARED ${OP_SRCS}) diff --git a/src/ascend/custom/add_rms_norm/op_host/add_rms_norm.cpp b/src/ascend/custom/add_rms_norm/op_host/add_rms_norm.cpp index b561eaaa..b8e0d504 100644 --- a/src/ascend/custom/add_rms_norm/op_host/add_rms_norm.cpp +++ b/src/ascend/custom/add_rms_norm/op_host/add_rms_norm.cpp @@ -1,4 +1,4 @@ -#include "aclrtlaunch_AddRmsNorm.h" +#include "aclrtlaunch_add_rms_norm.h" #include "tiling/platform/platform_ascendc.h" #include "torch_kernel_helper.h" @@ -105,13 +105,16 @@ std::vector AddRmsNorm(const at::Tensor& x1, const at::Tensor& x2, float eps_float = static_cast(eps); int64_t dtype_size_val = dtype_size; - // The first arg `AddRmsNorm` is the AscendC kernel entry-point name — it - // must match the `__global__ __aicore__ void AddRmsNorm(...)` definition - // in `op_kernel/` and the generated `aclrtlaunch_AddRmsNorm.h` header. - EXEC_KERNEL_CMD(AddRmsNorm, block_dim, kernel_input1, kernel_input2, - weight_float, total_rows, dim_length, dim_length_align, - former_num, former_length, tail_length, eps_float, - dtype_size_val, kernel_output_y, kernel_output_x_out); + // The first arg `add_rms_norm` is the AscendC kernel entry-point name — it + // must match `ascendc_add_operator(OP_NAME add_rms_norm)` in `CMakeLists.txt`, + // the `__global__ __aicore__ void add_rms_norm(...)` definition in + // `op_kernel/`, and the generated `aclrtlaunch_add_rms_norm.h` header. + // Google C++ Style's PascalCase rule does NOT apply: this identifier is + // dictated by the AscendC toolchain's symbol convention. + EXEC_KERNEL_CMD(add_rms_norm, block_dim, kernel_input1, kernel_input2, + weight_float, kernel_output_y, kernel_output_x_out, + total_rows, dim_length, dim_length_align, former_num, + former_length, tail_length, eps_float, dtype_size_val); // Remove padding and reshape back to original shape. at::Tensor output_y = kernel_output_y; diff --git a/src/ascend/custom/add_rms_norm/op_kernel/.clang-tidy b/src/ascend/custom/add_rms_norm/op_kernel/.clang-tidy deleted file mode 100644 index ccf13972..00000000 --- a/src/ascend/custom/add_rms_norm/op_kernel/.clang-tidy +++ /dev/null @@ -1,9 +0,0 @@ ---- -# `op_kernel/*.cpp` is `AscendC` device code compiled by `ccec`, not by -# the host toolchain, so it has no entry in `compile_commands.json` and -# `clang-tidy` cannot parse it correctly (the `__aicore__` macro expands -# unexpectedly when `kernel_operator.h` is absent). Disable all checks -# here — the `op_host/` side and the `kernel_custom.h` launcher still -# enforce the full ruleset. - -Checks: '-*' diff --git a/src/ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp b/src/ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp index 4b677d35..e2a08e55 100644 --- a/src/ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp +++ b/src/ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp @@ -1,102 +1,98 @@ -#include "data_type.h" #include "kernel_operator.h" -constexpr int32_t kBufferNum = 2; +constexpr int32_t BUFFER_NUM = 2; template class KernelAddRmsNorm { public: __aicore__ inline KernelAddRmsNorm() {} - __aicore__ inline void Init(GM_ADDR input, GM_ADDR residual, GM_ADDR weight, - int64_t total_rows, int64_t dim_length, - int64_t dim_length_align, int64_t former_num, - int64_t former_length, int64_t tail_length, - float eps, GM_ADDR out, GM_ADDR residual_out) { - dim_length_ = dim_length; - dim_length_align_ = dim_length_align; - eps_ = eps; + __aicore__ inline void Init(GM_ADDR x1, GM_ADDR x2, GM_ADDR weight, GM_ADDR y, + GM_ADDR x_out, int64_t totalRows, + int64_t dimLength, int64_t dimLengthAlign, + int64_t formerNum, int64_t formerLength, + int64_t tailLength, float eps) { + this->dimLength = dimLength; + this->dimLengthAlign = dimLengthAlign; + this->eps = eps; // Block-level tiling: determine row range for this core. - int64_t block_idx = AscendC::GetBlockIdx(); - int64_t row_offset; + int64_t blockIdx = AscendC::GetBlockIdx(); + int64_t rowOffset; - if (block_idx < former_num) { - block_rows_ = former_length; - row_offset = former_length * block_idx; + if (blockIdx < formerNum) { + this->blockRows = formerLength; + rowOffset = formerLength * blockIdx; } else { - block_rows_ = tail_length; - int64_t tail_idx = block_idx - former_num; - row_offset = former_length * former_num + tail_length * tail_idx; + this->blockRows = tailLength; + int64_t tailIdx = blockIdx - formerNum; + rowOffset = formerLength * formerNum + tailLength * tailIdx; } // Global memory pointers. - input_gm_.SetGlobalBuffer((__gm__ T*)input + row_offset * dim_length_align, - block_rows_ * dim_length_align); - residual_gm_.SetGlobalBuffer( - (__gm__ T*)residual + row_offset * dim_length_align, - block_rows_ * dim_length_align); - out_gm_.SetGlobalBuffer((__gm__ T*)out + row_offset * dim_length_align, - block_rows_ * dim_length_align); - residual_out_gm_.SetGlobalBuffer( - (__gm__ T*)residual_out + row_offset * dim_length_align, - block_rows_ * dim_length_align); - weight_gm_.SetGlobalBuffer((__gm__ float*)weight, dim_length_align); - - int32_t dim_len_align = static_cast(dim_length_align_); + x1Gm.SetGlobalBuffer((__gm__ T*)x1 + rowOffset * dimLengthAlign, + this->blockRows * dimLengthAlign); + x2Gm.SetGlobalBuffer((__gm__ T*)x2 + rowOffset * dimLengthAlign, + this->blockRows * dimLengthAlign); + yGm.SetGlobalBuffer((__gm__ T*)y + rowOffset * dimLengthAlign, + this->blockRows * dimLengthAlign); + xOutGm.SetGlobalBuffer((__gm__ T*)x_out + rowOffset * dimLengthAlign, + this->blockRows * dimLengthAlign); + weightGm.SetGlobalBuffer((__gm__ float*)weight, dimLengthAlign); + + int32_t dimLenAlign = static_cast(this->dimLengthAlign); // I/O queues (double-buffered). - pipe_.InitBuffer(in_queue_input_, kBufferNum, - dim_len_align * static_cast(sizeof(T))); - pipe_.InitBuffer(in_queue_residual_, kBufferNum, - dim_len_align * static_cast(sizeof(T))); - pipe_.InitBuffer(out_queue_out_, kBufferNum, - dim_len_align * static_cast(sizeof(T))); - pipe_.InitBuffer(out_queue_residual_out_, kBufferNum, - dim_len_align * static_cast(sizeof(T))); + pipe.InitBuffer(inQueueX1, BUFFER_NUM, + dimLenAlign * static_cast(sizeof(T))); + pipe.InitBuffer(inQueueX2, BUFFER_NUM, + dimLenAlign * static_cast(sizeof(T))); + pipe.InitBuffer(outQueueY, BUFFER_NUM, + dimLenAlign * static_cast(sizeof(T))); + pipe.InitBuffer(outQueueXOut, BUFFER_NUM, + dimLenAlign * static_cast(sizeof(T))); // Weight buffer (fp32, loaded once, reused for all rows). - pipe_.InitBuffer(weight_buf_, - dim_len_align * static_cast(sizeof(float))); + pipe.InitBuffer(weightBuf, + dimLenAlign * static_cast(sizeof(float))); - // FP16/BF16 path needs extra fp32 compute buffers. - // `fp32_buf1_`: holds `x_out` in fp32 (reused from `x1_fp32` after Add). - // `fp32_buf2_`: holds `x2_fp32` initially, then `x_out^2`, then final - // result. + // FP16 path needs extra fp32 compute buffers. + // buf1: holds x_out in fp32 (reused from x1_fp32 after Add). + // buf2: holds x2_fp32 initially, then x_out^2, then final result. if constexpr (sizeof(T) == 2) { - pipe_.InitBuffer(fp32_buf1_, - dim_len_align * static_cast(sizeof(float))); - pipe_.InitBuffer(fp32_buf2_, - dim_len_align * static_cast(sizeof(float))); + pipe.InitBuffer(fp32Buf1, + dimLenAlign * static_cast(sizeof(float))); + pipe.InitBuffer(fp32Buf2, + dimLenAlign * static_cast(sizeof(float))); } - // `ReduceSum` temporary buffer (size per API formula). - constexpr int32_t kElemsPerRepeat = 256 / sizeof(float); - constexpr int32_t kElemsPerBlock = 32 / sizeof(float); - int32_t first_max_repeat = - (dim_len_align + kElemsPerRepeat - 1) / kElemsPerRepeat; - int32_t reduce_tmp_size = - ((first_max_repeat + kElemsPerBlock - 1) / kElemsPerBlock) * - kElemsPerBlock; - pipe_.InitBuffer(reduce_tmp_buf_, - reduce_tmp_size * static_cast(sizeof(float))); + // ReduceSum temporary buffer (size per API formula). + constexpr int32_t ELEMS_PER_REPEAT = 256 / sizeof(float); + constexpr int32_t ELEMS_PER_BLOCK = 32 / sizeof(float); + int32_t firstMaxRepeat = + (dimLenAlign + ELEMS_PER_REPEAT - 1) / ELEMS_PER_REPEAT; + int32_t reduceTmpSize = + ((firstMaxRepeat + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK) * + ELEMS_PER_BLOCK; + pipe.InitBuffer(reduceTmpBuf, + reduceTmpSize * static_cast(sizeof(float))); // Scalar buffer for reduction result (8 floats = 32 bytes). - pipe_.InitBuffer(sum_buf_, 32); + pipe.InitBuffer(sumBuf, 32); - // Load weight (fp32) from GM into `weight_buf_`. - AscendC::LocalTensor w_local = weight_buf_.Get(); - AscendC::DataCopyExtParams w_params{ - 1, static_cast(dim_len_align * sizeof(float)), 0, 0, 0}; - AscendC::DataCopyPadExtParams w_pad{false, 0, 0, 0.0f}; - AscendC::DataCopyPad(w_local, weight_gm_, w_params, w_pad); + // Load weight (fp32) from GM into `weightBuf`. + AscendC::LocalTensor wLocal = weightBuf.Get(); + AscendC::DataCopyExtParams wParams{ + 1, static_cast(dimLenAlign * sizeof(float)), 0, 0, 0}; + AscendC::DataCopyPadExtParams wPad{false, 0, 0, 0.0f}; + AscendC::DataCopyPad(wLocal, weightGm, wParams, wPad); // Ensure weight DMA completes before compute. AscendC::PipeBarrier(); } __aicore__ inline void Process() { - for (int64_t row = 0; row < block_rows_; ++row) { + for (int64_t row = 0; row < this->blockRows; ++row) { CopyIn(row); Compute(row); CopyOut(row); @@ -105,175 +101,149 @@ class KernelAddRmsNorm { private: __aicore__ inline void CopyIn(int64_t row) { - AscendC::LocalTensor input_local = in_queue_input_.AllocTensor(); - AscendC::LocalTensor residual_local = - in_queue_residual_.AllocTensor(); + AscendC::LocalTensor x1Local = inQueueX1.AllocTensor(); + AscendC::LocalTensor x2Local = inQueueX2.AllocTensor(); AscendC::DataCopyExtParams params{ - 1, static_cast(dim_length_align_ * sizeof(T)), 0, 0, 0}; + 1, static_cast(this->dimLengthAlign * sizeof(T)), 0, 0, 0}; AscendC::DataCopyPadExtParams pad{false, 0, 0, static_cast(0)}; - AscendC::DataCopyPad(input_local, input_gm_[row * dim_length_align_], - params, pad); - AscendC::DataCopyPad(residual_local, residual_gm_[row * dim_length_align_], - params, pad); - in_queue_input_.EnQue(input_local); - in_queue_residual_.EnQue(residual_local); + AscendC::DataCopyPad(x1Local, x1Gm[row * this->dimLengthAlign], params, + pad); + AscendC::DataCopyPad(x2Local, x2Gm[row * this->dimLengthAlign], params, + pad); + inQueueX1.EnQue(x1Local); + inQueueX2.EnQue(x2Local); } __aicore__ inline void Compute(int64_t row) { - AscendC::LocalTensor input_local = in_queue_input_.DeQue(); - AscendC::LocalTensor residual_local = in_queue_residual_.DeQue(); - AscendC::LocalTensor out_local = out_queue_out_.AllocTensor(); - AscendC::LocalTensor residual_out_local = - out_queue_residual_out_.AllocTensor(); + AscendC::LocalTensor x1Local = inQueueX1.DeQue(); + AscendC::LocalTensor x2Local = inQueueX2.DeQue(); + AscendC::LocalTensor yLocal = outQueueY.AllocTensor(); + AscendC::LocalTensor xOutLocal = outQueueXOut.AllocTensor(); - AscendC::LocalTensor w_local = weight_buf_.Get(); - AscendC::LocalTensor r_tmp = reduce_tmp_buf_.Get(); - AscendC::LocalTensor s_local = sum_buf_.Get(); + AscendC::LocalTensor wLocal = weightBuf.Get(); + AscendC::LocalTensor rTmp = reduceTmpBuf.Get(); + AscendC::LocalTensor sLocal = sumBuf.Get(); - int32_t dim_len = static_cast(dim_length_); - int32_t dim_len_align = static_cast(dim_length_align_); + int32_t dimLen = static_cast(this->dimLength); + int32_t dimLenAlign = static_cast(this->dimLengthAlign); if constexpr (sizeof(T) == 4) { // ---- FP32 path: compute directly. ---- // Step 1: x_out = x1 + x2. - AscendC::Add(residual_out_local, input_local, residual_local, - dim_len_align); + AscendC::Add(xOutLocal, x1Local, x2Local, dimLenAlign); - // Step 2: x_out^2 into out_local (reuse output buffer temporarily). - AscendC::Mul(out_local, residual_out_local, residual_out_local, - dim_len_align); + // Step 2: x_out^2 into yLocal (reuse output buffer temporarily). + AscendC::Mul(yLocal, xOutLocal, xOutLocal, dimLenAlign); - // Step 3: ReduceSum(x_out^2) -> s_local[0]. - // `ReduceSum` may modify `out_local`, but we overwrite it below. - AscendC::ReduceSum(s_local, out_local, r_tmp, dim_len_align); + // Step 3: ReduceSum(x_out^2) -> sLocal[0]. + // ReduceSum may modify yLocal, but we overwrite it below. + AscendC::ReduceSum(sLocal, yLocal, rTmp, dimLenAlign); // Step 4-5: scale = 1 / sqrt(mean(x_out^2) + eps). - float sum_val = s_local.GetValue(0); - float mean_val = sum_val / static_cast(dim_len) + eps_; - s_local.SetValue(0, mean_val); - AscendC::Sqrt(s_local, s_local, 8); - float scale = 1.0f / s_local.GetValue(0); + float sumVal = sLocal.GetValue(0); + float meanVal = sumVal / static_cast(dimLen) + this->eps; + sLocal.SetValue(0, meanVal); + AscendC::Sqrt(sLocal, sLocal, 8); + float scale = 1.0f / sLocal.GetValue(0); // Step 6: y = x_out * scale. - AscendC::Muls(out_local, residual_out_local, scale, dim_len_align); + AscendC::Muls(yLocal, xOutLocal, scale, dimLenAlign); // Step 7: y = y * weight. - AscendC::Mul(out_local, out_local, w_local, dim_len_align); + AscendC::Mul(yLocal, yLocal, wLocal, dimLenAlign); } else { - // ---- FP16/BF16 path: cast → fp32 compute → cast back. ---- - AscendC::LocalTensor b1 = fp32_buf1_.Get(); - AscendC::LocalTensor b2 = fp32_buf2_.Get(); + // ---- FP16 path: cast → fp32 compute → cast back. ---- + AscendC::LocalTensor b1 = fp32Buf1.Get(); + AscendC::LocalTensor b2 = fp32Buf2.Get(); - // Cast inputs fp16/bf16 → fp32. - AscendC::Cast(b1, input_local, AscendC::RoundMode::CAST_NONE, - dim_len_align); - AscendC::Cast(b2, residual_local, AscendC::RoundMode::CAST_NONE, - dim_len_align); + // Cast inputs fp16 → fp32. + AscendC::Cast(b1, x1Local, AscendC::RoundMode::CAST_NONE, dimLenAlign); + AscendC::Cast(b2, x2Local, AscendC::RoundMode::CAST_NONE, dimLenAlign); // Step 1: x_out = x1 + x2 (fp32), stored in b1. - AscendC::Add(b1, b1, b2, dim_len_align); + AscendC::Add(b1, b1, b2, dimLenAlign); - // Cast `x_out` fp32 → fp16/bf16 for the residual output. - AscendC::Cast(residual_out_local, b1, AscendC::RoundMode::CAST_RINT, - dim_len_align); + // Cast x_out fp32 → fp16 for the x_out output. + AscendC::Cast(xOutLocal, b1, AscendC::RoundMode::CAST_ROUND, dimLenAlign); // Step 2: x_out^2 in fp32, stored in b2. - AscendC::Mul(b2, b1, b1, dim_len_align); + AscendC::Mul(b2, b1, b1, dimLenAlign); - // Step 3: ReduceSum(x_out^2) -> s_local[0]. - AscendC::ReduceSum(s_local, b2, r_tmp, dim_len_align); + // Step 3: ReduceSum(x_out^2) -> sLocal[0]. + AscendC::ReduceSum(sLocal, b2, rTmp, dimLenAlign); // Step 4-5: scale = 1 / sqrt(mean(x_out^2) + eps). - float sum_val = s_local.GetValue(0); - float mean_val = sum_val / static_cast(dim_len) + eps_; - s_local.SetValue(0, mean_val); - AscendC::Sqrt(s_local, s_local, 8); - float scale = 1.0f / s_local.GetValue(0); + float sumVal = sLocal.GetValue(0); + float meanVal = sumVal / static_cast(dimLen) + this->eps; + sLocal.SetValue(0, meanVal); + AscendC::Sqrt(sLocal, sLocal, 8); + float scale = 1.0f / sLocal.GetValue(0); // Step 6: y = x_out * scale (fp32), reuse b2. - AscendC::Muls(b2, b1, scale, dim_len_align); + AscendC::Muls(b2, b1, scale, dimLenAlign); // Step 7: y = y * weight (fp32). - AscendC::Mul(b2, b2, w_local, dim_len_align); + AscendC::Mul(b2, b2, wLocal, dimLenAlign); - AscendC::Cast(out_local, b2, AscendC::RoundMode::CAST_RINT, - dim_len_align); + // Cast result fp32 → fp16. + AscendC::Cast(yLocal, b2, AscendC::RoundMode::CAST_ROUND, dimLenAlign); } - in_queue_input_.FreeTensor(input_local); - in_queue_residual_.FreeTensor(residual_local); - out_queue_out_.EnQue(out_local); - out_queue_residual_out_.EnQue(residual_out_local); + inQueueX1.FreeTensor(x1Local); + inQueueX2.FreeTensor(x2Local); + outQueueY.EnQue(yLocal); + outQueueXOut.EnQue(xOutLocal); } __aicore__ inline void CopyOut(int64_t row) { - AscendC::LocalTensor out_local = out_queue_out_.DeQue(); - AscendC::LocalTensor residual_out_local = - out_queue_residual_out_.DeQue(); + AscendC::LocalTensor yLocal = outQueueY.DeQue(); + AscendC::LocalTensor xOutLocal = outQueueXOut.DeQue(); AscendC::DataCopyExtParams params{ - 1, static_cast(dim_length_align_ * sizeof(T)), 0, 0, 0}; - AscendC::DataCopyPad(out_gm_[row * dim_length_align_], out_local, params); - AscendC::DataCopyPad(residual_out_gm_[row * dim_length_align_], - residual_out_local, params); - out_queue_out_.FreeTensor(out_local); - out_queue_residual_out_.FreeTensor(residual_out_local); + 1, static_cast(this->dimLengthAlign * sizeof(T)), 0, 0, 0}; + AscendC::DataCopyPad(yGm[row * this->dimLengthAlign], yLocal, params); + AscendC::DataCopyPad(xOutGm[row * this->dimLengthAlign], xOutLocal, params); + outQueueY.FreeTensor(yLocal); + outQueueXOut.FreeTensor(xOutLocal); } private: - AscendC::TPipe pipe_; - AscendC::TQue in_queue_input_; - AscendC::TQue in_queue_residual_; - AscendC::TQue out_queue_out_; - AscendC::TQue out_queue_residual_out_; - - AscendC::TBuf weight_buf_; - AscendC::TBuf fp32_buf1_; - AscendC::TBuf fp32_buf2_; - AscendC::TBuf reduce_tmp_buf_; - AscendC::TBuf sum_buf_; - - AscendC::GlobalTensor input_gm_, residual_gm_, out_gm_, residual_out_gm_; - AscendC::GlobalTensor weight_gm_; - - int64_t block_rows_; - int64_t dim_length_; - int64_t dim_length_align_; - float eps_; + AscendC::TPipe pipe; + AscendC::TQue inQueueX1; + AscendC::TQue inQueueX2; + AscendC::TQue outQueueY; + AscendC::TQue outQueueXOut; + + AscendC::TBuf weightBuf; + AscendC::TBuf fp32Buf1; + AscendC::TBuf fp32Buf2; + AscendC::TBuf reduceTmpBuf; + AscendC::TBuf sumBuf; + + AscendC::GlobalTensor x1Gm, x2Gm, yGm, xOutGm; + AscendC::GlobalTensor weightGm; + + int64_t blockRows; + int64_t dimLength; + int64_t dimLengthAlign; + float eps; }; -// `dtype_code` is `static_cast(infini::ops::DataType)` forwarded -// by the host launcher. fp16 and bf16 both have `sizeof == 2` but need -// distinct numeric paths, so dispatch is on the `DataType` tag rather -// than the byte size. -extern "C" __global__ __aicore__ void AddRmsNorm( - GM_ADDR input, GM_ADDR residual, GM_ADDR weight, int64_t total_rows, - int64_t dim_length, int64_t dim_length_align, int64_t former_num, - int64_t former_length, int64_t tail_length, float eps, int64_t dtype_code, - GM_ADDR out, GM_ADDR residual_out) { - switch (static_cast(dtype_code)) { - case infini::ops::DataType::kFloat16: { - KernelAddRmsNorm op; - op.Init(input, residual, weight, total_rows, dim_length, dim_length_align, - former_num, former_length, tail_length, eps, out, residual_out); - op.Process(); - break; - } - case infini::ops::DataType::kBFloat16: { - KernelAddRmsNorm op; - op.Init(input, residual, weight, total_rows, dim_length, dim_length_align, - former_num, former_length, tail_length, eps, out, residual_out); - op.Process(); - break; - } - case infini::ops::DataType::kFloat32: - default: { - KernelAddRmsNorm op; - op.Init(input, residual, weight, total_rows, dim_length, dim_length_align, - former_num, former_length, tail_length, eps, out, residual_out); - op.Process(); - break; - } +extern "C" __global__ __aicore__ void add_rms_norm( + GM_ADDR x1, GM_ADDR x2, GM_ADDR weight, GM_ADDR y, GM_ADDR x_out, + int64_t totalRows, int64_t dimLength, int64_t dimLengthAlign, + int64_t formerNum, int64_t formerLength, int64_t tailLength, float eps, + int64_t dtypeSize) { + if (dtypeSize == 2) { + KernelAddRmsNorm op; + op.Init(x1, x2, weight, y, x_out, totalRows, dimLength, dimLengthAlign, + formerNum, formerLength, tailLength, eps); + op.Process(); + } else { + KernelAddRmsNorm op; + op.Init(x1, x2, weight, y, x_out, totalRows, dimLength, dimLengthAlign, + formerNum, formerLength, tailLength, eps); + op.Process(); } } diff --git a/src/ascend/custom/build.sh b/src/ascend/custom/build.sh index 83740881..258a88e4 100755 --- a/src/ascend/custom/build.sh +++ b/src/ascend/custom/build.sh @@ -1,45 +1,30 @@ #!/bin/bash -# Build custom `AscendC` kernels into `libno_workspace_kernel.a` (+ the -# standalone `libascend_kernel.so`). -# -# Intermediate artefacts default to `/build/build_ascend_custom/` -# so the source tree under `src/` stays free of build output. Override -# via `BUILD_DIR= bash build.sh …` if needed. +# Build custom `AscendC` kernels into `libascend_kernel.so`. set -e SOC_VERSION="${1:-Ascend910_9382}" -# Use the same `cmake` the caller resolved (default: first `cmake` on -# PATH). The outer `src/CMakeLists.txt` forwards `${CMAKE_COMMAND}` -# via `CMAKE_EXE` so the child build doesn't accidentally pick up the -# PyPI `cmake` shim whose Python package only exists in `pip`'s -# build-isolation overlay. -CMAKE_EXE="${CMAKE_EXE:-cmake}" - # Detect CANN toolkit path. _CANN_TOOLKIT_INSTALL_PATH=$(grep "Toolkit_InstallPath" /etc/Ascend/ascend_cann_install.info | awk -F'=' '{print $2}') source "${_CANN_TOOLKIT_INSTALL_PATH}/set_env.sh" echo "CANN: ${ASCEND_TOOLKIT_HOME}" ASCEND_INCLUDE_DIR=${ASCEND_TOOLKIT_HOME}/$(arch)-linux/include +CURRENT_DIR=$(pwd) +OUTPUT_DIR=${CURRENT_DIR}/output +mkdir -p "${OUTPUT_DIR}" -# Resolve build directory. `