InfiniTensor · voltjia · Apr 24, 2026 · Apr 17, 2026 · Apr 22, 2026 · Apr 22, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -18,12 +18,21 @@ option(WITH_ASCEND "Enable Ascend backend" OFF)
 
 option(WITH_TORCH "Enable PyTorch C++ backend" OFF)
 
-# Default OFF until CANN's `extract_host_stub.py` path handling is fixed for
-# `scikit-build-core` temp-dir builds (triggers `KeyError` on the preprocessed
-# object path). Enable explicitly with `-DBUILD_CUSTOM_KERNEL=ON` when the
-# toolchain is compatible or when building via the standalone
-# `src/ascend/custom/build.sh` script.
-option(BUILD_CUSTOM_KERNEL "Build custom AscendC kernel PyTorch extension (requires `torch_npu`)" OFF)
+# Custom `AscendC` kernels under `src/ascend/custom/`.  `ON` by default
+# so CI and routine dev builds always exercise `implementation_index=1/2`
+# for `RmsNorm` / `AddRmsNorm`.  Gated by `WITH_ASCEND` in
+# `src/CMakeLists.txt` — non-Ascend builds ignore it.  Pass
+# `-DBUILD_ASCEND_CUSTOM=OFF` to skip the `ccec` build on Ascend
+# machines where the custom kernels aren't needed.
+#
+# When `ON`, `src/CMakeLists.txt` drives the standalone
+# `src/ascend/custom/build.sh` via `execute_process` at configure time
+# (sidesteps a `CANN` `extract_host_stub.py` path bug that breaks
+# in-tree `ascendc_library()` under `scikit-build-core` temp-dir builds)
+# and links the produced `libno_workspace_kernel.a` into the `ops`
+# module with `--whole-archive`.  Requires `torch_npu` and the
+# `AscendC` toolchain (`ccec`).
+option(BUILD_ASCEND_CUSTOM "Build custom AscendC kernels" ON)
 
 option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF)
 option(AUTO_DETECT_BACKENDS "Automatically detect available backends" OFF)

diff --git a/pyproject.toml b/pyproject.toml
@@ -7,6 +7,15 @@ name = "InfiniOps"
 version = "0.1.0"
 
 [project.optional-dependencies]
+# TODO: `torch` here is unconstrained.  On Ascend hosts, the working
+# torch is the Ascend-matched `torch 2.9.0+cpu` paired with
+# `torch_npu 2.9.0.post1+…`.  A `pip install -e .[dev] --force-reinstall`
+# will re-resolve `torch` to the latest PyPI version (currently
+# `torch 2.11.0`), which now declares `cuda-toolkit` / `nvidia-cublas` /
+# `nvidia-cudnn` / … as hard deps — downloads GBs of CUDA wheels and
+# kills the `torch_npu` / `vllm-ascend` pairing.  Needs a platform-aware
+# split (e.g. `torch; platform_machine != 'aarch64'`, or move `torch`
+# out of `dev` and require it pre-installed in the container image).
 dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch", "pyyaml"]
 
 [tool.scikit-build.wheel]

diff --git a/scripts/generate_wrappers.py b/scripts/generate_wrappers.py
@@ -112,9 +112,29 @@ def _find_vector_tensor_params(op_name):
     return set(re.findall(r"std::vector<Tensor>\s+(\w+)", source))
 
 
+def _find_params_with_defaults(op_name):
+    """Return ``{param_name: default_literal}`` for base-header params that
+    carry a `= <literal>` default value.  `libclang`'s cursor API does not
+    expose defaults reliably, so we regex-scan the source.  Only used for
+    plain scalar defaults such as ``bool pre_gathered = false``.
+    """
+    source = (_BASE_DIR / f"{op_name}.h").read_text()
+
+    mapping = {}
+
+    for name, default in re.findall(
+        r"\b(?:bool|int(?:64_t|32_t|8_t|16_t)?|std::size_t|std::uint\w+_t|float|double)\s+(\w+)\s*=\s*([^,\)]+?)\s*(?:,|\))",
+        source,
+    ):
+        mapping[name] = default.strip()
+
+    return mapping
+
+
 def _generate_pybind11(operator):
     optional_tensor_params = _find_optional_tensor_params(operator.name)
     vector_tensor_params = _find_vector_tensor_params(operator.name)
+    params_with_defaults = _find_params_with_defaults(operator.name)
 
     def _is_optional_tensor(arg):
         if arg.spelling in optional_tensor_params:
@@ -186,6 +206,10 @@ def _generate_py_args(node):
 
             if _is_optional(arg):
                 parts.append(f'py::arg("{arg.spelling}") = py::none()')
+            elif arg.spelling in params_with_defaults:
+                parts.append(
+                    f'py::arg("{arg.spelling}") = {params_with_defaults[arg.spelling]}'
+                )
             else:
                 parts.append(f'py::arg("{arg.spelling}")')
 
@@ -257,8 +281,7 @@ def _generate_call(op_name, call, method=True):
       }})
       .def_static("clear_cache", &Self::clear_cache);
 
-{callers}
-}}
+{callers}}}
 
 }}  // namespace infini::ops
 

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -241,8 +241,66 @@ if(WITH_ASCEND)
     list(APPEND DEVICE_LIST "ascend")
 
     # Custom `AscendC` kernels (PyTorch extension, requires `torch_npu`).
-    if(BUILD_CUSTOM_KERNEL)
-        add_subdirectory(ascend/custom)
+    if(BUILD_ASCEND_CUSTOM)
+        # In-tree `ascendc_library()` trips the `CANN` `extract_host_stub.py`
+        # path-handling bug under `scikit-build-core`'s temp-dir builds
+        # (`KeyError` on `/./workspace/...` paths in `$<TARGET_OBJECTS>`).
+        # Work around it by driving the standalone `src/ascend/custom/build.sh`
+        # — that script invokes a separate `cmake` with
+        # `src/ascend/custom/` as its `SOURCE_DIR`, avoiding the buggy
+        # path shape.  The produced `.a` is imported and linked into
+        # `ops` with `--whole-archive`.
+        set(_custom_build_dir "${CMAKE_SOURCE_DIR}/build/build_ascend_custom")
+        set(_custom_lib "${_custom_build_dir}/lib/libno_workspace_kernel.a")
+
+        if(NOT DEFINED SOC_VERSION OR "${SOC_VERSION}" STREQUAL "")
+            include(${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/cmake/detect_soc.cmake)
+            infiniops_detect_soc(SOC_VERSION)
+        endif()
+
+        # Drive `build.sh` as a build-phase target with explicit source
+        # dependencies so that editing any `op_host/` or `op_kernel/`
+        # source re-triggers the build (plain `execute_process` at
+        # configure time would only gate on file existence and leave
+        # stale `.a` files in place).
+        file(GLOB_RECURSE _custom_srcs CONFIGURE_DEPENDS
+            "${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/*.cpp"
+            "${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/*.h"
+            "${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/build.sh")
+
+        # Scrub env inherited from the outer `scikit-build-core` invocation
+        # before handing control to `build.sh`:
+        #  * `CMAKE_GENERATOR` / `CMAKE_EXPORT_COMPILE_COMMANDS` leaking
+        #    into the inner `cmake` change the path format passed to
+        #    `ninja`'s `_host_cpp` rule and re-trigger the `CANN`
+        #    `extract_host_stub.py` `KeyError` (`/./workspace/...`) that
+        #    standalone `build.sh` avoids.
+        #  * `PYTHONPATH` from `pip`'s build-isolation overlay makes the
+        #    child `python3` skip the system `site-packages` — child
+        #    `cmake` modules that `import torch` (`config_envs.cmake`)
+        #    then fail with `ModuleNotFoundError` even though `torch` is
+        #    installed.
+        add_custom_command(
+            OUTPUT ${_custom_lib}
+            COMMAND ${CMAKE_COMMAND} -E env
+                    --unset=CMAKE_GENERATOR
+                    --unset=CMAKE_EXPORT_COMPILE_COMMANDS
+                    --unset=CMAKE_BUILD_PARALLEL_LEVEL
+                    --unset=PYTHONPATH
+                    "BUILD_DIR=${_custom_build_dir}"
+                    "CMAKE_EXE=${CMAKE_COMMAND}"
+                    bash ${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/build.sh ${SOC_VERSION}
+            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom
+            DEPENDS ${_custom_srcs}
+            COMMENT "Building custom AscendC kernels (SOC_VERSION=${SOC_VERSION})"
+            VERBATIM)
+
+        add_custom_target(no_workspace_kernel_build ALL DEPENDS ${_custom_lib})
+
+        add_library(no_workspace_kernel STATIC IMPORTED GLOBAL)
+        set_target_properties(no_workspace_kernel PROPERTIES
+            IMPORTED_LOCATION "${_custom_lib}")
+        add_dependencies(no_workspace_kernel no_workspace_kernel_build)
 
         # Link the compiled `AscendC` kernel objects into `infiniops` so that
         # custom kernel implementations (e.g. `RmsNorm` index 1) can call
@@ -379,9 +437,13 @@ if(GENERATE_PYTHON_BINDINGS)
     # The `Operator<..., 1>` template instantiations that call
     # `aclrtlaunch_*` live in `ops.cc`, so link here with
     # `--whole-archive` to ensure all launch functions are available.
-    if(BUILD_CUSTOM_KERNEL)
+    # `$<TARGET_FILE>` works for both real `ascendc_library()` targets and
+    # `IMPORTED` targets pointing at a pre-built `.a`.
+    if(BUILD_ASCEND_CUSTOM)
         target_link_libraries(ops PRIVATE
-            -Wl,--whole-archive no_workspace_kernel -Wl,--no-whole-archive)
+            -Wl,--whole-archive $<TARGET_FILE:no_workspace_kernel> -Wl,--no-whole-archive)
+        # `ops` link step must wait for `build.sh` to produce the `.a`.
+        add_dependencies(ops no_workspace_kernel_build)
     endif()
 
     set_target_properties(infiniops PROPERTIES INSTALL_RPATH "$ORIGIN")

diff --git a/src/ascend/add_rms_norm/kernel.h b/src/ascend/add_rms_norm/kernel.h
@@ -0,0 +1,144 @@
+#ifndef INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_H_
+#define INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_H_
+
+#include <vector>
+
+#include "acl/acl.h"
+#include "aclnn/aclnn_base.h"
+#include "aclnn_add.h"
+#include "aclnn_rms_norm.h"
+#include "ascend/common.h"
+#include "ascend/workspace_pool_.h"
+#include "base/add_rms_norm.h"
+#include "operator.h"
+
+namespace infini::ops {
+
+// Decomposed implementation: `aclnnAdd` + `aclnnRmsNorm`.
+//
+// The fused `aclnnAddRmsNorm` API has ~200 us host-side launch overhead that
+// dominates small-tensor dispatch.  Decomposing into two fast ACLNN calls
+// reduces host dispatch from ~224 us to ~56 us (4x faster) with negligible
+// NPU-side impact for inference tensor sizes.
+template <>
+class Operator<AddRmsNorm, Device::Type::kAscend, 0> : public AddRmsNorm {
+ public:
+  Operator(const Tensor input, const Tensor residual, const Tensor weight,
+           float eps, Tensor out, Tensor residual_out)
+      : AddRmsNorm(input, residual, weight, eps, out, residual_out),
+        input_cache_(input),
+        residual_cache_(residual),
+        weight_cache_(weight),
+        out_cache_(out),
+        residual_out_cache_(residual_out) {
+    // Alpha scalar for `aclnnAdd` (`residual_out = input + 1.0 * residual`).
+    alpha_ = aclCreateScalar(&alpha_storage_, ACL_FLOAT);
+
+    // `aclnnRmsNorm` writes `rstd` as a required side output.  Size is
+    // computed here; the buffer is obtained from the pool in `operator()`.
+    rstd_shape_ = {static_cast<int64_t>(batch_size_),
+                   static_cast<int64_t>(nhead_)};
+    rstd_size_ = batch_size_ * nhead_ * sizeof(float);
+  }
+
+  ~Operator() {
+    if (!ascend::IsAclRuntimeAlive()) return;
+
+    // Null cached descriptors — see `AclTensorCache::release()`.
+    input_cache_.release();
+    residual_cache_.release();
+    weight_cache_.release();
+    out_cache_.release();
+    residual_out_cache_.release();
+
+    // `rstd_tensor_` leaks with `norm_exec_` at shutdown (see `64c367c`).
+    if (alpha_) aclDestroyScalar(alpha_);
+  }
+
+  void operator()(const Tensor input, const Tensor residual,
+                  const Tensor weight, float eps, Tensor out,
+                  Tensor residual_out) const override {
+    auto t_input = input_cache_.get(const_cast<void*>(input.data()));
+    auto t_residual = residual_cache_.get(const_cast<void*>(residual.data()));
+    auto t_weight = weight_cache_.get(const_cast<void*>(weight.data()));
+    auto t_out = out_cache_.get(out.data());
+    auto t_residual_out = residual_out_cache_.get(residual_out.data());
+    auto stream = static_cast<aclrtStream>(stream_);
+
+    // Step 1: `residual_out = input + residual`.
+    if (!add_exec_) {
+      aclnnAddGetWorkspaceSize(t_input, t_residual, alpha_, t_residual_out,
+                               &add_ws_, &add_exec_);
+      aclSetAclOpExecutorRepeatable(add_exec_);
+    } else {
+      aclSetInputTensorAddr(add_exec_, 0, t_input,
+                            const_cast<void*>(input.data()));
+      aclSetInputTensorAddr(add_exec_, 1, t_residual,
+                            const_cast<void*>(residual.data()));
+      aclSetOutputTensorAddr(add_exec_, 0, t_residual_out, residual_out.data());
+    }
+    auto& add_arena = ascend::GetWorkspacePool().Ensure(stream, add_ws_);
+    aclnnAdd(add_arena.buf, add_ws_, add_exec_, stream);
+
+    // Obtain shared `rstd` buffer from pool.
+    auto& rstd_arena =
+        ascend::GetWorkspacePool().Ensure(stream, rstd_size_, "temp");
+
+    // Lazily create the `rstd` tensor descriptor on first call.
+    if (!rstd_tensor_) {
+      rstd_tensor_ = aclCreateTensor(rstd_shape_.data(), 2, ACL_FLOAT,
+                                     /*strides=*/nullptr, 0, ACL_FORMAT_ND,
+                                     rstd_shape_.data(), 2, rstd_arena.buf);
+    } else {
+      aclSetRawTensorAddr(rstd_tensor_, rstd_arena.buf);
+    }
+
+    // Step 2: `out = rms_norm(residual_out, weight, eps)`.
+    if (!norm_exec_) {
+      aclnnRmsNormGetWorkspaceSize(t_residual_out, t_weight, eps, t_out,
+                                   rstd_tensor_, &norm_ws_, &norm_exec_);
+      aclSetAclOpExecutorRepeatable(norm_exec_);
+    } else {
+      aclSetInputTensorAddr(norm_exec_, 0, t_residual_out, residual_out.data());
+      aclSetInputTensorAddr(norm_exec_, 1, t_weight,
+                            const_cast<void*>(weight.data()));
+      aclSetOutputTensorAddr(norm_exec_, 0, t_out, out.data());
+      aclSetOutputTensorAddr(norm_exec_, 1, rstd_tensor_, rstd_arena.buf);
+    }
+    auto& norm_arena = ascend::GetWorkspacePool().Ensure(stream, norm_ws_);
+    aclnnRmsNorm(norm_arena.buf, norm_ws_, norm_exec_, stream);
+  }
+
+ private:
+  mutable ascend::AclTensorCache input_cache_;
+
+  mutable ascend::AclTensorCache residual_cache_;
+
+  mutable ascend::AclTensorCache weight_cache_;
+
+  mutable ascend::AclTensorCache out_cache_;
+
+  mutable ascend::AclTensorCache residual_out_cache_;
+
+  float alpha_storage_ = 1.0f;
+
+  aclScalar* alpha_ = nullptr;
+
+  std::vector<int64_t> rstd_shape_;
+
+  uint64_t rstd_size_ = 0;
+
+  mutable aclTensor* rstd_tensor_ = nullptr;
+
+  mutable aclOpExecutor* add_exec_ = nullptr;
+
+  mutable uint64_t add_ws_ = 0;
+
+  mutable aclOpExecutor* norm_exec_ = nullptr;
+
+  mutable uint64_t norm_ws_ = 0;
+};
+
+}  // namespace infini::ops
+
+#endif