Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
e38d08b
feat(ascend): op-norm-rope group — Swiglu, SiluAndMul, CausalSoftmax,…
Apr 17, 2026
9f1d5cb
fix(ascend): norm/swiglu destructors + missing add_rms_norm custom ke…
Apr 22, 2026
10e4c76
style(ascend): rename `AddRmsNorm` parameters to PyTorch-aligned names
Apr 22, 2026
65b4ef7
style(ascend): comment + assert message audit for norm/swiglu/softmax…
Apr 22, 2026
74c9730
test(silu_and_mul): add `implementation_index` parametrize and stride…
Apr 22, 2026
36a4800
refactor(ascend/rotary_embedding): unify RotaryEmbedding and ApplyRot…
Apr 22, 2026
e207d5b
feat(scripts/generate_wrappers): emit `apply_rotary_pos_emb` Python shim
Apr 22, 2026
ee1e765
test(rotary_embedding): merge apply_rotary_pos_emb cases + cover MLA/…
Apr 22, 2026
4934199
fix(generate_wrappers): propagate scalar param defaults to pybind sig…
Apr 22, 2026
4342d55
fix(ascend/rotary_embedding): correct pre-gathered layout + revert si…
Apr 22, 2026
e76d145
test(rotary_embedding): fix GPT-J reference for partial rotary
Apr 22, 2026
f1ccd5d
merge(pr66): reviewer fixes — swiglu/silu_and_mul/causal_softmax/add_…
Apr 22, 2026
3fc0e8d
merge(pr66): rope unification — fold ApplyRotaryPosEmb into RotaryEmb…
Apr 22, 2026
8c159ad
refactor(pr66-simplify): correct `rstd_out` semantic name + clarity f…
Apr 22, 2026
b6821a5
style(tests): ruff format `test_add_rms_norm.py` after `residual_out`…
Apr 22, 2026
77300e4
build(ascend-custom): drive `build.sh` from `pip install` with proper…
Apr 22, 2026
735c128
refactor(data_type): pin `DataType` enum values explicitly
Apr 22, 2026
33e99af
feat(ascend-custom): add bf16 support + Google-style identifier renames
Apr 22, 2026
c23901a
refactor(base): align Linear/SiluAndMul/AddRmsNorm/RotaryEmbedding wi…
Apr 22, 2026
7210408
refactor(base): trim narrative comments and collapse CPU Linear ctors
Apr 23, 2026
ca52518
fix(pr66-review): address review findings 1-3
Apr 23, 2026
fdeb779
refactor(pr66): drop `apply_rotary_pos_emb` wrapper + tests
Apr 23, 2026
694506b
test(rotary_embedding): add `pre_gathered=True` coverage
Apr 23, 2026
ca6eb18
chore(pr66): drop unused headers
Apr 23, 2026
659ae35
style(pr66): sweep assert-message periods + comment backticks
Apr 23, 2026
d60c180
refactor(pr66): rename AscendC custom kernels to PascalCase + C2 para…
Apr 23, 2026
93d4f3b
refactor(pr66): trim commit-narration comments
Apr 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,21 @@ option(WITH_ASCEND "Enable Ascend backend" OFF)

option(WITH_TORCH "Enable PyTorch C++ backend" OFF)

# Default OFF until CANN's `extract_host_stub.py` path handling is fixed for
# `scikit-build-core` temp-dir builds (triggers `KeyError` on the preprocessed
# object path). Enable explicitly with `-DBUILD_CUSTOM_KERNEL=ON` when the
# toolchain is compatible or when building via the standalone
# `src/ascend/custom/build.sh` script.
option(BUILD_CUSTOM_KERNEL "Build custom AscendC kernel PyTorch extension (requires `torch_npu`)" OFF)
# Custom `AscendC` kernels under `src/ascend/custom/`. `ON` by default
# so CI and routine dev builds always exercise `implementation_index=1/2`
# for `RmsNorm` / `AddRmsNorm`. Gated by `WITH_ASCEND` in
# `src/CMakeLists.txt` — non-Ascend builds ignore it. Pass
# `-DBUILD_ASCEND_CUSTOM=OFF` to skip the `ccec` build on Ascend
# machines where the custom kernels aren't needed.
#
# When `ON`, `src/CMakeLists.txt` drives the standalone
# `src/ascend/custom/build.sh` via `execute_process` at configure time
# (sidesteps a `CANN` `extract_host_stub.py` path bug that breaks
# in-tree `ascendc_library()` under `scikit-build-core` temp-dir builds)
# and links the produced `libno_workspace_kernel.a` into the `ops`
# module with `--whole-archive`. Requires `torch_npu` and the
# `AscendC` toolchain (`ccec`).
option(BUILD_ASCEND_CUSTOM "Build custom AscendC kernels" ON)

option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF)
option(AUTO_DETECT_BACKENDS "Automatically detect available backends" OFF)
Expand Down
9 changes: 9 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ name = "InfiniOps"
version = "0.1.0"

[project.optional-dependencies]
# TODO: `torch` here is unconstrained. On Ascend hosts, the working
# torch is the Ascend-matched `torch 2.9.0+cpu` paired with
# `torch_npu 2.9.0.post1+…`. A `pip install -e .[dev] --force-reinstall`
# will re-resolve `torch` to the latest PyPI version (currently
# `torch 2.11.0`), which now declares `cuda-toolkit` / `nvidia-cublas` /
# `nvidia-cudnn` / … as hard deps — downloads GBs of CUDA wheels and
# kills the `torch_npu` / `vllm-ascend` pairing. Needs a platform-aware
# split (e.g. `torch; platform_machine != 'aarch64'`, or move `torch`
# out of `dev` and require it pre-installed in the container image).
dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch", "pyyaml"]

[tool.scikit-build.wheel]
Expand Down
27 changes: 25 additions & 2 deletions scripts/generate_wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,29 @@ def _find_vector_tensor_params(op_name):
return set(re.findall(r"std::vector<Tensor>\s+(\w+)", source))


def _find_params_with_defaults(op_name):
"""Return ``{param_name: default_literal}`` for base-header params that
carry a `= <literal>` default value. `libclang`'s cursor API does not
expose defaults reliably, so we regex-scan the source. Only used for
plain scalar defaults such as ``bool pre_gathered = false``.
"""
source = (_BASE_DIR / f"{op_name}.h").read_text()

mapping = {}

for name, default in re.findall(
r"\b(?:bool|int(?:64_t|32_t|8_t|16_t)?|std::size_t|std::uint\w+_t|float|double)\s+(\w+)\s*=\s*([^,\)]+?)\s*(?:,|\))",
source,
):
mapping[name] = default.strip()

return mapping


def _generate_pybind11(operator):
optional_tensor_params = _find_optional_tensor_params(operator.name)
vector_tensor_params = _find_vector_tensor_params(operator.name)
params_with_defaults = _find_params_with_defaults(operator.name)

def _is_optional_tensor(arg):
if arg.spelling in optional_tensor_params:
Expand Down Expand Up @@ -186,6 +206,10 @@ def _generate_py_args(node):

if _is_optional(arg):
parts.append(f'py::arg("{arg.spelling}") = py::none()')
elif arg.spelling in params_with_defaults:
parts.append(
f'py::arg("{arg.spelling}") = {params_with_defaults[arg.spelling]}'
)
else:
parts.append(f'py::arg("{arg.spelling}")')

Expand Down Expand Up @@ -257,8 +281,7 @@ def _generate_call(op_name, call, method=True):
}})
.def_static("clear_cache", &Self::clear_cache);

{callers}
}}
{callers}}}

}} // namespace infini::ops

Expand Down
70 changes: 66 additions & 4 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,66 @@ if(WITH_ASCEND)
list(APPEND DEVICE_LIST "ascend")

# Custom `AscendC` kernels (PyTorch extension, requires `torch_npu`).
if(BUILD_CUSTOM_KERNEL)
add_subdirectory(ascend/custom)
if(BUILD_ASCEND_CUSTOM)
# In-tree `ascendc_library()` trips the `CANN` `extract_host_stub.py`
# path-handling bug under `scikit-build-core`'s temp-dir builds
# (`KeyError` on `/./workspace/...` paths in `$<TARGET_OBJECTS>`).
# Work around it by driving the standalone `src/ascend/custom/build.sh`
# — that script invokes a separate `cmake` with
# `src/ascend/custom/` as its `SOURCE_DIR`, avoiding the buggy
# path shape. The produced `.a` is imported and linked into
# `ops` with `--whole-archive`.
set(_custom_build_dir "${CMAKE_SOURCE_DIR}/build/build_ascend_custom")
set(_custom_lib "${_custom_build_dir}/lib/libno_workspace_kernel.a")

if(NOT DEFINED SOC_VERSION OR "${SOC_VERSION}" STREQUAL "")
include(${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/cmake/detect_soc.cmake)
infiniops_detect_soc(SOC_VERSION)
endif()

# Drive `build.sh` as a build-phase target with explicit source
# dependencies so that editing any `op_host/` or `op_kernel/`
# source re-triggers the build (plain `execute_process` at
# configure time would only gate on file existence and leave
# stale `.a` files in place).
file(GLOB_RECURSE _custom_srcs CONFIGURE_DEPENDS
"${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/*.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/*.h"
"${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/build.sh")

# Scrub env inherited from the outer `scikit-build-core` invocation
# before handing control to `build.sh`:
# * `CMAKE_GENERATOR` / `CMAKE_EXPORT_COMPILE_COMMANDS` leaking
# into the inner `cmake` change the path format passed to
# `ninja`'s `_host_cpp` rule and re-trigger the `CANN`
# `extract_host_stub.py` `KeyError` (`/./workspace/...`) that
# standalone `build.sh` avoids.
# * `PYTHONPATH` from `pip`'s build-isolation overlay makes the
# child `python3` skip the system `site-packages` — child
# `cmake` modules that `import torch` (`config_envs.cmake`)
# then fail with `ModuleNotFoundError` even though `torch` is
# installed.
add_custom_command(
OUTPUT ${_custom_lib}
COMMAND ${CMAKE_COMMAND} -E env
--unset=CMAKE_GENERATOR
--unset=CMAKE_EXPORT_COMPILE_COMMANDS
--unset=CMAKE_BUILD_PARALLEL_LEVEL
--unset=PYTHONPATH
"BUILD_DIR=${_custom_build_dir}"
"CMAKE_EXE=${CMAKE_COMMAND}"
bash ${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/build.sh ${SOC_VERSION}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom
DEPENDS ${_custom_srcs}
COMMENT "Building custom AscendC kernels (SOC_VERSION=${SOC_VERSION})"
VERBATIM)

add_custom_target(no_workspace_kernel_build ALL DEPENDS ${_custom_lib})

add_library(no_workspace_kernel STATIC IMPORTED GLOBAL)
set_target_properties(no_workspace_kernel PROPERTIES
IMPORTED_LOCATION "${_custom_lib}")
add_dependencies(no_workspace_kernel no_workspace_kernel_build)

# Link the compiled `AscendC` kernel objects into `infiniops` so that
# custom kernel implementations (e.g. `RmsNorm` index 1) can call
Expand Down Expand Up @@ -379,9 +437,13 @@ if(GENERATE_PYTHON_BINDINGS)
# The `Operator<..., 1>` template instantiations that call
# `aclrtlaunch_*` live in `ops.cc`, so link here with
# `--whole-archive` to ensure all launch functions are available.
if(BUILD_CUSTOM_KERNEL)
# `$<TARGET_FILE>` works for both real `ascendc_library()` targets and
# `IMPORTED` targets pointing at a pre-built `.a`.
if(BUILD_ASCEND_CUSTOM)
target_link_libraries(ops PRIVATE
-Wl,--whole-archive no_workspace_kernel -Wl,--no-whole-archive)
-Wl,--whole-archive $<TARGET_FILE:no_workspace_kernel> -Wl,--no-whole-archive)
# `ops` link step must wait for `build.sh` to produce the `.a`.
add_dependencies(ops no_workspace_kernel_build)
endif()

set_target_properties(infiniops PROPERTIES INSTALL_RPATH "$ORIGIN")
Expand Down
144 changes: 144 additions & 0 deletions src/ascend/add_rms_norm/kernel.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#ifndef INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_H_
#define INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_H_

#include <vector>

#include "acl/acl.h"
#include "aclnn/aclnn_base.h"
#include "aclnn_add.h"
#include "aclnn_rms_norm.h"
#include "ascend/common.h"
#include "ascend/workspace_pool_.h"
#include "base/add_rms_norm.h"
#include "operator.h"

namespace infini::ops {

// Decomposed implementation: `aclnnAdd` + `aclnnRmsNorm`.
//
// The fused `aclnnAddRmsNorm` API has ~200 us host-side launch overhead that
// dominates small-tensor dispatch. Decomposing into two fast ACLNN calls
// reduces host dispatch from ~224 us to ~56 us (4x faster) with negligible
// NPU-side impact for inference tensor sizes.
template <>
class Operator<AddRmsNorm, Device::Type::kAscend, 0> : public AddRmsNorm {
public:
Operator(const Tensor input, const Tensor residual, const Tensor weight,
float eps, Tensor out, Tensor residual_out)
: AddRmsNorm(input, residual, weight, eps, out, residual_out),
input_cache_(input),
residual_cache_(residual),
weight_cache_(weight),
out_cache_(out),
residual_out_cache_(residual_out) {
// Alpha scalar for `aclnnAdd` (`residual_out = input + 1.0 * residual`).
alpha_ = aclCreateScalar(&alpha_storage_, ACL_FLOAT);

// `aclnnRmsNorm` writes `rstd` as a required side output. Size is
// computed here; the buffer is obtained from the pool in `operator()`.
rstd_shape_ = {static_cast<int64_t>(batch_size_),
static_cast<int64_t>(nhead_)};
rstd_size_ = batch_size_ * nhead_ * sizeof(float);
}

~Operator() {
if (!ascend::IsAclRuntimeAlive()) return;

// Null cached descriptors — see `AclTensorCache::release()`.
input_cache_.release();
residual_cache_.release();
weight_cache_.release();
out_cache_.release();
residual_out_cache_.release();

// `rstd_tensor_` leaks with `norm_exec_` at shutdown (see `64c367c`).
if (alpha_) aclDestroyScalar(alpha_);
}

void operator()(const Tensor input, const Tensor residual,
const Tensor weight, float eps, Tensor out,
Tensor residual_out) const override {
auto t_input = input_cache_.get(const_cast<void*>(input.data()));
auto t_residual = residual_cache_.get(const_cast<void*>(residual.data()));
auto t_weight = weight_cache_.get(const_cast<void*>(weight.data()));
auto t_out = out_cache_.get(out.data());
auto t_residual_out = residual_out_cache_.get(residual_out.data());
auto stream = static_cast<aclrtStream>(stream_);

// Step 1: `residual_out = input + residual`.
if (!add_exec_) {
aclnnAddGetWorkspaceSize(t_input, t_residual, alpha_, t_residual_out,
&add_ws_, &add_exec_);
aclSetAclOpExecutorRepeatable(add_exec_);
} else {
aclSetInputTensorAddr(add_exec_, 0, t_input,
const_cast<void*>(input.data()));
aclSetInputTensorAddr(add_exec_, 1, t_residual,
const_cast<void*>(residual.data()));
aclSetOutputTensorAddr(add_exec_, 0, t_residual_out, residual_out.data());
}
auto& add_arena = ascend::GetWorkspacePool().Ensure(stream, add_ws_);
aclnnAdd(add_arena.buf, add_ws_, add_exec_, stream);

// Obtain shared `rstd` buffer from pool.
auto& rstd_arena =
ascend::GetWorkspacePool().Ensure(stream, rstd_size_, "temp");

// Lazily create the `rstd` tensor descriptor on first call.
if (!rstd_tensor_) {
rstd_tensor_ = aclCreateTensor(rstd_shape_.data(), 2, ACL_FLOAT,
/*strides=*/nullptr, 0, ACL_FORMAT_ND,
rstd_shape_.data(), 2, rstd_arena.buf);
} else {
aclSetRawTensorAddr(rstd_tensor_, rstd_arena.buf);
}

// Step 2: `out = rms_norm(residual_out, weight, eps)`.
if (!norm_exec_) {
aclnnRmsNormGetWorkspaceSize(t_residual_out, t_weight, eps, t_out,
rstd_tensor_, &norm_ws_, &norm_exec_);
aclSetAclOpExecutorRepeatable(norm_exec_);
} else {
aclSetInputTensorAddr(norm_exec_, 0, t_residual_out, residual_out.data());
aclSetInputTensorAddr(norm_exec_, 1, t_weight,
const_cast<void*>(weight.data()));
aclSetOutputTensorAddr(norm_exec_, 0, t_out, out.data());
aclSetOutputTensorAddr(norm_exec_, 1, rstd_tensor_, rstd_arena.buf);
}
auto& norm_arena = ascend::GetWorkspacePool().Ensure(stream, norm_ws_);
aclnnRmsNorm(norm_arena.buf, norm_ws_, norm_exec_, stream);
}

private:
mutable ascend::AclTensorCache input_cache_;

mutable ascend::AclTensorCache residual_cache_;

mutable ascend::AclTensorCache weight_cache_;

mutable ascend::AclTensorCache out_cache_;

mutable ascend::AclTensorCache residual_out_cache_;

float alpha_storage_ = 1.0f;

aclScalar* alpha_ = nullptr;

std::vector<int64_t> rstd_shape_;

uint64_t rstd_size_ = 0;

mutable aclTensor* rstd_tensor_ = nullptr;

mutable aclOpExecutor* add_exec_ = nullptr;

mutable uint64_t add_ws_ = 0;

mutable aclOpExecutor* norm_exec_ = nullptr;

mutable uint64_t norm_ws_ = 0;
};

} // namespace infini::ops

#endif
Loading
Loading