From 43011caccd742217d1ee110270afbe142f37de56 Mon Sep 17 00:00:00 2001
From: Ziminli <70735843+Ziminli@users.noreply.github.com>
Date: Fri, 24 Apr 2026 16:47:44 +0800
Subject: [PATCH] =?UTF-8?q?Revert=20"feat(ascend):=20op-norm-rope=20group?=
 =?UTF-8?q?=20=E2=80=94=20Swiglu,=20SiluAndMul,=20CausalSoftmax,=E2=80=A6"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit 38a23cfd0adb803107672a040462349d0295af34.
---
 CMakeLists.txt                                |  21 +-
 pyproject.toml                                |   9 -
 scripts/generate_wrappers.py                  |  27 +-
 src/CMakeLists.txt                            |  70 +-
 src/ascend/add_rms_norm/kernel.h              | 144 ----
 src/ascend/add_rms_norm/kernel_custom.h       | 171 -----
 src/ascend/add_rms_norm/kernel_fused.h        | 132 ----
 src/ascend/causal_softmax/kernel.h            | 173 -----
 src/ascend/custom/CMakeLists.txt              |  18 +-
 .../add_rms_norm/op_host/add_rms_norm.cpp     |  19 +-
 .../custom/add_rms_norm/op_kernel/.clang-tidy |   9 -
 .../add_rms_norm/op_kernel/add_rms_norm.cpp   | 350 ++++-----
 src/ascend/custom/build.sh                    |  33 +-
 src/ascend/custom/cmake/config_ascend.cmake   |  14 +-
 src/ascend/custom/cmake/detect_soc.cmake      |  24 -
 .../custom/rms_norm/op_host/rms_norm.cpp      |  18 +-
 .../custom/rms_norm/op_kernel/rms_norm.cpp    | 281 ++++---
 src/ascend/linear/kernel.h                    |   6 -
 src/ascend/rms_norm/kernel.h                  | 100 ---
 src/ascend/rms_norm/kernel_custom.h           | 155 ----
 src/ascend/rotary_embedding/kernel.h          | 373 ---------
 src/ascend/rotary_embedding/kernel_atb.h      | 449 -----------
 .../rotary_embedding/kernel_sincos_cache.h    | 177 -----
 src/ascend/silu_and_mul/kernel.h              | 127 ---
 src/ascend/swiglu/kernel.h                    | 109 ---
 src/ascend/swiglu/kernel_fused.h              | 202 -----
 src/base/add_rms_norm.h                       |  41 +-
 src/base/linear.h                             |  33 +-
 src/base/rotary_embedding.h                   | 107 ++-
 src/base/silu_and_mul.h                       |  62 --
 src/cpu/linear/linear.h                       |   4 +-
 src/data_type.h                               |  31 +-
 tests/test_add_rms_norm.py                    | 113 ---
 tests/test_rotary_embedding.py                | 723 ------------------
 tests/test_silu_and_mul.py                    |  76 --
 35 files changed, 438 insertions(+), 3963 deletions(-)
 delete mode 100644 src/ascend/add_rms_norm/kernel.h
 delete mode 100644 src/ascend/add_rms_norm/kernel_custom.h
 delete mode 100644 src/ascend/add_rms_norm/kernel_fused.h
 delete mode 100644 src/ascend/causal_softmax/kernel.h
 delete mode 100644 src/ascend/custom/add_rms_norm/op_kernel/.clang-tidy
 delete mode 100644 src/ascend/custom/cmake/detect_soc.cmake
 delete mode 100644 src/ascend/rms_norm/kernel.h
 delete mode 100644 src/ascend/rms_norm/kernel_custom.h
 delete mode 100644 src/ascend/rotary_embedding/kernel.h
 delete mode 100644 src/ascend/rotary_embedding/kernel_atb.h
 delete mode 100644 src/ascend/rotary_embedding/kernel_sincos_cache.h
 delete mode 100644 src/ascend/silu_and_mul/kernel.h
 delete mode 100644 src/ascend/swiglu/kernel.h
 delete mode 100644 src/ascend/swiglu/kernel_fused.h
 delete mode 100644 src/base/silu_and_mul.h
 delete mode 100644 tests/test_add_rms_norm.py
 delete mode 100644 tests/test_rotary_embedding.py
 delete mode 100644 tests/test_silu_and_mul.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2e10db2e..91c2b015 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,21 +18,12 @@ option(WITH_ASCEND "Enable Ascend backend" OFF)
 
 option(WITH_TORCH "Enable PyTorch C++ backend" OFF)
 
-# Custom `AscendC` kernels under `src/ascend/custom/`.  `ON` by default
-# so CI and routine dev builds always exercise `implementation_index=1/2`
-# for `RmsNorm` / `AddRmsNorm`.  Gated by `WITH_ASCEND` in
-# `src/CMakeLists.txt` — non-Ascend builds ignore it.  Pass
-# `-DBUILD_ASCEND_CUSTOM=OFF` to skip the `ccec` build on Ascend
-# machines where the custom kernels aren't needed.
-#
-# When `ON`, `src/CMakeLists.txt` drives the standalone
-# `src/ascend/custom/build.sh` via `execute_process` at configure time
-# (sidesteps a `CANN` `extract_host_stub.py` path bug that breaks
-# in-tree `ascendc_library()` under `scikit-build-core` temp-dir builds)
-# and links the produced `libno_workspace_kernel.a` into the `ops`
-# module with `--whole-archive`.  Requires `torch_npu` and the
-# `AscendC` toolchain (`ccec`).
-option(BUILD_ASCEND_CUSTOM "Build custom AscendC kernels" ON)
+# Default OFF until CANN's `extract_host_stub.py` path handling is fixed for
+# `scikit-build-core` temp-dir builds (triggers `KeyError` on the preprocessed
+# object path). Enable explicitly with `-DBUILD_CUSTOM_KERNEL=ON` when the
+# toolchain is compatible or when building via the standalone
+# `src/ascend/custom/build.sh` script.
+option(BUILD_CUSTOM_KERNEL "Build custom AscendC kernel PyTorch extension (requires `torch_npu`)" OFF)
 
 option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF)
 option(AUTO_DETECT_BACKENDS "Automatically detect available backends" OFF)
diff --git a/pyproject.toml b/pyproject.toml
index 6b517026..959699f9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,15 +7,6 @@ name = "InfiniOps"
 version = "0.1.0"
 
 [project.optional-dependencies]
-# TODO: `torch` here is unconstrained.  On Ascend hosts, the working
-# torch is the Ascend-matched `torch 2.9.0+cpu` paired with
-# `torch_npu 2.9.0.post1+…`.  A `pip install -e .[dev] --force-reinstall`
-# will re-resolve `torch` to the latest PyPI version (currently
-# `torch 2.11.0`), which now declares `cuda-toolkit` / `nvidia-cublas` /
-# `nvidia-cudnn` / … as hard deps — downloads GBs of CUDA wheels and
-# kills the `torch_npu` / `vllm-ascend` pairing.  Needs a platform-aware
-# split (e.g. `torch; platform_machine != 'aarch64'`, or move `torch`
-# out of `dev` and require it pre-installed in the container image).
 dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch", "pyyaml"]
 
 [tool.scikit-build.wheel]
diff --git a/scripts/generate_wrappers.py b/scripts/generate_wrappers.py
index 9810404d..49b6c199 100644
--- a/scripts/generate_wrappers.py
+++ b/scripts/generate_wrappers.py
@@ -112,29 +112,9 @@ def _find_vector_tensor_params(op_name):
     return set(re.findall(r"std::vector<Tensor>\s+(\w+)", source))
 
 
-def _find_params_with_defaults(op_name):
-    """Return ``{param_name: default_literal}`` for base-header params that
-    carry a `= <literal>` default value.  `libclang`'s cursor API does not
-    expose defaults reliably, so we regex-scan the source.  Only used for
-    plain scalar defaults such as ``bool pre_gathered = false``.
-    """
-    source = (_BASE_DIR / f"{op_name}.h").read_text()
-
-    mapping = {}
-
-    for name, default in re.findall(
-        r"\b(?:bool|int(?:64_t|32_t|8_t|16_t)?|std::size_t|std::uint\w+_t|float|double)\s+(\w+)\s*=\s*([^,\)]+?)\s*(?:,|\))",
-        source,
-    ):
-        mapping[name] = default.strip()
-
-    return mapping
-
-
 def _generate_pybind11(operator):
     optional_tensor_params = _find_optional_tensor_params(operator.name)
     vector_tensor_params = _find_vector_tensor_params(operator.name)
-    params_with_defaults = _find_params_with_defaults(operator.name)
 
     def _is_optional_tensor(arg):
         if arg.spelling in optional_tensor_params:
@@ -206,10 +186,6 @@ def _generate_py_args(node):
 
             if _is_optional(arg):
                 parts.append(f'py::arg("{arg.spelling}") = py::none()')
-            elif arg.spelling in params_with_defaults:
-                parts.append(
-                    f'py::arg("{arg.spelling}") = {params_with_defaults[arg.spelling]}'
-                )
             else:
                 parts.append(f'py::arg("{arg.spelling}")')
 
@@ -281,7 +257,8 @@ def _generate_call(op_name, call, method=True):
       }})
       .def_static("clear_cache", &Self::clear_cache);
 
-{callers}}}
+{callers}
+}}
 
 }}  // namespace infini::ops
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 443ac0e2..32c92949 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -241,66 +241,8 @@ if(WITH_ASCEND)
     list(APPEND DEVICE_LIST "ascend")
 
     # Custom `AscendC` kernels (PyTorch extension, requires `torch_npu`).
-    if(BUILD_ASCEND_CUSTOM)
-        # In-tree `ascendc_library()` trips the `CANN` `extract_host_stub.py`
-        # path-handling bug under `scikit-build-core`'s temp-dir builds
-        # (`KeyError` on `/./workspace/...` paths in `$<TARGET_OBJECTS>`).
-        # Work around it by driving the standalone `src/ascend/custom/build.sh`
-        # — that script invokes a separate `cmake` with
-        # `src/ascend/custom/` as its `SOURCE_DIR`, avoiding the buggy
-        # path shape.  The produced `.a` is imported and linked into
-        # `ops` with `--whole-archive`.
-        set(_custom_build_dir "${CMAKE_SOURCE_DIR}/build/build_ascend_custom")
-        set(_custom_lib "${_custom_build_dir}/lib/libno_workspace_kernel.a")
-
-        if(NOT DEFINED SOC_VERSION OR "${SOC_VERSION}" STREQUAL "")
-            include(${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/cmake/detect_soc.cmake)
-            infiniops_detect_soc(SOC_VERSION)
-        endif()
-
-        # Drive `build.sh` as a build-phase target with explicit source
-        # dependencies so that editing any `op_host/` or `op_kernel/`
-        # source re-triggers the build (plain `execute_process` at
-        # configure time would only gate on file existence and leave
-        # stale `.a` files in place).
-        file(GLOB_RECURSE _custom_srcs CONFIGURE_DEPENDS
-            "${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/*.cpp"
-            "${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/*.h"
-            "${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/build.sh")
-
-        # Scrub env inherited from the outer `scikit-build-core` invocation
-        # before handing control to `build.sh`:
-        #  * `CMAKE_GENERATOR` / `CMAKE_EXPORT_COMPILE_COMMANDS` leaking
-        #    into the inner `cmake` change the path format passed to
-        #    `ninja`'s `_host_cpp` rule and re-trigger the `CANN`
-        #    `extract_host_stub.py` `KeyError` (`/./workspace/...`) that
-        #    standalone `build.sh` avoids.
-        #  * `PYTHONPATH` from `pip`'s build-isolation overlay makes the
-        #    child `python3` skip the system `site-packages` — child
-        #    `cmake` modules that `import torch` (`config_envs.cmake`)
-        #    then fail with `ModuleNotFoundError` even though `torch` is
-        #    installed.
-        add_custom_command(
-            OUTPUT ${_custom_lib}
-            COMMAND ${CMAKE_COMMAND} -E env
-                    --unset=CMAKE_GENERATOR
-                    --unset=CMAKE_EXPORT_COMPILE_COMMANDS
-                    --unset=CMAKE_BUILD_PARALLEL_LEVEL
-                    --unset=PYTHONPATH
-                    "BUILD_DIR=${_custom_build_dir}"
-                    "CMAKE_EXE=${CMAKE_COMMAND}"
-                    bash ${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/build.sh ${SOC_VERSION}
-            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom
-            DEPENDS ${_custom_srcs}
-            COMMENT "Building custom AscendC kernels (SOC_VERSION=${SOC_VERSION})"
-            VERBATIM)
-
-        add_custom_target(no_workspace_kernel_build ALL DEPENDS ${_custom_lib})
-
-        add_library(no_workspace_kernel STATIC IMPORTED GLOBAL)
-        set_target_properties(no_workspace_kernel PROPERTIES
-            IMPORTED_LOCATION "${_custom_lib}")
-        add_dependencies(no_workspace_kernel no_workspace_kernel_build)
+    if(BUILD_CUSTOM_KERNEL)
+        add_subdirectory(ascend/custom)
 
         # Link the compiled `AscendC` kernel objects into `infiniops` so that
         # custom kernel implementations (e.g. `RmsNorm` index 1) can call
@@ -437,13 +379,9 @@ if(GENERATE_PYTHON_BINDINGS)
     # The `Operator<..., 1>` template instantiations that call
     # `aclrtlaunch_*` live in `ops.cc`, so link here with
     # `--whole-archive` to ensure all launch functions are available.
-    # `$<TARGET_FILE>` works for both real `ascendc_library()` targets and
-    # `IMPORTED` targets pointing at a pre-built `.a`.
-    if(BUILD_ASCEND_CUSTOM)
+    if(BUILD_CUSTOM_KERNEL)
         target_link_libraries(ops PRIVATE
-            -Wl,--whole-archive $<TARGET_FILE:no_workspace_kernel> -Wl,--no-whole-archive)
-        # `ops` link step must wait for `build.sh` to produce the `.a`.
-        add_dependencies(ops no_workspace_kernel_build)
+            -Wl,--whole-archive no_workspace_kernel -Wl,--no-whole-archive)
     endif()
 
     set_target_properties(infiniops PROPERTIES INSTALL_RPATH "$ORIGIN")
diff --git a/src/ascend/add_rms_norm/kernel.h b/src/ascend/add_rms_norm/kernel.h
deleted file mode 100644
index 38b0a5ab..00000000
--- a/src/ascend/add_rms_norm/kernel.h
+++ /dev/null
@@ -1,144 +0,0 @@
-#ifndef INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_H_
-#define INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_H_
-
-#include <vector>
-
-#include "acl/acl.h"
-#include "aclnn/aclnn_base.h"
-#include "aclnn_add.h"
-#include "aclnn_rms_norm.h"
-#include "ascend/common.h"
-#include "ascend/workspace_pool_.h"
-#include "base/add_rms_norm.h"
-#include "operator.h"
-
-namespace infini::ops {
-
-// Decomposed implementation: `aclnnAdd` + `aclnnRmsNorm`.
-//
-// The fused `aclnnAddRmsNorm` API has ~200 us host-side launch overhead that
-// dominates small-tensor dispatch.  Decomposing into two fast ACLNN calls
-// reduces host dispatch from ~224 us to ~56 us (4x faster) with negligible
-// NPU-side impact for inference tensor sizes.
-template <>
-class Operator<AddRmsNorm, Device::Type::kAscend, 0> : public AddRmsNorm {
- public:
-  Operator(const Tensor input, const Tensor residual, const Tensor weight,
-           float eps, Tensor out, Tensor residual_out)
-      : AddRmsNorm(input, residual, weight, eps, out, residual_out),
-        input_cache_(input),
-        residual_cache_(residual),
-        weight_cache_(weight),
-        out_cache_(out),
-        residual_out_cache_(residual_out) {
-    // Alpha scalar for `aclnnAdd` (`residual_out = input + 1.0 * residual`).
-    alpha_ = aclCreateScalar(&alpha_storage_, ACL_FLOAT);
-
-    // `aclnnRmsNorm` writes `rstd` as a required side output.  Size is
-    // computed here; the buffer is obtained from the pool in `operator()`.
-    rstd_shape_ = {static_cast<int64_t>(batch_size_),
-                   static_cast<int64_t>(nhead_)};
-    rstd_size_ = batch_size_ * nhead_ * sizeof(float);
-  }
-
-  ~Operator() {
-    if (!ascend::IsAclRuntimeAlive()) return;
-
-    // Null cached descriptors — see `AclTensorCache::release()`.
-    input_cache_.release();
-    residual_cache_.release();
-    weight_cache_.release();
-    out_cache_.release();
-    residual_out_cache_.release();
-
-    // `rstd_tensor_` leaks with `norm_exec_` at shutdown (see `64c367c`).
-    if (alpha_) aclDestroyScalar(alpha_);
-  }
-
-  void operator()(const Tensor input, const Tensor residual,
-                  const Tensor weight, float eps, Tensor out,
-                  Tensor residual_out) const override {
-    auto t_input = input_cache_.get(const_cast<void*>(input.data()));
-    auto t_residual = residual_cache_.get(const_cast<void*>(residual.data()));
-    auto t_weight = weight_cache_.get(const_cast<void*>(weight.data()));
-    auto t_out = out_cache_.get(out.data());
-    auto t_residual_out = residual_out_cache_.get(residual_out.data());
-    auto stream = static_cast<aclrtStream>(stream_);
-
-    // Step 1: `residual_out = input + residual`.
-    if (!add_exec_) {
-      aclnnAddGetWorkspaceSize(t_input, t_residual, alpha_, t_residual_out,
-                               &add_ws_, &add_exec_);
-      aclSetAclOpExecutorRepeatable(add_exec_);
-    } else {
-      aclSetInputTensorAddr(add_exec_, 0, t_input,
-                            const_cast<void*>(input.data()));
-      aclSetInputTensorAddr(add_exec_, 1, t_residual,
-                            const_cast<void*>(residual.data()));
-      aclSetOutputTensorAddr(add_exec_, 0, t_residual_out, residual_out.data());
-    }
-    auto& add_arena = ascend::GetWorkspacePool().Ensure(stream, add_ws_);
-    aclnnAdd(add_arena.buf, add_ws_, add_exec_, stream);
-
-    // Obtain shared `rstd` buffer from pool.
-    auto& rstd_arena =
-        ascend::GetWorkspacePool().Ensure(stream, rstd_size_, "temp");
-
-    // Lazily create the `rstd` tensor descriptor on first call.
-    if (!rstd_tensor_) {
-      rstd_tensor_ = aclCreateTensor(rstd_shape_.data(), 2, ACL_FLOAT,
-                                     /*strides=*/nullptr, 0, ACL_FORMAT_ND,
-                                     rstd_shape_.data(), 2, rstd_arena.buf);
-    } else {
-      aclSetRawTensorAddr(rstd_tensor_, rstd_arena.buf);
-    }
-
-    // Step 2: `out = rms_norm(residual_out, weight, eps)`.
-    if (!norm_exec_) {
-      aclnnRmsNormGetWorkspaceSize(t_residual_out, t_weight, eps, t_out,
-                                   rstd_tensor_, &norm_ws_, &norm_exec_);
-      aclSetAclOpExecutorRepeatable(norm_exec_);
-    } else {
-      aclSetInputTensorAddr(norm_exec_, 0, t_residual_out, residual_out.data());
-      aclSetInputTensorAddr(norm_exec_, 1, t_weight,
-                            const_cast<void*>(weight.data()));
-      aclSetOutputTensorAddr(norm_exec_, 0, t_out, out.data());
-      aclSetOutputTensorAddr(norm_exec_, 1, rstd_tensor_, rstd_arena.buf);
-    }
-    auto& norm_arena = ascend::GetWorkspacePool().Ensure(stream, norm_ws_);
-    aclnnRmsNorm(norm_arena.buf, norm_ws_, norm_exec_, stream);
-  }
-
- private:
-  mutable ascend::AclTensorCache input_cache_;
-
-  mutable ascend::AclTensorCache residual_cache_;
-
-  mutable ascend::AclTensorCache weight_cache_;
-
-  mutable ascend::AclTensorCache out_cache_;
-
-  mutable ascend::AclTensorCache residual_out_cache_;
-
-  float alpha_storage_ = 1.0f;
-
-  aclScalar* alpha_ = nullptr;
-
-  std::vector<int64_t> rstd_shape_;
-
-  uint64_t rstd_size_ = 0;
-
-  mutable aclTensor* rstd_tensor_ = nullptr;
-
-  mutable aclOpExecutor* add_exec_ = nullptr;
-
-  mutable uint64_t add_ws_ = 0;
-
-  mutable aclOpExecutor* norm_exec_ = nullptr;
-
-  mutable uint64_t norm_ws_ = 0;
-};
-
-}  // namespace infini::ops
-
-#endif
diff --git a/src/ascend/add_rms_norm/kernel_custom.h b/src/ascend/add_rms_norm/kernel_custom.h
deleted file mode 100644
index daaa8c39..00000000
--- a/src/ascend/add_rms_norm/kernel_custom.h
+++ /dev/null
@@ -1,171 +0,0 @@
-#ifndef INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_CUSTOM_H_
-#define INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_CUSTOM_H_
-
-#ifdef INFINI_HAS_CUSTOM_KERNELS
-
-#include <algorithm>
-#include <cstdint>
-
-#include "acl/acl.h"
-#include "aclnn/aclnn_base.h"
-#include "aclnnop/aclnn_cast.h"
-#include "ascend/common.h"
-#include "ascend/workspace_pool_.h"
-#include "base/add_rms_norm.h"
-#include "operator.h"
-
-// Forward-declare the `aclrtlaunch_AddRmsNorm` launch symbol defined
-// by the AscendC toolchain from `custom/add_rms_norm/op_kernel/`.
-extern "C" uint32_t aclrtlaunch_AddRmsNorm(
-    uint32_t block_dim, void* stream, void* input, void* residual, void* weight,
-    int64_t total_rows, int64_t dim_length, int64_t dim_length_align,
-    int64_t former_num, int64_t former_length, int64_t tail_length, float eps,
-    int64_t dtype_code, void* out, void* residual_out);
-
-namespace infini::ops {
-
-// Custom AscendC fused `AddRmsNorm` kernel (implementation index 2).
-//
-// A single-kernel implementation that computes `residual_out = input +
-// residual` followed by `out = rms_norm(residual_out, weight, eps)` in one
-// launch, avoiding the decomposed `aclnnAdd` + `aclnnRmsNorm` calls (index 0)
-// or the fused `aclnnAddRmsNorm` call (index 1).  Migrated from the custom
-// `RmsNorm` kernel (index 1 of `RmsNorm`).
-//
-// Select via `implementation_index=2` in Python:
-//   `infini.ops.add_rms_norm(input, residual, weight, eps, out, residual_out,
-//                            implementation_index=2, stream=s)`.
-//
-// Requirements:
-//   - Input last dimension must be 32-byte aligned (divisible by 16 for
-//     `float16` or 8 for `float32`).  All standard LLM hidden dimensions
-//     satisfy this.
-//   - `weight` must have the same dtype as `input`.
-//   - The custom kernel binary must be linked (`BUILD_ASCEND_CUSTOM=ON`).
-template <>
-class Operator<AddRmsNorm, Device::Type::kAscend, 2> : public AddRmsNorm {
- public:
-  Operator(const Tensor input, const Tensor residual, const Tensor weight,
-           float eps, Tensor out, Tensor residual_out)
-      : AddRmsNorm(input, residual, weight, eps, out, residual_out),
-        dtype_{input.dtype()} {
-    assert((dtype_ == DataType::kFloat16 || dtype_ == DataType::kBFloat16 ||
-            dtype_ == DataType::kFloat32) &&
-           "`AddRmsNorm` custom kernel: `input` must be `fp16`, `bf16`, or "
-           "`fp32`");
-
-    // 32-byte alignment on the last dimension — kernel relies on aligned
-    // `DataCopyPad` loads/stores.
-    int64_t align_elems = 32 / static_cast<int64_t>(kDataTypeToSize.at(dtype_));
-    dim_length_align_ =
-        ((static_cast<int64_t>(dim_) + align_elems - 1) / align_elems) *
-        align_elems;
-    assert(static_cast<int64_t>(dim_) == dim_length_align_ &&
-           "`AddRmsNorm` custom kernel: last dimension must be 32-byte "
-           "aligned");
-
-    total_rows_ =
-        static_cast<int64_t>(batch_size_) * static_cast<int64_t>(nhead_);
-
-    // The custom kernel always reads `weight` as fp32.  fp16 / bf16 inputs
-    // trigger a lazy cast in `operator()` (guarded by `last_weight_ptr_`
-    // so that the cast runs only when the weight pointer changes — model
-    // weights are typically fixed after loading).
-    if (dtype_ != DataType::kFloat32) {
-      size_t fp32_bytes = static_cast<size_t>(dim_) * sizeof(float);
-      aclrtMalloc(&weight_fp32_data_, fp32_bytes, ACL_MEM_MALLOC_NORMAL_ONLY);
-
-      weight_src_cache_ = ascend::AclTensorCache(
-          {static_cast<int64_t>(dim_)}, ascend::ToAclDtype(dtype_), nullptr);
-      weight_dst_cache_ = ascend::AclTensorCache({static_cast<int64_t>(dim_)},
-                                                 ACL_FLOAT, weight_fp32_data_);
-    }
-  }
-
-  ~Operator() {
-    if (!ascend::IsAclRuntimeAlive()) return;
-
-    // Null cached descriptors — see `AclTensorCache::release()`.
-    weight_src_cache_.release();
-    weight_dst_cache_.release();
-
-    if (weight_fp32_data_) aclrtFree(weight_fp32_data_);
-  }
-
-  void operator()(const Tensor input, const Tensor residual,
-                  const Tensor weight, float eps, Tensor out,
-                  Tensor residual_out) const override {
-    auto stream = static_cast<aclrtStream>(stream_);
-
-    void* weight_fp32;
-
-    if (dtype_ != DataType::kFloat32) {
-      const void* cur_weight = weight.data();
-
-      // Model weights are fixed after loading, so the cast typically runs
-      // once on the first call and is skipped on all subsequent calls.
-      if (cur_weight != last_weight_ptr_) {
-        auto t_src = weight_src_cache_.get(const_cast<void*>(cur_weight));
-        auto t_dst = weight_dst_cache_.get(weight_fp32_data_);
-
-        if (!cast_exec_) {
-          aclnnCastGetWorkspaceSize(t_src, ACL_FLOAT, t_dst, &cast_ws_,
-                                    &cast_exec_);
-          aclSetAclOpExecutorRepeatable(cast_exec_);
-        } else {
-          aclSetInputTensorAddr(cast_exec_, 0, t_src,
-                                const_cast<void*>(cur_weight));
-          aclSetOutputTensorAddr(cast_exec_, 0, t_dst, weight_fp32_data_);
-        }
-
-        auto& arena = ascend::GetWorkspacePool().Ensure(stream, cast_ws_);
-        aclnnCast(arena.buf, cast_ws_, cast_exec_, stream);
-        last_weight_ptr_ = cur_weight;
-      }
-
-      weight_fp32 = weight_fp32_data_;
-    } else {
-      weight_fp32 = const_cast<void*>(weight.data());
-    }
-
-    // Block-level tiling.  Ascend 910B has 20–40 AIV cores; over-subscribing
-    // is safe (runtime multiplexes) but wastes one weight load per block.
-    static constexpr int64_t kMaxBlockDim = 40;
-    int64_t used_cores = std::min(total_rows_, kMaxBlockDim);
-    int64_t former_length = (total_rows_ + used_cores - 1) / used_cores;
-    int64_t tail_length = former_length - 1;
-    int64_t former_num = total_rows_ - tail_length * used_cores;
-    uint32_t block_dim = static_cast<uint32_t>(used_cores);
-
-    aclrtlaunch_AddRmsNorm(block_dim, stream, const_cast<void*>(input.data()),
-                           const_cast<void*>(residual.data()), weight_fp32,
-                           total_rows_, static_cast<int64_t>(dim_),
-                           dim_length_align_, former_num, former_length,
-                           tail_length, eps, static_cast<int64_t>(dtype_),
-                           out.data(), residual_out.data());
-  }
-
- private:
-  DataType dtype_;
-
-  int64_t dim_length_align_;
-
-  int64_t total_rows_;
-
-  void* weight_fp32_data_ = nullptr;
-
-  mutable ascend::AclTensorCache weight_src_cache_;
-
-  mutable ascend::AclTensorCache weight_dst_cache_;
-
-  mutable const void* last_weight_ptr_ = nullptr;
-
-  mutable aclOpExecutor* cast_exec_ = nullptr;
-
-  mutable uint64_t cast_ws_ = 0;
-};
-
-}  // namespace infini::ops
-
-#endif  // INFINI_HAS_CUSTOM_KERNELS
-#endif  // INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_CUSTOM_H_
diff --git a/src/ascend/add_rms_norm/kernel_fused.h b/src/ascend/add_rms_norm/kernel_fused.h
deleted file mode 100644
index e28d7c28..00000000
--- a/src/ascend/add_rms_norm/kernel_fused.h
+++ /dev/null
@@ -1,132 +0,0 @@
-#ifndef INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_FUSED_H_
-#define INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_FUSED_H_
-
-#include <vector>
-
-#include "acl/acl.h"
-#include "aclnn/aclnn_base.h"
-#include "aclnnop/aclnn_add_rms_norm.h"
-#include "ascend/common.h"
-#include "ascend/workspace_pool_.h"
-#include "base/add_rms_norm.h"
-#include "operator.h"
-
-namespace infini::ops {
-
-// Fused implementation via `aclnnAddRmsNorm` (implementation index 1).
-//
-// Computes `residual_out = input + residual` and `out = rms_norm(residual_out,
-// weight, eps)` in a single CANN launch.  The fused API has higher host-side
-// launch overhead (~200 us) compared to the decomposed `aclnnAdd` +
-// `aclnnRmsNorm` path (~39 us), but may offer better NPU-side efficiency for
-// large tensors where kernel fusion reduces memory traffic.
-//
-// Select via `implementation_index=1` in Python:
-//   `infini.ops.add_rms_norm(..., implementation_index=1, stream=s)`.
-template <>
-class Operator<AddRmsNorm, Device::Type::kAscend, 1> : public AddRmsNorm {
- public:
-  Operator(const Tensor input, const Tensor residual, const Tensor weight,
-           float eps, Tensor out, Tensor residual_out)
-      : AddRmsNorm(input, residual, weight, eps, out, residual_out),
-        input_cache_(input),
-        residual_cache_(residual),
-        weight_cache_(weight),
-        out_cache_(out),
-        residual_out_cache_(residual_out) {
-    // `aclnnAddRmsNorm` requires `rstdOut` to have the same ndim as `input`,
-    // with the last `weight.ndim()` dimensions set to 1.  For example:
-    //   `input` (2, 32, 128), `weight` (128) -> `rstdOut` (2, 32, 1).
-    //   `input` (64, 128),    `weight` (128) -> `rstdOut` (64, 1).
-    fused_rstd_shape_.reserve(ndim_);
-    for (size_t i = 0; i < ndim_ - weight.ndim(); ++i) {
-      fused_rstd_shape_.push_back(static_cast<int64_t>(input.size(i)));
-    }
-    for (size_t i = 0; i < weight.ndim(); ++i) {
-      fused_rstd_shape_.push_back(1);
-    }
-
-    size_t rstd_elems = 1;
-    for (auto d : fused_rstd_shape_) {
-      rstd_elems *= static_cast<size_t>(d);
-    }
-    size_t rstd_bytes = rstd_elems * sizeof(float);
-    aclrtMalloc(&rstd_data_, rstd_bytes, ACL_MEM_MALLOC_NORMAL_ONLY);
-
-    rstd_tensor_ = aclCreateTensor(
-        fused_rstd_shape_.data(),
-        static_cast<int64_t>(fused_rstd_shape_.size()), ACL_FLOAT,
-        /*strides=*/nullptr, 0, ACL_FORMAT_ND, fused_rstd_shape_.data(),
-        static_cast<int64_t>(fused_rstd_shape_.size()), rstd_data_);
-  }
-
-  ~Operator() {
-    if (!ascend::IsAclRuntimeAlive()) return;
-
-    // Null cached descriptors — see `AclTensorCache::release()`.
-    input_cache_.release();
-    residual_cache_.release();
-    weight_cache_.release();
-    out_cache_.release();
-    residual_out_cache_.release();
-
-    // `rstd_tensor_` leaks with the executor at shutdown (see `64c367c`).
-    if (rstd_data_) aclrtFree(rstd_data_);
-  }
-
-  void operator()(const Tensor input, const Tensor residual,
-                  const Tensor weight, float eps, Tensor out,
-                  Tensor residual_out) const override {
-    auto t_input = input_cache_.get(const_cast<void*>(input.data()));
-    auto t_residual = residual_cache_.get(const_cast<void*>(residual.data()));
-    auto t_weight = weight_cache_.get(const_cast<void*>(weight.data()));
-    auto t_out = out_cache_.get(out.data());
-    auto t_residual_out = residual_out_cache_.get(residual_out.data());
-    auto stream = static_cast<aclrtStream>(stream_);
-
-    if (!executor_) {
-      aclnnAddRmsNormGetWorkspaceSize(
-          t_input, t_residual, t_weight, static_cast<double>(eps), t_out,
-          rstd_tensor_, t_residual_out, &ws_size_, &executor_);
-      aclSetAclOpExecutorRepeatable(executor_);
-    } else {
-      aclSetInputTensorAddr(executor_, 0, t_input,
-                            const_cast<void*>(input.data()));
-      aclSetInputTensorAddr(executor_, 1, t_residual,
-                            const_cast<void*>(residual.data()));
-      aclSetInputTensorAddr(executor_, 2, t_weight,
-                            const_cast<void*>(weight.data()));
-      aclSetOutputTensorAddr(executor_, 0, t_out, out.data());
-      // `rstd` at output index 1 has a stable address — no update needed.
-      aclSetOutputTensorAddr(executor_, 2, t_residual_out, residual_out.data());
-    }
-
-    auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_);
-    aclnnAddRmsNorm(arena.buf, ws_size_, executor_, stream);
-  }
-
- private:
-  mutable ascend::AclTensorCache input_cache_;
-
-  mutable ascend::AclTensorCache residual_cache_;
-
-  mutable ascend::AclTensorCache weight_cache_;
-
-  mutable ascend::AclTensorCache out_cache_;
-
-  mutable ascend::AclTensorCache residual_out_cache_;
-
-  std::vector<int64_t> fused_rstd_shape_;
-
-  void* rstd_data_ = nullptr;
-
-  aclTensor* rstd_tensor_ = nullptr;
-
-  mutable aclOpExecutor* executor_ = nullptr;
-
-  mutable uint64_t ws_size_ = 0;
-};
-
-}  // namespace infini::ops
-
-#endif
diff --git a/src/ascend/causal_softmax/kernel.h b/src/ascend/causal_softmax/kernel.h
deleted file mode 100644
index 975a0346..00000000
--- a/src/ascend/causal_softmax/kernel.h
+++ /dev/null
@@ -1,173 +0,0 @@
-#ifndef INFINI_OPS_ASCEND_CAUSAL_SOFTMAX_KERNEL_H_
-#define INFINI_OPS_ASCEND_CAUSAL_SOFTMAX_KERNEL_H_
-
-#include <limits>
-#include <vector>
-
-#include "acl/acl.h"
-#include "aclnn/aclnn_base.h"
-#include "aclnn_copy.h"
-#include "aclnn_masked_fill_scalar.h"
-#include "aclnn_softmax.h"
-#include "ascend/common.h"
-#include "ascend/workspace_pool_.h"
-#include "base/causal_softmax.h"
-#include "data_type.h"
-#include "operator.h"
-
-namespace infini::ops {
-
-// CANN 8.5 has no single API covering causal-mask-then-softmax: the nearest
-// candidates (`aclnnSoftmaxV2`, `aclnnScaledSoftmaxGrad`) do not accept a
-// boolean mask argument, and `aclnnScaledMaskedSoftmax` requires a
-// pre-scaled attention-score tensor produced inside flash-attention, not a
-// standalone softmax input.  Decomposing into three ACLNN calls is therefore
-// unavoidable until a `aclnnCausalSoftmax` ships:
-//   1. `aclnnInplaceCopy(temp, input)` — stride-aware copy to a contiguous
-//      `temp` buffer.
-//   2. `aclnnInplaceMaskedFillScalar(temp, mask, -inf)` — apply the
-//      upper-triangle mask.
-//   3. `aclnnSoftmax(temp, dim=-1, out)` — softmax over the last dimension.
-//
-// The boolean causal mask is pre-computed and uploaded to device once in the
-// constructor.  Its shape `(seq_len, total_seq_len)` broadcasts over the
-// batch dimension.
-template <>
-class Operator<CausalSoftmax, Device::Type::kAscend> : public CausalSoftmax {
- public:
-  Operator(const Tensor input, Tensor out)
-      : CausalSoftmax(input, out), in_cache_(input), out_cache_(out) {
-    // Compute `temp` buffer size — allocated lazily from the pool in
-    // `operator()`.
-    size_t n_elems = input.numel();
-    size_t elem_bytes = kDataTypeToSize.at(dtype_);
-    temp_size_ = n_elems * elem_bytes;
-
-    // Build a contiguous `Tensor` descriptor — data pointer set on first use.
-    Tensor temp_t{nullptr, input.shape(), input.dtype(), input.device()};
-    temp_cache_ = ascend::AclTensorCache(temp_t);
-
-    // Causal mask: `mask[i][j] = 1` when position `j` must be masked for
-    // query `i`.  Shape `(seq_len, total_seq_len)` broadcasts over the batch
-    // dimension.
-    size_t mask_elems = seq_len_ * total_seq_len_;
-    std::vector<uint8_t> mask_host(mask_elems, 0);
-
-    for (size_t i = 0; i < seq_len_; ++i) {
-      auto vis_end = static_cast<int64_t>(total_seq_len_ - seq_len_ + i);
-
-      for (auto j = vis_end + 1; j < static_cast<int64_t>(total_seq_len_);
-           ++j) {
-        mask_host[i * total_seq_len_ + j] = 1;
-      }
-    }
-
-    aclrtMalloc(&mask_buf_, mask_elems, ACL_MEM_MALLOC_NORMAL_ONLY);
-    aclrtMemcpy(mask_buf_, mask_elems, mask_host.data(), mask_elems,
-                ACL_MEMCPY_HOST_TO_DEVICE);
-
-    std::vector<int64_t> mshape = {static_cast<int64_t>(seq_len_),
-                                   static_cast<int64_t>(total_seq_len_)};
-    std::vector<int64_t> mstrides = {static_cast<int64_t>(total_seq_len_), 1};
-    mask_tensor_ = aclCreateTensor(mshape.data(), mshape.size(), ACL_BOOL,
-                                   mstrides.data(), 0, ACL_FORMAT_ND,
-                                   mshape.data(), mshape.size(), mask_buf_);
-
-    // Scalar `-inf` for the masked-fill step.  `aclCreateScalar` stores the
-    // pointer rather than copying, so `neg_inf_storage_` must stay alive
-    // with the object.
-    neg_inf_ = aclCreateScalar(&neg_inf_storage_, ACL_FLOAT);
-    // Workspaces are allocated lazily on the first `operator()` call.
-  }
-
-  ~Operator() {
-    if (!ascend::IsAclRuntimeAlive()) return;
-
-    // Null cached descriptors — see `AclTensorCache::release()`.
-    in_cache_.release();
-    out_cache_.release();
-    temp_cache_.release();
-
-    // `mask_tensor_` leaks with `fill_exec_` at shutdown (see `64c367c`).
-    if (mask_buf_) aclrtFree(mask_buf_);
-    if (neg_inf_) aclDestroyScalar(neg_inf_);
-  }
-
-  void operator()(const Tensor input, Tensor out) const override {
-    auto t_in = in_cache_.get(const_cast<void*>(input.data()));
-    auto t_out = out_cache_.get(out.data());
-    auto stream = static_cast<aclrtStream>(stream_);
-
-    // Obtain shared `temp` buffer from the pool.
-    auto& temp = ascend::GetWorkspacePool().Ensure(stream, temp_size_, "temp");
-    auto t_temp = temp_cache_.get(temp.buf);
-
-    // Step 1: copy `input` (possibly non-contiguous) into a contiguous `temp`.
-    if (!copy_exec_) {
-      aclnnInplaceCopyGetWorkspaceSize(t_temp, t_in, &copy_ws_, &copy_exec_);
-      aclSetAclOpExecutorRepeatable(copy_exec_);
-    } else {
-      aclSetInputTensorAddr(copy_exec_, 0, t_temp, temp.buf);
-      aclSetInputTensorAddr(copy_exec_, 1, t_in,
-                            const_cast<void*>(input.data()));
-    }
-    auto& copy_arena = ascend::GetWorkspacePool().Ensure(stream, copy_ws_);
-    aclnnInplaceCopy(copy_arena.buf, copy_ws_, copy_exec_, stream);
-
-    // Step 2: mask upper-triangle positions with `-inf` in-place.
-    // `mask_tensor_` and `neg_inf_` have stable addresses — first-call only.
-    if (!fill_exec_) {
-      aclnnInplaceMaskedFillScalarGetWorkspaceSize(
-          t_temp, mask_tensor_, neg_inf_, &fill_ws_, &fill_exec_);
-      aclSetAclOpExecutorRepeatable(fill_exec_);
-    }
-    auto& fill_arena = ascend::GetWorkspacePool().Ensure(stream, fill_ws_);
-    aclnnInplaceMaskedFillScalar(fill_arena.buf, fill_ws_, fill_exec_, stream);
-
-    // Step 3: softmax over the last dimension -> `out`.
-    if (!softmax_exec_) {
-      constexpr int64_t kLastDim = -1;
-      aclnnSoftmaxGetWorkspaceSize(t_temp, kLastDim, t_out, &softmax_ws_,
-                                   &softmax_exec_);
-      aclSetAclOpExecutorRepeatable(softmax_exec_);
-    } else {
-      aclSetOutputTensorAddr(softmax_exec_, 0, t_out, out.data());
-    }
-    auto& softmax_arena =
-        ascend::GetWorkspacePool().Ensure(stream, softmax_ws_);
-    aclnnSoftmax(softmax_arena.buf, softmax_ws_, softmax_exec_, stream);
-  }
-
- private:
-  mutable ascend::AclTensorCache in_cache_;
-
-  mutable ascend::AclTensorCache out_cache_;
-
-  mutable ascend::AclTensorCache temp_cache_;
-
-  float neg_inf_storage_ = -std::numeric_limits<float>::infinity();
-
-  uint64_t temp_size_ = 0;
-
-  void* mask_buf_ = nullptr;
-
-  aclTensor* mask_tensor_ = nullptr;
-
-  aclScalar* neg_inf_ = nullptr;
-
-  mutable aclOpExecutor* copy_exec_ = nullptr;
-
-  mutable uint64_t copy_ws_ = 0;
-
-  mutable aclOpExecutor* fill_exec_ = nullptr;
-
-  mutable uint64_t fill_ws_ = 0;
-
-  mutable aclOpExecutor* softmax_exec_ = nullptr;
-
-  mutable uint64_t softmax_ws_ = 0;
-};
-
-}  // namespace infini::ops
-
-#endif
diff --git a/src/ascend/custom/CMakeLists.txt b/src/ascend/custom/CMakeLists.txt
index fb900419..ca6e6883 100644
--- a/src/ascend/custom/CMakeLists.txt
+++ b/src/ascend/custom/CMakeLists.txt
@@ -30,6 +30,8 @@ else()
 endif()
 
 set(PROJECT_OP_SRC_BASE ${PROJECT_SOURCE_DIR})
+set(PROJECT_BUILD_PATH ${PROJECT_SOURCE_DIR}/build)
+set(PROJECT_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/output)
 
 include(cmake/config_envs.cmake)
 include(cmake/config_ascend.cmake)
@@ -41,15 +43,13 @@ if(CCACHE_PROGRAM)
     set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
 endif()
 
-# `CMAKE_LIBRARY_OUTPUT_DIRECTORY` is set by `build.sh` so that the
-# standalone `libascend_kernel.so` lands next to `libno_workspace_kernel.a`
-# under `<repo>/build/build_ascend_custom/output/`.
+# Shared library output location.
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_OUTPUT_PATH})
 
 # Host-side files.
 file(GLOB OP_SRCS
     ${PROJECT_OP_SRC_BASE}/torch_binding.cpp
     ${PROJECT_OP_SRC_BASE}/rms_norm/op_host/rms_norm.cpp
-    ${PROJECT_OP_SRC_BASE}/add_rms_norm/op_host/add_rms_norm.cpp
 )
 
 # Shared library name — consumed by `kernel_custom.h` variants and by the
@@ -59,18 +59,8 @@ set(OP_PLUGIN_NAME ascend_kernel)
 # Kernel-side files (device code compiled by the `AscendC` toolchain).
 ascendc_library(no_workspace_kernel STATIC
     ${PROJECT_OP_SRC_BASE}/rms_norm/op_kernel/rms_norm.cpp
-    ${PROJECT_OP_SRC_BASE}/add_rms_norm/op_kernel/add_rms_norm.cpp
 )
 
-# The kernel translation units include `"data_type_enum.h"` from the main
-# project's `src/` so that launcher and device code share one `DataType`
-# enum.  `ascendc_library` forwards the interface target's `INCLUDES`
-# property to the nested `ExternalProject_Add` (see
-# `${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake/legacy_modules/function.cmake`),
-# so append the main `src/` dir here.
-set_property(TARGET no_workspace_kernel_interface APPEND PROPERTY
-    INCLUDES ${PROJECT_OP_SRC_BASE}/../..)
-
 # Create the shared library `libascend_kernel.so`.
 add_library(${OP_PLUGIN_NAME} SHARED ${OP_SRCS})
 
diff --git a/src/ascend/custom/add_rms_norm/op_host/add_rms_norm.cpp b/src/ascend/custom/add_rms_norm/op_host/add_rms_norm.cpp
index b561eaaa..b8e0d504 100644
--- a/src/ascend/custom/add_rms_norm/op_host/add_rms_norm.cpp
+++ b/src/ascend/custom/add_rms_norm/op_host/add_rms_norm.cpp
@@ -1,4 +1,4 @@
-#include "aclrtlaunch_AddRmsNorm.h"
+#include "aclrtlaunch_add_rms_norm.h"
 #include "tiling/platform/platform_ascendc.h"
 #include "torch_kernel_helper.h"
 
@@ -105,13 +105,16 @@ std::vector<at::Tensor> AddRmsNorm(const at::Tensor& x1, const at::Tensor& x2,
   float eps_float = static_cast<float>(eps);
   int64_t dtype_size_val = dtype_size;
 
-  // The first arg `AddRmsNorm` is the AscendC kernel entry-point name — it
-  // must match the `__global__ __aicore__ void AddRmsNorm(...)` definition
-  // in `op_kernel/` and the generated `aclrtlaunch_AddRmsNorm.h` header.
-  EXEC_KERNEL_CMD(AddRmsNorm, block_dim, kernel_input1, kernel_input2,
-                  weight_float, total_rows, dim_length, dim_length_align,
-                  former_num, former_length, tail_length, eps_float,
-                  dtype_size_val, kernel_output_y, kernel_output_x_out);
+  // The first arg `add_rms_norm` is the AscendC kernel entry-point name — it
+  // must match `ascendc_add_operator(OP_NAME add_rms_norm)` in `CMakeLists.txt`,
+  // the `__global__ __aicore__ void add_rms_norm(...)` definition in
+  // `op_kernel/`, and the generated `aclrtlaunch_add_rms_norm.h` header.
+  // Google C++ Style's PascalCase rule does NOT apply: this identifier is
+  // dictated by the AscendC toolchain's symbol convention.
+  EXEC_KERNEL_CMD(add_rms_norm, block_dim, kernel_input1, kernel_input2,
+                  weight_float, kernel_output_y, kernel_output_x_out,
+                  total_rows, dim_length, dim_length_align, former_num,
+                  former_length, tail_length, eps_float, dtype_size_val);
 
   // Remove padding and reshape back to original shape.
   at::Tensor output_y = kernel_output_y;
diff --git a/src/ascend/custom/add_rms_norm/op_kernel/.clang-tidy b/src/ascend/custom/add_rms_norm/op_kernel/.clang-tidy
deleted file mode 100644
index ccf13972..00000000
--- a/src/ascend/custom/add_rms_norm/op_kernel/.clang-tidy
+++ /dev/null
@@ -1,9 +0,0 @@
----
-# `op_kernel/*.cpp` is `AscendC` device code compiled by `ccec`, not by
-# the host toolchain, so it has no entry in `compile_commands.json` and
-# `clang-tidy` cannot parse it correctly (the `__aicore__` macro expands
-# unexpectedly when `kernel_operator.h` is absent).  Disable all checks
-# here — the `op_host/` side and the `kernel_custom.h` launcher still
-# enforce the full ruleset.
-
-Checks: '-*'
diff --git a/src/ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp b/src/ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp
index 4b677d35..e2a08e55 100644
--- a/src/ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp
+++ b/src/ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp
@@ -1,102 +1,98 @@
-#include "data_type.h"
 #include "kernel_operator.h"
 
-constexpr int32_t kBufferNum = 2;
+constexpr int32_t BUFFER_NUM = 2;
 
 template <typename T>
 class KernelAddRmsNorm {
  public:
   __aicore__ inline KernelAddRmsNorm() {}
 
-  __aicore__ inline void Init(GM_ADDR input, GM_ADDR residual, GM_ADDR weight,
-                              int64_t total_rows, int64_t dim_length,
-                              int64_t dim_length_align, int64_t former_num,
-                              int64_t former_length, int64_t tail_length,
-                              float eps, GM_ADDR out, GM_ADDR residual_out) {
-    dim_length_ = dim_length;
-    dim_length_align_ = dim_length_align;
-    eps_ = eps;
+  __aicore__ inline void Init(GM_ADDR x1, GM_ADDR x2, GM_ADDR weight, GM_ADDR y,
+                              GM_ADDR x_out, int64_t totalRows,
+                              int64_t dimLength, int64_t dimLengthAlign,
+                              int64_t formerNum, int64_t formerLength,
+                              int64_t tailLength, float eps) {
+    this->dimLength = dimLength;
+    this->dimLengthAlign = dimLengthAlign;
+    this->eps = eps;
 
     // Block-level tiling: determine row range for this core.
-    int64_t block_idx = AscendC::GetBlockIdx();
-    int64_t row_offset;
+    int64_t blockIdx = AscendC::GetBlockIdx();
+    int64_t rowOffset;
 
-    if (block_idx < former_num) {
-      block_rows_ = former_length;
-      row_offset = former_length * block_idx;
+    if (blockIdx < formerNum) {
+      this->blockRows = formerLength;
+      rowOffset = formerLength * blockIdx;
     } else {
-      block_rows_ = tail_length;
-      int64_t tail_idx = block_idx - former_num;
-      row_offset = former_length * former_num + tail_length * tail_idx;
+      this->blockRows = tailLength;
+      int64_t tailIdx = blockIdx - formerNum;
+      rowOffset = formerLength * formerNum + tailLength * tailIdx;
     }
 
     // Global memory pointers.
-    input_gm_.SetGlobalBuffer((__gm__ T*)input + row_offset * dim_length_align,
-                              block_rows_ * dim_length_align);
-    residual_gm_.SetGlobalBuffer(
-        (__gm__ T*)residual + row_offset * dim_length_align,
-        block_rows_ * dim_length_align);
-    out_gm_.SetGlobalBuffer((__gm__ T*)out + row_offset * dim_length_align,
-                            block_rows_ * dim_length_align);
-    residual_out_gm_.SetGlobalBuffer(
-        (__gm__ T*)residual_out + row_offset * dim_length_align,
-        block_rows_ * dim_length_align);
-    weight_gm_.SetGlobalBuffer((__gm__ float*)weight, dim_length_align);
-
-    int32_t dim_len_align = static_cast<int32_t>(dim_length_align_);
+    x1Gm.SetGlobalBuffer((__gm__ T*)x1 + rowOffset * dimLengthAlign,
+                         this->blockRows * dimLengthAlign);
+    x2Gm.SetGlobalBuffer((__gm__ T*)x2 + rowOffset * dimLengthAlign,
+                         this->blockRows * dimLengthAlign);
+    yGm.SetGlobalBuffer((__gm__ T*)y + rowOffset * dimLengthAlign,
+                        this->blockRows * dimLengthAlign);
+    xOutGm.SetGlobalBuffer((__gm__ T*)x_out + rowOffset * dimLengthAlign,
+                           this->blockRows * dimLengthAlign);
+    weightGm.SetGlobalBuffer((__gm__ float*)weight, dimLengthAlign);
+
+    int32_t dimLenAlign = static_cast<int32_t>(this->dimLengthAlign);
 
     // I/O queues (double-buffered).
-    pipe_.InitBuffer(in_queue_input_, kBufferNum,
-                     dim_len_align * static_cast<int32_t>(sizeof(T)));
-    pipe_.InitBuffer(in_queue_residual_, kBufferNum,
-                     dim_len_align * static_cast<int32_t>(sizeof(T)));
-    pipe_.InitBuffer(out_queue_out_, kBufferNum,
-                     dim_len_align * static_cast<int32_t>(sizeof(T)));
-    pipe_.InitBuffer(out_queue_residual_out_, kBufferNum,
-                     dim_len_align * static_cast<int32_t>(sizeof(T)));
+    pipe.InitBuffer(inQueueX1, BUFFER_NUM,
+                    dimLenAlign * static_cast<int32_t>(sizeof(T)));
+    pipe.InitBuffer(inQueueX2, BUFFER_NUM,
+                    dimLenAlign * static_cast<int32_t>(sizeof(T)));
+    pipe.InitBuffer(outQueueY, BUFFER_NUM,
+                    dimLenAlign * static_cast<int32_t>(sizeof(T)));
+    pipe.InitBuffer(outQueueXOut, BUFFER_NUM,
+                    dimLenAlign * static_cast<int32_t>(sizeof(T)));
 
     // Weight buffer (fp32, loaded once, reused for all rows).
-    pipe_.InitBuffer(weight_buf_,
-                     dim_len_align * static_cast<int32_t>(sizeof(float)));
+    pipe.InitBuffer(weightBuf,
+                    dimLenAlign * static_cast<int32_t>(sizeof(float)));
 
-    // FP16/BF16 path needs extra fp32 compute buffers.
-    // `fp32_buf1_`: holds `x_out` in fp32 (reused from `x1_fp32` after Add).
-    // `fp32_buf2_`: holds `x2_fp32` initially, then `x_out^2`, then final
-    // result.
+    // FP16 path needs extra fp32 compute buffers.
+    // buf1: holds x_out in fp32 (reused from x1_fp32 after Add).
+    // buf2: holds x2_fp32 initially, then x_out^2, then final result.
     if constexpr (sizeof(T) == 2) {
-      pipe_.InitBuffer(fp32_buf1_,
-                       dim_len_align * static_cast<int32_t>(sizeof(float)));
-      pipe_.InitBuffer(fp32_buf2_,
-                       dim_len_align * static_cast<int32_t>(sizeof(float)));
+      pipe.InitBuffer(fp32Buf1,
+                      dimLenAlign * static_cast<int32_t>(sizeof(float)));
+      pipe.InitBuffer(fp32Buf2,
+                      dimLenAlign * static_cast<int32_t>(sizeof(float)));
     }
 
-    // `ReduceSum` temporary buffer (size per API formula).
-    constexpr int32_t kElemsPerRepeat = 256 / sizeof(float);
-    constexpr int32_t kElemsPerBlock = 32 / sizeof(float);
-    int32_t first_max_repeat =
-        (dim_len_align + kElemsPerRepeat - 1) / kElemsPerRepeat;
-    int32_t reduce_tmp_size =
-        ((first_max_repeat + kElemsPerBlock - 1) / kElemsPerBlock) *
-        kElemsPerBlock;
-    pipe_.InitBuffer(reduce_tmp_buf_,
-                     reduce_tmp_size * static_cast<int32_t>(sizeof(float)));
+    // ReduceSum temporary buffer (size per API formula).
+    constexpr int32_t ELEMS_PER_REPEAT = 256 / sizeof(float);
+    constexpr int32_t ELEMS_PER_BLOCK = 32 / sizeof(float);
+    int32_t firstMaxRepeat =
+        (dimLenAlign + ELEMS_PER_REPEAT - 1) / ELEMS_PER_REPEAT;
+    int32_t reduceTmpSize =
+        ((firstMaxRepeat + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK) *
+        ELEMS_PER_BLOCK;
+    pipe.InitBuffer(reduceTmpBuf,
+                    reduceTmpSize * static_cast<int32_t>(sizeof(float)));
 
     // Scalar buffer for reduction result (8 floats = 32 bytes).
-    pipe_.InitBuffer(sum_buf_, 32);
+    pipe.InitBuffer(sumBuf, 32);
 
-    // Load weight (fp32) from GM into `weight_buf_`.
-    AscendC::LocalTensor<float> w_local = weight_buf_.Get<float>();
-    AscendC::DataCopyExtParams w_params{
-        1, static_cast<uint32_t>(dim_len_align * sizeof(float)), 0, 0, 0};
-    AscendC::DataCopyPadExtParams<float> w_pad{false, 0, 0, 0.0f};
-    AscendC::DataCopyPad(w_local, weight_gm_, w_params, w_pad);
+    // Load weight (fp32) from GM into `weightBuf`.
+    AscendC::LocalTensor<float> wLocal = weightBuf.Get<float>();
+    AscendC::DataCopyExtParams wParams{
+        1, static_cast<uint32_t>(dimLenAlign * sizeof(float)), 0, 0, 0};
+    AscendC::DataCopyPadExtParams<float> wPad{false, 0, 0, 0.0f};
+    AscendC::DataCopyPad(wLocal, weightGm, wParams, wPad);
 
     // Ensure weight DMA completes before compute.
     AscendC::PipeBarrier<PIPE_ALL>();
   }
 
   __aicore__ inline void Process() {
-    for (int64_t row = 0; row < block_rows_; ++row) {
+    for (int64_t row = 0; row < this->blockRows; ++row) {
       CopyIn(row);
       Compute(row);
       CopyOut(row);
@@ -105,175 +101,149 @@ class KernelAddRmsNorm {
 
  private:
   __aicore__ inline void CopyIn(int64_t row) {
-    AscendC::LocalTensor<T> input_local = in_queue_input_.AllocTensor<T>();
-    AscendC::LocalTensor<T> residual_local =
-        in_queue_residual_.AllocTensor<T>();
+    AscendC::LocalTensor<T> x1Local = inQueueX1.AllocTensor<T>();
+    AscendC::LocalTensor<T> x2Local = inQueueX2.AllocTensor<T>();
     AscendC::DataCopyExtParams params{
-        1, static_cast<uint32_t>(dim_length_align_ * sizeof(T)), 0, 0, 0};
+        1, static_cast<uint32_t>(this->dimLengthAlign * sizeof(T)), 0, 0, 0};
     AscendC::DataCopyPadExtParams<T> pad{false, 0, 0, static_cast<T>(0)};
-    AscendC::DataCopyPad(input_local, input_gm_[row * dim_length_align_],
-                         params, pad);
-    AscendC::DataCopyPad(residual_local, residual_gm_[row * dim_length_align_],
-                         params, pad);
-    in_queue_input_.EnQue(input_local);
-    in_queue_residual_.EnQue(residual_local);
+    AscendC::DataCopyPad(x1Local, x1Gm[row * this->dimLengthAlign], params,
+                         pad);
+    AscendC::DataCopyPad(x2Local, x2Gm[row * this->dimLengthAlign], params,
+                         pad);
+    inQueueX1.EnQue(x1Local);
+    inQueueX2.EnQue(x2Local);
   }
 
   __aicore__ inline void Compute(int64_t row) {
-    AscendC::LocalTensor<T> input_local = in_queue_input_.DeQue<T>();
-    AscendC::LocalTensor<T> residual_local = in_queue_residual_.DeQue<T>();
-    AscendC::LocalTensor<T> out_local = out_queue_out_.AllocTensor<T>();
-    AscendC::LocalTensor<T> residual_out_local =
-        out_queue_residual_out_.AllocTensor<T>();
+    AscendC::LocalTensor<T> x1Local = inQueueX1.DeQue<T>();
+    AscendC::LocalTensor<T> x2Local = inQueueX2.DeQue<T>();
+    AscendC::LocalTensor<T> yLocal = outQueueY.AllocTensor<T>();
+    AscendC::LocalTensor<T> xOutLocal = outQueueXOut.AllocTensor<T>();
 
-    AscendC::LocalTensor<float> w_local = weight_buf_.Get<float>();
-    AscendC::LocalTensor<float> r_tmp = reduce_tmp_buf_.Get<float>();
-    AscendC::LocalTensor<float> s_local = sum_buf_.Get<float>();
+    AscendC::LocalTensor<float> wLocal = weightBuf.Get<float>();
+    AscendC::LocalTensor<float> rTmp = reduceTmpBuf.Get<float>();
+    AscendC::LocalTensor<float> sLocal = sumBuf.Get<float>();
 
-    int32_t dim_len = static_cast<int32_t>(dim_length_);
-    int32_t dim_len_align = static_cast<int32_t>(dim_length_align_);
+    int32_t dimLen = static_cast<int32_t>(this->dimLength);
+    int32_t dimLenAlign = static_cast<int32_t>(this->dimLengthAlign);
 
     if constexpr (sizeof(T) == 4) {
       // ---- FP32 path: compute directly. ----
 
       // Step 1: x_out = x1 + x2.
-      AscendC::Add(residual_out_local, input_local, residual_local,
-                   dim_len_align);
+      AscendC::Add(xOutLocal, x1Local, x2Local, dimLenAlign);
 
-      // Step 2: x_out^2 into out_local (reuse output buffer temporarily).
-      AscendC::Mul(out_local, residual_out_local, residual_out_local,
-                   dim_len_align);
+      // Step 2: x_out^2 into yLocal (reuse output buffer temporarily).
+      AscendC::Mul(yLocal, xOutLocal, xOutLocal, dimLenAlign);
 
-      // Step 3: ReduceSum(x_out^2) -> s_local[0].
-      // `ReduceSum` may modify `out_local`, but we overwrite it below.
-      AscendC::ReduceSum(s_local, out_local, r_tmp, dim_len_align);
+      // Step 3: ReduceSum(x_out^2) -> sLocal[0].
+      // ReduceSum may modify yLocal, but we overwrite it below.
+      AscendC::ReduceSum(sLocal, yLocal, rTmp, dimLenAlign);
 
       // Step 4-5: scale = 1 / sqrt(mean(x_out^2) + eps).
-      float sum_val = s_local.GetValue(0);
-      float mean_val = sum_val / static_cast<float>(dim_len) + eps_;
-      s_local.SetValue(0, mean_val);
-      AscendC::Sqrt(s_local, s_local, 8);
-      float scale = 1.0f / s_local.GetValue(0);
+      float sumVal = sLocal.GetValue(0);
+      float meanVal = sumVal / static_cast<float>(dimLen) + this->eps;
+      sLocal.SetValue(0, meanVal);
+      AscendC::Sqrt(sLocal, sLocal, 8);
+      float scale = 1.0f / sLocal.GetValue(0);
 
       // Step 6: y = x_out * scale.
-      AscendC::Muls(out_local, residual_out_local, scale, dim_len_align);
+      AscendC::Muls(yLocal, xOutLocal, scale, dimLenAlign);
 
       // Step 7: y = y * weight.
-      AscendC::Mul(out_local, out_local, w_local, dim_len_align);
+      AscendC::Mul(yLocal, yLocal, wLocal, dimLenAlign);
 
     } else {
-      // ---- FP16/BF16 path: cast → fp32 compute → cast back. ----
-      AscendC::LocalTensor<float> b1 = fp32_buf1_.Get<float>();
-      AscendC::LocalTensor<float> b2 = fp32_buf2_.Get<float>();
+      // ---- FP16 path: cast → fp32 compute → cast back. ----
+      AscendC::LocalTensor<float> b1 = fp32Buf1.Get<float>();
+      AscendC::LocalTensor<float> b2 = fp32Buf2.Get<float>();
 
-      // Cast inputs fp16/bf16 → fp32.
-      AscendC::Cast(b1, input_local, AscendC::RoundMode::CAST_NONE,
-                    dim_len_align);
-      AscendC::Cast(b2, residual_local, AscendC::RoundMode::CAST_NONE,
-                    dim_len_align);
+      // Cast inputs fp16 → fp32.
+      AscendC::Cast(b1, x1Local, AscendC::RoundMode::CAST_NONE, dimLenAlign);
+      AscendC::Cast(b2, x2Local, AscendC::RoundMode::CAST_NONE, dimLenAlign);
 
       // Step 1: x_out = x1 + x2 (fp32), stored in b1.
-      AscendC::Add(b1, b1, b2, dim_len_align);
+      AscendC::Add(b1, b1, b2, dimLenAlign);
 
-      // Cast `x_out` fp32 → fp16/bf16 for the residual output.
-      AscendC::Cast(residual_out_local, b1, AscendC::RoundMode::CAST_RINT,
-                    dim_len_align);
+      // Cast x_out fp32 → fp16 for the x_out output.
+      AscendC::Cast(xOutLocal, b1, AscendC::RoundMode::CAST_ROUND, dimLenAlign);
 
       // Step 2: x_out^2 in fp32, stored in b2.
-      AscendC::Mul(b2, b1, b1, dim_len_align);
+      AscendC::Mul(b2, b1, b1, dimLenAlign);
 
-      // Step 3: ReduceSum(x_out^2) -> s_local[0].
-      AscendC::ReduceSum(s_local, b2, r_tmp, dim_len_align);
+      // Step 3: ReduceSum(x_out^2) -> sLocal[0].
+      AscendC::ReduceSum(sLocal, b2, rTmp, dimLenAlign);
 
       // Step 4-5: scale = 1 / sqrt(mean(x_out^2) + eps).
-      float sum_val = s_local.GetValue(0);
-      float mean_val = sum_val / static_cast<float>(dim_len) + eps_;
-      s_local.SetValue(0, mean_val);
-      AscendC::Sqrt(s_local, s_local, 8);
-      float scale = 1.0f / s_local.GetValue(0);
+      float sumVal = sLocal.GetValue(0);
+      float meanVal = sumVal / static_cast<float>(dimLen) + this->eps;
+      sLocal.SetValue(0, meanVal);
+      AscendC::Sqrt(sLocal, sLocal, 8);
+      float scale = 1.0f / sLocal.GetValue(0);
 
       // Step 6: y = x_out * scale (fp32), reuse b2.
-      AscendC::Muls(b2, b1, scale, dim_len_align);
+      AscendC::Muls(b2, b1, scale, dimLenAlign);
 
       // Step 7: y = y * weight (fp32).
-      AscendC::Mul(b2, b2, w_local, dim_len_align);
+      AscendC::Mul(b2, b2, wLocal, dimLenAlign);
 
-      AscendC::Cast(out_local, b2, AscendC::RoundMode::CAST_RINT,
-                    dim_len_align);
+      // Cast result fp32 → fp16.
+      AscendC::Cast(yLocal, b2, AscendC::RoundMode::CAST_ROUND, dimLenAlign);
     }
 
-    in_queue_input_.FreeTensor(input_local);
-    in_queue_residual_.FreeTensor(residual_local);
-    out_queue_out_.EnQue(out_local);
-    out_queue_residual_out_.EnQue(residual_out_local);
+    inQueueX1.FreeTensor(x1Local);
+    inQueueX2.FreeTensor(x2Local);
+    outQueueY.EnQue(yLocal);
+    outQueueXOut.EnQue(xOutLocal);
   }
 
   __aicore__ inline void CopyOut(int64_t row) {
-    AscendC::LocalTensor<T> out_local = out_queue_out_.DeQue<T>();
-    AscendC::LocalTensor<T> residual_out_local =
-        out_queue_residual_out_.DeQue<T>();
+    AscendC::LocalTensor<T> yLocal = outQueueY.DeQue<T>();
+    AscendC::LocalTensor<T> xOutLocal = outQueueXOut.DeQue<T>();
     AscendC::DataCopyExtParams params{
-        1, static_cast<uint32_t>(dim_length_align_ * sizeof(T)), 0, 0, 0};
-    AscendC::DataCopyPad(out_gm_[row * dim_length_align_], out_local, params);
-    AscendC::DataCopyPad(residual_out_gm_[row * dim_length_align_],
-                         residual_out_local, params);
-    out_queue_out_.FreeTensor(out_local);
-    out_queue_residual_out_.FreeTensor(residual_out_local);
+        1, static_cast<uint32_t>(this->dimLengthAlign * sizeof(T)), 0, 0, 0};
+    AscendC::DataCopyPad(yGm[row * this->dimLengthAlign], yLocal, params);
+    AscendC::DataCopyPad(xOutGm[row * this->dimLengthAlign], xOutLocal, params);
+    outQueueY.FreeTensor(yLocal);
+    outQueueXOut.FreeTensor(xOutLocal);
   }
 
  private:
-  AscendC::TPipe pipe_;
-  AscendC::TQue<AscendC::TPosition::VECIN, kBufferNum> in_queue_input_;
-  AscendC::TQue<AscendC::TPosition::VECIN, kBufferNum> in_queue_residual_;
-  AscendC::TQue<AscendC::TPosition::VECOUT, kBufferNum> out_queue_out_;
-  AscendC::TQue<AscendC::TPosition::VECOUT, kBufferNum> out_queue_residual_out_;
-
-  AscendC::TBuf<AscendC::TPosition::VECCALC> weight_buf_;
-  AscendC::TBuf<AscendC::TPosition::VECCALC> fp32_buf1_;
-  AscendC::TBuf<AscendC::TPosition::VECCALC> fp32_buf2_;
-  AscendC::TBuf<AscendC::TPosition::VECCALC> reduce_tmp_buf_;
-  AscendC::TBuf<AscendC::TPosition::VECCALC> sum_buf_;
-
-  AscendC::GlobalTensor<T> input_gm_, residual_gm_, out_gm_, residual_out_gm_;
-  AscendC::GlobalTensor<float> weight_gm_;
-
-  int64_t block_rows_;
-  int64_t dim_length_;
-  int64_t dim_length_align_;
-  float eps_;
+  AscendC::TPipe pipe;
+  AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX1;
+  AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX2;
+  AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueY;
+  AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueXOut;
+
+  AscendC::TBuf<AscendC::TPosition::VECCALC> weightBuf;
+  AscendC::TBuf<AscendC::TPosition::VECCALC> fp32Buf1;
+  AscendC::TBuf<AscendC::TPosition::VECCALC> fp32Buf2;
+  AscendC::TBuf<AscendC::TPosition::VECCALC> reduceTmpBuf;
+  AscendC::TBuf<AscendC::TPosition::VECCALC> sumBuf;
+
+  AscendC::GlobalTensor<T> x1Gm, x2Gm, yGm, xOutGm;
+  AscendC::GlobalTensor<float> weightGm;
+
+  int64_t blockRows;
+  int64_t dimLength;
+  int64_t dimLengthAlign;
+  float eps;
 };
 
-// `dtype_code` is `static_cast<int64_t>(infini::ops::DataType)` forwarded
-// by the host launcher.  fp16 and bf16 both have `sizeof == 2` but need
-// distinct numeric paths, so dispatch is on the `DataType` tag rather
-// than the byte size.
-extern "C" __global__ __aicore__ void AddRmsNorm(
-    GM_ADDR input, GM_ADDR residual, GM_ADDR weight, int64_t total_rows,
-    int64_t dim_length, int64_t dim_length_align, int64_t former_num,
-    int64_t former_length, int64_t tail_length, float eps, int64_t dtype_code,
-    GM_ADDR out, GM_ADDR residual_out) {
-  switch (static_cast<infini::ops::DataType>(dtype_code)) {
-    case infini::ops::DataType::kFloat16: {
-      KernelAddRmsNorm<half> op;
-      op.Init(input, residual, weight, total_rows, dim_length, dim_length_align,
-              former_num, former_length, tail_length, eps, out, residual_out);
-      op.Process();
-      break;
-    }
-    case infini::ops::DataType::kBFloat16: {
-      KernelAddRmsNorm<bfloat16_t> op;
-      op.Init(input, residual, weight, total_rows, dim_length, dim_length_align,
-              former_num, former_length, tail_length, eps, out, residual_out);
-      op.Process();
-      break;
-    }
-    case infini::ops::DataType::kFloat32:
-    default: {
-      KernelAddRmsNorm<float> op;
-      op.Init(input, residual, weight, total_rows, dim_length, dim_length_align,
-              former_num, former_length, tail_length, eps, out, residual_out);
-      op.Process();
-      break;
-    }
+extern "C" __global__ __aicore__ void add_rms_norm(
+    GM_ADDR x1, GM_ADDR x2, GM_ADDR weight, GM_ADDR y, GM_ADDR x_out,
+    int64_t totalRows, int64_t dimLength, int64_t dimLengthAlign,
+    int64_t formerNum, int64_t formerLength, int64_t tailLength, float eps,
+    int64_t dtypeSize) {
+  if (dtypeSize == 2) {
+    KernelAddRmsNorm<half> op;
+    op.Init(x1, x2, weight, y, x_out, totalRows, dimLength, dimLengthAlign,
+            formerNum, formerLength, tailLength, eps);
+    op.Process();
+  } else {
+    KernelAddRmsNorm<float> op;
+    op.Init(x1, x2, weight, y, x_out, totalRows, dimLength, dimLengthAlign,
+            formerNum, formerLength, tailLength, eps);
+    op.Process();
   }
 }
diff --git a/src/ascend/custom/build.sh b/src/ascend/custom/build.sh
index 83740881..258a88e4 100755
--- a/src/ascend/custom/build.sh
+++ b/src/ascend/custom/build.sh
@@ -1,45 +1,30 @@
 #!/bin/bash
-# Build custom `AscendC` kernels into `libno_workspace_kernel.a` (+ the
-# standalone `libascend_kernel.so`).
-#
-# Intermediate artefacts default to `<repo>/build/build_ascend_custom/`
-# so the source tree under `src/` stays free of build output.  Override
-# via `BUILD_DIR=<abs-path> bash build.sh …` if needed.
+# Build custom `AscendC` kernels into `libascend_kernel.so`.
 set -e
 
 SOC_VERSION="${1:-Ascend910_9382}"
 
-# Use the same `cmake` the caller resolved (default: first `cmake` on
-# PATH).  The outer `src/CMakeLists.txt` forwards `${CMAKE_COMMAND}`
-# via `CMAKE_EXE` so the child build doesn't accidentally pick up the
-# PyPI `cmake` shim whose Python package only exists in `pip`'s
-# build-isolation overlay.
-CMAKE_EXE="${CMAKE_EXE:-cmake}"
-
 # Detect CANN toolkit path.
 _CANN_TOOLKIT_INSTALL_PATH=$(grep "Toolkit_InstallPath" /etc/Ascend/ascend_cann_install.info | awk -F'=' '{print $2}')
 source "${_CANN_TOOLKIT_INSTALL_PATH}/set_env.sh"
 echo "CANN: ${ASCEND_TOOLKIT_HOME}"
 
 ASCEND_INCLUDE_DIR=${ASCEND_TOOLKIT_HOME}/$(arch)-linux/include
+CURRENT_DIR=$(pwd)
+OUTPUT_DIR=${CURRENT_DIR}/output
+mkdir -p "${OUTPUT_DIR}"
 
-# Resolve build directory.  `<script>/../../..` is `<repo>/`.
-SCRIPT_DIR=$(cd "$(dirname "$(readlink -f "$0")")" && pwd)
-REPO_ROOT=$(cd "${SCRIPT_DIR}/../../.." && pwd)
-BUILD_DIR="${BUILD_DIR:-${REPO_ROOT}/build/build_ascend_custom}"
-OUTPUT_DIR="${BUILD_DIR}/output"
-
+BUILD_DIR=build
 rm -rf "${BUILD_DIR}"
-mkdir -p "${BUILD_DIR}" "${OUTPUT_DIR}"
+mkdir -p "${BUILD_DIR}"
 
-"${CMAKE_EXE}" \
+cmake \
     -DASCEND_HOME_PATH="${ASCEND_HOME_PATH}" \
     -DASCEND_INCLUDE_DIR="${ASCEND_INCLUDE_DIR}" \
     -DSOC_VERSION="${SOC_VERSION}" \
-    -DCMAKE_LIBRARY_OUTPUT_DIRECTORY="${OUTPUT_DIR}" \
     -B "${BUILD_DIR}" \
-    -S "${SCRIPT_DIR}"
+    -S .
 
-"${CMAKE_EXE}" --build "${BUILD_DIR}" -j 16
+cmake --build "${BUILD_DIR}" -j 16
 
 echo "Build complete. Output: ${OUTPUT_DIR}"
diff --git a/src/ascend/custom/cmake/config_ascend.cmake b/src/ascend/custom/cmake/config_ascend.cmake
index 29bbee85..1772e9e7 100644
--- a/src/ascend/custom/cmake/config_ascend.cmake
+++ b/src/ascend/custom/cmake/config_ascend.cmake
@@ -9,9 +9,17 @@ set(ASCEND_CANN_PACKAGE_PATH ${ASCEND_HOME_PATH})
 # Auto-detect `SOC_VERSION` from `npu-smi info` if not set externally.
 # Required by `CANN`'s `ascendc.cmake` for `AscendC` kernel compilation.
 if(NOT DEFINED SOC_VERSION OR "${SOC_VERSION}" STREQUAL "")
-    include(${CMAKE_CURRENT_LIST_DIR}/detect_soc.cmake)
-    infiniops_detect_soc(_detected_soc)
-    set(SOC_VERSION "${_detected_soc}" CACHE STRING "Ascend SOC version" FORCE)
+    execute_process(
+        COMMAND bash -c "npu-smi info 2>/dev/null | awk '/910B|910A|310/ {for (i=1;i<=NF;i++) if ($i ~ /^(910|310)/) {print \"Ascend\" $i; exit}}'"
+        OUTPUT_VARIABLE _DETECTED_SOC
+        OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+    if(_DETECTED_SOC)
+        set(SOC_VERSION "${_DETECTED_SOC}" CACHE STRING "Ascend SOC version" FORCE)
+    else()
+        set(SOC_VERSION "Ascend910B4" CACHE STRING "Ascend SOC version" FORCE)
+    endif()
+
     message(STATUS "SOC_VERSION auto-set to ${SOC_VERSION}")
 endif()
 
diff --git a/src/ascend/custom/cmake/detect_soc.cmake b/src/ascend/custom/cmake/detect_soc.cmake
deleted file mode 100644
index a8f97e7f..00000000
--- a/src/ascend/custom/cmake/detect_soc.cmake
+++ /dev/null
@@ -1,24 +0,0 @@
-# Auto-detect the Ascend SOC version from `npu-smi info`.
-#
-# `infiniops_detect_soc(<out_var>)` parses the first `910*` / `310*` entry
-# in `npu-smi info` and writes `Ascend<NNNX>` into the named variable in
-# the caller's scope.  Falls back to `Ascend910B4` when detection fails
-# (no NPU on the host, `npu-smi` missing, output format mismatch).
-#
-# Called from both `src/CMakeLists.txt` (outer `pip install` build, to
-# forward `SOC_VERSION` to the standalone `build.sh` invocation) and
-# `src/ascend/custom/cmake/config_ascend.cmake` (the sub-build driven
-# by that `build.sh`).
-
-function(infiniops_detect_soc out_var)
-    execute_process(
-        COMMAND bash -c "npu-smi info 2>/dev/null | awk '/910B|910A|310/ {for (i=1;i<=NF;i++) if ($i ~ /^(910|310)/) {print \"Ascend\" $i; exit}}'"
-        OUTPUT_VARIABLE _detected
-        OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-    if(_detected)
-        set(${out_var} "${_detected}" PARENT_SCOPE)
-    else()
-        set(${out_var} "Ascend910B4" PARENT_SCOPE)
-    endif()
-endfunction()
diff --git a/src/ascend/custom/rms_norm/op_host/rms_norm.cpp b/src/ascend/custom/rms_norm/op_host/rms_norm.cpp
index d5b8a7df..eb521c7b 100644
--- a/src/ascend/custom/rms_norm/op_host/rms_norm.cpp
+++ b/src/ascend/custom/rms_norm/op_host/rms_norm.cpp
@@ -1,4 +1,4 @@
-#include "aclrtlaunch_RmsNorm.h"
+#include "aclrtlaunch_rms_norm.h"
 #include "tiling/platform/platform_ascendc.h"
 #include "torch_kernel_helper.h"
 
@@ -94,12 +94,16 @@ at::Tensor RmsNorm(const at::Tensor& input, const at::Tensor& weight,
   float eps_float = static_cast<float>(eps);
   int64_t dtype_size_val = dtype_size;
 
-  // The first arg `RmsNorm` is the AscendC kernel entry-point name — it
-  // must match the `__global__ __aicore__ void RmsNorm(...)` definition in
-  // `op_kernel/` and the generated `aclrtlaunch_RmsNorm.h` header.
-  EXEC_KERNEL_CMD(RmsNorm, block_dim, kernel_input, weight_float, total_rows,
-                  dim_length, dim_length_align, former_num, former_length,
-                  tail_length, eps_float, dtype_size_val, kernel_output);
+  // The first arg `rms_norm` is the AscendC kernel entry-point name — it
+  // must match `ascendc_add_operator(OP_NAME rms_norm)` in `CMakeLists.txt`,
+  // the `__global__ __aicore__ void rms_norm(...)` definition in `op_kernel/`,
+  // and the generated `aclrtlaunch_rms_norm.h` header.  Google C++ Style's
+  // PascalCase rule does NOT apply: this identifier is dictated by the
+  // AscendC toolchain's symbol convention.
+  EXEC_KERNEL_CMD(rms_norm, block_dim, kernel_input, weight_float,
+                  kernel_output, total_rows, dim_length, dim_length_align,
+                  former_num, former_length, tail_length, eps_float,
+                  dtype_size_val);
 
   // Remove padding and reshape back to original shape.
   at::Tensor output = kernel_output;
diff --git a/src/ascend/custom/rms_norm/op_kernel/rms_norm.cpp b/src/ascend/custom/rms_norm/op_kernel/rms_norm.cpp
index 8f07cac6..5c8f4fc6 100644
--- a/src/ascend/custom/rms_norm/op_kernel/rms_norm.cpp
+++ b/src/ascend/custom/rms_norm/op_kernel/rms_norm.cpp
@@ -1,88 +1,88 @@
-#include "data_type.h"
 #include "kernel_operator.h"
 
-constexpr int32_t kBufferNum = 2;
+constexpr int32_t BUFFER_NUM = 2;
 
 template <typename T>
 class KernelRmsNorm {
  public:
   __aicore__ inline KernelRmsNorm() {}
 
-  __aicore__ inline void Init(GM_ADDR input, GM_ADDR weight, int64_t total_rows,
-                              int64_t dim_length, int64_t dim_length_align,
-                              int64_t former_num, int64_t former_length,
-                              int64_t tail_length, float eps, GM_ADDR out) {
-    dim_length_ = dim_length;
-    dim_length_align_ = dim_length_align;
-    eps_ = eps;
+  __aicore__ inline void Init(GM_ADDR x, GM_ADDR weight, GM_ADDR y,
+                              int64_t totalRows, int64_t dimLength,
+                              int64_t dimLengthAlign, int64_t formerNum,
+                              int64_t formerLength, int64_t tailLength,
+                              float eps) {
+    this->dimLength = dimLength;
+    this->dimLengthAlign = dimLengthAlign;
+    this->eps = eps;
 
     // Block-level tiling: determine row range for this core.
-    int64_t block_idx = AscendC::GetBlockIdx();
-    int64_t row_offset;
+    int64_t blockIdx = AscendC::GetBlockIdx();
+    int64_t rowOffset;
 
-    if (block_idx < former_num) {
-      block_rows_ = former_length;
-      row_offset = former_length * block_idx;
+    if (blockIdx < formerNum) {
+      this->blockRows = formerLength;
+      rowOffset = formerLength * blockIdx;
     } else {
-      block_rows_ = tail_length;
-      int64_t tail_idx = block_idx - former_num;
-      row_offset = former_length * former_num + tail_length * tail_idx;
+      this->blockRows = tailLength;
+      int64_t tailIdx = blockIdx - formerNum;
+      rowOffset = formerLength * formerNum + tailLength * tailIdx;
     }
 
     // Global memory pointers.
-    input_gm_.SetGlobalBuffer((__gm__ T*)input + row_offset * dim_length_align,
-                              block_rows_ * dim_length_align);
-    out_gm_.SetGlobalBuffer((__gm__ T*)out + row_offset * dim_length_align,
-                            block_rows_ * dim_length_align);
-    weight_gm_.SetGlobalBuffer((__gm__ float*)weight, dim_length_align);
+    xGm.SetGlobalBuffer((__gm__ T*)x + rowOffset * dimLengthAlign,
+                        this->blockRows * dimLengthAlign);
+    yGm.SetGlobalBuffer((__gm__ T*)y + rowOffset * dimLengthAlign,
+                        this->blockRows * dimLengthAlign);
+    weightGm.SetGlobalBuffer((__gm__ float*)weight, dimLengthAlign);
 
-    int32_t dim_len_align = static_cast<int32_t>(dim_length_align_);
+    int32_t dimLenAlign = static_cast<int32_t>(this->dimLengthAlign);
 
     // I/O queues (double-buffered).
-    pipe_.InitBuffer(in_queue_input_, kBufferNum,
-                     dim_len_align * static_cast<int32_t>(sizeof(T)));
-    pipe_.InitBuffer(out_queue_out_, kBufferNum,
-                     dim_len_align * static_cast<int32_t>(sizeof(T)));
+    pipe.InitBuffer(inQueueX, BUFFER_NUM,
+                    dimLenAlign * static_cast<int32_t>(sizeof(T)));
+    pipe.InitBuffer(outQueueY, BUFFER_NUM,
+                    dimLenAlign * static_cast<int32_t>(sizeof(T)));
 
     // Weight buffer (fp32, loaded once, reused for all rows).
-    pipe_.InitBuffer(weight_buf_,
-                     dim_len_align * static_cast<int32_t>(sizeof(float)));
+    pipe.InitBuffer(weightBuf,
+                    dimLenAlign * static_cast<int32_t>(sizeof(float)));
 
-    // FP16/BF16 path needs extra fp32 compute buffers.
+    // FP16 path needs extra fp32 compute buffers.
     if constexpr (sizeof(T) == 2) {
-      pipe_.InitBuffer(input_fp32_buf_,
-                       dim_len_align * static_cast<int32_t>(sizeof(float)));
-      pipe_.InitBuffer(tmp_fp32_buf_,
-                       dim_len_align * static_cast<int32_t>(sizeof(float)));
+      pipe.InitBuffer(xFp32Buf,
+                      dimLenAlign * static_cast<int32_t>(sizeof(float)));
+      pipe.InitBuffer(tmpFp32Buf,
+                      dimLenAlign * static_cast<int32_t>(sizeof(float)));
     }
 
-    // `ReduceSum` temporary buffer (size per API formula).
-    constexpr int32_t kElemsPerRepeat = 256 / sizeof(float);
-    constexpr int32_t kElemsPerBlock = 32 / sizeof(float);
-    int32_t first_max_repeat =
-        (dim_len_align + kElemsPerRepeat - 1) / kElemsPerRepeat;
-    int32_t reduce_tmp_size =
-        ((first_max_repeat + kElemsPerBlock - 1) / kElemsPerBlock) *
-        kElemsPerBlock;
-    pipe_.InitBuffer(reduce_tmp_buf_,
-                     reduce_tmp_size * static_cast<int32_t>(sizeof(float)));
+    // ReduceSum temporary buffer (size per API formula).
+    constexpr int32_t ELEMS_PER_REPEAT = 256 / sizeof(float);
+    constexpr int32_t ELEMS_PER_BLOCK = 32 / sizeof(float);
+    int32_t firstMaxRepeat =
+        (dimLenAlign + ELEMS_PER_REPEAT - 1) / ELEMS_PER_REPEAT;
+    int32_t reduceTmpSize =
+        ((firstMaxRepeat + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK) *
+        ELEMS_PER_BLOCK;
+    pipe.InitBuffer(reduceTmpBuf,
+                    reduceTmpSize * static_cast<int32_t>(sizeof(float)));
 
     // Scalar buffer for reduction result (8 floats = 32 bytes).
-    pipe_.InitBuffer(sum_buf_, 32);
+    pipe.InitBuffer(sumBuf, 32);
 
-    // Load weight (fp32) from GM into `weight_buf_`.
-    AscendC::LocalTensor<float> w_local = weight_buf_.Get<float>();
-    AscendC::DataCopyExtParams w_params{
-        1, static_cast<uint32_t>(dim_len_align * sizeof(float)), 0, 0, 0};
-    AscendC::DataCopyPadExtParams<float> w_pad{false, 0, 0, 0.0f};
-    AscendC::DataCopyPad(w_local, weight_gm_, w_params, w_pad);
+    // Load weight (fp32) from GM into `weightBuf`.
+    AscendC::LocalTensor<float> wLocal = weightBuf.Get<float>();
+    AscendC::DataCopyExtParams wParams{
+        1, static_cast<uint32_t>(dimLenAlign * sizeof(float)), 0, 0, 0};
+    AscendC::DataCopyPadExtParams<float> wPad{false, 0, 0, 0.0f};
+    AscendC::DataCopyPad(wLocal, weightGm, wParams, wPad);
 
     // Ensure weight DMA completes before compute.
     AscendC::PipeBarrier<PIPE_ALL>();
   }
 
   __aicore__ inline void Process() {
-    for (int64_t row = 0; row < block_rows_; ++row) {
+    for (int64_t row = 0; row < this->blockRows; ++row) {
       CopyIn(row);
       Compute(row);
       CopyOut(row);
@@ -91,146 +91,125 @@ class KernelRmsNorm {
 
  private:
   __aicore__ inline void CopyIn(int64_t row) {
-    AscendC::LocalTensor<T> input_local = in_queue_input_.AllocTensor<T>();
+    AscendC::LocalTensor<T> xLocal = inQueueX.AllocTensor<T>();
     AscendC::DataCopyExtParams params{
-        1, static_cast<uint32_t>(dim_length_align_ * sizeof(T)), 0, 0, 0};
+        1, static_cast<uint32_t>(this->dimLengthAlign * sizeof(T)), 0, 0, 0};
     AscendC::DataCopyPadExtParams<T> pad{false, 0, 0, static_cast<T>(0)};
-    AscendC::DataCopyPad(input_local, input_gm_[row * dim_length_align_],
-                         params, pad);
-    in_queue_input_.EnQue(input_local);
+    AscendC::DataCopyPad(xLocal, xGm[row * this->dimLengthAlign], params, pad);
+    inQueueX.EnQue(xLocal);
   }
 
   __aicore__ inline void Compute(int64_t row) {
-    AscendC::LocalTensor<T> input_local = in_queue_input_.DeQue<T>();
-    AscendC::LocalTensor<T> out_local = out_queue_out_.AllocTensor<T>();
+    AscendC::LocalTensor<T> xLocal = inQueueX.DeQue<T>();
+    AscendC::LocalTensor<T> yLocal = outQueueY.AllocTensor<T>();
 
-    AscendC::LocalTensor<float> w_local = weight_buf_.Get<float>();
-    AscendC::LocalTensor<float> r_tmp = reduce_tmp_buf_.Get<float>();
-    AscendC::LocalTensor<float> s_local = sum_buf_.Get<float>();
+    AscendC::LocalTensor<float> wLocal = weightBuf.Get<float>();
+    AscendC::LocalTensor<float> rTmp = reduceTmpBuf.Get<float>();
+    AscendC::LocalTensor<float> sLocal = sumBuf.Get<float>();
 
-    int32_t dim_len = static_cast<int32_t>(dim_length_);
-    int32_t dim_len_align = static_cast<int32_t>(dim_length_align_);
+    int32_t dimLen = static_cast<int32_t>(this->dimLength);
+    int32_t dimLenAlign = static_cast<int32_t>(this->dimLengthAlign);
 
     if constexpr (sizeof(T) == 4) {
       // ---- FP32 path: compute directly. ----
 
-      // Step 1: x^2 into out_local (reuse output buffer temporarily).
-      AscendC::Mul(out_local, input_local, input_local, dim_len_align);
+      // Step 1: x^2 into yLocal (reuse output buffer temporarily).
+      AscendC::Mul(yLocal, xLocal, xLocal, dimLenAlign);
 
-      // Step 2: ReduceSum(x^2) -> s_local[0].
-      // `ReduceSum` may modify src (out_local), but we overwrite it later.
-      AscendC::ReduceSum(s_local, out_local, r_tmp, dim_len_align);
+      // Step 2: ReduceSum(x^2) -> sLocal[0].
+      // ReduceSum may modify src (yLocal), but we overwrite it later.
+      AscendC::ReduceSum(sLocal, yLocal, rTmp, dimLenAlign);
 
       // Step 3-5: scale = 1 / sqrt(mean(x^2) + eps).
-      float sum_val = s_local.GetValue(0);
-      float mean_val = sum_val / static_cast<float>(dim_len) + eps_;
-      s_local.SetValue(0, mean_val);
-      AscendC::Sqrt(s_local, s_local, 8);
-      float scale = 1.0f / s_local.GetValue(0);
+      float sumVal = sLocal.GetValue(0);
+      float meanVal = sumVal / static_cast<float>(dimLen) + this->eps;
+      sLocal.SetValue(0, meanVal);
+      AscendC::Sqrt(sLocal, sLocal, 8);
+      float scale = 1.0f / sLocal.GetValue(0);
 
       // Step 6: y = x * scale.
-      AscendC::Muls(out_local, input_local, scale, dim_len_align);
+      AscendC::Muls(yLocal, xLocal, scale, dimLenAlign);
 
       // Step 7: y = y * weight.
-      AscendC::Mul(out_local, out_local, w_local, dim_len_align);
+      AscendC::Mul(yLocal, yLocal, wLocal, dimLenAlign);
 
     } else {
-      // ---- FP16/BF16 path: cast → fp32 compute → cast back. ----
-      AscendC::LocalTensor<float> x_f32 = input_fp32_buf_.Get<float>();
-      AscendC::LocalTensor<float> tmp_f32 = tmp_fp32_buf_.Get<float>();
+      // ---- FP16 path: cast → fp32 compute → cast back. ----
+      AscendC::LocalTensor<float> xF32 = xFp32Buf.Get<float>();
+      AscendC::LocalTensor<float> tmpF32 = tmpFp32Buf.Get<float>();
 
-      // Cast input fp16/bf16 → fp32.
-      AscendC::Cast(x_f32, input_local, AscendC::RoundMode::CAST_NONE,
-                    dim_len_align);
+      // Cast input fp16 → fp32.
+      AscendC::Cast(xF32, xLocal, AscendC::RoundMode::CAST_NONE, dimLenAlign);
 
       // Step 1: x^2 in fp32.
-      AscendC::Mul(tmp_f32, x_f32, x_f32, dim_len_align);
+      AscendC::Mul(tmpF32, xF32, xF32, dimLenAlign);
 
-      // Step 2: ReduceSum(x^2) -> s_local[0].
-      AscendC::ReduceSum(s_local, tmp_f32, r_tmp, dim_len_align);
+      // Step 2: ReduceSum(x^2) -> sLocal[0].
+      AscendC::ReduceSum(sLocal, tmpF32, rTmp, dimLenAlign);
 
       // Step 3-5: scale = 1 / sqrt(mean(x^2) + eps).
-      float sum_val = s_local.GetValue(0);
-      float mean_val = sum_val / static_cast<float>(dim_len) + eps_;
-      s_local.SetValue(0, mean_val);
-      AscendC::Sqrt(s_local, s_local, 8);
-      float scale = 1.0f / s_local.GetValue(0);
+      float sumVal = sLocal.GetValue(0);
+      float meanVal = sumVal / static_cast<float>(dimLen) + this->eps;
+      sLocal.SetValue(0, meanVal);
+      AscendC::Sqrt(sLocal, sLocal, 8);
+      float scale = 1.0f / sLocal.GetValue(0);
 
       // Step 6: y = x * scale (fp32).
-      AscendC::Muls(tmp_f32, x_f32, scale, dim_len_align);
+      AscendC::Muls(tmpF32, xF32, scale, dimLenAlign);
 
       // Step 7: y = y * weight (fp32).
-      AscendC::Mul(tmp_f32, tmp_f32, w_local, dim_len_align);
+      AscendC::Mul(tmpF32, tmpF32, wLocal, dimLenAlign);
 
-      // Cast result fp32 → fp16/bf16.  `CAST_RINT` is round-to-nearest-even
-      // and is defined for both `half` and `bfloat16_t` destinations;
-      // `CAST_ROUND` is a `half`-specific alias.
-      AscendC::Cast(out_local, tmp_f32, AscendC::RoundMode::CAST_RINT,
-                    dim_len_align);
+      // Cast result fp32 → fp16.
+      AscendC::Cast(yLocal, tmpF32, AscendC::RoundMode::CAST_ROUND,
+                    dimLenAlign);
     }
 
-    in_queue_input_.FreeTensor(input_local);
-    out_queue_out_.EnQue(out_local);
+    inQueueX.FreeTensor(xLocal);
+    outQueueY.EnQue(yLocal);
   }
 
   __aicore__ inline void CopyOut(int64_t row) {
-    AscendC::LocalTensor<T> out_local = out_queue_out_.DeQue<T>();
+    AscendC::LocalTensor<T> yLocal = outQueueY.DeQue<T>();
     AscendC::DataCopyExtParams params{
-        1, static_cast<uint32_t>(dim_length_align_ * sizeof(T)), 0, 0, 0};
-    AscendC::DataCopyPad(out_gm_[row * dim_length_align_], out_local, params);
-    out_queue_out_.FreeTensor(out_local);
+        1, static_cast<uint32_t>(this->dimLengthAlign * sizeof(T)), 0, 0, 0};
+    AscendC::DataCopyPad(yGm[row * this->dimLengthAlign], yLocal, params);
+    outQueueY.FreeTensor(yLocal);
   }
 
  private:
-  AscendC::TPipe pipe_;
-  AscendC::TQue<AscendC::TPosition::VECIN, kBufferNum> in_queue_input_;
-  AscendC::TQue<AscendC::TPosition::VECOUT, kBufferNum> out_queue_out_;
-
-  AscendC::TBuf<AscendC::TPosition::VECCALC> weight_buf_;
-  AscendC::TBuf<AscendC::TPosition::VECCALC> input_fp32_buf_;
-  AscendC::TBuf<AscendC::TPosition::VECCALC> tmp_fp32_buf_;
-  AscendC::TBuf<AscendC::TPosition::VECCALC> reduce_tmp_buf_;
-  AscendC::TBuf<AscendC::TPosition::VECCALC> sum_buf_;
-
-  AscendC::GlobalTensor<T> input_gm_, out_gm_;
-  AscendC::GlobalTensor<float> weight_gm_;
-
-  int64_t block_rows_;
-  int64_t dim_length_;
-  int64_t dim_length_align_;
-  float eps_;
+  AscendC::TPipe pipe;
+  AscendC::TQue<AscendC::TPosition::VECIN, BUFFER_NUM> inQueueX;
+  AscendC::TQue<AscendC::TPosition::VECOUT, BUFFER_NUM> outQueueY;
+
+  AscendC::TBuf<AscendC::TPosition::VECCALC> weightBuf;
+  AscendC::TBuf<AscendC::TPosition::VECCALC> xFp32Buf;
+  AscendC::TBuf<AscendC::TPosition::VECCALC> tmpFp32Buf;
+  AscendC::TBuf<AscendC::TPosition::VECCALC> reduceTmpBuf;
+  AscendC::TBuf<AscendC::TPosition::VECCALC> sumBuf;
+
+  AscendC::GlobalTensor<T> xGm, yGm;
+  AscendC::GlobalTensor<float> weightGm;
+
+  int64_t blockRows;
+  int64_t dimLength;
+  int64_t dimLengthAlign;
+  float eps;
 };
 
-// `dtype_code` is `static_cast<int64_t>(infini::ops::DataType)` forwarded
-// by the host launcher.  fp16 and bf16 both have `sizeof == 2` but need
-// distinct numeric paths, so dispatch is on the `DataType` tag rather
-// than the byte size.
-extern "C" __global__ __aicore__ void RmsNorm(
-    GM_ADDR input, GM_ADDR weight, int64_t total_rows, int64_t dim_length,
-    int64_t dim_length_align, int64_t former_num, int64_t former_length,
-    int64_t tail_length, float eps, int64_t dtype_code, GM_ADDR out) {
-  switch (static_cast<infini::ops::DataType>(dtype_code)) {
-    case infini::ops::DataType::kFloat16: {
-      KernelRmsNorm<half> op;
-      op.Init(input, weight, total_rows, dim_length, dim_length_align,
-              former_num, former_length, tail_length, eps, out);
-      op.Process();
-      break;
-    }
-    case infini::ops::DataType::kBFloat16: {
-      KernelRmsNorm<bfloat16_t> op;
-      op.Init(input, weight, total_rows, dim_length, dim_length_align,
-              former_num, former_length, tail_length, eps, out);
-      op.Process();
-      break;
-    }
-    case infini::ops::DataType::kFloat32:
-    default: {
-      KernelRmsNorm<float> op;
-      op.Init(input, weight, total_rows, dim_length, dim_length_align,
-              former_num, former_length, tail_length, eps, out);
-      op.Process();
-      break;
-    }
+extern "C" __global__ __aicore__ void rms_norm(
+    GM_ADDR x, GM_ADDR weight, GM_ADDR y, int64_t totalRows, int64_t dimLength,
+    int64_t dimLengthAlign, int64_t formerNum, int64_t formerLength,
+    int64_t tailLength, float eps, int64_t dtypeSize) {
+  if (dtypeSize == 2) {
+    KernelRmsNorm<half> op;
+    op.Init(x, weight, y, totalRows, dimLength, dimLengthAlign, formerNum,
+            formerLength, tailLength, eps);
+    op.Process();
+  } else {
+    KernelRmsNorm<float> op;
+    op.Init(x, weight, y, totalRows, dimLength, dimLengthAlign, formerNum,
+            formerLength, tailLength, eps);
+    op.Process();
   }
 }
diff --git a/src/ascend/linear/kernel.h b/src/ascend/linear/kernel.h
index eefcfb5a..497dd806 100644
--- a/src/ascend/linear/kernel.h
+++ b/src/ascend/linear/kernel.h
@@ -30,12 +30,6 @@ class Operator<Linear, Device::Type::kAscend> : public Linear {
     }
   }
 
-  // vLLM-aligned overload — `weight [out, in]`, `out = input @ weight^T`.
-  Operator(const Tensor input, const Tensor weight, std::optional<Tensor> bias,
-           Tensor out)
-      : Operator(input, weight, bias, /*trans_a=*/false, /*trans_b=*/true,
-                 out) {}
-
   ~Operator() {
     if (!ascend::IsAclRuntimeAlive()) return;
 
diff --git a/src/ascend/rms_norm/kernel.h b/src/ascend/rms_norm/kernel.h
deleted file mode 100644
index d68a88bb..00000000
--- a/src/ascend/rms_norm/kernel.h
+++ /dev/null
@@ -1,100 +0,0 @@
-#ifndef INFINI_OPS_ASCEND_RMS_NORM_KERNEL_H_
-#define INFINI_OPS_ASCEND_RMS_NORM_KERNEL_H_
-
-#include <vector>
-
-#include "acl/acl.h"
-#include "aclnn/aclnn_base.h"
-#include "aclnn_rms_norm.h"
-#include "ascend/common.h"
-#include "ascend/workspace_pool_.h"
-#include "base/rms_norm.h"
-#include "operator.h"
-
-namespace infini::ops {
-
-template <>
-class Operator<RmsNorm, Device::Type::kAscend> : public RmsNorm {
- public:
-  Operator(const Tensor input, const Tensor weight, float eps, Tensor out)
-      : RmsNorm(input, weight, eps, out),
-        in_cache_(input),
-        weight_cache_(weight),
-        out_cache_(out) {
-    // `aclnnRmsNorm` writes `rstd` as a required side output.  Size is
-    // computed here; the buffer is obtained from the pool in `operator()`.
-    rstd_shape_ = {static_cast<int64_t>(batch_size_),
-                   static_cast<int64_t>(nhead_)};
-    rstd_size_ = batch_size_ * nhead_ * sizeof(float);
-  }
-
-  ~Operator() {
-    if (!ascend::IsAclRuntimeAlive()) return;
-
-    // Null cached descriptors — see `AclTensorCache::release()`.
-    in_cache_.release();
-    weight_cache_.release();
-    out_cache_.release();
-    // `rstd_tensor_` leaks with the executor at shutdown (see `64c367c`).
-  }
-
-  void operator()(const Tensor input, const Tensor weight, float eps,
-                  Tensor out) const override {
-    auto t_in = in_cache_.get(const_cast<void*>(input.data()));
-    auto t_weight = weight_cache_.get(const_cast<void*>(weight.data()));
-    auto t_out = out_cache_.get(out.data());
-    auto stream = static_cast<aclrtStream>(stream_);
-
-    // Obtain shared `rstd` buffer from pool.
-    auto& rstd_arena =
-        ascend::GetWorkspacePool().Ensure(stream, rstd_size_, "temp");
-
-    // Lazily create the `rstd` tensor descriptor on first call.
-    if (!rstd_tensor_) {
-      rstd_tensor_ = aclCreateTensor(rstd_shape_.data(), 2, ACL_FLOAT,
-                                     /*strides=*/nullptr, 0, ACL_FORMAT_ND,
-                                     rstd_shape_.data(), 2, rstd_arena.buf);
-    } else {
-      aclSetRawTensorAddr(rstd_tensor_, rstd_arena.buf);
-    }
-
-    if (!executor_) {
-      aclnnRmsNormGetWorkspaceSize(t_in, t_weight, eps, t_out, rstd_tensor_,
-                                   &ws_size_, &executor_);
-      aclSetAclOpExecutorRepeatable(executor_);
-    } else {
-      aclSetInputTensorAddr(executor_, 0, t_in,
-                            const_cast<void*>(input.data()));
-      aclSetInputTensorAddr(executor_, 1, t_weight,
-                            const_cast<void*>(weight.data()));
-      aclSetOutputTensorAddr(executor_, 0, t_out, out.data());
-      aclSetOutputTensorAddr(executor_, 1, rstd_tensor_, rstd_arena.buf);
-    }
-
-    auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_);
-    aclnnRmsNorm(arena.buf, ws_size_, executor_, stream);
-  }
-
- private:
-  mutable ascend::AclTensorCache in_cache_;
-
-  mutable ascend::AclTensorCache weight_cache_;
-
-  mutable ascend::AclTensorCache out_cache_;
-
-  mutable aclOpExecutor* executor_ = nullptr;
-
-  mutable uint64_t ws_size_ = 0;
-
-  std::vector<int64_t> rstd_shape_;
-
-  uint64_t rstd_size_ = 0;
-
-  mutable aclTensor* rstd_tensor_ = nullptr;
-};
-
-}  // namespace infini::ops
-
-#include "ascend/rms_norm/kernel_custom.h"
-
-#endif
diff --git a/src/ascend/rms_norm/kernel_custom.h b/src/ascend/rms_norm/kernel_custom.h
deleted file mode 100644
index e7436b0d..00000000
--- a/src/ascend/rms_norm/kernel_custom.h
+++ /dev/null
@@ -1,155 +0,0 @@
-#ifndef INFINI_OPS_ASCEND_RMS_NORM_KERNEL_CUSTOM_H_
-#define INFINI_OPS_ASCEND_RMS_NORM_KERNEL_CUSTOM_H_
-
-#ifdef INFINI_HAS_CUSTOM_KERNELS
-
-#include <algorithm>
-#include <cstdint>
-#include <vector>
-
-#include "acl/acl.h"
-#include "aclnn/aclnn_base.h"
-#include "aclnnop/aclnn_cast.h"
-#include "ascend/common.h"
-#include "ascend/workspace_pool_.h"
-#include "base/rms_norm.h"
-#include "operator.h"
-
-// Forward-declare the `aclrtlaunch_RmsNorm` launch symbol defined by
-// the AscendC toolchain from `custom/rms_norm/op_kernel/`.
-extern "C" uint32_t aclrtlaunch_RmsNorm(
-    uint32_t block_dim, void* stream, void* input, void* weight,
-    int64_t total_rows, int64_t dim_length, int64_t dim_length_align,
-    int64_t former_num, int64_t former_length, int64_t tail_length, float eps,
-    int64_t dtype_code, void* out);
-
-namespace infini::ops {
-
-// Custom AscendC fused `RmsNorm` kernel (implementation index 1).
-//
-// A single-kernel implementation that computes `RMSNorm` in one launch,
-// avoiding the 5-sub-op decomposition of `aclnnRmsNorm` (index 0).  Uses
-// `Sqrt` + scalar division instead of `Rsqrt` for higher precision (~1e-7
-// `fp32` error vs ~0.2% with `Rsqrt`).
-//
-// Select via `implementation_index=1` in Python:
-//   `infini.ops.rms_norm(input, weight, eps, out, implementation_index=1,
-//                        stream=s)`.
-//
-// Requirements:
-//   - Input last dimension must be 32-byte aligned (divisible by 16 for
-//     `fp16` or 8 for `fp32`).  All standard LLM hidden dimensions satisfy
-//     this.
-//   - `weight` must have the same dtype as `input`.
-//   - The custom kernel binary must be linked (`BUILD_ASCEND_CUSTOM=ON`).
-template <>
-class Operator<RmsNorm, Device::Type::kAscend, 1> : public RmsNorm {
- public:
-  Operator(const Tensor input, const Tensor weight, float eps, Tensor out)
-      : RmsNorm(input, weight, eps, out), dtype_{input.dtype()} {
-    assert((dtype_ == DataType::kFloat16 || dtype_ == DataType::kBFloat16 ||
-            dtype_ == DataType::kFloat32) &&
-           "`RmsNorm` custom kernel: `input` must be `fp16`, `bf16`, or "
-           "`fp32`");
-
-    // 32-byte alignment on the last dimension — kernel relies on aligned
-    // `DataCopyPad` loads/stores.
-    int64_t align_elems = 32 / static_cast<int64_t>(kDataTypeToSize.at(dtype_));
-    dim_length_align_ =
-        ((static_cast<int64_t>(dim_) + align_elems - 1) / align_elems) *
-        align_elems;
-    assert(static_cast<int64_t>(dim_) == dim_length_align_ &&
-           "`RmsNorm` custom kernel: last dimension must be 32-byte aligned");
-
-    total_rows_ =
-        static_cast<int64_t>(batch_size_) * static_cast<int64_t>(nhead_);
-
-    // The custom kernel always reads `weight` as fp32, so fp16 / bf16
-    // inputs need a cached `aclnnCast` invocation in `operator()` to
-    // produce an fp32 shadow buffer on every launch.
-    if (dtype_ != DataType::kFloat32) {
-      size_t fp32_bytes = static_cast<size_t>(dim_) * sizeof(float);
-      aclrtMalloc(&weight_fp32_data_, fp32_bytes, ACL_MEM_MALLOC_NORMAL_ONLY);
-
-      weight_src_cache_ = ascend::AclTensorCache(
-          {static_cast<int64_t>(dim_)}, ascend::ToAclDtype(dtype_), nullptr);
-      weight_dst_cache_ = ascend::AclTensorCache({static_cast<int64_t>(dim_)},
-                                                 ACL_FLOAT, weight_fp32_data_);
-    }
-  }
-
-  ~Operator() {
-    if (!ascend::IsAclRuntimeAlive()) return;
-
-    // Null cached descriptors — see `AclTensorCache::release()`.
-    weight_src_cache_.release();
-    weight_dst_cache_.release();
-
-    if (weight_fp32_data_) aclrtFree(weight_fp32_data_);
-  }
-
-  void operator()(const Tensor input, const Tensor weight, float eps,
-                  Tensor out) const override {
-    auto stream = static_cast<aclrtStream>(stream_);
-
-    void* weight_fp32;
-
-    if (dtype_ != DataType::kFloat32) {
-      auto t_src = weight_src_cache_.get(const_cast<void*>(weight.data()));
-      auto t_dst = weight_dst_cache_.get(weight_fp32_data_);
-
-      if (!cast_exec_) {
-        aclnnCastGetWorkspaceSize(t_src, ACL_FLOAT, t_dst, &cast_ws_,
-                                  &cast_exec_);
-        aclSetAclOpExecutorRepeatable(cast_exec_);
-      } else {
-        aclSetInputTensorAddr(cast_exec_, 0, t_src,
-                              const_cast<void*>(weight.data()));
-        aclSetOutputTensorAddr(cast_exec_, 0, t_dst, weight_fp32_data_);
-      }
-
-      auto& arena = ascend::GetWorkspacePool().Ensure(stream, cast_ws_);
-      aclnnCast(arena.buf, cast_ws_, cast_exec_, stream);
-      weight_fp32 = weight_fp32_data_;
-    } else {
-      weight_fp32 = const_cast<void*>(weight.data());
-    }
-
-    // Block-level tiling.  Ascend 910B has 20–40 AIV cores; over-subscribing
-    // is safe (runtime multiplexes) but wastes one weight load per block.
-    static constexpr int64_t kMaxBlockDim = 40;
-    int64_t used_cores = std::min(total_rows_, kMaxBlockDim);
-    int64_t former_length = (total_rows_ + used_cores - 1) / used_cores;
-    int64_t tail_length = former_length - 1;
-    int64_t former_num = total_rows_ - tail_length * used_cores;
-    uint32_t block_dim = static_cast<uint32_t>(used_cores);
-
-    aclrtlaunch_RmsNorm(block_dim, stream, const_cast<void*>(input.data()),
-                        weight_fp32, total_rows_, static_cast<int64_t>(dim_),
-                        dim_length_align_, former_num, former_length,
-                        tail_length, eps, static_cast<int64_t>(dtype_),
-                        out.data());
-  }
-
- private:
-  DataType dtype_;
-
-  int64_t dim_length_align_;
-
-  int64_t total_rows_;
-
-  void* weight_fp32_data_ = nullptr;
-
-  mutable ascend::AclTensorCache weight_src_cache_;
-
-  mutable ascend::AclTensorCache weight_dst_cache_;
-
-  mutable aclOpExecutor* cast_exec_ = nullptr;
-
-  mutable uint64_t cast_ws_ = 0;
-};
-
-}  // namespace infini::ops
-
-#endif  // INFINI_HAS_CUSTOM_KERNELS
-#endif  // INFINI_OPS_ASCEND_RMS_NORM_KERNEL_CUSTOM_H_
diff --git a/src/ascend/rotary_embedding/kernel.h b/src/ascend/rotary_embedding/kernel.h
deleted file mode 100644
index 43ef1515..00000000
--- a/src/ascend/rotary_embedding/kernel.h
+++ /dev/null
@@ -1,373 +0,0 @@
-#ifndef INFINI_OPS_ASCEND_ROTARY_EMBEDDING_KERNEL_H_
-#define INFINI_OPS_ASCEND_ROTARY_EMBEDDING_KERNEL_H_
-
-#include <cassert>
-#include <cstddef>
-#include <cstring>
-#include <optional>
-#include <vector>
-
-#include "acl/acl.h"
-#include "aclnn/aclnn_base.h"
-#include "aclnnop/aclnn_apply_rotary_pos_emb_v2.h"
-#include "aclnnop/aclnn_index_select.h"
-#include "ascend/common.h"
-#include "ascend/workspace_pool_.h"
-#include "base/rotary_embedding.h"
-#include "operator.h"
-
-namespace infini::ops {
-
-// Rotary position embedding via `aclnnApplyRotaryPosEmbV2`.
-//
-// V2 handles Q and K simultaneously in a single inplace call (`layout=4`,
-// TND).  When `pre_gathered` is true, `cos_sin_cache` is interpreted as the
-// already-gathered `[T, head_size * 2]` neox-expanded table and the internal
-// `aclnnIndexSelect` step is skipped.
-//
-// fp16 note: V2 accumulates with ~4 ULP error for float16 (max diff ~0.008),
-// which exceeds strict atol=0.001 tests but is acceptable for inference.
-// bfloat16 passes with atol=0.005.
-//
-// Restrictions (implementation choices, not V2 API limits):
-//   - `rotary_dim` must equal `head_size` (partial rotation not
-//     implemented; V2's cos/sin second dim can be `head_size / 2` per the
-//     CANN 8.5 docs).
-//   - `is_neox_style` must be `true`.  V2 accepts `rotaryMode="half" /
-//     "interleave" / "quarter"`; this wrapper plumbs only `"half"`.
-// All mainstream models (LLaMA, Qwen, Mistral, DeepSeek) satisfy both.
-template <>
-class Operator<RotaryEmbedding, Device::Type::kAscend>
-    : public RotaryEmbedding {
- public:
-  Operator(const Tensor positions, const Tensor query,
-           std::optional<Tensor> key, int64_t head_size,
-           const Tensor cos_sin_cache, bool is_neox_style, int64_t rotary_dim,
-           std::optional<Tensor> query_out = std::nullopt,
-           std::optional<Tensor> key_out = std::nullopt,
-           bool pre_gathered = false)
-      : RotaryEmbedding(positions, query, key, head_size, cos_sin_cache,
-                        is_neox_style, rotary_dim, query_out, key_out,
-                        pre_gathered),
-        max_seq_len_{cos_sin_cache.size(0)},
-        elem_sz_{cos_sin_cache.element_size()} {
-    assert(rotary_dim == head_size &&
-           "ascend `RotaryEmbedding`: `rotary_dim` must equal `head_size` "
-           "(partial rotation is not implemented in this wrapper)");
-    assert(is_neox_style &&
-           "ascend `RotaryEmbedding`: `is_neox_style` must be `true` — "
-           "this wrapper only plumbs `rotaryMode=\"half\"` through "
-           "`aclnnApplyRotaryPosEmbV2`");
-    assert(has_key_ &&
-           "ascend `RotaryEmbedding` (impl 0): `key` is required — "
-           "`aclnnApplyRotaryPosEmbV2` always rotates Q and K together");
-
-    // Resolve optional out buffers; when omitted, RoPE writes back in place
-    // on `query` / `key` — vLLM-style inplace semantics.
-    Tensor q_out = query_out.value_or(query);
-    Tensor k_out = key_out.value_or(*key);
-
-    const int64_t head_dim = head_size_;
-    const int64_t num_tokens = num_tokens_;
-    const int64_t num_q_heads = num_heads_;
-    const int64_t num_kv_heads = num_kv_heads_;
-    aclDataType acl_dt = ascend::ToAclDtype(query.dtype());
-
-    if (!pre_gathered_) {
-      // Full cache path: allocate expanded cos/sin tables of
-      // `[max_seq_len, head_dim]`, and `[T, head_dim]` gathered buffers that
-      // `aclnnIndexSelect` writes per call.
-      size_t table_bytes =
-          static_cast<size_t>(max_seq_len_ * head_dim) * elem_sz_;
-
-      aclrtMalloc(&cos_table_dev_, table_bytes, ACL_MEM_MALLOC_NORMAL_ONLY);
-      aclrtMalloc(&sin_table_dev_, table_bytes, ACL_MEM_MALLOC_NORMAL_ONLY);
-
-      // Upload the initial cos_sin_cache.  `cos_sin_cache_data_` memorizes
-      // the source pointer; if the caller later hands in a different buffer,
-      // `operator()` re-runs the upload.
-      UploadCosSinCache(cos_sin_cache);
-      cos_sin_cache_data_ = cos_sin_cache.data();
-
-      size_t gathered_bytes =
-          static_cast<size_t>(num_tokens * head_dim) * elem_sz_;
-      aclrtMalloc(&cos_dev_, gathered_bytes, ACL_MEM_MALLOC_NORMAL_ONLY);
-      aclrtMalloc(&sin_dev_, gathered_bytes, ACL_MEM_MALLOC_NORMAL_ONLY);
-
-      // IndexSelect descriptors: table ptrs stable, positions ptr varies.
-      cos_table_cache_ = ascend::AclTensorCache({max_seq_len_, head_dim},
-                                                acl_dt, cos_table_dev_);
-      sin_table_cache_ = ascend::AclTensorCache({max_seq_len_, head_dim},
-                                                acl_dt, sin_table_dev_);
-      idx_cache_ = ascend::AclTensorCache({num_tokens}, ACL_INT64,
-                                          const_cast<void*>(positions.data()));
-      cos_out_cache_ =
-          ascend::AclTensorCache({num_tokens, head_dim}, acl_dt, cos_dev_);
-      sin_out_cache_ =
-          ascend::AclTensorCache({num_tokens, head_dim}, acl_dt, sin_dev_);
-    }
-
-    // V2 descriptors: cos/sin `[T, 1, head_dim]`, Q `[T, Nq, head_dim]`,
-    // K `[T, Nkv, head_dim]`.  When `pre_gathered` is true, cos/sin point
-    // into the caller's `cos_sin_cache`: row 0..T-1 is cos, row T..2T-1 is
-    // sin (stacked along dim=0 by the shim).
-    void* cos_init = cos_dev_;
-    void* sin_init = sin_dev_;
-
-    if (pre_gathered_) {
-      auto* base =
-          static_cast<uint8_t*>(const_cast<void*>(cos_sin_cache.data()));
-      cos_init = base;
-      sin_init = base + static_cast<size_t>(num_tokens * head_dim) * elem_sz_;
-    }
-
-    cos_v2_cache_ =
-        ascend::AclTensorCache({num_tokens, 1, head_dim}, acl_dt, cos_init);
-    sin_v2_cache_ =
-        ascend::AclTensorCache({num_tokens, 1, head_dim}, acl_dt, sin_init);
-    q_cache_ = ascend::AclTensorCache({num_tokens, num_q_heads, head_dim},
-                                      acl_dt, const_cast<void*>(q_out.data()));
-    k_cache_ = ascend::AclTensorCache({num_tokens, num_kv_heads, head_dim},
-                                      acl_dt, const_cast<void*>(k_out.data()));
-  }
-
-  ~Operator() {
-    if (!ascend::IsAclRuntimeAlive()) return;
-
-    // Null cached descriptors — see `AclTensorCache::release()`.
-    cos_table_cache_.release();
-    sin_table_cache_.release();
-    idx_cache_.release();
-    cos_out_cache_.release();
-    sin_out_cache_.release();
-    cos_v2_cache_.release();
-    sin_v2_cache_.release();
-    q_cache_.release();
-    k_cache_.release();
-
-    if (cos_table_dev_) aclrtFree(cos_table_dev_);
-    if (sin_table_dev_) aclrtFree(sin_table_dev_);
-    if (cos_dev_) aclrtFree(cos_dev_);
-    if (sin_dev_) aclrtFree(sin_dev_);
-  }
-
-  void operator()(const Tensor positions, const Tensor query,
-                  std::optional<Tensor> key, int64_t head_size,
-                  const Tensor cos_sin_cache, bool is_neox_style,
-                  int64_t rotary_dim, std::optional<Tensor> query_out,
-                  std::optional<Tensor> key_out,
-                  bool pre_gathered) const override {
-    auto stream = static_cast<aclrtStream>(stream_);
-
-    // Resolve optional out buffers (inplace on `query` / `key` when omitted).
-    // Non-const so `.data()` returns a writable `void*`.
-    Tensor q_out = query_out.value_or(query);
-    Tensor k_out = key_out.value_or(*key);
-
-    const int64_t num_tokens = query.size(0);
-    const int64_t num_q_heads = num_heads_;
-    const int64_t num_kv_heads = num_kv_heads_;
-    const int64_t head_dim = head_size;
-
-    const void* cos_sin_for_v2 = nullptr;
-    const void* sin_for_v2 = nullptr;
-
-    if (!pre_gathered) {
-      // `CacheKey` matches on shape/stride/dtype and ignores data pointers,
-      // so a cached operator instance may be reused across calls that hand in
-      // different `cos_sin_cache` allocations.  Re-upload when the source
-      // pointer changes.  See `operator_cache_stale_data` in memory.
-      if (cos_sin_cache.data() != cos_sin_cache_data_) {
-        UploadCosSinCache(cos_sin_cache);
-        cos_sin_cache_data_ = cos_sin_cache.data();
-      }
-
-      // Step 1: Gather cos/sin by positions via `aclnnIndexSelect` (async).
-      auto t_cos_table = cos_table_cache_.get(cos_table_dev_);
-      auto t_sin_table = sin_table_cache_.get(sin_table_dev_);
-      auto t_idx = idx_cache_.get(const_cast<void*>(positions.data()));
-      auto t_cos_out = cos_out_cache_.get(cos_dev_);
-      auto t_sin_out = sin_out_cache_.get(sin_dev_);
-
-      if (!idx_cos_exec_) {
-        aclnnIndexSelectGetWorkspaceSize(t_cos_table, 0, t_idx, t_cos_out,
-                                         &idx_cos_ws_, &idx_cos_exec_);
-        aclSetAclOpExecutorRepeatable(idx_cos_exec_);
-      } else {
-        aclSetInputTensorAddr(idx_cos_exec_, 1, t_idx,
-                              const_cast<void*>(positions.data()));
-      }
-
-      if (!idx_sin_exec_) {
-        aclnnIndexSelectGetWorkspaceSize(t_sin_table, 0, t_idx, t_sin_out,
-                                         &idx_sin_ws_, &idx_sin_exec_);
-        aclSetAclOpExecutorRepeatable(idx_sin_exec_);
-      } else {
-        aclSetInputTensorAddr(idx_sin_exec_, 1, t_idx,
-                              const_cast<void*>(positions.data()));
-      }
-
-      uint64_t ws_max = idx_cos_ws_ > idx_sin_ws_ ? idx_cos_ws_ : idx_sin_ws_;
-      auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_max);
-
-      aclnnIndexSelect(arena.buf, idx_cos_ws_, idx_cos_exec_, stream);
-      aclnnIndexSelect(arena.buf, idx_sin_ws_, idx_sin_exec_, stream);
-
-      cos_sin_for_v2 = cos_dev_;
-      sin_for_v2 = sin_dev_;
-    } else {
-      // Pre-gathered: caller passes `[2 * T, head_size]` — rows 0..T-1 are
-      // neox-expanded cos, rows T..2T-1 are neox-expanded sin (stacked via
-      // `torch.cat([cos, sin], dim=0)`).
-      const auto* base = static_cast<const uint8_t*>(cos_sin_cache.data());
-      cos_sin_for_v2 = base;
-      sin_for_v2 = base + static_cast<size_t>(num_tokens * head_dim) * elem_sz_;
-    }
-
-    // Step 2: Copy q -> q_out, k -> k_out if not inplace (V2 operates
-    // inplace).
-    size_t elem_sz = query.element_size();
-
-    if (query.data() != q_out.data()) {
-      aclrtMemcpyAsync(
-          q_out.data(),
-          static_cast<size_t>(num_tokens * num_q_heads * head_dim) * elem_sz,
-          query.data(),
-          static_cast<size_t>(num_tokens * num_q_heads * head_dim) * elem_sz,
-          ACL_MEMCPY_DEVICE_TO_DEVICE, stream);
-    }
-
-    if (key->data() != k_out.data()) {
-      aclrtMemcpyAsync(
-          k_out.data(),
-          static_cast<size_t>(num_tokens * num_kv_heads * head_dim) * elem_sz,
-          key->data(),
-          static_cast<size_t>(num_tokens * num_kv_heads * head_dim) * elem_sz,
-          ACL_MEMCPY_DEVICE_TO_DEVICE, stream);
-    }
-
-    // Step 3: Apply V2 RoPE inplace on q_out and k_out.
-    auto t_cos = cos_v2_cache_.get(const_cast<void*>(cos_sin_for_v2));
-    auto t_sin = sin_v2_cache_.get(const_cast<void*>(sin_for_v2));
-    auto t_q = q_cache_.get(q_out.data());
-    auto t_k = k_cache_.get(k_out.data());
-
-    if (!v2_exec_) {
-      aclnnApplyRotaryPosEmbV2GetWorkspaceSize(
-          t_q, t_k, t_cos, t_sin, /*layout=*/4, const_cast<char*>("half"),
-          &v2_ws_, &v2_exec_);
-      aclSetAclOpExecutorRepeatable(v2_exec_);
-    } else {
-      aclSetInputTensorAddr(v2_exec_, 0, t_q, q_out.data());
-      aclSetInputTensorAddr(v2_exec_, 1, t_k, k_out.data());
-      aclSetInputTensorAddr(v2_exec_, 2, t_cos,
-                            const_cast<void*>(cos_sin_for_v2));
-      aclSetInputTensorAddr(v2_exec_, 3, t_sin, const_cast<void*>(sin_for_v2));
-    }
-
-    auto& arena = ascend::GetWorkspacePool().Ensure(stream, v2_ws_);
-    aclnnApplyRotaryPosEmbV2(arena.buf, v2_ws_, v2_exec_, stream);
-  }
-
- private:
-  // D2H copy `cos_sin_cache`, split into cos/sin, neox-expand, and upload to
-  // device.  Called at construction and on cache-pointer change.
-  void UploadCosSinCache(const Tensor cos_sin_cache) const {
-    const int64_t head_dim = head_size_;
-    const int64_t half_head_dim = head_dim / 2;
-    size_t table_bytes =
-        static_cast<size_t>(max_seq_len_ * head_dim) * elem_sz_;
-
-    std::vector<uint8_t> cache_host(table_bytes);
-    aclrtMemcpy(cache_host.data(), table_bytes, cos_sin_cache.data(),
-                table_bytes, ACL_MEMCPY_DEVICE_TO_HOST);
-
-    std::vector<uint8_t> cos_host(table_bytes);
-    std::vector<uint8_t> sin_host(table_bytes);
-
-    for (int64_t p = 0; p < max_seq_len_; ++p) {
-      for (int64_t j = 0; j < half_head_dim; ++j) {
-        const auto* c_src = cache_host.data() +
-                            static_cast<size_t>(p * head_dim + j) * elem_sz_;
-        const auto* s_src =
-            cache_host.data() +
-            static_cast<size_t>(p * head_dim + half_head_dim + j) * elem_sz_;
-
-        std::memcpy(
-            cos_host.data() + static_cast<size_t>(p * head_dim + j) * elem_sz_,
-            c_src, elem_sz_);
-        std::memcpy(cos_host.data() +
-                        static_cast<size_t>(p * head_dim + half_head_dim + j) *
-                            elem_sz_,
-                    c_src, elem_sz_);
-        std::memcpy(
-            sin_host.data() + static_cast<size_t>(p * head_dim + j) * elem_sz_,
-            s_src, elem_sz_);
-        std::memcpy(sin_host.data() +
-                        static_cast<size_t>(p * head_dim + half_head_dim + j) *
-                            elem_sz_,
-                    s_src, elem_sz_);
-      }
-    }
-
-    aclrtMemcpy(cos_table_dev_, table_bytes, cos_host.data(), table_bytes,
-                ACL_MEMCPY_HOST_TO_DEVICE);
-    aclrtMemcpy(sin_table_dev_, table_bytes, sin_host.data(), table_bytes,
-                ACL_MEMCPY_HOST_TO_DEVICE);
-  }
-
-  int64_t max_seq_len_;
-
-  size_t elem_sz_;
-
-  // Last `cos_sin_cache.data()` uploaded via `UploadCosSinCache()`.  Compared
-  // on every call to detect caller-side cache swaps.
-  mutable const void* cos_sin_cache_data_ = nullptr;
-
-  // Pre-expanded cos/sin tables on device: `[max_seq_len, head_dim]`.
-  void* cos_table_dev_ = nullptr;
-
-  void* sin_table_dev_ = nullptr;
-
-  // Device buffers for gathered `[T, head_dim]` cos/sin.
-  void* cos_dev_ = nullptr;
-
-  void* sin_dev_ = nullptr;
-
-  // IndexSelect descriptors.
-  mutable ascend::AclTensorCache cos_table_cache_;
-
-  mutable ascend::AclTensorCache sin_table_cache_;
-
-  mutable ascend::AclTensorCache idx_cache_;
-
-  mutable ascend::AclTensorCache cos_out_cache_;
-
-  mutable ascend::AclTensorCache sin_out_cache_;
-
-  // V2 descriptors.
-  mutable ascend::AclTensorCache cos_v2_cache_;
-
-  mutable ascend::AclTensorCache sin_v2_cache_;
-
-  mutable ascend::AclTensorCache q_cache_;
-
-  mutable ascend::AclTensorCache k_cache_;
-
-  // Cached executors.
-  mutable aclOpExecutor* idx_cos_exec_ = nullptr;
-
-  mutable uint64_t idx_cos_ws_ = 0;
-
-  mutable aclOpExecutor* idx_sin_exec_ = nullptr;
-
-  mutable uint64_t idx_sin_ws_ = 0;
-
-  mutable aclOpExecutor* v2_exec_ = nullptr;
-
-  mutable uint64_t v2_ws_ = 0;
-};
-
-}  // namespace infini::ops
-
-#endif
diff --git a/src/ascend/rotary_embedding/kernel_atb.h b/src/ascend/rotary_embedding/kernel_atb.h
deleted file mode 100644
index aa468cb9..00000000
--- a/src/ascend/rotary_embedding/kernel_atb.h
+++ /dev/null
@@ -1,449 +0,0 @@
-#ifndef INFINI_OPS_ASCEND_ROTARY_EMBEDDING_KERNEL_ATB_H_
-#define INFINI_OPS_ASCEND_ROTARY_EMBEDDING_KERNEL_ATB_H_
-
-#ifdef INFINI_HAS_ATB
-
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <optional>
-#include <vector>
-
-#include "acl/acl.h"
-#include "aclnn/aclnn_base.h"
-#include "aclnnop/aclnn_index_select.h"
-#include "ascend/atb_common_.h"
-#include "ascend/common.h"
-#include "ascend/workspace_pool_.h"
-#include "atb/context.h"
-#include "atb/infer_op_params.h"
-#include "atb/operation.h"
-#include "atb/types.h"
-#include "base/rotary_embedding.h"
-#include "operator.h"
-
-namespace infini::ops {
-
-// ATB-based rotary position embedding (implementation index 1).
-//
-// Wraps ATB `RopeParam` which applies rotary embedding in a single fused
-// kernel, eliminating the per-token V2 decomposition in the CANN path
-// (index 0).  When `pre_gathered` is true, `cos_sin_cache` is interpreted as
-// the already-gathered `[T, head_size * 2]` table (cos half followed by sin
-// half, neox or interleave layout chosen upstream); the internal
-// `aclnnIndexSelect` step is skipped.
-//
-// ATB Rope with `rotaryCoeff=2`, `cosFormat=0` expects 5 inputs / 2 outputs:
-//   `inTensors[0] = query   [T, hidden_q]`
-//   `inTensors[1] = key     [T, hidden_k]`
-//   `inTensors[2] = cos     [T, head_dim]`   — pre-gathered per-token cos.
-//   `inTensors[3] = sin     [T, head_dim]`   — pre-gathered per-token sin.
-//   `inTensors[4] = seqlen  [batch]`         — per-batch sequence lengths.
-//   `outTensors[0] = q_out  [T, hidden_q]`
-//   `outTensors[1] = k_out  [T, hidden_k]`
-//
-// This implementation gathers cos/sin from pre-expanded
-// `[max_seq_len, head_dim]` tables using `aclnnIndexSelect` on the position
-// indices, then passes the gathered `[T, head_dim]` tensors to ATB Rope.
-// The `seqlen` input is a single `int32` element equal to `T` (all tokens
-// treated as one batch).
-//
-// Restrictions:
-//   - `rotary_dim` must equal `head_size` (full rotation only).  ATB
-//     `RopeParam` supports `rotaryCoeff=2/4/head_size/head_size_2` per the
-//     CANN 8.5 ATB docs.  This wrapper plumbs:
-//       * `rotaryCoeff=2` when `is_neox_style=true`  (half split + cat)
-//       * `rotaryCoeff=head_size` when `is_neox_style=false` (interleave)
-//     Partial rotary (`rotary_dim < head_size`) is not supported by either
-//     the `aclnn` or ATB fused APIs; callers must pad to `head_size`
-//     upstream.
-template <>
-class Operator<RotaryEmbedding, Device::Type::kAscend, 1>
-    : public RotaryEmbedding {
- public:
-  Operator(const Tensor positions, const Tensor query,
-           std::optional<Tensor> key, int64_t head_size,
-           const Tensor cos_sin_cache, bool is_neox_style, int64_t rotary_dim,
-           std::optional<Tensor> query_out = std::nullopt,
-           std::optional<Tensor> key_out = std::nullopt,
-           bool pre_gathered = false)
-      : RotaryEmbedding(positions, query, key, head_size, cos_sin_cache,
-                        is_neox_style, rotary_dim, query_out, key_out,
-                        pre_gathered) {
-    assert(rotary_dim == head_size &&
-           "ascend `RotaryEmbedding` (ATB): `rotary_dim` must equal "
-           "`head_size` — ATB `RopeParam` does not support partial rotary");
-    assert(has_key_ &&
-           "ascend `RotaryEmbedding` (ATB): `key` is required — ATB "
-           "`RopeParam` always rotates Q and K together");
-
-    const int64_t head_dim = head_size_;
-    const size_t elem_sz = cos_sin_cache.element_size();
-
-    max_seq_len_ = cos_sin_cache.size(0);
-
-    const int64_t num_tokens = num_tokens_;
-    int64_t hidden_q = static_cast<int64_t>(query.numel()) / num_tokens;
-    int64_t hidden_k = static_cast<int64_t>(key->numel()) / num_tokens;
-    q_2d_shape_ = {num_tokens, hidden_q};
-    k_2d_shape_ = {num_tokens, hidden_k};
-    cos_sin_gathered_shape_ = {num_tokens, head_dim};
-    seqlen_shape_ = {1};
-    acl_dt_ = ascend::ToAclDtype(query.dtype());
-    elem_size_ = static_cast<uint64_t>(elem_sz);
-
-    if (!pre_gathered_) {
-      size_t table_bytes = static_cast<size_t>(max_seq_len_) *
-                           static_cast<size_t>(head_dim) * elem_sz;
-
-      // Allocate device buffers for expanded cos/sin tables
-      // `[max_seq_len, head_dim]`.
-      aclrtMalloc(&cos_table_dev_, table_bytes, ACL_MEM_MALLOC_NORMAL_ONLY);
-      aclrtMalloc(&sin_table_dev_, table_bytes, ACL_MEM_MALLOC_NORMAL_ONLY);
-
-      // Upload the initial `cos_sin_cache`.  `cos_sin_cache_data_` memorizes
-      // the source pointer; if the caller later hands in a different buffer,
-      // `operator()` re-runs the upload.
-      UploadCosSinCache(cos_sin_cache);
-      cos_sin_cache_data_ = cos_sin_cache.data();
-
-      // Allocate gathered cos/sin buffers `[T, head_dim]` — filled by
-      // `aclnnIndexSelect`.
-      size_t gathered_bytes =
-          static_cast<size_t>(num_tokens * head_dim) * elem_sz;
-      aclrtMalloc(&cos_dev_, gathered_bytes, ACL_MEM_MALLOC_NORMAL_ONLY);
-      aclrtMalloc(&sin_dev_, gathered_bytes, ACL_MEM_MALLOC_NORMAL_ONLY);
-
-      // IndexSelect descriptor caches: table ptrs stable, positions ptr
-      // varies.
-      cos_table_cache_ = ascend::AclTensorCache({max_seq_len_, head_dim},
-                                                acl_dt_, cos_table_dev_);
-      sin_table_cache_ = ascend::AclTensorCache({max_seq_len_, head_dim},
-                                                acl_dt_, sin_table_dev_);
-      idx_cache_ = ascend::AclTensorCache({num_tokens}, ACL_INT64,
-                                          const_cast<void*>(positions.data()));
-      cos_out_cache_ =
-          ascend::AclTensorCache({num_tokens, head_dim}, acl_dt_, cos_dev_);
-      sin_out_cache_ =
-          ascend::AclTensorCache({num_tokens, head_dim}, acl_dt_, sin_dev_);
-    }
-
-    // Allocate seqlen buffer: 1 `int32` element holding `T`.
-    aclrtMalloc(&seqlen_dev_, sizeof(int32_t), ACL_MEM_MALLOC_NORMAL_ONLY);
-    int32_t seqlen_val = static_cast<int32_t>(num_tokens);
-    aclrtMemcpy(seqlen_dev_, sizeof(int32_t), &seqlen_val, sizeof(int32_t),
-                ACL_MEMCPY_HOST_TO_DEVICE);
-
-    // Create the ATB Rope operation.  `rotaryCoeff` selects the rotation
-    // pattern: `2` for neox (split-then-rotate halves), `head_size` for
-    // interleave (pair-wise rotate adjacent elements).
-    atb::infer::RopeParam param;
-    param.rotaryCoeff = is_neox_style ? 2 : static_cast<int32_t>(head_dim);
-    param.cosFormat = 0;  // Inference mode.
-    atb::Status s = atb::CreateOperation(param, &op_);
-
-    assert(s == atb::NO_ERROR && "`atb::CreateOperation(Rope)` failed");
-  }
-
-  ~Operator() {
-    if (!ascend::IsAclRuntimeAlive()) return;
-
-    // Null cached descriptors — see `AclTensorCache::release()`.
-    cos_table_cache_.release();
-    sin_table_cache_.release();
-    idx_cache_.release();
-    cos_out_cache_.release();
-    sin_out_cache_.release();
-
-    if (op_) atb::DestroyOperation(op_);
-    if (cos_table_dev_) aclrtFree(cos_table_dev_);
-    if (sin_table_dev_) aclrtFree(sin_table_dev_);
-    if (cos_dev_) aclrtFree(cos_dev_);
-    if (sin_dev_) aclrtFree(sin_dev_);
-    if (seqlen_dev_) aclrtFree(seqlen_dev_);
-  }
-
-  Operator(const Operator&) = delete;
-
-  Operator& operator=(const Operator&) = delete;
-
-  void operator()(const Tensor positions, const Tensor query,
-                  std::optional<Tensor> key, int64_t head_size,
-                  const Tensor cos_sin_cache, bool is_neox_style,
-                  int64_t rotary_dim, std::optional<Tensor> query_out,
-                  std::optional<Tensor> key_out,
-                  bool pre_gathered) const override {
-    auto stream = static_cast<aclrtStream>(stream_);
-
-    // Resolve optional out buffers (inplace on `query` / `key` when omitted).
-    // Non-const so `.data()` returns a writable `void*`.
-    Tensor q_out = query_out.value_or(query);
-    Tensor k_out = key_out.value_or(*key);
-
-    int64_t num_tokens = query.size(0);
-    int64_t head_dim = head_size;
-
-    // Compute total hidden sizes for the 2D view expected by ATB Rope.
-    // Works for both 2D `[T, N * D]` and 3D `[T, N, D]` input.
-    int64_t hidden_q = static_cast<int64_t>(query.numel()) / num_tokens;
-    int64_t hidden_k = static_cast<int64_t>(key->numel()) / num_tokens;
-
-    const void* cos_for_rope = nullptr;
-    const void* sin_for_rope = nullptr;
-
-    if (!pre_gathered) {
-      // `CacheKey` matches on shape/stride/dtype and ignores data pointers,
-      // so a cached operator instance may be reused across calls that hand in
-      // different `cos_sin_cache` allocations.  Re-upload when the source
-      // pointer changes.  See `operator_cache_stale_data` in memory.
-      if (cos_sin_cache.data() != cos_sin_cache_data_) {
-        UploadCosSinCache(cos_sin_cache);
-        cos_sin_cache_data_ = cos_sin_cache.data();
-      }
-
-      // Step 1: Gather cos/sin by positions via `aclnnIndexSelect` (async).
-      auto t_cos_table = cos_table_cache_.get(cos_table_dev_);
-      auto t_sin_table = sin_table_cache_.get(sin_table_dev_);
-      auto t_idx = idx_cache_.get(const_cast<void*>(positions.data()));
-      auto t_cos_out = cos_out_cache_.get(cos_dev_);
-      auto t_sin_out = sin_out_cache_.get(sin_dev_);
-
-      if (!idx_cos_exec_) {
-        aclnnIndexSelectGetWorkspaceSize(t_cos_table, 0, t_idx, t_cos_out,
-                                         &idx_cos_ws_, &idx_cos_exec_);
-        aclSetAclOpExecutorRepeatable(idx_cos_exec_);
-      } else {
-        aclSetInputTensorAddr(idx_cos_exec_, 1, t_idx,
-                              const_cast<void*>(positions.data()));
-      }
-
-      if (!idx_sin_exec_) {
-        aclnnIndexSelectGetWorkspaceSize(t_sin_table, 0, t_idx, t_sin_out,
-                                         &idx_sin_ws_, &idx_sin_exec_);
-        aclSetAclOpExecutorRepeatable(idx_sin_exec_);
-      } else {
-        aclSetInputTensorAddr(idx_sin_exec_, 1, t_idx,
-                              const_cast<void*>(positions.data()));
-      }
-
-      uint64_t ws_max = idx_cos_ws_ > idx_sin_ws_ ? idx_cos_ws_ : idx_sin_ws_;
-      auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_max);
-
-      aclnnIndexSelect(arena.buf, idx_cos_ws_, idx_cos_exec_, stream);
-      aclnnIndexSelect(arena.buf, idx_sin_ws_, idx_sin_exec_, stream);
-
-      cos_for_rope = cos_dev_;
-      sin_for_rope = sin_dev_;
-    } else {
-      // Pre-gathered: caller passes `[2 * T, head_size]` — rows 0..T-1 are
-      // expanded cos (neox or interleave per `is_neox_style`), rows T..2T-1
-      // are expanded sin (stacked via `torch.cat([cos, sin], dim=0)`).
-      const auto* base = static_cast<const uint8_t*>(cos_sin_cache.data());
-      cos_for_rope = base;
-      sin_for_rope =
-          base + static_cast<size_t>(num_tokens * head_dim) * elem_size_;
-    }
-
-    // Step 2: Copy q -> q_out, k -> k_out if not in-place.
-    size_t elem_sz = query.element_size();
-
-    if (query.data() != q_out.data()) {
-      aclrtMemcpyAsync(
-          q_out.data(), static_cast<size_t>(num_tokens * hidden_q) * elem_sz,
-          query.data(), static_cast<size_t>(num_tokens * hidden_q) * elem_sz,
-          ACL_MEMCPY_DEVICE_TO_DEVICE, stream);
-    }
-
-    if (key->data() != k_out.data()) {
-      aclrtMemcpyAsync(
-          k_out.data(), static_cast<size_t>(num_tokens * hidden_k) * elem_sz,
-          key->data(), static_cast<size_t>(num_tokens * hidden_k) * elem_sz,
-          ACL_MEMCPY_DEVICE_TO_DEVICE, stream);
-    }
-
-    // Step 3: Build ATB `VariantPack` with 5 inputs + 2 outputs.
-    // Inputs: `q_out [T, hidden_q]`, `k_out [T, hidden_k]`,
-    //         `cos [T, head_dim]`, `sin [T, head_dim]`, `seqlen [1]`.
-    // Outputs: `q_out [T, hidden_q]`, `k_out [T, hidden_k]`.
-    atb::Context* ctx = ascend::GetAtbContext(stream);
-
-    uint64_t q_bytes =
-        static_cast<uint64_t>(num_tokens * hidden_q) * elem_size_;
-    uint64_t k_bytes =
-        static_cast<uint64_t>(num_tokens * hidden_k) * elem_size_;
-    uint64_t gathered_bytes =
-        static_cast<uint64_t>(num_tokens * head_dim) * elem_size_;
-
-    atb::Tensor t_q =
-        ascend::ToAtbTensor(q_2d_shape_, acl_dt_, q_out.data(), q_bytes);
-    atb::Tensor t_k =
-        ascend::ToAtbTensor(k_2d_shape_, acl_dt_, k_out.data(), k_bytes);
-    atb::Tensor t_cos =
-        ascend::ToAtbTensor(cos_sin_gathered_shape_, acl_dt_,
-                            const_cast<void*>(cos_for_rope), gathered_bytes);
-    atb::Tensor t_sin =
-        ascend::ToAtbTensor(cos_sin_gathered_shape_, acl_dt_,
-                            const_cast<void*>(sin_for_rope), gathered_bytes);
-    atb::Tensor t_seqlen =
-        ascend::ToAtbTensor(seqlen_shape_, ACL_INT32, seqlen_dev_,
-                            static_cast<uint64_t>(sizeof(int32_t)));
-
-    atb::VariantPack vp;
-    vp.inTensors = {t_q, t_k, t_cos, t_sin, t_seqlen};
-    vp.outTensors = {t_q, t_k};
-
-    uint64_t ws_size = 0;
-    atb::Status s = op_->Setup(vp, ws_size, ctx);
-
-    assert(s == atb::NO_ERROR && "ATB Rope `Setup` failed");
-
-    uint8_t* ws_ptr = nullptr;
-
-    if (ws_size > 0) {
-      auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size);
-      ws_ptr = static_cast<uint8_t*>(arena.buf);
-    }
-
-    s = op_->Execute(vp, ws_ptr, ws_size, ctx);
-
-    assert(s == atb::NO_ERROR && "ATB Rope `Execute` failed");
-  }
-
- private:
-  // D2H copy `cos_sin_cache`, split into cos/sin, expand to
-  // `[max_seq_len, head_dim]` in the layout that ATB Rope expects for the
-  // chosen `rotaryCoeff`, and upload to device.  Called at construction and
-  // on cache-pointer change.
-  //
-  // For `rotaryCoeff=2` (neox): cos tensor holds the same `half_head_dim`
-  // values duplicated front/back —
-  // `[c_0 .. c_{half-1}, c_0 .. c_{half-1}]`.
-  //
-  // For `rotaryCoeff=head_size` (interleave): cos tensor holds each of the
-  // `half_head_dim` values repeated pair-wise —
-  // `[c_0, c_0, c_1, c_1, .., c_{half-1}, c_{half-1}]`.
-  void UploadCosSinCache(const Tensor cos_sin_cache) const {
-    const int64_t head_dim = head_size_;
-    const int64_t half_head_dim = head_dim / 2;
-    const size_t elem_sz = cos_sin_cache.element_size();
-    size_t table_bytes = static_cast<size_t>(max_seq_len_) *
-                         static_cast<size_t>(head_dim) * elem_sz;
-
-    std::vector<uint8_t> cache_host(table_bytes);
-    aclrtMemcpy(cache_host.data(), table_bytes, cos_sin_cache.data(),
-                table_bytes, ACL_MEMCPY_DEVICE_TO_HOST);
-
-    std::vector<uint8_t> cos_host(table_bytes);
-    std::vector<uint8_t> sin_host(table_bytes);
-
-    for (int64_t p = 0; p < max_seq_len_; ++p) {
-      for (int64_t j = 0; j < half_head_dim; ++j) {
-        const auto* c_src =
-            cache_host.data() + static_cast<size_t>(p * head_dim + j) * elem_sz;
-        const auto* s_src =
-            cache_host.data() +
-            static_cast<size_t>(p * head_dim + half_head_dim + j) * elem_sz;
-
-        if (is_neox_style_) {
-          // Neox layout: `[c_j ... , c_j ...]` front/back duplication.
-          std::memcpy(
-              cos_host.data() + static_cast<size_t>(p * head_dim + j) * elem_sz,
-              c_src, elem_sz);
-          std::memcpy(cos_host.data() + static_cast<size_t>(p * head_dim +
-                                                            half_head_dim + j) *
-                                            elem_sz,
-                      c_src, elem_sz);
-          std::memcpy(
-              sin_host.data() + static_cast<size_t>(p * head_dim + j) * elem_sz,
-              s_src, elem_sz);
-          std::memcpy(sin_host.data() + static_cast<size_t>(p * head_dim +
-                                                            half_head_dim + j) *
-                                            elem_sz,
-                      s_src, elem_sz);
-        } else {
-          // Interleave layout: each value repeated pair-wise.
-          std::memcpy(cos_host.data() +
-                          static_cast<size_t>(p * head_dim + 2 * j) * elem_sz,
-                      c_src, elem_sz);
-          std::memcpy(
-              cos_host.data() +
-                  static_cast<size_t>(p * head_dim + 2 * j + 1) * elem_sz,
-              c_src, elem_sz);
-          std::memcpy(sin_host.data() +
-                          static_cast<size_t>(p * head_dim + 2 * j) * elem_sz,
-                      s_src, elem_sz);
-          std::memcpy(
-              sin_host.data() +
-                  static_cast<size_t>(p * head_dim + 2 * j + 1) * elem_sz,
-              s_src, elem_sz);
-        }
-      }
-    }
-
-    aclrtMemcpy(cos_table_dev_, table_bytes, cos_host.data(), table_bytes,
-                ACL_MEMCPY_HOST_TO_DEVICE);
-    aclrtMemcpy(sin_table_dev_, table_bytes, sin_host.data(), table_bytes,
-                ACL_MEMCPY_HOST_TO_DEVICE);
-  }
-
-  atb::Operation* op_ = nullptr;
-
-  // Neox-expanded cos/sin tables on device: `[max_seq_len, head_dim]`.
-  void* cos_table_dev_ = nullptr;
-
-  void* sin_table_dev_ = nullptr;
-
-  // Device buffers for gathered `[T, head_dim]` cos/sin.
-  void* cos_dev_ = nullptr;
-
-  void* sin_dev_ = nullptr;
-
-  // Device buffer for `seqlen`: 1 `int32` element holding `T`.
-  void* seqlen_dev_ = nullptr;
-
-  // Last `cos_sin_cache.data()` uploaded via `UploadCosSinCache()`.  Compared
-  // on every call to detect caller-side cache swaps.
-  mutable const void* cos_sin_cache_data_ = nullptr;
-
-  // IndexSelect descriptor caches.
-  mutable ascend::AclTensorCache cos_table_cache_;
-
-  mutable ascend::AclTensorCache sin_table_cache_;
-
-  mutable ascend::AclTensorCache idx_cache_;
-
-  mutable ascend::AclTensorCache cos_out_cache_;
-
-  mutable ascend::AclTensorCache sin_out_cache_;
-
-  // Cached IndexSelect executors.
-  mutable aclOpExecutor* idx_cos_exec_ = nullptr;
-
-  mutable uint64_t idx_cos_ws_ = 0;
-
-  mutable aclOpExecutor* idx_sin_exec_ = nullptr;
-
-  mutable uint64_t idx_sin_ws_ = 0;
-
-  // Cached shapes for ATB `VariantPack`.
-  std::vector<int64_t> q_2d_shape_;
-
-  std::vector<int64_t> k_2d_shape_;
-
-  std::vector<int64_t> cos_sin_gathered_shape_;
-
-  std::vector<int64_t> seqlen_shape_;
-
-  aclDataType acl_dt_ = ACL_DT_UNDEFINED;
-
-  uint64_t elem_size_ = 0;
-
-  int64_t max_seq_len_ = 0;
-};
-
-}  // namespace infini::ops
-
-#endif  // INFINI_HAS_ATB
-
-#endif  // INFINI_OPS_ASCEND_ROTARY_EMBEDDING_KERNEL_ATB_H_
diff --git a/src/ascend/rotary_embedding/kernel_sincos_cache.h b/src/ascend/rotary_embedding/kernel_sincos_cache.h
deleted file mode 100644
index 317e472f..00000000
--- a/src/ascend/rotary_embedding/kernel_sincos_cache.h
+++ /dev/null
@@ -1,177 +0,0 @@
-#ifndef INFINI_OPS_ASCEND_ROTARY_EMBEDDING_KERNEL_SINCOS_CACHE_H_
-#define INFINI_OPS_ASCEND_ROTARY_EMBEDDING_KERNEL_SINCOS_CACHE_H_
-
-#include <cassert>
-#include <cstdint>
-#include <optional>
-
-#include "acl/acl.h"
-#include "aclnn/aclnn_base.h"
-#include "aclnnop/aclnn_rope_with_sin_cos_cache.h"
-#include "ascend/common.h"
-#include "ascend/workspace_pool_.h"
-#include "base/rotary_embedding.h"
-#include "operator.h"
-
-namespace infini::ops {
-
-// Rotary position embedding via `aclnnRopeWithSinCosCache` (implementation
-// index 2).  This is the only Ascend fused rotary API that supports partial
-// rotary (`rotary_dim < head_size`); it also natively supports both
-// GPT-NeoX (`is_neox_style=true`) and GPT-J (`is_neox_style=false`) styles
-// from the same interface.
-//
-// Input format: 2D contiguous `[num_tokens, num_heads * head_size]`.  The
-// `aclnn` wrapper reads strides from the tensor descriptor — we pass a 2D
-// descriptor even when the caller holds a 3D view `[T, N, D]`, since the
-// memory layout is identical for contiguous tensors.  The 2D descriptor is
-// what the `aclnn` sample in the CANN 8.5 docs uses.
-//
-// `cos_sin_cache` layout: `[max_seq_len, rotary_dim]` where the first
-// `rotary_dim / 2` columns are cos and the next `rotary_dim / 2` are sin.
-// The `aclnn` API splits internally via `cosSin.chunk(2, dim=-1)`.
-//
-// cf. `aclnn_rope_with_sin_cos_cache_hidden_attrs` memory: the public
-// header hides four `REG_OP` attrs (`numQHeads`, `numKHeads`, `qStride`,
-// `kStride`).  For 2D contiguous inputs the `aclnn` wrapper infers them
-// correctly from the tensor descriptor; for 3D descriptors a previous
-// attempt produced garbage output.
-template <>
-class Operator<RotaryEmbedding, Device::Type::kAscend, 2>
-    : public RotaryEmbedding {
- public:
-  Operator(const Tensor positions, const Tensor query,
-           std::optional<Tensor> key, int64_t head_size,
-           const Tensor cos_sin_cache, bool is_neox_style, int64_t rotary_dim,
-           std::optional<Tensor> query_out = std::nullopt,
-           std::optional<Tensor> key_out = std::nullopt,
-           bool pre_gathered = false)
-      : RotaryEmbedding(positions, query, key, head_size, cos_sin_cache,
-                        is_neox_style, rotary_dim, query_out, key_out,
-                        pre_gathered),
-        max_seq_len_{cos_sin_cache.size(0)} {
-    assert(has_key_ &&
-           "ascend `RotaryEmbedding` (`aclnnRopeWithSinCosCache`): `key` is "
-           "required — this fused API always rotates Q and K together");
-    assert(!pre_gathered_ &&
-           "ascend `RotaryEmbedding` (`aclnnRopeWithSinCosCache`): "
-           "`pre_gathered` is not supported — use implementation index 0 or "
-           "1 for the pre-gathered fast path");
-
-    // Resolve optional out buffers (inplace on `query` / `key` when omitted).
-    // Non-const so `.data()` returns a writable `void*`.
-    Tensor q_out = query_out.value_or(query);
-    Tensor k_out = key_out.value_or(*key);
-
-    const int64_t num_tokens = num_tokens_;
-    const int64_t num_q_heads = num_heads_;
-    const int64_t num_kv_heads = num_kv_heads_;
-    const int64_t head_dim = head_size_;
-    aclDataType acl_dt = ascend::ToAclDtype(query.dtype());
-
-    positions_cache_ = ascend::AclTensorCache(
-        {num_tokens}, ACL_INT64, const_cast<void*>(positions.data()));
-    q_in_cache_ =
-        ascend::AclTensorCache({num_tokens, num_q_heads * head_dim}, acl_dt,
-                               const_cast<void*>(query.data()));
-    k_in_cache_ =
-        ascend::AclTensorCache({num_tokens, num_kv_heads * head_dim}, acl_dt,
-                               const_cast<void*>(key->data()));
-    cos_sin_cache_cache_ =
-        ascend::AclTensorCache({max_seq_len_, rotary_dim_}, acl_dt,
-                               const_cast<void*>(cos_sin_cache.data()));
-    q_out_cache_ = ascend::AclTensorCache({num_tokens, num_q_heads * head_dim},
-                                          acl_dt, q_out.data());
-    k_out_cache_ = ascend::AclTensorCache({num_tokens, num_kv_heads * head_dim},
-                                          acl_dt, k_out.data());
-  }
-
-  ~Operator() {
-    if (!ascend::IsAclRuntimeAlive()) return;
-
-    positions_cache_.release();
-    q_in_cache_.release();
-    k_in_cache_.release();
-    cos_sin_cache_cache_.release();
-    q_out_cache_.release();
-    k_out_cache_.release();
-  }
-
-  Operator(const Operator&) = delete;
-
-  Operator& operator=(const Operator&) = delete;
-
-  void operator()(const Tensor positions, const Tensor query,
-                  std::optional<Tensor> key, int64_t head_size,
-                  const Tensor cos_sin_cache, bool is_neox_style,
-                  int64_t rotary_dim, std::optional<Tensor> query_out,
-                  std::optional<Tensor> key_out,
-                  bool pre_gathered) const override {
-    auto stream = static_cast<aclrtStream>(stream_);
-
-    // Resolve optional out buffers (inplace on `query` / `key` when omitted).
-    Tensor q_out = query_out.value_or(query);
-    Tensor k_out = key_out.value_or(*key);
-
-    // Refresh cached descriptors with the current-call data pointers —
-    // `Operator::call()` cache matches on shape/stride/dtype, so one
-    // instance may serve multiple calls with different underlying buffers.
-    auto t_pos = positions_cache_.get(const_cast<void*>(positions.data()));
-    auto t_q = q_in_cache_.get(const_cast<void*>(query.data()));
-    auto t_k = k_in_cache_.get(const_cast<void*>(key->data()));
-    auto t_cache =
-        cos_sin_cache_cache_.get(const_cast<void*>(cos_sin_cache.data()));
-    auto t_q_out = q_out_cache_.get(const_cast<void*>(q_out.data()));
-    auto t_k_out = k_out_cache_.get(const_cast<void*>(k_out.data()));
-
-    // FIXME: per-call unbounded executor leak.  `aclnnRopeWithSinCosCache`'s
-    // public header hides four `REG_OP` attrs (see
-    // `aclnn_rope_with_sin_cos_cache_hidden_attrs` memory), so the official
-    // `aclSetInputTensorAddr` index numbering for this kernel is not
-    // documented — we cannot safely reuse a Repeatable executor across calls.
-    // The async stream consumes the executor after enqueue, so destroying it
-    // synchronously here races with the launch (SIGABRT).  Long-running
-    // persistent workers (e.g. vLLM decode) accumulate one executor per
-    // forward step until the runtime tears down.
-    //
-    // Resolve by obtaining the input-address index layout from the CANN team
-    // (or deriving it from the binary) and switching to the cached-executor
-    // pattern used in `kernel.h` / `kernel_atb.h`.
-    uint64_t ws_size = 0;
-    aclOpExecutor* executor = nullptr;
-
-    auto ret = aclnnRopeWithSinCosCacheGetWorkspaceSize(
-        t_pos, t_q, t_k, t_cache, /*mropeSection=*/nullptr, head_size,
-        is_neox_style, t_q_out, t_k_out, &ws_size, &executor);
-    assert(ret == 0 && "`aclnnRopeWithSinCosCacheGetWorkspaceSize` failed");
-
-    void* ws_buf = nullptr;
-
-    if (ws_size > 0) {
-      auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size);
-      ws_buf = arena.buf;
-    }
-
-    ret = aclnnRopeWithSinCosCache(ws_buf, ws_size, executor, stream);
-    assert(ret == 0 && "`aclnnRopeWithSinCosCache` failed");
-  }
-
- private:
-  int64_t max_seq_len_;
-
-  mutable ascend::AclTensorCache positions_cache_;
-
-  mutable ascend::AclTensorCache q_in_cache_;
-
-  mutable ascend::AclTensorCache k_in_cache_;
-
-  mutable ascend::AclTensorCache cos_sin_cache_cache_;
-
-  mutable ascend::AclTensorCache q_out_cache_;
-
-  mutable ascend::AclTensorCache k_out_cache_;
-};
-
-}  // namespace infini::ops
-
-#endif
diff --git a/src/ascend/silu_and_mul/kernel.h b/src/ascend/silu_and_mul/kernel.h
deleted file mode 100644
index 21a9314d..00000000
--- a/src/ascend/silu_and_mul/kernel.h
+++ /dev/null
@@ -1,127 +0,0 @@
-#ifndef INFINI_OPS_ASCEND_SILU_AND_MUL_KERNEL_H_
-#define INFINI_OPS_ASCEND_SILU_AND_MUL_KERNEL_H_
-
-#include <vector>
-
-#include "acl/acl.h"
-#include "aclnn/aclnn_base.h"
-#include "aclnn_copy.h"
-#include "aclnnop/aclnn_swi_glu.h"
-#include "ascend/common.h"
-#include "ascend/workspace_pool_.h"
-#include "base/silu_and_mul.h"
-#include "operator.h"
-
-namespace infini::ops {
-
-// Calls `aclnnSwiGlu` directly on the concatenated `x = [gate, up]` tensor.
-//
-// `aclnnSwiGlu` splits `x` along `dim` into `[first_half, second_half]` and
-// computes `second_half * silu(first_half)`, i.e. `up * silu(gate)`.
-//
-// `aclnnSwiGlu` ignores output strides and writes contiguously.  When the
-// output is non-contiguous, a contiguous staging buffer is used and the
-// result is copied back via `aclnnInplaceCopy`.
-template <>
-class Operator<SiluAndMul, Device::Type::kAscend, 0> : public SiluAndMul {
- public:
-  Operator(const Tensor input, int64_t dim, Tensor out)
-      : SiluAndMul(input, dim, out), input_cache_(input), out_cache_(out) {
-    needs_copy_ = !is_out_contiguous_;
-
-    if (needs_copy_) {
-      out_staging_size_ = out.numel() * kDataTypeToSize.at(out.dtype());
-    }
-  }
-
-  ~Operator() {
-    if (!ascend::IsAclRuntimeAlive()) return;
-
-    // Null cached descriptors — see `AclTensorCache::release()`.  Inputs and
-    // outputs are referenced by the Repeatable executors (`swiglu_exec_`,
-    // `copy_exec_`); releasing them here prevents `~AclTensorCache()` from
-    // double-freeing at shutdown.
-    input_cache_.release();
-    out_cache_.release();
-
-    // The staging cache is held by `swiglu_exec_` / `copy_exec_`; release to
-    // avoid double-free on destruction.
-    if (out_staging_cache_) out_staging_cache_->release();
-  }
-
-  void operator()(const Tensor input, int64_t dim, Tensor out) const override {
-    auto t_input = input_cache_.get(const_cast<void*>(input.data()));
-    auto t_out = out_cache_.get(out.data());
-    auto stream = static_cast<aclrtStream>(stream_);
-
-    // Determine effective output target.
-    aclTensor* t_swiglu_out = t_out;
-    void* swiglu_out_data = out.data();
-
-    if (needs_copy_) {
-      auto& staging = ascend::GetWorkspacePool().Ensure(
-          stream, out_staging_size_, "staging");
-
-      if (!out_staging_cache_) {
-        std::vector<int64_t> out_shape(out_shape_.begin(), out_shape_.end());
-        out_staging_cache_.emplace(out_shape, ascend::ToAclDtype(out_dtype_),
-                                   staging.buf);
-      }
-
-      t_swiglu_out = out_staging_cache_->get(staging.buf);
-      swiglu_out_data = staging.buf;
-    }
-
-    // Call `aclnnSwiGlu`.
-    if (!swiglu_exec_) {
-      aclnnSwiGluGetWorkspaceSize(t_input, dim_, t_swiglu_out, &swiglu_ws_,
-                                  &swiglu_exec_);
-      aclSetAclOpExecutorRepeatable(swiglu_exec_);
-    } else {
-      aclSetInputTensorAddr(swiglu_exec_, 0, t_input,
-                            const_cast<void*>(input.data()));
-      aclSetOutputTensorAddr(swiglu_exec_, 0, t_swiglu_out, swiglu_out_data);
-    }
-
-    auto& arena = ascend::GetWorkspacePool().Ensure(stream, swiglu_ws_);
-    aclnnSwiGlu(arena.buf, swiglu_ws_, swiglu_exec_, stream);
-
-    // Copy staging buffer back to non-contiguous output if needed.
-    if (needs_copy_) {
-      if (!copy_exec_) {
-        aclnnInplaceCopyGetWorkspaceSize(t_out, t_swiglu_out, &copy_ws_,
-                                         &copy_exec_);
-        aclSetAclOpExecutorRepeatable(copy_exec_);
-      } else {
-        aclSetInputTensorAddr(copy_exec_, 0, t_out, out.data());
-        aclSetInputTensorAddr(copy_exec_, 1, t_swiglu_out, swiglu_out_data);
-      }
-
-      auto& copy_arena = ascend::GetWorkspacePool().Ensure(stream, copy_ws_);
-      aclnnInplaceCopy(copy_arena.buf, copy_ws_, copy_exec_, stream);
-    }
-  }
-
- private:
-  mutable ascend::AclTensorCache input_cache_;
-
-  mutable ascend::AclTensorCache out_cache_;
-
-  mutable std::optional<ascend::AclTensorCache> out_staging_cache_;
-
-  bool needs_copy_ = false;
-
-  uint64_t out_staging_size_ = 0;
-
-  mutable aclOpExecutor* swiglu_exec_ = nullptr;
-
-  mutable uint64_t swiglu_ws_ = 0;
-
-  mutable aclOpExecutor* copy_exec_ = nullptr;
-
-  mutable uint64_t copy_ws_ = 0;
-};
-
-}  // namespace infini::ops
-
-#endif
diff --git a/src/ascend/swiglu/kernel.h b/src/ascend/swiglu/kernel.h
deleted file mode 100644
index 434345d6..00000000
--- a/src/ascend/swiglu/kernel.h
+++ /dev/null
@@ -1,109 +0,0 @@
-#ifndef INFINI_OPS_ASCEND_SWIGLU_KERNEL_H_
-#define INFINI_OPS_ASCEND_SWIGLU_KERNEL_H_
-
-#include "acl/acl.h"
-#include "aclnn/aclnn_base.h"
-#include "aclnn_mul.h"
-#include "aclnn_silu.h"
-#include "ascend/common.h"
-#include "ascend/workspace_pool_.h"
-#include "base/swiglu.h"
-#include "data_type.h"
-#include "operator.h"
-
-namespace infini::ops {
-
-// Implements SwiGLU as two ACLNN calls: `aclnnSilu(gate)` into a `temp`
-// buffer, then elementwise `aclnnMul(input, temp)` into `out`.
-// `aclnnSiluMul` was not used because it fuses silu-and-mul on the same
-// tensor (`x * silu(x)`), whereas SwiGLU requires `input * silu(gate)` —
-// two distinct inputs.
-template <>
-class Operator<Swiglu, Device::Type::kAscend, 0> : public Swiglu {
- public:
-  Operator(const Tensor input, const Tensor gate, Tensor out)
-      : Swiglu(input, gate, out),
-        in_cache_(input),
-        gate_cache_(gate),
-        out_cache_(out) {
-    temp_size_ = input.numel() * kDataTypeToSize.at(input.dtype());
-
-    // Build the `temp` cache from `gate` geometry (contiguous, same
-    // shape/dtype).  No data pointer yet — it is set on the first `get()`
-    // call.
-    Tensor temp_t{nullptr, gate.shape(), gate.dtype(), gate.device()};
-    temp_cache_ = ascend::AclTensorCache(temp_t);
-  }
-
-  ~Operator() {
-    if (!ascend::IsAclRuntimeAlive()) return;
-
-    // Null cached descriptors — see `AclTensorCache::release()`.
-    in_cache_.release();
-    gate_cache_.release();
-    out_cache_.release();
-    temp_cache_.release();
-  }
-
-  void operator()(const Tensor input, const Tensor gate,
-                  Tensor out) const override {
-    auto t_in = in_cache_.get(const_cast<void*>(input.data()));
-    auto t_gate = gate_cache_.get(const_cast<void*>(gate.data()));
-    auto t_out = out_cache_.get(out.data());
-    auto stream = static_cast<aclrtStream>(stream_);
-
-    // Obtain shared `temp` buffer from the pool.
-    auto& temp = ascend::GetWorkspacePool().Ensure(stream, temp_size_, "temp");
-    auto t_temp = temp_cache_.get(temp.buf);
-
-    // Step 1: `silu(gate) -> temp`.
-    if (!silu_exec_) {
-      aclnnSiluGetWorkspaceSize(t_gate, t_temp, &silu_ws_, &silu_exec_);
-      aclSetAclOpExecutorRepeatable(silu_exec_);
-    } else {
-      aclSetInputTensorAddr(silu_exec_, 0, t_gate,
-                            const_cast<void*>(gate.data()));
-      aclSetOutputTensorAddr(silu_exec_, 0, t_temp, temp.buf);
-    }
-    auto& silu_arena = ascend::GetWorkspacePool().Ensure(stream, silu_ws_);
-    aclnnSilu(silu_arena.buf, silu_ws_, silu_exec_, stream);
-
-    // Step 2: `mul(input, temp) -> out`.
-    if (!mul_exec_) {
-      aclnnMulGetWorkspaceSize(t_in, t_temp, t_out, &mul_ws_, &mul_exec_);
-      aclSetAclOpExecutorRepeatable(mul_exec_);
-    } else {
-      aclSetInputTensorAddr(mul_exec_, 0, t_in,
-                            const_cast<void*>(input.data()));
-      aclSetInputTensorAddr(mul_exec_, 1, t_temp, temp.buf);
-      aclSetOutputTensorAddr(mul_exec_, 0, t_out, out.data());
-    }
-    auto& mul_arena = ascend::GetWorkspacePool().Ensure(stream, mul_ws_);
-    aclnnMul(mul_arena.buf, mul_ws_, mul_exec_, stream);
-  }
-
- private:
-  mutable ascend::AclTensorCache in_cache_;
-
-  mutable ascend::AclTensorCache gate_cache_;
-
-  mutable ascend::AclTensorCache out_cache_;
-
-  mutable ascend::AclTensorCache temp_cache_;
-
-  uint64_t temp_size_ = 0;
-
-  mutable aclOpExecutor* silu_exec_ = nullptr;
-
-  mutable uint64_t silu_ws_ = 0;
-
-  mutable aclOpExecutor* mul_exec_ = nullptr;
-
-  mutable uint64_t mul_ws_ = 0;
-};
-
-}  // namespace infini::ops
-
-#include "ascend/swiglu/kernel_fused.h"
-
-#endif
diff --git a/src/ascend/swiglu/kernel_fused.h b/src/ascend/swiglu/kernel_fused.h
deleted file mode 100644
index c0550015..00000000
--- a/src/ascend/swiglu/kernel_fused.h
+++ /dev/null
@@ -1,202 +0,0 @@
-#ifndef INFINI_OPS_ASCEND_SWIGLU_KERNEL_FUSED_H_
-#define INFINI_OPS_ASCEND_SWIGLU_KERNEL_FUSED_H_
-
-#include <vector>
-
-#include "acl/acl.h"
-#include "aclnn/aclnn_base.h"
-#include "aclnn_copy.h"
-#include "aclnnop/aclnn_cat.h"
-#include "aclnnop/aclnn_swi_glu.h"
-#include "ascend/common.h"
-#include "ascend/workspace_pool_.h"
-#include "base/swiglu.h"
-#include "operator.h"
-
-namespace infini::ops {
-
-// Fused implementation via `aclnnSwiGlu` (implementation index 1).
-//
-// Concatenates `[gate, input]` into a `temp` buffer via `aclnnCat`, then
-// calls `aclnnSwiGlu` which computes `second_half * silu(first_half)` in a
-// single fused kernel, i.e. `input * silu(gate)`.
-//
-// This trades an extra `aclnnCat` launch for a single fused SwiGLU kernel
-// instead of separate `aclnnSilu` + `aclnnMul`.  The net benefit is one
-// fewer intermediate buffer materialised on-device (the `silu` temp is
-// eliminated).
-//
-// `aclnnSwiGlu` requires a contiguous output tensor.  When the caller's
-// output is non-contiguous, a contiguous staging buffer is used and the
-// result is copied back via `aclnnInplaceCopy`.
-//
-// Select via `implementation_index=1` in Python:
-//   `infini.ops.swiglu(..., implementation_index=1, stream=s)`.
-template <>
-class Operator<Swiglu, Device::Type::kAscend, 1> : public Swiglu {
- public:
-  Operator(const Tensor input, const Tensor gate, Tensor out)
-      : Swiglu(input, gate, out),
-        gate_cache_(gate),
-        in_cache_(input),
-        out_cache_(out) {
-    // Compute the concatenated shape: same as input but with last dim doubled.
-    cat_shape_.assign(input.shape().begin(), input.shape().end());
-    cat_shape_.back() *= 2;
-
-    uint64_t cat_elems = 1;
-
-    for (auto d : cat_shape_) {
-      cat_elems *= static_cast<uint64_t>(d);
-    }
-
-    cat_size_ = cat_elems * kDataTypeToSize.at(input.dtype());
-
-    // `aclnnSwiGlu` ignores output strides and writes contiguously.
-    // When the output is non-contiguous we need a contiguous staging buffer.
-    needs_copy_ = !is_out_contiguous_;
-
-    if (needs_copy_) {
-      out_staging_size_ = output_size_ * kDataTypeToSize.at(out.dtype());
-    }
-  }
-
-  ~Operator() {
-    if (!ascend::IsAclRuntimeAlive()) return;
-
-    // Null cached descriptors — see `AclTensorCache::release()`.  The inputs
-    // and outputs are referenced by the Repeatable executors (`cat_exec_`,
-    // `swiglu_exec_`, `copy_exec_`) via `cat_tensor_list_`; releasing them
-    // here prevents `~AclTensorCache()` from double-freeing at shutdown.
-    gate_cache_.release();
-    in_cache_.release();
-    out_cache_.release();
-
-    // Optional caches are held by `swiglu_exec_` / `copy_exec_`; release to
-    // avoid double-free on destruction.
-    if (cat_out_cache_) cat_out_cache_->release();
-    if (out_staging_cache_) out_staging_cache_->release();
-
-    // `cat_tensor_list_` leaks with `cat_exec_` at shutdown (see `64c367c`).
-  }
-
-  void operator()(const Tensor input, const Tensor gate,
-                  Tensor out) const override {
-    auto t_gate = gate_cache_.get(const_cast<void*>(gate.data()));
-    auto t_in = in_cache_.get(const_cast<void*>(input.data()));
-    auto t_out = out_cache_.get(out.data());
-    auto stream = static_cast<aclrtStream>(stream_);
-
-    // Obtain shared `temp` buffer for the concatenated tensor.
-    auto& cat_arena =
-        ascend::GetWorkspacePool().Ensure(stream, cat_size_, "temp");
-
-    // Lazily build the `aclnnCat` output tensor cache on first call.
-    if (!cat_out_cache_) {
-      cat_out_cache_.emplace(cat_shape_, ascend::ToAclDtype(input_type_),
-                             cat_arena.buf);
-    }
-
-    auto t_cat = cat_out_cache_->get(cat_arena.buf);
-
-    // Step 1: `aclnnCat([gate, input], dim=-1) -> cat_buf`.
-    if (!cat_exec_) {
-      aclTensor* tensors[2] = {t_gate, t_in};
-      cat_tensor_list_ =
-          aclCreateTensorList(const_cast<const aclTensor**>(tensors), 2);
-      aclnnCatGetWorkspaceSize(cat_tensor_list_,
-                               static_cast<int64_t>(ndim_ - 1), t_cat, &cat_ws_,
-                               &cat_exec_);
-      aclSetAclOpExecutorRepeatable(cat_exec_);
-    } else {
-      // The tensor list references the same `aclTensor*` objects whose data
-      // pointers were already updated by `get()` above.
-      aclSetOutputTensorAddr(cat_exec_, 0, t_cat, cat_arena.buf);
-    }
-
-    auto& cat_ws_arena = ascend::GetWorkspacePool().Ensure(stream, cat_ws_);
-    aclnnCat(cat_ws_arena.buf, cat_ws_, cat_exec_, stream);
-
-    // Step 2: `aclnnSwiGlu(cat_buf, dim=-1) -> out` (or staging buffer).
-    aclTensor* t_swiglu_out = t_out;
-    void* swiglu_out_data = out.data();
-
-    if (needs_copy_) {
-      auto& staging = ascend::GetWorkspacePool().Ensure(
-          stream, out_staging_size_, "staging");
-
-      if (!out_staging_cache_) {
-        std::vector<int64_t> out_shape(out_shape_.begin(), out_shape_.end());
-        out_staging_cache_.emplace(out_shape, ascend::ToAclDtype(out_type_),
-                                   staging.buf);
-      }
-
-      t_swiglu_out = out_staging_cache_->get(staging.buf);
-      swiglu_out_data = staging.buf;
-    }
-
-    if (!swiglu_exec_) {
-      aclnnSwiGluGetWorkspaceSize(t_cat, static_cast<int64_t>(ndim_ - 1),
-                                  t_swiglu_out, &swiglu_ws_, &swiglu_exec_);
-      aclSetAclOpExecutorRepeatable(swiglu_exec_);
-    } else {
-      aclSetInputTensorAddr(swiglu_exec_, 0, t_cat, cat_arena.buf);
-      aclSetOutputTensorAddr(swiglu_exec_, 0, t_swiglu_out, swiglu_out_data);
-    }
-
-    auto& swiglu_arena = ascend::GetWorkspacePool().Ensure(stream, swiglu_ws_);
-    aclnnSwiGlu(swiglu_arena.buf, swiglu_ws_, swiglu_exec_, stream);
-
-    // Step 3 (non-contiguous output only): copy staging -> `out`.
-    if (needs_copy_) {
-      if (!copy_exec_) {
-        aclnnInplaceCopyGetWorkspaceSize(t_out, t_swiglu_out, &copy_ws_,
-                                         &copy_exec_);
-        aclSetAclOpExecutorRepeatable(copy_exec_);
-      } else {
-        aclSetInputTensorAddr(copy_exec_, 0, t_out, out.data());
-        aclSetInputTensorAddr(copy_exec_, 1, t_swiglu_out, swiglu_out_data);
-      }
-
-      auto& copy_arena = ascend::GetWorkspacePool().Ensure(stream, copy_ws_);
-      aclnnInplaceCopy(copy_arena.buf, copy_ws_, copy_exec_, stream);
-    }
-  }
-
- private:
-  mutable ascend::AclTensorCache gate_cache_;
-
-  mutable ascend::AclTensorCache in_cache_;
-
-  mutable ascend::AclTensorCache out_cache_;
-
-  mutable std::optional<ascend::AclTensorCache> cat_out_cache_;
-
-  mutable std::optional<ascend::AclTensorCache> out_staging_cache_;
-
-  std::vector<int64_t> cat_shape_;
-
-  uint64_t cat_size_ = 0;
-
-  bool needs_copy_ = false;
-
-  uint64_t out_staging_size_ = 0;
-
-  mutable aclTensorList* cat_tensor_list_ = nullptr;
-
-  mutable aclOpExecutor* cat_exec_ = nullptr;
-
-  mutable uint64_t cat_ws_ = 0;
-
-  mutable aclOpExecutor* swiglu_exec_ = nullptr;
-
-  mutable uint64_t swiglu_ws_ = 0;
-
-  mutable aclOpExecutor* copy_exec_ = nullptr;
-
-  mutable uint64_t copy_ws_ = 0;
-};
-
-}  // namespace infini::ops
-
-#endif
diff --git a/src/base/add_rms_norm.h b/src/base/add_rms_norm.h
index 9cfac810..3c888917 100644
--- a/src/base/add_rms_norm.h
+++ b/src/base/add_rms_norm.h
@@ -1,45 +1,36 @@
 #ifndef INFINI_OPS_BASE_ADD_RMS_NORM_H_
 #define INFINI_OPS_BASE_ADD_RMS_NORM_H_
 
+#include <cstddef>
+#include <vector>
+
 #include "operator.h"
 #include "tensor.h"
 
 namespace infini::ops {
 
-// Fused residual-add + RMSNorm.  Computes
-// `residual_out = input + residual` and `out = RMSNorm(residual_out) *
-// weight`.  The 4-arg overload `(input, residual, weight, eps)` aliases
-// `out = input`, `residual_out = residual` to match vLLM's inplace
-// `fused_add_rms_norm` schema.
 class AddRmsNorm : public Operator<AddRmsNorm> {
  public:
-  AddRmsNorm(const Tensor input, const Tensor residual, const Tensor weight,
-             float eps, Tensor out, Tensor residual_out)
+  // TODO: Make `eps` an `std::optional<float>` with a PyTorch-aligned default.
+  // Also consider the same change for `RmsNorm`.
+  AddRmsNorm(const Tensor input, const Tensor other, const Tensor weight,
+             float eps, Tensor out, Tensor rstd_out)
       : input_shape_{input.shape()},
         eps_{eps},
         dim_{input.size(-1)},
         ndim_{input.ndim()},
         batch_size_{ndim_ == 2 ? input.size(-2) : input.size(-3)},
-        nhead_{ndim_ == 2 ? 1 : input.size(-2)} {
-    assert(input.dtype() == residual.dtype() &&
-           "`AddRmsNorm`: `input` and `residual` must have the same dtype");
-    assert(input.dtype() == out.dtype() &&
-           "`AddRmsNorm`: `input` and `out` must have the same dtype");
-    assert(input.dtype() == residual_out.dtype() &&
-           "`AddRmsNorm`: `input` and `residual_out` must have the same dtype");
+        nhead_{ndim_ == 2 ? 1 : input.size(-2)},
+        rstd_shape_{static_cast<int64_t>(batch_size_),
+                    static_cast<int64_t>(nhead_)} {
+    assert(input.dtype() == other.dtype());
+    assert(input.dtype() == out.dtype());
+    assert(input.dtype() == rstd_out.dtype());
   }
 
-  AddRmsNorm(Tensor input, Tensor residual, const Tensor weight, float eps)
-      : AddRmsNorm{input, residual, weight, eps, input, residual} {}
-
-  virtual void operator()(const Tensor input, const Tensor residual,
+  virtual void operator()(const Tensor input, const Tensor other,
                           const Tensor weight, float eps, Tensor out,
-                          Tensor residual_out) const = 0;
-
-  virtual void operator()(Tensor input, Tensor residual, const Tensor weight,
-                          float eps) const {
-    return operator()(input, residual, weight, eps, input, residual);
-  }
+                          Tensor rstd_out) const = 0;
 
  protected:
   Tensor::Shape input_shape_;
@@ -53,6 +44,8 @@ class AddRmsNorm : public Operator<AddRmsNorm> {
   Tensor::Size batch_size_{0};
 
   Tensor::Size nhead_{1};
+
+  std::vector<int64_t> rstd_shape_;
 };
 
 }  // namespace infini::ops
diff --git a/src/base/linear.h b/src/base/linear.h
index 7e4ab5fd..a5276e61 100644
--- a/src/base/linear.h
+++ b/src/base/linear.h
@@ -7,14 +7,14 @@
 
 namespace infini::ops {
 
-// Fused linear projection.  Primary form `(input, weight, bias?, out)`
-// matches `F.linear(input, weight, bias)`: `weight` is pre-transposed as
-// `[out_features, in_features]`, kernel computes `input @ weight^T`.  The
-// 6-arg `(a, b, bias, trans_a, trans_b, out)` form is kept deprecated for
-// callers that need explicit transpose flags.
+// Fused linear projection: out = a @ b (+ bias).
+//
+// When bias is present, computes out = a @ b + bias in a single dispatch.
+// When bias is absent, computes out = a @ b (equivalent to Matmul).
+// `trans_a` / `trans_b`: If true, transpose the last two dims before
+// multiplying.
 class Linear : public Operator<Linear> {
  public:
-  // Deprecated — use `(input, weight, bias, out)` instead.
   Linear(const Tensor a, const Tensor b, std::optional<Tensor> bias,
          bool trans_a, bool trans_b, Tensor out)
       : a_shape_{a.shape()},
@@ -36,31 +36,10 @@ class Linear : public Operator<Linear> {
     }
   }
 
-  Linear(const Tensor input, const Tensor weight, std::optional<Tensor> bias,
-         Tensor out)
-      : Linear{input, weight, bias, /*trans_a=*/false, /*trans_b=*/true, out} {
-    assert(weight.ndim() >= 2 &&
-           "`Linear`: `weight` must have at least 2 dims "
-           "`[..., out_features, in_features]`");
-    assert(weight.size(-1) == input.size(-1) &&
-           "`Linear`: `weight.shape[-1]` must equal `input.shape[-1]` "
-           "(`in_features`)");
-    assert(weight.size(-2) == out.size(-1) &&
-           "`Linear`: `weight.shape[-2]` must equal `out.shape[-1]` "
-           "(`out_features`)");
-  }
-
-  // Deprecated — use `(input, weight, bias, out)` overload.
   virtual void operator()(const Tensor a, const Tensor b,
                           std::optional<Tensor> bias, bool trans_a,
                           bool trans_b, Tensor out) const = 0;
 
-  virtual void operator()(const Tensor input, const Tensor weight,
-                          std::optional<Tensor> bias, Tensor out) const {
-    return operator()(input, weight, bias, /*trans_a=*/false,
-                      /*trans_b=*/true, out);
-  }
-
  protected:
   Tensor::Shape a_shape_;
 
diff --git a/src/base/rotary_embedding.h b/src/base/rotary_embedding.h
index 7adc1556..10426ee8 100644
--- a/src/base/rotary_embedding.h
+++ b/src/base/rotary_embedding.h
@@ -1,73 +1,56 @@
 #ifndef INFINI_OPS_BASE_ROTARY_EMBEDDING_H_
 #define INFINI_OPS_BASE_ROTARY_EMBEDDING_H_
 
-#include <cstdint>
-#include <optional>
+#include <cstddef>
+#include <vector>
 
-#include "data_type.h"
 #include "operator.h"
 
 namespace infini::ops {
 
-// Rotary position embedding.  First 6 parameters mirror vLLM's
-// `rotary_embedding(positions, query, key?, head_size, cos_sin_cache,
-// is_neox_style)` schema verbatim; `cos_sin_cache` is `[max_pos,
-// rotary_dim * 2]` (cos then sin).  Inplace when `query_out` / `key_out`
-// are `nullopt`.
+// Rotary position embedding (RoPE) applied in-place to Q and K.
+//
+// Interface follows vLLM's `RotaryEmbedding.forward_oot()`:
+//   `vllm.model_executor.layers.rotary_embedding.RotaryEmbedding`
+//
+// `positions`: `[T]` token position indices.
+// `cos_sin_cache`: precomputed `[max_seq_len, rotary_dim]` table.
+// `query` / `key`: `[T, N, D]` (TND layout), mutated in-place into
+// `query_out` / `key_out`.
 class RotaryEmbedding : public Operator<RotaryEmbedding> {
  public:
-  // `pre_gathered = true` means the caller has already applied
-  // `cos_sin_cache.index_select(0, positions)` plus neox expansion, so
-  // `cos_sin_cache` is laid out as `[T, head_size * 2]` and the kernel skips
-  // the internal gather step.
-  RotaryEmbedding(const Tensor positions, const Tensor query,
-                  std::optional<Tensor> key, int64_t head_size,
-                  const Tensor cos_sin_cache, bool is_neox_style,
-                  int64_t rotary_dim,
-                  std::optional<Tensor> query_out = std::nullopt,
-                  std::optional<Tensor> key_out = std::nullopt,
-                  bool pre_gathered = false)
+  RotaryEmbedding(const Tensor positions, const Tensor query, const Tensor key,
+                  const Tensor cos_sin_cache, int64_t head_size,
+                  int64_t rotary_dim, bool is_neox_style, Tensor query_out,
+                  Tensor key_out)
       : num_tokens_{query.size(0)},
-        num_heads_{static_cast<int64_t>(query.numel()) /
-                   (static_cast<int64_t>(query.size(0)) * head_size)},
-        num_kv_heads_{key.has_value()
-                          ? static_cast<int64_t>(key->numel()) /
-                                (static_cast<int64_t>(key->size(0)) * head_size)
-                          : 0},
+        num_heads_{static_cast<int64_t>(query.size(1))},
+        num_kv_heads_{static_cast<int64_t>(key.size(1))},
         head_size_{head_size},
         rotary_dim_{rotary_dim},
         is_neox_style_{is_neox_style},
-        has_key_{key.has_value()},
-        pre_gathered_{pre_gathered} {
-    assert(positions.dtype() == DataType::kInt64 &&
-           "`RotaryEmbedding`: `positions` must be `int64` (vLLM convention)");
-
-    assert((query.ndim() == 2 || query.ndim() == 3) &&
-           "`RotaryEmbedding`: `query` must be 2D `[T, Nq * head_size]` or 3D "
-           "`[T, Nq, head_size]`");
-
-    // TODO: relax once an MLA-capable Ascend impl lands.  The signature keeps
-    // `std::optional<Tensor> key` for vLLM-API compatibility, but all current
-    // Ascend impls assume `key` is present and rotate Q and K together.
-    assert(key.has_value() &&
-           "`RotaryEmbedding`: `key` is required; the `key = None` (MLA) path "
-           "is not yet implemented on any backend");
-
-    assert((key->ndim() == 2 || key->ndim() == 3) &&
-           "`RotaryEmbedding`: `key` must be 2D `[T, Nkv * head_size]` or 3D "
-           "`[T, Nkv, head_size]`");
-
+        query_shape_{query.shape()},
+        key_shape_{key.shape()},
+        cos_sin_cache_shape_{cos_sin_cache.shape()},
+        query_out_shape_{query_out.shape()},
+        key_out_shape_{key_out.shape()},
+        query_strides_{query.strides()},
+        key_strides_{key.strides()},
+        query_out_strides_{query_out.strides()},
+        key_out_strides_{key_out.strides()} {
+    assert(query.ndim() == 3 &&
+           "`RotaryEmbedding` requires query to be 3D [T, N, D]");
+    assert(key.ndim() == 3 &&
+           "`RotaryEmbedding` requires key to be 3D [T, N_kv, D]");
     assert(rotary_dim <= head_size &&
-           "`RotaryEmbedding`: `rotary_dim` must be `<= head_size`");
+           "`RotaryEmbedding` requires rotary_dim <= head_size");
   }
 
   virtual void operator()(const Tensor positions, const Tensor query,
-                          std::optional<Tensor> key, int64_t head_size,
-                          const Tensor cos_sin_cache, bool is_neox_style,
-                          int64_t rotary_dim,
-                          std::optional<Tensor> query_out = std::nullopt,
-                          std::optional<Tensor> key_out = std::nullopt,
-                          bool pre_gathered = false) const = 0;
+                          const Tensor key, const Tensor cos_sin_cache,
+                          int64_t head_size, int64_t rotary_dim,
+                          bool is_neox_style, Tensor query_out,
+                          Tensor key_out) const = 0;
 
  protected:
   Tensor::Size num_tokens_{0};
@@ -80,11 +63,25 @@ class RotaryEmbedding : public Operator<RotaryEmbedding> {
 
   int64_t rotary_dim_{0};
 
-  bool is_neox_style_{false};
+  bool is_neox_style_{true};
+
+  Tensor::Shape query_shape_;
+
+  Tensor::Shape key_shape_;
+
+  Tensor::Shape cos_sin_cache_shape_;
+
+  Tensor::Shape query_out_shape_;
+
+  Tensor::Shape key_out_shape_;
+
+  Tensor::Strides query_strides_;
+
+  Tensor::Strides key_strides_;
 
-  bool has_key_{false};
+  Tensor::Strides query_out_strides_;
 
-  bool pre_gathered_{false};
+  Tensor::Strides key_out_strides_;
 };
 
 }  // namespace infini::ops
diff --git a/src/base/silu_and_mul.h b/src/base/silu_and_mul.h
deleted file mode 100644
index 6cede6e4..00000000
--- a/src/base/silu_and_mul.h
+++ /dev/null
@@ -1,62 +0,0 @@
-#ifndef INFINI_OPS_BASE_SILU_AND_MUL_H_
-#define INFINI_OPS_BASE_SILU_AND_MUL_H_
-
-#include "operator.h"
-
-namespace infini::ops {
-
-// SiLU-gated linear unit: splits `input` along `dim` into two halves and
-// computes `silu(first_half) * second_half`.  Matches
-// `vllm._C.silu_and_mul`; `dim` defaults to `-1` (PyTorch `F.glu`
-// convention).
-class SiluAndMul : public Operator<SiluAndMul> {
- public:
-  SiluAndMul(const Tensor input, int64_t dim, Tensor out)
-      : input_shape_{input.shape()},
-        input_strides_{input.strides()},
-        out_shape_{out.shape()},
-        out_strides_{out.strides()},
-        input_dtype_{input.dtype()},
-        out_dtype_{out.dtype()},
-        dim_{dim},
-        ndim_{input.ndim()},
-        is_input_contiguous_{input.IsContiguous()},
-        is_out_contiguous_{out.IsContiguous()} {
-    assert(input_dtype_ == out_dtype_ &&
-           "`SiluAndMul`: `input` and `out` must have the same dtype");
-  }
-
-  SiluAndMul(const Tensor input, Tensor out) : SiluAndMul{input, -1, out} {}
-
-  virtual void operator()(const Tensor input, int64_t dim,
-                          Tensor out) const = 0;
-
-  virtual void operator()(const Tensor input, Tensor out) const {
-    return operator()(input, -1, out);
-  }
-
- protected:
-  Tensor::Shape input_shape_;
-
-  Tensor::Strides input_strides_;
-
-  Tensor::Shape out_shape_;
-
-  Tensor::Strides out_strides_;
-
-  const DataType input_dtype_;
-
-  const DataType out_dtype_;
-
-  int64_t dim_;
-
-  Tensor::Size ndim_;
-
-  bool is_input_contiguous_;
-
-  bool is_out_contiguous_;
-};
-
-}  // namespace infini::ops
-
-#endif
diff --git a/src/cpu/linear/linear.h b/src/cpu/linear/linear.h
index 7ceffdd6..21e1bb26 100644
--- a/src/cpu/linear/linear.h
+++ b/src/cpu/linear/linear.h
@@ -13,7 +13,9 @@ template <>
 class Operator<Linear, Device::Type::kCpu> : public Linear,
                                              Caster<Device::Type::kCpu> {
  public:
-  using Linear::Linear;
+  Operator(const Tensor a, const Tensor b, std::optional<Tensor> bias,
+           bool trans_a, bool trans_b, Tensor out)
+      : Linear{a, b, bias, trans_a, trans_b, out} {}
 
   void operator()(const Tensor a, const Tensor b, std::optional<Tensor> bias,
                   bool trans_a, bool trans_b, Tensor out) const override {
diff --git a/src/data_type.h b/src/data_type.h
index 12308ce4..75483d2b 100644
--- a/src/data_type.h
+++ b/src/data_type.h
@@ -11,26 +11,19 @@
 
 namespace infini::ops {
 
-// Element-type tag shared across the project.  Values are assigned
-// explicitly because they are part of the ABI between the host-side
-// launcher wrappers (e.g. `src/ascend/rms_norm/kernel_custom.h`) and the
-// `aclrtlaunch_*` device kernels under `src/ascend/custom/**/op_kernel/`
-// — the launcher forwards `static_cast<int64_t>(input.dtype())` and the
-// kernel dispatches on it.  Reordering entries would silently break that
-// ABI.
 enum class DataType : std::int8_t {
-  kInt8 = 0,
-  kInt16 = 1,
-  kInt32 = 2,
-  kInt64 = 3,
-  kUInt8 = 4,
-  kUInt16 = 5,
-  kUInt32 = 6,
-  kUInt64 = 7,
-  kFloat16 = 8,
-  kBFloat16 = 9,
-  kFloat32 = 10,
-  kFloat64 = 11,
+  kInt8,
+  kInt16,
+  kInt32,
+  kInt64,
+  kUInt8,
+  kUInt16,
+  kUInt32,
+  kUInt64,
+  kFloat16,
+  kBFloat16,
+  kFloat32,
+  kFloat64
 };
 
 constexpr ConstexprMap<DataType, std::size_t, 12> kDataTypeToSize{{{
diff --git a/tests/test_add_rms_norm.py b/tests/test_add_rms_norm.py
deleted file mode 100644
index 0df589f9..00000000
--- a/tests/test_add_rms_norm.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import infini.ops
-import pytest
-import torch
-
-from tests.utils import Payload, empty_strided, get_stream, randn_strided
-
-
-@pytest.fixture(autouse=True)
-def _clear_add_rms_norm_cache():
-    # Clear the `AddRmsNorm` op cache before each test.  Impl 2 (custom
-    # AscendC kernel) pre-casts `weight` on first call and reuses a cached
-    # fp32 buffer.  `CacheKey` matches on shape/dtype/strides only, so two
-    # tests with identical parametrize tuples but different random tensors
-    # collide on the same cached op — the `last_weight_ptr_` guard detects
-    # the new pointer but the cast itself has a lingering stale-state issue
-    # that is better avoided test-side for now.
-    infini.ops.AddRmsNorm.clear_cache()
-
-    yield
-
-
-@pytest.mark.auto_act_and_assert
-@pytest.mark.parametrize(
-    "shape, strides",
-    (
-        ((1, 64), None),
-        ((2, 128), None),
-        ((4, 48, 64), None),
-        ((2, 4, 2048), None),
-        ((1, 64), (64, 1)),
-        ((4, 48, 64), (3072, 64, 1)),
-    ),
-)
-@pytest.mark.parametrize("eps", (1e-6, 1e-5))
-@pytest.mark.parametrize(
-    ("dtype", "rtol", "atol"),
-    (
-        (torch.float32, 1e-4, 1e-4),
-        (torch.float16, 1e-2, 1e-2),
-        (torch.bfloat16, 2e-2, 1e-2),
-    ),
-)
-def test_add_rms_norm(
-    shape,
-    strides,
-    eps,
-    implementation_index,
-    dtype,
-    device,
-    rtol,
-    atol,
-):
-    weight_shape = (shape[-1],)
-    input = randn_strided(shape, strides, dtype=dtype, device=device)
-    residual = randn_strided(shape, strides, dtype=dtype, device=device)
-    weight = randn_strided(weight_shape, None, dtype=dtype, device=device)
-    out = empty_strided(shape, strides, dtype=dtype, device=device)
-    residual_out = empty_strided(shape, strides, dtype=dtype, device=device)
-
-    return Payload(
-        lambda *args, **kwargs: _add_rms_norm(
-            *args, **kwargs, implementation_index=implementation_index
-        ),
-        _torch_add_rms_norm,
-        (input, residual, weight),
-        {"eps": eps, "out": out, "residual_out": residual_out},
-        rtol=rtol,
-        atol=atol,
-    )
-
-
-def _add_rms_norm(
-    input,
-    residual,
-    weight,
-    *,
-    eps=1e-6,
-    out=None,
-    residual_out=None,
-    implementation_index=0,
-):
-    infini.ops.add_rms_norm(
-        input,
-        residual,
-        weight,
-        eps,
-        out,
-        residual_out,
-        implementation_index=implementation_index,
-        stream=get_stream(input.device),
-    )
-
-    # Concatenate both outputs into a single flat tensor for `allclose` comparison.
-    return torch.cat([out.contiguous().flatten(), residual_out.contiguous().flatten()])
-
-
-def _torch_add_rms_norm(
-    input, residual, weight, *, eps=1e-6, out=None, residual_out=None
-):
-    x_sum = input + residual
-
-    if residual_out is not None:
-        residual_out.copy_(x_sum)
-
-    rms = torch.sqrt(
-        torch.mean(x_sum.float() * x_sum.float(), dim=-1, keepdim=True) + eps
-    )
-    y = (x_sum.float() / rms * weight.float()).to(input.dtype)
-
-    if out is not None:
-        out.copy_(y)
-
-    return torch.cat([out.contiguous().flatten(), residual_out.contiguous().flatten()])
diff --git a/tests/test_rotary_embedding.py b/tests/test_rotary_embedding.py
deleted file mode 100644
index 51139c78..00000000
--- a/tests/test_rotary_embedding.py
+++ /dev/null
@@ -1,723 +0,0 @@
-import infini.ops
-import pytest
-import torch
-
-from tests.utils import get_stream, randn_strided, randint_strided
-
-
-@pytest.fixture(autouse=True)
-def _clear_rotary_cache():
-    """Clear the `RotaryEmbedding` op cache before each test.
-
-    `CacheKey` ignores the `cos_sin_cache` data pointer, so a cached op
-    constructed by a previous test with different cache contents would be
-    reused here.  In production vLLM inference the cache is loaded once,
-    so this pollution is a test-only hazard.
-    """
-    infini.ops.RotaryEmbedding.clear_cache()
-
-    yield
-
-
-def _rotary_embedding(
-    positions,
-    query,
-    key,
-    cos_sin_cache,
-    head_size,
-    rotary_dim,
-    is_neox_style,
-    query_out,
-    key_out,
-    device,
-    implementation_index=0,
-    pre_gathered=False,
-):
-    if device == "npu":
-        infini.ops.rotary_embedding(
-            positions,
-            query,
-            key,
-            head_size,
-            cos_sin_cache,
-            is_neox_style,
-            rotary_dim,
-            query_out,
-            key_out,
-            pre_gathered,
-            implementation_index=implementation_index,
-            stream=get_stream(query.device),
-        )
-    else:
-        infini.ops.rotary_embedding(
-            positions,
-            query,
-            key,
-            head_size,
-            cos_sin_cache,
-            is_neox_style,
-            rotary_dim,
-            query_out,
-            key_out,
-            pre_gathered,
-        )
-
-    return query_out, key_out
-
-
-def _ref_rotary_embedding(
-    positions, query, key, cos_sin_cache, head_size, rotary_dim, is_neox_style
-):
-    """PyTorch reference for RoPE.
-
-    ``cos_sin_cache`` layout: ``[max_seq_len, rotary_dim]`` where the first
-    ``rotary_dim // 2`` columns are cos and the rest are sin.
-
-    Accepts both 2D ``[T, N*D]`` and 3D ``[T, N, D]`` inputs.  When ``key``
-    is ``None`` only the query is rotated (MLA).
-    """
-    T = query.size(0)
-    R = rotary_dim
-    half_R = R // 2
-
-    # Reshape to 3D for computation if input is 2D.
-    q_is_2d = query.ndim == 2
-    q3d = query.view(T, -1, head_size) if q_is_2d else query
-    k3d = None
-
-    if key is not None:
-        k3d = key.view(T, -1, head_size) if q_is_2d else key
-
-    cos_sin = cos_sin_cache.float()
-    cos_half = cos_sin[:, :half_R]
-    sin_half = cos_sin[:, half_R:]
-
-    def apply_rope(x):
-        out = x.float().clone()
-
-        for t in range(T):
-            p = positions[t].item()
-            c = cos_half[p]
-            s = sin_half[p]
-
-            if is_neox_style:
-                x1 = x[t, :, :half_R].float()
-                x2 = x[t, :, half_R:R].float()
-                out[t, :, :half_R] = c * x1 - s * x2
-                out[t, :, half_R:R] = c * x2 + s * x1
-            else:
-                # GPT-J interleave: only the first `rotary_dim` features
-                # rotate, and within them even/odd indices form the pairs.
-                x1 = x[t, :, 0:R:2].float()
-                x2 = x[t, :, 1:R:2].float()
-                out[t, :, 0:R:2] = c * x1 - s * x2
-                out[t, :, 1:R:2] = c * x2 + s * x1
-
-        return out.to(x.dtype)
-
-    ref_q = apply_rope(q3d)
-    ref_k = apply_rope(k3d) if k3d is not None else None
-
-    # Flatten back to 2D if input was 2D.
-    if q_is_2d:
-        ref_q = ref_q.view(T, -1)
-
-        if ref_k is not None:
-            ref_k = ref_k.view(T, -1)
-
-    return ref_q, ref_k
-
-
-def _assert_close(actual, expected, rtol, atol):
-    assert torch.allclose(actual, expected, rtol=rtol, atol=atol), (
-        f"Max diff: {(actual.float() - expected.float()).abs().max().item()}"
-    )
-
-
-@pytest.mark.parametrize(
-    "num_heads, head_size",
-    (
-        (32, 128),
-        (8, 64),
-    ),
-)
-@pytest.mark.parametrize("is_neox_style", (True, False))
-@pytest.mark.parametrize(
-    ("dtype", "rtol", "atol"),
-    (
-        (torch.float16, 1e-3, 1e-3),
-        (torch.bfloat16, 1e-2, 5e-3),
-    ),
-)
-def test_rotary_embedding_full(
-    num_heads,
-    head_size,
-    is_neox_style,
-    implementation_index,
-    dtype,
-    rtol,
-    atol,
-    device,
-):
-    """Full rotary: ``rotary_dim == head_size``."""
-    if device == "npu" and not (hasattr(torch, "npu") and torch.npu.is_available()):
-        pytest.skip("NPU not available")
-
-    # Only implementation 0 (`aclnnApplyRotaryPosEmbV2`) is still limited to
-    # `rotaryMode="half"`; implementation 1 (ATB `RopeParam`) plumbs
-    # `rotaryCoeff=head_size` for the non-neox (interleave) case.
-    if device == "npu" and not is_neox_style and implementation_index == 0:
-        pytest.skip(
-            'Ascend `aclnnApplyRotaryPosEmbV2` only supports `rotaryMode="half"`'
-        )
-
-    # `aclnnApplyRotaryPosEmbV2` accumulates with ~4 ULP error for float16.
-    if device == "npu" and dtype == torch.float16:
-        atol = 0.01
-
-    num_kv_heads = num_heads
-    rotary_dim = head_size
-    num_tokens = 16
-    max_seq_len = 64
-
-    positions = randint_strided(
-        0,
-        max_seq_len,
-        (num_tokens,),
-        None,
-        dtype=torch.int64,
-        device=device,
-    )
-    query = randn_strided(
-        (num_tokens, num_heads, head_size),
-        None,
-        dtype=dtype,
-        device=device,
-    )
-    key = randn_strided(
-        (num_tokens, num_kv_heads, head_size),
-        None,
-        dtype=dtype,
-        device=device,
-    )
-    cos_sin_cache = randn_strided(
-        (max_seq_len, rotary_dim),
-        None,
-        dtype=dtype,
-        device=device,
-    )
-    query_out = torch.empty_like(query)
-    key_out = torch.empty_like(key)
-
-    q_out, k_out = _rotary_embedding(
-        positions,
-        query,
-        key,
-        cos_sin_cache,
-        head_size,
-        rotary_dim,
-        is_neox_style,
-        query_out,
-        key_out,
-        device,
-        implementation_index=implementation_index,
-    )
-
-    ref_q, ref_k = _ref_rotary_embedding(
-        positions,
-        query,
-        key,
-        cos_sin_cache,
-        head_size,
-        rotary_dim,
-        is_neox_style,
-    )
-
-    _assert_close(q_out, ref_q, rtol, atol)
-    _assert_close(k_out, ref_k, rtol, atol)
-
-
-def _rotary_embedding_atb(
-    positions,
-    query,
-    key,
-    cos_sin_cache,
-    head_size,
-    rotary_dim,
-    is_neox_style,
-    query_out,
-    key_out,
-):
-    """Call rotary embedding with ATB implementation (index=1)."""
-    infini.ops.rotary_embedding(
-        positions,
-        query,
-        key,
-        head_size,
-        cos_sin_cache,
-        is_neox_style,
-        rotary_dim,
-        query_out,
-        key_out,
-        implementation_index=1,
-        stream=get_stream(query.device),
-    )
-
-    return query_out, key_out
-
-
-@pytest.mark.parametrize("num_tokens", (1, 4, 16))
-@pytest.mark.parametrize(
-    "num_heads, head_size",
-    (
-        (32, 128),
-        (8, 64),
-    ),
-)
-@pytest.mark.parametrize("device", ("npu",))
-def test_rotary_embedding_atb(num_tokens, num_heads, head_size, device):
-    """ATB `RopeParam` path (implementation_index=1), fp16 only."""
-    if not (hasattr(torch, "npu") and torch.npu.is_available()):
-        pytest.skip("NPU not available")
-
-    active_indices = infini.ops.RotaryEmbedding.active_implementation_indices(device)
-
-    if 1 not in active_indices:
-        pytest.skip("ATB implementation (index=1) not active on this build")
-
-    dtype = torch.float16
-    rtol = 1e-3
-    atol = 0.01
-    num_kv_heads = num_heads
-    rotary_dim = head_size
-    max_seq_len = 64
-
-    positions = randint_strided(
-        0,
-        max_seq_len,
-        (num_tokens,),
-        None,
-        dtype=torch.int64,
-        device=device,
-    )
-    query = randn_strided(
-        (num_tokens, num_heads, head_size),
-        None,
-        dtype=dtype,
-        device=device,
-    )
-    key = randn_strided(
-        (num_tokens, num_kv_heads, head_size),
-        None,
-        dtype=dtype,
-        device=device,
-    )
-    cos_sin_cache = randn_strided(
-        (max_seq_len, rotary_dim),
-        None,
-        dtype=dtype,
-        device=device,
-    )
-    query_out = torch.empty_like(query)
-    key_out = torch.empty_like(key)
-
-    q_out, k_out = _rotary_embedding_atb(
-        positions,
-        query,
-        key,
-        cos_sin_cache,
-        head_size,
-        rotary_dim,
-        True,
-        query_out,
-        key_out,
-    )
-
-    ref_q, ref_k = _ref_rotary_embedding(
-        positions,
-        query,
-        key,
-        cos_sin_cache,
-        head_size,
-        rotary_dim,
-        True,
-    )
-
-    _assert_close(q_out, ref_q, rtol, atol)
-    _assert_close(k_out, ref_k, rtol, atol)
-
-
-@pytest.mark.parametrize("num_tokens", (1, 4, 16))
-@pytest.mark.parametrize(
-    "num_heads, head_size",
-    (
-        (32, 128),
-        (8, 64),
-    ),
-)
-@pytest.mark.parametrize(
-    ("dtype", "rtol", "atol"),
-    (
-        (torch.float16, 1e-3, 0.01),
-        (torch.bfloat16, 1e-2, 5e-3),
-    ),
-)
-def test_rotary_embedding_2d(
-    num_tokens, num_heads, head_size, implementation_index, dtype, rtol, atol, device
-):
-    """2D ``[T, N*D]`` layout (vLLM convention) for both CANN and ATB paths."""
-    if not (hasattr(torch, "npu") and torch.npu.is_available()):
-        pytest.skip("NPU not available")
-
-    num_kv_heads = num_heads
-    rotary_dim = head_size
-    max_seq_len = 64
-
-    positions = randint_strided(
-        0,
-        max_seq_len,
-        (num_tokens,),
-        None,
-        dtype=torch.int64,
-        device=device,
-    )
-
-    # 2D layout: [T, N*D].
-    query = randn_strided(
-        (num_tokens, num_heads * head_size),
-        None,
-        dtype=dtype,
-        device=device,
-    )
-    key = randn_strided(
-        (num_tokens, num_kv_heads * head_size),
-        None,
-        dtype=dtype,
-        device=device,
-    )
-    cos_sin_cache = randn_strided(
-        (max_seq_len, rotary_dim),
-        None,
-        dtype=dtype,
-        device=device,
-    )
-    query_out = torch.empty_like(query)
-    key_out = torch.empty_like(key)
-
-    if device == "npu":
-        infini.ops.rotary_embedding(
-            positions,
-            query,
-            key,
-            head_size,
-            cos_sin_cache,
-            True,
-            rotary_dim,
-            query_out,
-            key_out,
-            implementation_index=implementation_index,
-            stream=get_stream(query.device),
-        )
-    else:
-        infini.ops.rotary_embedding(
-            positions,
-            query,
-            key,
-            head_size,
-            cos_sin_cache,
-            True,
-            rotary_dim,
-            query_out,
-            key_out,
-            implementation_index=implementation_index,
-        )
-
-    ref_q, ref_k = _ref_rotary_embedding(
-        positions,
-        query,
-        key,
-        cos_sin_cache,
-        head_size,
-        rotary_dim,
-        True,
-    )
-
-    _assert_close(query_out, ref_q, rtol, atol)
-    _assert_close(key_out, ref_k, rtol, atol)
-
-
-@pytest.mark.parametrize(
-    "num_heads, num_kv_heads, head_size, rotary_dim",
-    (
-        (32, 8, 128, 64),
-        (16, 4, 64, 32),
-    ),
-)
-@pytest.mark.parametrize("is_neox_style", (True, False))
-@pytest.mark.parametrize(
-    ("dtype", "rtol", "atol"),
-    (
-        (torch.float16, 1e-3, 1e-3),
-        (torch.bfloat16, 1e-2, 5e-3),
-    ),
-)
-@pytest.mark.parametrize("device", ("npu",))
-def test_rotary_embedding_partial(
-    num_heads,
-    num_kv_heads,
-    head_size,
-    rotary_dim,
-    is_neox_style,
-    dtype,
-    rtol,
-    atol,
-    device,
-):
-    """Partial rotary: ``rotary_dim < head_size`` via implementation_index=2.
-
-    Only `aclnnRopeWithSinCosCache` (impl=2) supports partial rotary among
-    the Ascend fused APIs — V2 (impl=0) and ATB `RopeParam` (impl=1) both
-    require `cos.D == sin.D == x.D`.  Covers both neox and GPT-J styles.
-    """
-    if device == "npu" and not (hasattr(torch, "npu") and torch.npu.is_available()):
-        pytest.skip("NPU not available")
-
-    if device == "npu":
-        active_indices = infini.ops.RotaryEmbedding.active_implementation_indices(
-            device
-        )
-
-        if 2 not in active_indices:
-            pytest.skip(
-                "`aclnnRopeWithSinCosCache` (implementation_index=2) not "
-                "active on this build; it is the only Ascend fused API "
-                "that supports partial rotary (`rotary_dim < head_size`)."
-            )
-
-    num_tokens = 16
-    max_seq_len = 64
-
-    positions = randint_strided(
-        0,
-        max_seq_len,
-        (num_tokens,),
-        None,
-        dtype=torch.int64,
-        device=device,
-    )
-    query = randn_strided(
-        (num_tokens, num_heads, head_size),
-        None,
-        dtype=dtype,
-        device=device,
-    )
-    key = randn_strided(
-        (num_tokens, num_kv_heads, head_size),
-        None,
-        dtype=dtype,
-        device=device,
-    )
-    cos_sin_cache = randn_strided(
-        (max_seq_len, rotary_dim),
-        None,
-        dtype=dtype,
-        device=device,
-    )
-    query_out = torch.empty_like(query)
-    key_out = torch.empty_like(key)
-
-    q_out, k_out = _rotary_embedding(
-        positions,
-        query,
-        key,
-        cos_sin_cache,
-        head_size,
-        rotary_dim,
-        is_neox_style,
-        query_out,
-        key_out,
-        device,
-        implementation_index=2,
-    )
-
-    ref_q, ref_k = _ref_rotary_embedding(
-        positions,
-        query,
-        key,
-        cos_sin_cache,
-        head_size,
-        rotary_dim,
-        is_neox_style,
-    )
-
-    _assert_close(q_out, ref_q, rtol, atol)
-    _assert_close(k_out, ref_k, rtol, atol)
-
-
-@pytest.mark.parametrize(
-    ("dtype", "rtol", "atol"),
-    (
-        # V2 accumulates ~4 ULP error in fp16 (kernel.h doc: max diff ~0.008);
-        # ATB `RopeParam` is similar.  Use atol=5e-3 for honest headroom.
-        (torch.float16, 1e-2, 5e-3),
-        (torch.bfloat16, 1e-2, 5e-3),
-    ),
-)
-def test_rotary_embedding_inplace(implementation_index, dtype, rtol, atol, device):
-    """Verify the inplace path (`query_out` / `key_out` omitted).
-
-    Matches vLLM's `RotaryEmbedding.forward(positions, query, key)`
-    convention where the op mutates `query` / `key` directly.
-    """
-    if not (hasattr(torch, "npu") and torch.npu.is_available()):
-        pytest.skip("NPU not available")
-
-    num_tokens = 4
-    num_heads = 8
-    num_kv_heads = 8
-    head_size = 64
-    rotary_dim = head_size
-    max_seq_len = 32
-
-    positions = randint_strided(
-        0, max_seq_len, (num_tokens,), None, dtype=torch.int64, device=device
-    )
-    query = randn_strided(
-        (num_tokens, num_heads, head_size), None, dtype=dtype, device=device
-    )
-    key = randn_strided(
-        (num_tokens, num_kv_heads, head_size), None, dtype=dtype, device=device
-    )
-    cos_sin_cache = randn_strided(
-        (max_seq_len, rotary_dim), None, dtype=dtype, device=device
-    )
-
-    # Reference: apply RoPE to clones of the original inputs.
-    ref_q, ref_k = _ref_rotary_embedding(
-        positions,
-        query.clone(),
-        key.clone(),
-        cos_sin_cache,
-        head_size,
-        rotary_dim,
-        is_neox_style=True,
-    )
-
-    # Inplace call — no `query_out` / `key_out` supplied.
-    infini.ops.rotary_embedding(
-        positions,
-        query,
-        key,
-        head_size,
-        cos_sin_cache,
-        True,
-        rotary_dim,
-        implementation_index=implementation_index,
-        stream=get_stream(query.device),
-    )
-
-    _assert_close(query, ref_q, rtol, atol)
-    _assert_close(key, ref_k, rtol, atol)
-
-
-def _build_pre_gathered_cache(cos_sin_cache, positions, head_size, is_neox_style):
-    """Build the `[2 * T, head_size]` pre-gathered cache the kernel expects.
-
-    Layout (see `src/ascend/rotary_embedding/kernel.h` pre-gathered branch):
-      - rows `0..T-1`: neox-expanded cos for each token (row `t` holds the
-        cos values for `positions[t]`, broadcast to full `head_size`).
-      - rows `T..2T-1`: neox-expanded sin, same indexing.
-    """
-    half = head_size // 2
-    cos_half = cos_sin_cache[:, :half].index_select(0, positions)
-    sin_half = cos_sin_cache[:, half:].index_select(0, positions)
-
-    if is_neox_style:
-        cos_full = torch.cat([cos_half, cos_half], dim=-1)
-        sin_full = torch.cat([sin_half, sin_half], dim=-1)
-    else:
-        # GPT-J interleave: pair-wise expansion `(x[0],x[0],x[1],x[1],…)`.
-        cos_full = cos_half.repeat_interleave(2, dim=-1)
-        sin_full = sin_half.repeat_interleave(2, dim=-1)
-
-    return torch.cat([cos_full, sin_full], dim=0)
-
-
-@pytest.mark.parametrize("layout", ("2d", "3d"))
-@pytest.mark.parametrize("is_neox_style", (True, False))
-@pytest.mark.parametrize(
-    ("dtype", "rtol", "atol"),
-    (
-        (torch.float16, 1e-2, 5e-3),
-        (torch.bfloat16, 1e-2, 5e-3),
-    ),
-)
-def test_rotary_embedding_pre_gathered(
-    implementation_index, layout, is_neox_style, dtype, rtol, atol, device
-):
-    """`pre_gathered=True` fast path: caller hands in `[2*T, head_size]` with
-    cos/sin already gathered and neox-expanded per token.  Exercises both 2D
-    `[T, N*D]` and 3D `[T, N, D]` query/key layouts."""
-    if not (hasattr(torch, "npu") and torch.npu.is_available()):
-        pytest.skip("NPU not available")
-
-    if implementation_index == 2:
-        pytest.skip("`aclnnRopeWithSinCosCache` (impl 2) asserts `!pre_gathered_`")
-
-    if not is_neox_style and implementation_index == 0:
-        pytest.skip('`aclnnApplyRotaryPosEmbV2` only supports `rotaryMode="half"`')
-
-    num_tokens = 8
-    num_heads = 16
-    num_kv_heads = 4
-    head_size = 128
-    rotary_dim = head_size
-    max_seq_len = 64
-
-    positions = randint_strided(
-        0, max_seq_len, (num_tokens,), None, dtype=torch.int64, device=device
-    )
-    cos_sin_cache = randn_strided(
-        (max_seq_len, rotary_dim), None, dtype=dtype, device=device
-    )
-
-    if layout == "3d":
-        q_shape = (num_tokens, num_heads, head_size)
-        k_shape = (num_tokens, num_kv_heads, head_size)
-    else:
-        q_shape = (num_tokens, num_heads * head_size)
-        k_shape = (num_tokens, num_kv_heads * head_size)
-
-    query = randn_strided(q_shape, None, dtype=dtype, device=device)
-    key = randn_strided(k_shape, None, dtype=dtype, device=device)
-    query_out = torch.empty_like(query)
-    key_out = torch.empty_like(key)
-
-    pre_gathered_cache = _build_pre_gathered_cache(
-        cos_sin_cache, positions, head_size, is_neox_style
-    )
-    # Kernel reads `positions` as `0..T-1` in the pre-gathered path (the
-    # gather has already happened); the actual values are not indexed.
-    arange_positions = torch.arange(num_tokens, dtype=torch.int64, device=device)
-
-    infini.ops.rotary_embedding(
-        arange_positions,
-        query,
-        key,
-        head_size,
-        pre_gathered_cache,
-        is_neox_style,
-        rotary_dim,
-        query_out,
-        key_out,
-        True,
-        implementation_index=implementation_index,
-        stream=get_stream(query.device),
-    )
-
-    ref_q, ref_k = _ref_rotary_embedding(
-        positions, query, key, cos_sin_cache, head_size, rotary_dim, is_neox_style
-    )
-
-    _assert_close(query_out, ref_q, rtol, atol)
-    _assert_close(key_out, ref_k, rtol, atol)
diff --git a/tests/test_silu_and_mul.py b/tests/test_silu_and_mul.py
deleted file mode 100644
index c1bb62e4..00000000
--- a/tests/test_silu_and_mul.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import infini.ops
-import pytest
-import torch
-
-from tests.utils import Payload, empty_strided, get_stream, rand_strided
-
-
-@pytest.mark.auto_act_and_assert
-@pytest.mark.parametrize(
-    "shape, x_strides, out_strides",
-    (
-        ((13, 8), None, None),
-        ((16, 11264), None, None),
-        ((4, 4, 11264), None, None),
-        ((1, 8), None, None),
-        ((32, 5632), None, None),
-        # Non-contiguous `x` (inner stride > inner dim doubled).
-        ((13, 8), (16, 1), (4, 1)),
-        # Non-contiguous across all dims (3-D with larger outer stride).
-        ((4, 4, 16), (128, 16, 1), (64, 8, 1)),
-    ),
-)
-@pytest.mark.parametrize(
-    ("dtype", "rtol", "atol"),
-    (
-        (torch.float32, 1e-7, 1e-7),
-        (torch.float16, 1e-3, 1e-3),
-        (torch.bfloat16, 1e-2, 5e-3),
-    ),
-)
-def test_silu_and_mul(
-    shape,
-    x_strides,
-    out_strides,
-    implementation_index,
-    dtype,
-    device,
-    rtol,
-    atol,
-):
-    x = rand_strided(shape, x_strides, dtype=dtype, device=device)
-    d = shape[-1] // 2
-    out_shape = (*shape[:-1], d)
-    out = empty_strided(out_shape, out_strides, dtype=dtype, device=device)
-
-    return Payload(
-        lambda *args, **kwargs: _silu_and_mul(
-            *args, **kwargs, implementation_index=implementation_index
-        ),
-        _torch_silu_and_mul,
-        (x, out),
-        {},
-        rtol=rtol,
-        atol=atol,
-    )
-
-
-def _silu_and_mul(x, out, implementation_index=0):
-    infini.ops.silu_and_mul(
-        x,
-        -1,
-        out,
-        implementation_index=implementation_index,
-        stream=get_stream(x.device),
-    )
-
-    return out
-
-
-def _torch_silu_and_mul(x, out):
-    d = x.shape[-1] // 2
-    gate = x[..., :d]
-    up = x[..., d:]
-    result = up * torch.sigmoid(gate) * gate
-
-    return result.to(out.dtype)