From e99d1c4f5895a6fdafcc93975202cbd19caf8e10 Mon Sep 17 00:00:00 2001
From: gehaoxuan <2310314646@qq.com>
Date: Fri, 17 Oct 2025 22:31:22 +0800
Subject: [PATCH 1/2] add example/README.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: gehaoxuan<2310314646@qq.com>


# message auto-generated for no-merge-commit merge:
!49 merge readme into master

add example/README.md

Created-by: gehaoxuan
Commit-by: gehaoxuan
Merged-by: ascend-robot
Description: ## 类型
- [ ] Bug修复
- [ ] 新功能
- [ ] 代码风格更新
- [ ] 代码重构
- [ ] 构建过程或辅助工具的变动
- [x] 文档内容更新

## 描述
请提供此 Pull Request 的详细说明。

## 如何测试
描述测试这个变更的步骤，包括哪些文件需要被修改。

## Checklist:
- [ ] 我的代码遵循这个项目的代码风格
- [ ] 我已经自己测试过我的代码
- [ ] 我已经更新了相应的文档
- [ ] 我已经根据需要更新了对应的变更日志
- [ ] 我已经在标题中正确使用了类型标签（例如：`feature:`, `fix:`）

## 其他信息
在这里可以添加任何与这个 Pull Request 相关的其他说明。


See merge request: cann/asnumpy!49
---
 examples/README.md | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 examples/README.md

diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000..36d60ac
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,30 @@
+# AsNumpy项目函数样例说明  
+样例调用本项目的函数，和Numpy的同功能函数用numpy.allclose进行结果对比，并输出运行时间，以此来展现AsNumpy的准确性和性能  
+  
+## 已实现样例
+| 文件名 | 功能描述 |
+| :--- | :-- |
+| [01_add](01_add.py) |  用asnumpy.add和numpy.add分别对输入数组 x1 和 x2 执行逐元素加法运算并对比结果，并计算它们的运行时间  |
+| [02_exp2](02_exp2.py) |  用asnumpy.exp2和numpy.exp2分别对输入数组 x 的每个元素计算 2 的幂并对比结果，并计算它们的运行时间  |
+| [03_multiply](03_multiply.py) |  用asnumpy.multiply和numpy.multiply分别对输入数组 x1 和 x2 执行逐元素乘法运算并对比结果，并计算它们的运行时间  |
+| [04_all](04_all.py) |  用asnumpy.all和numpy.all分别对输入数组 x 执行对输入数组执行逻辑与归约操作，判断所有元素是否均为 True并对比结果，并计算它们的运行时间  |
+| [05_divide](05_divide.py) |  用asnumpy.divide和numpy.divide分别对输入数组 x1 和 x2 执行逐元素除法并对比结果，并计算它们的运行时间  |  
+  
+## 下一步预期实现样例  
+| 函数名 | 预期功能描述 |
+| :--- | :-- |
+| sinh |  用asnumpy.sinh和numpy.sinh分别对输入数组 x1 和 x2 执行逐元素计算双曲正弦并对比结果，并计算它们的运行时间  |
+| real |  用asnumpy.real和numpy.real分别逐元素输出 x 的实数部分并对比结果，并计算它们的运行时间  |
+| square |  用asnumpy.square和numpy.square分别逐元素计算 x 的平方并对比结果，并计算它们的运行时间  |
+| sinc |  用asnumpy.sinc和numpy.sinc分别对输入数组 x 逐元素计算 sinc 函数并对比结果，并计算它们的运行时间  |
+| gcd |  用asnumpy.gcd和numpy.gcd分别对输入数组 x1 和 x2 逐元素计算最大公约数并对比结果，并计算它们的运行时间  |
+| around |  用asnumpy.around和numpy.around分别逐元素将 x 四舍五入到指定小数位数并对比结果，并计算它们的运行时间  |
+| cumsum |  用asnumpy.cumsum和numpy.cumsum分别逐元素计算 x 沿给定轴的元素的累积和并对比结果，并计算它们的运行时间  |
+| arcsin |  用asnumpy.arcsin和numpy.arcsin分别对 x 进行逐元素的反正弦计算并对比结果，并计算它们的运行时间  |
+| reciprocal |  用asnumpy.reciprocal和numpy.reciprocal分别对 x 计算每个元素的倒数并对比结果，并计算它们的运行时间  |
+| binomial |  用asnumpy.binomial从二项分布中抽取足够多随机样本并用卡方分布测试是否符合分布，并计算运行时间  |  
+  
+## 更新说明  
+| 时间 | 更新事项 |
+| :--- | :-- |
+| 2025/10/14 |  新增AsNumpy项目函数样例说明  |
\ No newline at end of file

From 7717cd34d7c7cb8269b37b324784addbc51f5bfc Mon Sep 17 00:00:00 2001
From: yuanyuan14 <3263715730@qq.com>
Date: Thu, 23 Oct 2025 10:20:33 +0800
Subject: [PATCH 2/2] feat:add relu and gelu API including test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: yuanyuan14<3263715730@qq.com>


# message auto-generated for no-merge-commit merge:
!53 merge dev into master

feat:add relu and gelu API including test

Created-by: yuanyuan14
Commit-by: yuanyuan14
Merged-by: turing_project1
Description: ## 类型
- [ ] Bug修复
- [x] 新功能
- [ ] 代码风格更新
- [ ] 代码重构
- [ ] 构建过程或辅助工具的变动
- [ ] 文档内容更新

## 描述
请提供此 Pull Request 的详细说明。

## 如何测试
描述测试这个变更的步骤，包括哪些文件需要被修改。

## Checklist:
- [x] 我的代码遵循这个项目的代码风格
- [x] 我已经自己测试过我的代码
- [x] 我已经更新了相应的文档
- [x] 我已经根据需要更新了对应的变更日志
- [x] 我已经在标题中正确使用了类型标签（例如：`feature:`, `fix:`）

## 其他信息
在这里可以添加任何与这个 Pull Request 相关的其他说明。


See merge request: cann/asnumpy!53
---
 asnumpy/lib/__init__.py                |  2 +
 include/asnumpy/math/miscellaneous.hpp |  3 ++
 python/bind_math.cpp                   |  2 +
 src/math/miscellaneous.cpp             | 68 ++++++++++++++++++++++++++
 test/test_math/test_functions.py       | 12 +++++
 5 files changed, 87 insertions(+)

diff --git a/asnumpy/lib/__init__.py b/asnumpy/lib/__init__.py
index 1f08be1..85eaa92 100644
--- a/asnumpy/lib/__init__.py
+++ b/asnumpy/lib/__init__.py
@@ -121,6 +121,8 @@
     "minimum",
     "fmax",
     "fmin",
+    "relu",
+    "gelu",
     "pareto",
     "rayleigh",
     "normal",
diff --git a/include/asnumpy/math/miscellaneous.hpp b/include/asnumpy/math/miscellaneous.hpp
index f30add8..db979d7 100644
--- a/include/asnumpy/math/miscellaneous.hpp
+++ b/include/asnumpy/math/miscellaneous.hpp
@@ -52,4 +52,7 @@ NPUArray Fmax(const NPUArray& x1, const NPUArray& x2, std::optional<py::dtype> d
 
 NPUArray Fmin(const NPUArray& x1, const NPUArray& x2, std::optional<py::dtype> dtype = std::nullopt);
 
+NPUArray Relu(const NPUArray& x, std::optional<py::dtype> dtype = std::nullopt);
+
+NPUArray Gelu(const NPUArray& x, std::optional<py::dtype> dtype = std::nullopt);
 }
\ No newline at end of file
diff --git a/python/bind_math.cpp b/python/bind_math.cpp
index c7fcfac..70cb8c0 100644
--- a/python/bind_math.cpp
+++ b/python/bind_math.cpp
@@ -98,6 +98,8 @@ void bind_miscellaneous(py::module_& math){
     math.def("minimum", &Minimum, py::arg("x1"), py::arg("x2"), py::arg("dtype") = py::none());
     math.def("fmax", &Fmax, py::arg("x1"), py::arg("x2"), py::arg("dtype") = py::none());
     math.def("fmin", &Fmin, py::arg("x1"), py::arg("x2"), py::arg("dtype") = py::none());
+    math.def("relu", &Relu, py::arg("x"), py::arg("dtype") = py::none());
+    math.def("gelu", &Gelu, py::arg("x"), py::arg("dtype") = py::none());
 }
 
 void bind_arithmetic_operations(py::module_& math) {
diff --git a/src/math/miscellaneous.cpp b/src/math/miscellaneous.cpp
index 88d3c6b..4f5fcfa 100644
--- a/src/math/miscellaneous.cpp
+++ b/src/math/miscellaneous.cpp
@@ -25,6 +25,8 @@
 #include <aclnnop/aclnn_convolution.h>
 #include <aclnnop/aclnn_clamp.h>
 #include <aclnnop/aclnn_pow.h>
+#include <aclnnop/aclnn_relu.h>
+#include <aclnnop/aclnn_gelu.h> 
 #include <aclnnop/aclnn_nan_to_num.h>
 #include <aclnnop/aclnn_abs.h>
 #include <aclnnop/aclnn_sign.h>
@@ -750,4 +752,70 @@ NPUArray Fmin(const NPUArray& x1, const NPUArray& x2, std::optional<py::dtype> d
     return out;
 }
 
+
+/**
+ * @brief Compute element-wise Rectified Linear Unit (ReLU).
+ * 
+ * Applies ReLU activation function element-wise: max(0, x).
+ * Equivalent to numpy.maximum(x, 0).
+ * 
+ * @param x Input array.
+ * @param dtype Optional target numpy dtype for the output array. If not provided, uses input dtype.
+ * @return NPUArray Array with element-wise ReLU values.
+ * @throws std::runtime_error If ACL operation or memory allocation fails.
+ */
+ NPUArray Relu(const NPUArray& x, std::optional<py::dtype> dtype) {
+    py::dtype out_dtype = dtype.has_value() ? dtype.value() : x.dtype;
+    auto out = NPUArray(x.shape, out_dtype);
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor = nullptr;
+    auto error = aclnnReluGetWorkspaceSize(x.tensorPtr, out.tensorPtr, &workspaceSize, &executor);
+    CheckGetWorkspaceSizeAclnnStatus(error);
+    void* workspaceAddr = nullptr;
+    if(workspaceSize > 0) {
+        error = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
+        CheckMallocAclnnStatus(error);
+    }
+    error = aclnnRelu(workspaceAddr, workspaceSize, executor, nullptr);
+    CheckAclnnStatus(error, "aclnnRelu error");
+    error = aclrtSynchronizeDevice();
+    CheckSynchronizeDeviceAclnnStatus(error);
+    if (workspaceAddr) aclrtFree(workspaceAddr);
+    return out;
+}
+
+
+/**
+ * @brief Compute element-wise Gaussian Error Linear Unit (GELU).
+ * 
+ * Applies GELU activation function element-wise: GELU(x) = x * Φ(x)
+ * where Φ(x) is the cumulative distribution function of the standard normal distribution.
+ * 
+ * GELU is commonly used in models like BERT and GPT. It provides smoother gradients
+ * compared to ReLU and incorporates probabilistic properties.
+ * 
+ * @param x Input array.
+ * @param dtype Optional target numpy dtype for the output array. If not provided, uses input dtype.
+ * @return NPUArray Array with element-wise GELU values.
+ * @throws std::runtime_error If ACL operation or memory allocation fails.
+ */
+ NPUArray Gelu(const NPUArray& x, std::optional<py::dtype> dtype) {
+    py::dtype out_dtype = dtype.has_value() ? dtype.value() : x.dtype;
+    auto out = NPUArray(x.shape, out_dtype);
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor = nullptr;
+    auto error = aclnnGeluGetWorkspaceSize(x.tensorPtr, out.tensorPtr, &workspaceSize, &executor);
+    CheckGetWorkspaceSizeAclnnStatus(error);
+    void* workspaceAddr = nullptr;
+    if(workspaceSize > 0) {
+        error = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
+        CheckMallocAclnnStatus(error);
+    }
+    error = aclnnGelu(workspaceAddr, workspaceSize, executor, nullptr);
+    CheckAclnnStatus(error, "aclnnGelu error");
+    error = aclrtSynchronizeDevice();
+    CheckSynchronizeDeviceAclnnStatus(error);
+    if (workspaceAddr) aclrtFree(workspaceAddr);
+    return out;
+}
 }
\ No newline at end of file
diff --git a/test/test_math/test_functions.py b/test/test_math/test_functions.py
index c231ede..f08966b 100644
--- a/test/test_math/test_functions.py
+++ b/test/test_math/test_functions.py
@@ -42,6 +42,18 @@
         ap.fabs,
         UNARY_TEST_CASES
     ),
+        (
+        "relu",
+        lambda x: np.maximum(x, 0),  # NumPy 没有内置 relu，用 maximum 模拟
+        ap.relu,
+        UNARY_TEST_CASES
+    ),
+        (
+        "gelu",
+        lambda x: x * 0.5 * (1.0 + np.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * np.power(x, 3)))),  # GELU 近似公式
+        ap.gelu,
+        UNARY_TEST_CASES
+    ),
 ]
 
 # 双操作数函数注册表 (函数名, numpy函数, asnumpy函数, 测试用例列表)