diff --git a/asnumpy/lib/__init__.py b/asnumpy/lib/__init__.py
index 1f08be1..85eaa92 100644
--- a/asnumpy/lib/__init__.py
+++ b/asnumpy/lib/__init__.py
@@ -121,6 +121,8 @@
     "minimum",
     "fmax",
     "fmin",
+    "relu",
+    "gelu",
     "pareto",
     "rayleigh",
     "normal",
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000..36d60ac
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,30 @@
+# AsNumpy项目函数样例说明  
+样例调用本项目的函数，和Numpy的同功能函数用numpy.allclose进行结果对比，并输出运行时间，以此来展现AsNumpy的准确性和性能  
+  
+## 已实现样例
+| 文件名 | 功能描述 |
+| :--- | :-- |
+| [01_add](01_add.py) |  用asnumpy.add和numpy.add分别对输入数组 x1 和 x2 执行逐元素加法运算并对比结果，并计算它们的运行时间  |
+| [02_exp2](02_exp2.py) |  用asnumpy.exp2和numpy.exp2分别对输入数组 x 的每个元素计算 2 的幂并对比结果，并计算它们的运行时间  |
+| [03_multiply](03_multiply.py) |  用asnumpy.multiply和numpy.multiply分别对输入数组 x1 和 x2 执行逐元素乘法运算并对比结果，并计算它们的运行时间  |
+| [04_all](04_all.py) |  用asnumpy.all和numpy.all分别对输入数组 x 执行对输入数组执行逻辑与归约操作，判断所有元素是否均为 True并对比结果，并计算它们的运行时间  |
+| [05_divide](05_divide.py) |  用asnumpy.divide和numpy.divide分别对输入数组 x1 和 x2 执行逐元素除法并对比结果，并计算它们的运行时间  |  
+  
+## 下一步预期实现样例  
+| 函数名 | 预期功能描述 |
+| :--- | :-- |
+| sinh |  用asnumpy.sinh和numpy.sinh分别对输入数组 x1 和 x2 执行逐元素计算双曲正弦并对比结果，并计算它们的运行时间  |
+| real |  用asnumpy.real和numpy.real分别逐元素输出 x 的实数部分并对比结果，并计算它们的运行时间  |
+| square |  用asnumpy.square和numpy.square分别逐元素计算 x 的平方并对比结果，并计算它们的运行时间  |
+| sinc |  用asnumpy.sinc和numpy.sinc分别对输入数组 x 逐元素计算 sinc 函数并对比结果，并计算它们的运行时间  |
+| gcd |  用asnumpy.gcd和numpy.gcd分别对输入数组 x1 和 x2 逐元素计算最大公约数并对比结果，并计算它们的运行时间  |
+| around |  用asnumpy.around和numpy.around分别逐元素将 x 四舍五入到指定小数位数并对比结果，并计算它们的运行时间  |
+| cumsum |  用asnumpy.cumsum和numpy.cumsum分别逐元素计算 x 沿给定轴的元素的累积和并对比结果，并计算它们的运行时间  |
+| arcsin |  用asnumpy.arcsin和numpy.arcsin分别对 x 进行逐元素的反正弦计算并对比结果，并计算它们的运行时间  |
+| reciprocal |  用asnumpy.reciprocal和numpy.reciprocal分别对 x 计算每个元素的倒数并对比结果，并计算它们的运行时间  |
+| binomial |  用asnumpy.binomial从二项分布中抽取足够多随机样本并用卡方分布测试是否符合分布，并计算运行时间  |  
+  
+## 更新说明  
+| 时间 | 更新事项 |
+| :--- | :-- |
+| 2025/10/14 |  新增AsNumpy项目函数样例说明  |
\ No newline at end of file
diff --git a/include/asnumpy/math/miscellaneous.hpp b/include/asnumpy/math/miscellaneous.hpp
index f30add8..db979d7 100644
--- a/include/asnumpy/math/miscellaneous.hpp
+++ b/include/asnumpy/math/miscellaneous.hpp
@@ -52,4 +52,7 @@ NPUArray Fmax(const NPUArray& x1, const NPUArray& x2, std::optional<py::dtype> d
 
 NPUArray Fmin(const NPUArray& x1, const NPUArray& x2, std::optional<py::dtype> dtype = std::nullopt);
 
+NPUArray Relu(const NPUArray& x, std::optional<py::dtype> dtype = std::nullopt);
+
+NPUArray Gelu(const NPUArray& x, std::optional<py::dtype> dtype = std::nullopt);
 }
\ No newline at end of file
diff --git a/python/bind_math.cpp b/python/bind_math.cpp
index c7fcfac..70cb8c0 100644
--- a/python/bind_math.cpp
+++ b/python/bind_math.cpp
@@ -98,6 +98,8 @@ void bind_miscellaneous(py::module_& math){
     math.def("minimum", &Minimum, py::arg("x1"), py::arg("x2"), py::arg("dtype") = py::none());
     math.def("fmax", &Fmax, py::arg("x1"), py::arg("x2"), py::arg("dtype") = py::none());
     math.def("fmin", &Fmin, py::arg("x1"), py::arg("x2"), py::arg("dtype") = py::none());
+    math.def("relu", &Relu, py::arg("x"), py::arg("dtype") = py::none());
+    math.def("gelu", &Gelu, py::arg("x"), py::arg("dtype") = py::none());
 }
 
 void bind_arithmetic_operations(py::module_& math) {
diff --git a/src/math/miscellaneous.cpp b/src/math/miscellaneous.cpp
index 88d3c6b..4f5fcfa 100644
--- a/src/math/miscellaneous.cpp
+++ b/src/math/miscellaneous.cpp
@@ -25,6 +25,8 @@
 #include <aclnnop/aclnn_convolution.h>
 #include <aclnnop/aclnn_clamp.h>
 #include <aclnnop/aclnn_pow.h>
+#include <aclnnop/aclnn_relu.h>
+#include <aclnnop/aclnn_gelu.h> 
 #include <aclnnop/aclnn_nan_to_num.h>
 #include <aclnnop/aclnn_abs.h>
 #include <aclnnop/aclnn_sign.h>
@@ -750,4 +752,70 @@ NPUArray Fmin(const NPUArray& x1, const NPUArray& x2, std::optional<py::dtype> d
     return out;
 }
 
+
+/**
+ * @brief Compute element-wise Rectified Linear Unit (ReLU).
+ * 
+ * Applies ReLU activation function element-wise: max(0, x).
+ * Equivalent to numpy.maximum(x, 0).
+ * 
+ * @param x Input array.
+ * @param dtype Optional target numpy dtype for the output array. If not provided, uses input dtype.
+ * @return NPUArray Array with element-wise ReLU values.
+ * @throws std::runtime_error If ACL operation or memory allocation fails.
+ */
+ NPUArray Relu(const NPUArray& x, std::optional<py::dtype> dtype) {
+    py::dtype out_dtype = dtype.has_value() ? dtype.value() : x.dtype;
+    auto out = NPUArray(x.shape, out_dtype);
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor = nullptr;
+    auto error = aclnnReluGetWorkspaceSize(x.tensorPtr, out.tensorPtr, &workspaceSize, &executor);
+    CheckGetWorkspaceSizeAclnnStatus(error);
+    void* workspaceAddr = nullptr;
+    if(workspaceSize > 0) {
+        error = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
+        CheckMallocAclnnStatus(error);
+    }
+    error = aclnnRelu(workspaceAddr, workspaceSize, executor, nullptr);
+    CheckAclnnStatus(error, "aclnnRelu error");
+    error = aclrtSynchronizeDevice();
+    CheckSynchronizeDeviceAclnnStatus(error);
+    if (workspaceAddr) aclrtFree(workspaceAddr);
+    return out;
+}
+
+
+/**
+ * @brief Compute element-wise Gaussian Error Linear Unit (GELU).
+ * 
+ * Applies GELU activation function element-wise: GELU(x) = x * Φ(x)
+ * where Φ(x) is the cumulative distribution function of the standard normal distribution.
+ * 
+ * GELU is commonly used in models like BERT and GPT. It provides smoother gradients
+ * compared to ReLU and incorporates probabilistic properties.
+ * 
+ * @param x Input array.
+ * @param dtype Optional target numpy dtype for the output array. If not provided, uses input dtype.
+ * @return NPUArray Array with element-wise GELU values.
+ * @throws std::runtime_error If ACL operation or memory allocation fails.
+ */
+ NPUArray Gelu(const NPUArray& x, std::optional<py::dtype> dtype) {
+    py::dtype out_dtype = dtype.has_value() ? dtype.value() : x.dtype;
+    auto out = NPUArray(x.shape, out_dtype);
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor = nullptr;
+    auto error = aclnnGeluGetWorkspaceSize(x.tensorPtr, out.tensorPtr, &workspaceSize, &executor);
+    CheckGetWorkspaceSizeAclnnStatus(error);
+    void* workspaceAddr = nullptr;
+    if(workspaceSize > 0) {
+        error = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
+        CheckMallocAclnnStatus(error);
+    }
+    error = aclnnGelu(workspaceAddr, workspaceSize, executor, nullptr);
+    CheckAclnnStatus(error, "aclnnGelu error");
+    error = aclrtSynchronizeDevice();
+    CheckSynchronizeDeviceAclnnStatus(error);
+    if (workspaceAddr) aclrtFree(workspaceAddr);
+    return out;
+}
 }
\ No newline at end of file
diff --git a/test/test_math/test_functions.py b/test/test_math/test_functions.py
index c231ede..f08966b 100644
--- a/test/test_math/test_functions.py
+++ b/test/test_math/test_functions.py
@@ -42,6 +42,18 @@
         ap.fabs,
         UNARY_TEST_CASES
     ),
+        (
+        "relu",
+        lambda x: np.maximum(x, 0),  # NumPy 没有内置 relu，用 maximum 模拟
+        ap.relu,
+        UNARY_TEST_CASES
+    ),
+        (
+        "gelu",
+        lambda x: x * 0.5 * (1.0 + np.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * np.power(x, 3)))),  # GELU 近似公式
+        ap.gelu,
+        UNARY_TEST_CASES
+    ),
 ]
 
 # 双操作数函数注册表 (函数名, numpy函数, asnumpy函数, 测试用例列表)