diff --git a/asnumpy/lib/__init__.py b/asnumpy/lib/__init__.py index 1f08be1..85eaa92 100644 --- a/asnumpy/lib/__init__.py +++ b/asnumpy/lib/__init__.py @@ -121,6 +121,8 @@ "minimum", "fmax", "fmin", + "relu", + "gelu", "pareto", "rayleigh", "normal", diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..36d60ac --- /dev/null +++ b/examples/README.md @@ -0,0 +1,30 @@ +# AsNumpy项目函数样例说明 +样例调用本项目的函数,和Numpy的同功能函数用numpy.allclose进行结果对比,并输出运行时间,以此来展现AsNumpy的准确性和性能 + +## 已实现样例 +| 文件名 | 功能描述 | +| :--- | :-- | +| [01_add](01_add.py) | 用asnumpy.add和numpy.add分别对输入数组 x1 和 x2 执行逐元素加法运算并对比结果,并计算它们的运行时间 | +| [02_exp2](02_exp2.py) | 用asnumpy.exp2和numpy.exp2分别对输入数组 x 的每个元素计算 2 的幂并对比结果,并计算它们的运行时间 | +| [03_multiply](03_multiply.py) | 用asnumpy.multiply和numpy.multiply分别对输入数组 x1 和 x2 执行逐元素乘法运算并对比结果,并计算它们的运行时间 | +| [04_all](04_all.py) | 用asnumpy.all和numpy.all分别对输入数组 x 执行对输入数组执行逻辑与归约操作,判断所有元素是否均为 True并对比结果,并计算它们的运行时间 | +| [05_divide](05_divide.py) | 用asnumpy.divide和numpy.divide分别对输入数组 x1 和 x2 执行逐元素除法并对比结果,并计算它们的运行时间 | + +## 下一步预期实现样例 +| 函数名 | 预期功能描述 | +| :--- | :-- | +| sinh | 用asnumpy.sinh和numpy.sinh分别对输入数组 x1 和 x2 执行逐元素计算双曲正弦并对比结果,并计算它们的运行时间 | +| real | 用asnumpy.real和numpy.real分别逐元素输出 x 的实数部分并对比结果,并计算它们的运行时间 | +| square | 用asnumpy.square和numpy.square分别逐元素计算 x 的平方并对比结果,并计算它们的运行时间 | +| sinc | 用asnumpy.sinc和numpy.sinc分别对输入数组 x 逐元素计算 sinc 函数并对比结果,并计算它们的运行时间 | +| gcd | 用asnumpy.gcd和numpy.gcd分别对输入数组 x1 和 x2 逐元素计算最大公约数并对比结果,并计算它们的运行时间 | +| around | 用asnumpy.around和numpy.around分别逐元素将 x 四舍五入到指定小数位数并对比结果,并计算它们的运行时间 | +| cumsum | 用asnumpy.cumsum和numpy.cumsum分别逐元素计算 x 沿给定轴的元素的累积和并对比结果,并计算它们的运行时间 | +| arcsin | 用asnumpy.arcsin和numpy.arcsin分别对 x 进行逐元素的反正弦计算并对比结果,并计算它们的运行时间 | +| reciprocal | 用asnumpy.reciprocal和numpy.reciprocal分别对 x 计算每个元素的倒数并对比结果,并计算它们的运行时间 | +| binomial | 用asnumpy.binomial从二项分布中抽取足够多随机样本并用卡方分布测试是否符合分布,并计算运行时间 | + +## 更新说明 +| 时间 | 更新事项 | +| :--- | :-- | +| 2025/10/14 | 新增AsNumpy项目函数样例说明 | \ No newline at end of file diff --git a/include/asnumpy/math/miscellaneous.hpp b/include/asnumpy/math/miscellaneous.hpp index f30add8..db979d7 100644 --- a/include/asnumpy/math/miscellaneous.hpp +++ b/include/asnumpy/math/miscellaneous.hpp @@ -52,4 +52,7 @@ NPUArray Fmax(const NPUArray& x1, const NPUArray& x2, std::optional d NPUArray Fmin(const NPUArray& x1, const NPUArray& x2, std::optional dtype = std::nullopt); +NPUArray Relu(const NPUArray& x, std::optional dtype = std::nullopt); + +NPUArray Gelu(const NPUArray& x, std::optional dtype = std::nullopt); } \ No newline at end of file diff --git a/python/bind_math.cpp b/python/bind_math.cpp index c7fcfac..70cb8c0 100644 --- a/python/bind_math.cpp +++ b/python/bind_math.cpp @@ -98,6 +98,8 @@ void bind_miscellaneous(py::module_& math){ math.def("minimum", &Minimum, py::arg("x1"), py::arg("x2"), py::arg("dtype") = py::none()); math.def("fmax", &Fmax, py::arg("x1"), py::arg("x2"), py::arg("dtype") = py::none()); math.def("fmin", &Fmin, py::arg("x1"), py::arg("x2"), py::arg("dtype") = py::none()); + math.def("relu", &Relu, py::arg("x"), py::arg("dtype") = py::none()); + math.def("gelu", &Gelu, py::arg("x"), py::arg("dtype") = py::none()); } void bind_arithmetic_operations(py::module_& math) { diff --git a/src/math/miscellaneous.cpp b/src/math/miscellaneous.cpp index 88d3c6b..4f5fcfa 100644 --- a/src/math/miscellaneous.cpp +++ b/src/math/miscellaneous.cpp @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include #include @@ -750,4 +752,70 @@ NPUArray Fmin(const NPUArray& x1, const NPUArray& x2, std::optional d return out; } + +/** + * @brief Compute element-wise Rectified Linear Unit (ReLU). + * + * Applies ReLU activation function element-wise: max(0, x). + * Equivalent to numpy.maximum(x, 0). + * + * @param x Input array. + * @param dtype Optional target numpy dtype for the output array. If not provided, uses input dtype. + * @return NPUArray Array with element-wise ReLU values. + * @throws std::runtime_error If ACL operation or memory allocation fails. + */ + NPUArray Relu(const NPUArray& x, std::optional dtype) { + py::dtype out_dtype = dtype.has_value() ? dtype.value() : x.dtype; + auto out = NPUArray(x.shape, out_dtype); + uint64_t workspaceSize = 0; + aclOpExecutor* executor = nullptr; + auto error = aclnnReluGetWorkspaceSize(x.tensorPtr, out.tensorPtr, &workspaceSize, &executor); + CheckGetWorkspaceSizeAclnnStatus(error); + void* workspaceAddr = nullptr; + if(workspaceSize > 0) { + error = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); + CheckMallocAclnnStatus(error); + } + error = aclnnRelu(workspaceAddr, workspaceSize, executor, nullptr); + CheckAclnnStatus(error, "aclnnRelu error"); + error = aclrtSynchronizeDevice(); + CheckSynchronizeDeviceAclnnStatus(error); + if (workspaceAddr) aclrtFree(workspaceAddr); + return out; +} + + +/** + * @brief Compute element-wise Gaussian Error Linear Unit (GELU). + * + * Applies GELU activation function element-wise: GELU(x) = x * Φ(x) + * where Φ(x) is the cumulative distribution function of the standard normal distribution. + * + * GELU is commonly used in models like BERT and GPT. It provides smoother gradients + * compared to ReLU and incorporates probabilistic properties. + * + * @param x Input array. + * @param dtype Optional target numpy dtype for the output array. If not provided, uses input dtype. + * @return NPUArray Array with element-wise GELU values. + * @throws std::runtime_error If ACL operation or memory allocation fails. + */ + NPUArray Gelu(const NPUArray& x, std::optional dtype) { + py::dtype out_dtype = dtype.has_value() ? dtype.value() : x.dtype; + auto out = NPUArray(x.shape, out_dtype); + uint64_t workspaceSize = 0; + aclOpExecutor* executor = nullptr; + auto error = aclnnGeluGetWorkspaceSize(x.tensorPtr, out.tensorPtr, &workspaceSize, &executor); + CheckGetWorkspaceSizeAclnnStatus(error); + void* workspaceAddr = nullptr; + if(workspaceSize > 0) { + error = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); + CheckMallocAclnnStatus(error); + } + error = aclnnGelu(workspaceAddr, workspaceSize, executor, nullptr); + CheckAclnnStatus(error, "aclnnGelu error"); + error = aclrtSynchronizeDevice(); + CheckSynchronizeDeviceAclnnStatus(error); + if (workspaceAddr) aclrtFree(workspaceAddr); + return out; +} } \ No newline at end of file diff --git a/test/test_math/test_functions.py b/test/test_math/test_functions.py index c231ede..f08966b 100644 --- a/test/test_math/test_functions.py +++ b/test/test_math/test_functions.py @@ -42,6 +42,18 @@ ap.fabs, UNARY_TEST_CASES ), + ( + "relu", + lambda x: np.maximum(x, 0), # NumPy 没有内置 relu,用 maximum 模拟 + ap.relu, + UNARY_TEST_CASES + ), + ( + "gelu", + lambda x: x * 0.5 * (1.0 + np.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * np.power(x, 3)))), # GELU 近似公式 + ap.gelu, + UNARY_TEST_CASES + ), ] # 双操作数函数注册表 (函数名, numpy函数, asnumpy函数, 测试用例列表)