From 461218e8140747fe84426088ce764ff835a610a8 Mon Sep 17 00:00:00 2001 From: StrelkovKM Date: Fri, 13 Mar 2026 15:04:54 +0300 Subject: [PATCH 01/13] [CPU] rv64: Add rvv_gemm_convolution.cpp --- src/cpu/rv64/rvv_gemm_convolution.cpp | 497 +++++ src/cpu/rv64/rvv_gemm_convolution.hpp | 149 ++ src/cpu/rv64/rvv_gemm_convolution_utils.cpp | 2185 +++++++++++++++++++ src/cpu/rv64/rvv_gemm_convolution_utils.hpp | 142 ++ 4 files changed, 2973 insertions(+) create mode 100644 src/cpu/rv64/rvv_gemm_convolution.cpp create mode 100644 src/cpu/rv64/rvv_gemm_convolution.hpp create mode 100644 src/cpu/rv64/rvv_gemm_convolution_utils.cpp create mode 100644 src/cpu/rv64/rvv_gemm_convolution_utils.hpp diff --git a/src/cpu/rv64/rvv_gemm_convolution.cpp b/src/cpu/rv64/rvv_gemm_convolution.cpp new file mode 100644 index 00000000000..fc20fb2fecf --- /dev/null +++ b/src/cpu/rv64/rvv_gemm_convolution.cpp @@ -0,0 +1,497 @@ +/******************************************************************************* +* Copyright 2016 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include + +#include "common/c_types_map.hpp" +#include "common/dnnl_thread.hpp" +#include "common/type_helpers.hpp" +#include "common/utils.hpp" +#include "cpu/rv64/rvv_gemm_convolution.hpp" + +namespace dnnl { +namespace impl { +namespace cpu { +namespace rv64 { + +using namespace dnnl::impl::status; +using namespace dnnl::impl::memory_tracking::names; +using namespace dnnl::impl::utils; + +namespace { +struct im_pos_t { + im_pos_t() : n {0}, g {0}, od {0}, sp {0}, ic {0}, oc {0} {} + dim_t n, g, od, sp, ic, oc; + bool do_im2col(const im_pos_t &prev) const { + return true + && (n != prev.n || g != prev.g || od != prev.od || sp != prev.sp + || ic != prev.ic); + } +}; +} // namespace + +status_t riscv_gemm_convolution_fwd_t::execute_forward_nspc( + const exec_ctx_t &ctx) const { + auto src_base = CTX_IN_MEM(const data_t *, DNNL_ARG_SRC); + auto wei_base = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS); + auto bia_base = CTX_IN_MEM(const data_t *, DNNL_ARG_BIAS); + auto dst_base = CTX_OUT_MEM(data_t *, DNNL_ARG_DST); + + auto scratchpad = ctx.get_scratchpad_grantor(); + const conv_gemm_conf_t &jcp = pd()->jcp_; + std::atomic st(status::success); + + parallel(jcp.nthr, [&](const int ithr, const int nthr) { + status_t st_thr = execute_forward_thr_nspc(ctx, ithr, nthr, src_base, + wei_base, bia_base, dst_base, scratchpad); + if (st_thr != status::success) st = st_thr; + }); + + return st; +} + +status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc( + const exec_ctx_t &ctx, const int ithr, const int nthr, + const data_t *src_base, const data_t *wei_base, const data_t *bia_base, + data_t *dst_base, const memory_tracking::grantor_t &scratchpad) const { + const conv_gemm_conf_t &jcp = pd()->jcp_; + + // Src Format: mb-spatial-groups-input_channels + const dim_t src_mb_stride = jcp.id * jcp.ih * jcp.iw * jcp.ngroups * jcp.ic; + const dim_t src_g_stride = jcp.ic; + // Wei Format: spatial-input_channels-groups-output_channels + const dim_t wei_g_stride = pd()->with_groups() ? jcp.oc : 0; + + // Dst Format: mb-spatial-groups-output_channels + const dim_t dst_mb_stride = jcp.od * jcp.oh * jcp.ow * jcp.ngroups * jcp.oc; + const dim_t dst_g_stride = jcp.oc; + const dim_t dst_os_stride = jcp.ngroups * jcp.oc; + + data_t *__restrict col = scratchpad.get(key_conv_gemm_col) + + (ptrdiff_t)ithr * jcp.im2col_sz; + data_t *__restrict imtr = scratchpad.get(key_conv_gemm_imtr) + + (ptrdiff_t)ithr * jcp.is * jcp.ic; + + dim_t g {0}, n {0}, ohb {0}, owb {0}; + dim_t start = 0, end = 0; + const bool is_problem_3d = pd()->ndims() == 5; + + assert(IMPLICATION(is_problem_3d, + jcp.oh_block == jcp.oh && jcp.ow_block == jcp.ow + && jcp.ic_block == jcp.ic)); + assert(IMPLICATION(jcp.ow_block != jcp.ow, jcp.oh_block == 1)); + + const dim_t nb_oh = div_up(jcp.oh, jcp.oh_block); + const dim_t nb_ow = div_up(jcp.ow, jcp.ow_block); + // threads share work across mini-batch, groups, and blocked width/height + const dim_t work_amount = jcp.mb * jcp.ngroups * nb_oh * nb_ow; + balance211(work_amount, nthr, ithr, start, end); + nd_iterator_init(start, n, jcp.mb, g, jcp.ngroups, ohb, nb_oh, owb, nb_ow); + + if (jcp.im2col_sz && is_problem_3d) { + // jit_gemm_convolution_utils::im2col_dt_3d() requires external + // data initialization by zeroes + + ptrdiff_t i = 0; + while (i < jcp.im2col_sz) { + size_t vl = __riscv_vsetvl_e32m1(jcp.im2col_sz - i); + vfloat32m1_t v_zero = __riscv_vfmv_v_f_f32m1(0.0f, vl); + __riscv_vse32_v_f32m1(col + i, v_zero, vl); + i += vl; + } + } + + for (dim_t iwork = start; iwork < end; ++iwork) { + dim_t oh = ohb * jcp.oh_block; + dim_t ow = owb * jcp.ow_block; + const data_t *__restrict src + = src_base + n * src_mb_stride + g * src_g_stride; + const data_t *__restrict wei = wei_base + g * wei_g_stride; + + const int h_step = nstl::min(jcp.oh_block, jcp.oh - oh); + const int w_step = nstl::min(jcp.ow_block, jcp.ow - ow); + if (jcp.im2col_sz && is_problem_3d) { + jit_gemm_convolution_utils::transpose_dt(jcp, src, imtr); + } + + for (int od = 0; od < jcp.od; od++) { + data_t *__restrict dst = dst_base + n * dst_mb_stride + + g * dst_g_stride + + ((od * jcp.oh + oh) * jcp.ow + ow) * dst_os_stride; + if (jcp.im2col_sz) { + if (is_problem_3d) + jit_gemm_convolution_utils::im2col_dt_3d( + jcp, imtr, col, od); + else + jit_gemm_convolution_utils::im2col_dt( + jcp, src, imtr, col, oh, h_step, ow, w_step); + } + + const dim_t M = jcp.oc; + const dim_t K = jcp.ks * jcp.ic; + const dim_t N = h_step * w_step; + const dim_t LDA = M * jcp.ngroups; + const dim_t LDB = jcp.im2col_sz ? N : K * jcp.ngroups; + const dim_t LDC = M * jcp.ngroups; + const char *BT = jcp.im2col_sz ? "T" : "N"; + const data_t onef = 1.f; + const float beta = jcp.with_sum ? 1.0f : 0.0f; + const data_t *__restrict src_od + = src + od * jcp.oh * jcp.ow * jcp.ngroups * jcp.ic; + status_t st = extended_sgemm("N", BT, &M, &N, &K, &onef, wei, &LDA, + jcp.im2col_sz ? col : (data_t *)src_od, &LDB, &beta, dst, + &LDC); + if (st != status::success) return st; + + if (jcp.with_bias || jcp.with_eltwise || jcp.with_binary) { + parallel(0, [&](int ithr, int nthr) { + dim_t start, end; + balance211(N * jcp.oc, nthr, ithr, start, end); + + const size_t first_oc = start % jcp.oc; + const size_t last_oc = (end - 1) % jcp.oc; + const size_t first_os = start / jcp.oc; + const size_t last_os = (end - 1) / jcp.oc; + + for (size_t os = first_os; os <= last_os; ++os) { + const size_t start_oc = (os == first_os) ? first_oc : 0; + const size_t end_oc + = (os == last_os) ? last_oc : jcp.oc - 1; + + const data_t *__restrict bia_arr + = bia_base ? bia_base + g * jcp.oc : nullptr; + data_t *__restrict dst_arr = dst + os * dst_os_stride; + + if (jcp.with_bias) { + size_t n_elems = end_oc - start_oc + 1; + if (n_elems > 0) { + size_t oc = 0; + const data_t *b_ptr = bia_arr + start_oc; + data_t *d_ptr = dst_arr + start_oc; + + while (oc < n_elems) { + size_t vl = __riscv_vsetvl_e32m1( + n_elems - oc); + vfloat32m1_t v_dst = __riscv_vle32_v_f32m1( + d_ptr + oc, vl); + vfloat32m1_t v_bias = __riscv_vle32_v_f32m1( + b_ptr + oc, vl); + v_dst = __riscv_vfadd_vv_f32m1( + v_dst, v_bias, vl); + __riscv_vse32_v_f32m1( + d_ptr + oc, v_dst, vl); + oc += vl; + } + } + } + + if (jcp.with_eltwise || jcp.with_binary) { + bool fast_relu_done = false; + if (jcp.with_eltwise && jcp.post_ops.len() == 1) { + // fast branch for ReLU case + const auto &eltwise + = jcp.post_ops.entry_.back().eltwise; + + if (eltwise.alg == alg_kind::eltwise_relu) { + const auto alpha = eltwise.alpha; + const auto scale = eltwise.scale; + PRAGMA_OMP_SIMD() + for (size_t oc = start_oc; oc <= end_oc; + oc++) { + if (dst_arr[oc] < 0) + dst_arr[oc] *= alpha; + dst_arr[oc] *= scale; + } + fast_relu_done = true; + } + } + if (!fast_relu_done) { + ref_post_ops_t::args_t args; + args.ctx = &ctx; + args.dst_md = pd()->dst_md(); + + for (size_t oc = start_oc; oc <= end_oc; oc++) { + // jcp.od is not part of jcp.os, so multiply + // jcp.od to get spatial offset. + args.l_offset = (g * jcp.oc + oc) + * (jcp.os * jcp.od); + post_ops_->execute(dst_arr[oc], args); + } + } + } + } + }); + } + } + nd_iterator_step(n, jcp.mb, g, jcp.ngroups, ohb, nb_oh, owb, nb_ow); + } + return status::success; +} + +status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp( + const exec_ctx_t &ctx) const { + auto src = CTX_IN_MEM(const data_t *, DNNL_ARG_SRC); + auto weights = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS); + auto bias = CTX_IN_MEM(const data_t *, DNNL_ARG_BIAS); + auto dst = CTX_OUT_MEM(data_t *, DNNL_ARG_DST); + + auto col = ctx.get_scratchpad_grantor().get(key_conv_gemm_col); + + const conv_gemm_conf_t &jcp = this->pd()->jcp_; + + const memory_desc_wrapper src_d(pd()->src_md()); + const memory_desc_wrapper dst_d(pd()->dst_md()); + + // The second arg in template means sub_offset0 = true + // See `blk_off` method definition. + const size_t src_mb_stride = src_d.blk_off(1); + const size_t src_g_stride = src_d.blk_off(0, 1) * jcp.ic; + + const size_t dst_mb_stride = dst_d.blk_off(1); + const size_t dst_g_stride = dst_d.blk_off(0, 1) * jcp.oc; + + const size_t weights_oc_size = jcp.ic * jcp.ks; + const size_t weights_g_size = weights_oc_size * jcp.oc; + const bool is_problem_3d = pd()->ndims() == 5; + + src += src_d.off_l(0); + dst += dst_d.off_l(0); + + assert(IMPLICATION(is_problem_3d, + jcp.os_block == jcp.os && jcp.ic_block == jcp.ic + && jcp.os_nb_block == 1)); + + status_t st = status::success; + parallel(jcp.nthr, [&](const int ithr, const int nthr) { + data_t *_col = col + (ptrdiff_t)ithr * jcp.im2col_sz; + + // non-blocked jit_gemm_convolution_utils::im2col_3d() requires + // external data initialization by zeroes + const bool outer_padding = jcp.os_nb_block == 1; + if (outer_padding && is_problem_3d) { + for (ptrdiff_t i = 0; i < jcp.im2col_sz; i++) + _col[i] = (data_t)0; + } + auto inner_ker = [&](int spatial, const im_pos_t &curr, im_pos_t &prev, + im_pos_t &step, const im_pos_t &end) { + const data_t *_src + = src + curr.n * src_mb_stride + curr.g * src_g_stride; + step.oc = nstl::min( + jcp.oc_block, nstl::min(jcp.oc, end.oc) - curr.oc); + step.sp = nstl::min(jcp.os_block, + nstl::min(jcp.os - curr.sp, end.sp - spatial)); + step.ic = nstl::min( + jcp.ic_block, nstl::min(jcp.ic, end.ic) - curr.ic); + bool do_im2col = curr.do_im2col(prev); + prev = curr; + + if (jcp.im2col_sz && do_im2col) { + if (!is_problem_3d) + jit_gemm_convolution_utils::im2col(jcp, _src, _col, + curr.sp, step.sp, curr.ic, step.ic); + else + jit_gemm_convolution_utils::im2col_3d( + jcp, _src, _col, curr.od, 0, jcp.os); + } + const data_t one = 1.0; + + const dim_t M = jcp.os * jcp.od; + const dim_t m = step.sp; + const dim_t LDA = jcp.im2col_sz ? m : M; + data_t *_dst = dst + curr.n * dst_mb_stride + curr.g * dst_g_stride + + curr.oc * M + curr.od * jcp.os + curr.sp; + const dim_t K = step.ic * jcp.ks; + const dim_t LDB = jcp.ic * jcp.ks; + const dim_t N = step.oc; + + const float beta + = (curr.ic == 0) ? (jcp.with_sum ? 1.0f : 0.0f) : one; + const float *_source = jcp.im2col_sz + ? _col + : _src + curr.ic * M + curr.od * jcp.os + curr.sp; + const data_t *_weights = weights + curr.g * weights_g_size + + curr.oc * weights_oc_size + curr.ic * jcp.ks; + + status_t st = extended_sgemm("N", "N", &m, &N, &K, &one, _source, + &LDA, _weights, &LDB, &beta, _dst, &M); + if (st != status::success) return st; + + if (curr.ic == jcp.ic - step.ic) { + // TODO: for "outer threading" we have parallel section within + // outermost "parallel". It is not good. Consider to use + // "parallel" here with number of threads passed as parameter + const int oc_start = curr.g * jcp.oc + curr.oc; + if (jcp.with_eltwise || jcp.with_binary) { + bool fast_relu_done = false; + if (jcp.with_eltwise && jcp.post_ops.len() == 1) { + // fast branch for ReLU case + const auto &eltwise + = jcp.post_ops.entry_.back().eltwise; + if (eltwise.alg == alg_kind::eltwise_relu) { + parallel_nd(step.oc, [&](dim_t oc) { + data_t b = jcp.with_bias ? bias[oc_start + oc] + : 0; + data_t *d_ = _dst + oc * M; + + if (eltwise.alpha == 0.0f) { + int oS = 0; + while (oS < m) { + size_t vl + = __riscv_vsetvl_e32m1(m - oS); + vfloat32m1_t v_d + = __riscv_vle32_v_f32m1( + d_ + oS, vl); + v_d = __riscv_vfadd_vf_f32m1( + v_d, b, vl); // Add bias + + v_d = __riscv_vfmax_vf_f32m1( + v_d, 0.0f, vl); + + if (eltwise.scale != 1.0f) { + v_d = __riscv_vfmul_vf_f32m1( + v_d, eltwise.scale, vl); + } + + __riscv_vse32_v_f32m1(d_ + oS, v_d, vl); + oS += vl; + } + } else { + int oS = 0; + while (oS < m) { + size_t vl + = __riscv_vsetvl_e32m1(m - oS); + vfloat32m1_t v_d + = __riscv_vle32_v_f32m1( + d_ + oS, vl); + v_d = __riscv_vfadd_vf_f32m1( + v_d, b, vl); // Add bias + vbool32_t mask + = __riscv_vmflt_vf_f32m1_b32( + v_d, 0.0f, vl); + v_d = __riscv_vfmul_vf_f32m1_m( + mask, v_d, eltwise.alpha, vl); + v_d = __riscv_vfmul_vf_f32m1( + v_d, eltwise.scale, vl); + __riscv_vse32_v_f32m1(d_ + oS, v_d, vl); + oS += vl; + } + } + }); + fast_relu_done = true; + } + } + if (!fast_relu_done) { + parallel_nd(step.oc, [&](dim_t oc) { + data_t b = jcp.with_bias ? bias[oc_start + oc] : 0; + data_t *d_ = _dst + oc * M; + + ref_post_ops_t::args_t args; + args.ctx = &ctx; + args.dst_md = pd()->dst_md(); + args.l_offset = d_ - dst; + + for (int oS = 0; oS < m; ++oS) { + d_[oS] += b; + post_ops_->execute(d_[oS], args); + args.l_offset++; + } + }); + } + + } else if (jcp.with_bias) { + parallel_nd(step.oc, [&](dim_t oc) { + data_t b = bias[oc_start + oc]; + data_t *d_ = _dst + oc * M; + + int oS = 0; + while (oS < m) { + size_t vl = __riscv_vsetvl_e32m1(m - oS); + vfloat32m1_t v_d + = __riscv_vle32_v_f32m1(d_ + oS, vl); + v_d = __riscv_vfadd_vf_f32m1(v_d, b, vl); + __riscv_vse32_v_f32m1(d_ + oS, v_d, vl); + oS += vl; + } + }); + } + } + + return status::success; + }; + im_pos_t start, end; + end.ic = jcp.ic; + + if (!is_problem_3d) { + dim_t sp_work = jcp.mb * jcp.ngroups * jcp.od * jcp.os; + balance2D(nthr, ithr, sp_work, start.sp, end.sp, jcp.oc, start.oc, + end.oc, dim_t(jcp.nthr_oc)); + } else { + dim_t sp_work = jcp.mb * jcp.ngroups * jcp.od; + balance2D(nthr, ithr, sp_work, start.sp, end.sp, jcp.oc, start.oc, + end.oc, dim_t(jcp.nthr_oc)); + start.sp *= jcp.os; + end.sp *= jcp.os; + } + + im_pos_t curr, prev, step; + prev.n = prev.g = prev.od = prev.sp = prev.ic = -1; + step.oc = jcp.oc_block; + step.sp = jcp.os_block; + step.ic = jcp.ic_block; + + if (jcp.loop_order == gemm_loop_rlb) + for (curr.ic = 0; curr.ic < jcp.ic; curr.ic += step.ic) + for (int spatial = start.sp; spatial < end.sp; + spatial += step.sp) { + nd_iterator_init(spatial, curr.n, jcp.mb, curr.g, + jcp.ngroups, curr.od, jcp.od, curr.sp, jcp.os); + for (curr.oc = start.oc; curr.oc < end.oc; + curr.oc += step.oc) { + status_t st_thr + = inner_ker(spatial, curr, prev, step, end); + if (st_thr != status::success) { + st = st_thr; + return; + } + } + } + else if (jcp.loop_order == gemm_loop_lrb) + for (int spatial = start.sp; spatial < end.sp; spatial += step.sp) { + nd_iterator_init(spatial, curr.n, jcp.mb, curr.g, jcp.ngroups, + curr.od, jcp.od, curr.sp, jcp.os); + for (curr.ic = 0; curr.ic < jcp.ic; curr.ic += step.ic) + for (curr.oc = start.oc; curr.oc < end.oc; + curr.oc += step.oc) { + status_t st_thr + = inner_ker(spatial, curr, prev, step, end); + if (st_thr != status::success) { + st = st_thr; + return; + } + } + } + else + st = status::unimplemented; + }); + + return st; +} + +} // namespace rv64 +} // namespace cpu +} // namespace impl +} // namespace dnnl diff --git a/src/cpu/rv64/rvv_gemm_convolution.hpp b/src/cpu/rv64/rvv_gemm_convolution.hpp new file mode 100644 index 00000000000..7bcda8e9462 --- /dev/null +++ b/src/cpu/rv64/rvv_gemm_convolution.hpp @@ -0,0 +1,149 @@ +/******************************************************************************* +* Copyright 2016 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef CPU_RV64_RVV_GEMM_CONVOLUTION_HPP +#define CPU_RV64_RVV_GEMM_CONVOLUTION_HPP + +#include "common/broadcast_strategy.hpp" +#include "common/c_types_map.hpp" +#include "common/memory_tracking.hpp" +#include "common/primitive.hpp" +#include "common/utils.hpp" + +#include "cpu/binary_injector_utils.hpp" +#include "cpu/cpu_convolution_pd.hpp" +#include "cpu/gemm/gemm.hpp" +#include "cpu/primitive_attr_postops.hpp" +#include "cpu/rv64/rvv_gemm_convolution_utils.hpp" + +namespace dnnl { +namespace impl { +namespace cpu { +namespace rv64 { + +struct riscv_gemm_convolution_fwd_t : public primitive_t { + struct pd_t : public cpu_convolution_fwd_pd_t { + using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t; + + DECLARE_COMMON_PD_T(GEMM_IMPL_STR, riscv_gemm_convolution_fwd_t, + USE_GLOBAL_SCRATCHPAD); + + status_t init(engine_t *engine) { + using namespace data_type; + + VDISPATCH_CONV(is_fwd(), VERBOSE_BAD_PROPKIND); + + if (with_bias()) { + VDISPATCH_CONV(expect_data_types(f32, f32, f32, f32, f32), + VERBOSE_UNSUPPORTED_DT_CFG); + } else { + VDISPATCH_CONV( + expect_data_types(f32, f32, data_type::undef, f32, f32), + VERBOSE_UNSUPPORTED_DT_CFG); + } + + VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct), + VERBOSE_BAD_ALGORITHM); + VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, ""); + VDISPATCH_CONV( + attr()->has_default_values( + primitive_attr_t::skip_mask_t::post_ops, f32), + VERBOSE_UNSUPPORTED_ATTR); + VDISPATCH_CONV(post_ops_ok(), VERBOSE_UNSUPPORTED_POSTOP); + + auto scratchpad = scratchpad_registry().registrar(); + + // TODO: make `init_conf` assign initialized object to `jcp_` + jcp_ = conv_gemm_conf_t(); + return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad, + *desc(), src_md_, weights_md_, dst_md_, bias_md_, attr_, + dnnl_get_max_threads()); + } + + conv_gemm_conf_t jcp_ = utils::zero(); + + protected: + bool post_ops_ok() const { + auto const &po = attr()->post_ops_; + auto is_sum_ok = [&](int idx) { + return IMPLICATION(po.entry_[idx].kind == primitive_kind::sum, + idx == 0 && po.entry_[idx].is_sum()); + }; + auto is_binary + = [&](int idx) { return po.entry_[idx].is_binary(); }; + auto is_prelu = [&](int idx) { return po.entry_[idx].is_prelu(); }; + auto is_binary_or_prelu_supported = [&](int idx) { + bool ok = dnnl::impl::get_rhs_arg_broadcasting_strategy( + binary_injector_utils::get_src1_desc( + po.entry_[idx], dst_md_), + dst_md_, + {broadcasting_strategy_t::scalar, + broadcasting_strategy_t::per_oc}) + != broadcasting_strategy_t::unsupported; + return ok; + }; + + if (!ref_post_ops_t::post_ops_ok(attr()->post_ops_)) return false; + + for (int idx = 0; idx < po.len(); idx++) { + bool ok = is_sum_ok(idx) + && IMPLICATION(is_binary(idx) || is_prelu(idx), + is_binary_or_prelu_supported(idx)); + if (!ok) return false; + } + + return true; + } + }; + + riscv_gemm_convolution_fwd_t(const pd_t *apd) + : primitive_t(apd), post_ops_(nullptr) {} + + status_t init(engine_t *engine) override { + const auto &jcp = pd()->jcp_; + + if (jcp.with_eltwise || jcp.with_binary) { + CHECK(safe_ptr_assign(post_ops_, new ref_post_ops_t(jcp.post_ops))); + CHECK(post_ops_->init(pd()->dst_md())); + } + return status::success; + } + + using data_t = typename prec_traits_t::type; + + status_t execute(const exec_ctx_t &ctx) const override { + bool is_nspc = pd()->jcp_.is_nspc; + return is_nspc ? execute_forward_nspc(ctx) : execute_forward_ncsp(ctx); + } + +private: + status_t execute_forward_ncsp(const exec_ctx_t &ctx) const; + status_t execute_forward_nspc(const exec_ctx_t &ctx) const; + status_t execute_forward_thr_nspc(const exec_ctx_t &ctx, const int ithr, + const int nthr, const data_t *src_base, const data_t *wei_base, + const data_t *bia_base, data_t *dst_base, + const memory_tracking::grantor_t &scratchpad) const; + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } + + std::unique_ptr post_ops_; +}; + +} // namespace rv64 +} // namespace cpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/cpu/rv64/rvv_gemm_convolution_utils.cpp b/src/cpu/rv64/rvv_gemm_convolution_utils.cpp new file mode 100644 index 00000000000..2ce81d0a738 --- /dev/null +++ b/src/cpu/rv64/rvv_gemm_convolution_utils.cpp @@ -0,0 +1,2185 @@ +/******************************************************************************* +* Copyright 2016 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "cpu/rv64/rvv_gemm_convolution_utils.hpp" +#include "common/bfloat16.hpp" +#include "common/c_types_map.hpp" +#include "common/dnnl_thread.hpp" +#include "common/type_helpers.hpp" +#include "common/utils.hpp" +#include "cpu/scale_utils.hpp" + +#include "cpu/platform.hpp" + +namespace dnnl { +namespace impl { +namespace cpu { +namespace rv64 { + +using namespace dnnl::impl::status; +using namespace dnnl::impl::utils; +using namespace prop_kind; +using namespace data_type; + +single_gemm_conv_chunk_desc_t::single_gemm_conv_chunk_desc_t(dim_t d_off, + dim_t d_size, dim_t h_off, dim_t h_size, dim_t w_off, dim_t w_size) + : d_off_(d_off) + , d_size_(d_size) + , h_off_(h_off) + , h_size_(h_size) + , w_off_(w_off) + , w_size_(w_size) {} + +namespace jit_gemm_convolution_utils { + +template +void im2col_3d(const conv_gemm_conf_t &jcp, const data_type_t *im, + data_type_t *col, dim_t od, int spatial_step, int spatial_block) { + using data_t = + typename conditional::data_type == bf16, + uint16_t, data_type_t>::type; + const data_t *__restrict _im + = reinterpret_cast(im); + data_t *__restrict _col = reinterpret_cast(col); + + const size_t OHW = spatial_block; + const size_t im_step = jcp.ih * jcp.iw * jcp.id; + const size_t col_step = jcp.ks * OHW; + + auto compute_im2col_outer_padding = [&](dim_t ic) { + const data_t *__restrict im_loc = _im + ic * im_step; + data_t *__restrict col_loc = _col + ic * col_step; + dim_t id = od * jcp.stride_d - jcp.f_pad; + for (dim_t kd = 0; kd < jcp.kd; ++kd) { + data_t *__restrict col_ = col_loc + kd * jcp.kh * jcp.kw * OHW; + if (id < 0 || id >= jcp.id) { + dim_t ih_ = -jcp.t_pad; + for (dim_t kh = 0; kh < jcp.kh; ++kh) { + dim_t ih = ih_; + for (dim_t oh = 0; oh < jcp.oh; ++oh) { + if (ih < 0 || ih >= jcp.ih) { + ih += jcp.stride_h; + continue; + } + dim_t iw_ = -jcp.l_pad; + for (dim_t kw = 0; kw < jcp.kw; ++kw) { + dim_t iw = iw_; + for (dim_t ow = 0; ow < jcp.ow; ++ow) { + if (iw < 0 || iw >= jcp.iw) { + iw += jcp.stride_w; + continue; + } + + const size_t col_idx + = kw * OHW + oh * jcp.ow + ow; + + col_[col_idx] = 0; + iw += jcp.stride_w; + } + iw_ += (1 + jcp.dilate_w); + } + ih += jcp.stride_h; + } + ih_ += (1 + jcp.dilate_h); + col_ += jcp.kw * OHW; + } + } else { + const data_t *__restrict im_ = im_loc + id * jcp.ih * jcp.iw; + dim_t ih_ = -jcp.t_pad; + for (dim_t kh = 0; kh < jcp.kh; ++kh) { + dim_t ih = ih_; + for (dim_t oh = 0; oh < jcp.oh; ++oh) { + if (ih < 0 || ih >= jcp.ih) { + ih += jcp.stride_h; + continue; + } + dim_t iw_ = -jcp.l_pad; + for (dim_t kw = 0; kw < jcp.kw; ++kw) { + dim_t iw = iw_; + for (dim_t ow = 0; ow < jcp.ow; ++ow) { + if (iw < 0 || iw >= jcp.iw) { + iw += jcp.stride_w; + continue; + } + + const size_t col_idx + = kw * OHW + oh * jcp.ow + ow; + const size_t im_idx = ih * jcp.iw + iw; + + col_[col_idx] = im_[im_idx]; + iw += jcp.stride_w; + } + iw_ += (1 + jcp.dilate_w); + } + ih += jcp.stride_h; + } + ih_ += (1 + jcp.dilate_h); + col_ += jcp.kw * OHW; + } + } + id += (1 + jcp.dilate_d); + } + }; + auto compute_im2col_padding = [&](dim_t ic) { + const dim_t first_oh = spatial_step / jcp.ow; + const dim_t last_oh = (spatial_step + spatial_block - 1) / jcp.ow; + const dim_t oh_begin = first_oh; + const dim_t oh_end = last_oh + 1; + const dim_t first_ow = spatial_step % jcp.ow; + const dim_t last_ow = (spatial_step + spatial_block - 1) % jcp.ow; + + const data_t *__restrict im_loc = _im + ic * im_step; + data_t *__restrict col_loc = _col + ic * col_step; + dim_t id = od * jcp.stride_d - jcp.f_pad; + for (dim_t kd = 0; kd < jcp.kd; ++kd) { + data_t *__restrict col_ = col_loc + kd * jcp.kh * jcp.kw * OHW; + if (id < 0 || id >= jcp.id) { + for (dim_t kh = 0; kh < jcp.kh; ++kh) { + for (dim_t oh = oh_begin; oh < oh_end; ++oh) { + const dim_t ow_begin = (oh == first_oh) ? first_ow : 0; + const dim_t ow_end + = (oh == last_oh) ? (last_ow + 1) : jcp.ow; + for (dim_t kw = 0; kw < jcp.kw; ++kw) { + for (dim_t ow = ow_begin; ow < ow_end; ++ow) { + const size_t col_idx = kw * OHW + oh * jcp.ow + + ow - spatial_step; + col_[col_idx] = 0; + } + } + } + col_ += jcp.kw * OHW; + } + } else { + const data_t *__restrict im_ = im_loc + id * jcp.ih * jcp.iw; + dim_t ih_ = oh_begin * jcp.stride_h - jcp.t_pad; + for (dim_t kh = 0; kh < jcp.kh; ++kh) { + dim_t ih = ih_; + for (dim_t oh = oh_begin; oh < oh_end; ++oh) { + const dim_t ow_begin = (oh == first_oh) ? first_ow : 0; + const dim_t ow_end + = (oh == last_oh) ? (last_ow + 1) : jcp.ow; + if (ih < 0 || ih >= jcp.ih) { + for (dim_t kw = 0; kw < jcp.kw; ++kw) { + for (dim_t ow = ow_begin; ow < ow_end; ++ow) { + const size_t col_idx = kw * OHW + + oh * jcp.ow + ow - spatial_step; + col_[col_idx] = 0; + } + } + ih += jcp.stride_h; + continue; + } + dim_t iw_ = ow_begin * jcp.stride_w - jcp.l_pad; + for (dim_t kw = 0; kw < jcp.kw; ++kw) { + dim_t iw = iw_; + for (dim_t ow = ow_begin; ow < ow_end; ++ow) { + const size_t col_idx = kw * OHW + oh * jcp.ow + + ow - spatial_step; + if (iw < 0 || iw >= jcp.iw) { + col_[col_idx] = 0; + iw += jcp.stride_w; + continue; + } + const size_t im_idx = ih * jcp.iw + iw; + col_[col_idx] = im_[im_idx]; + iw += jcp.stride_w; + } + iw_ += (1 + jcp.dilate_w); + } + ih += jcp.stride_h; + } + ih_ += (1 + jcp.dilate_h); + col_ += jcp.kw * OHW; + } + } + id += (1 + jcp.dilate_d); + } + }; + + // zero padding is handled outside im2col + const bool outer_padding = jcp.os_nb_block == 1; + if (outer_padding) + parallel_nd(jcp.ic, compute_im2col_outer_padding); + else + parallel_nd(jcp.ic, compute_im2col_padding); +} + +template void im2col_3d(const conv_gemm_conf_t &jcp, const float *im, + float *col, dim_t od, int spatial_step, int spatial_block); + +template void im2col_3d(const conv_gemm_conf_t &jcp, const bfloat16_t *im, + bfloat16_t *col, dim_t od, int spatial_step, int spatial_block); + +/* imtr[ic][od][oh][ow] <-- im[id][ih][iw][ic]*/ +template +void transpose_dt(const conv_gemm_conf_t &jcp, const T *__restrict im, + T *__restrict imtr) { + uint8_t shift = jcp.signed_input ? 128 : 0; + const dim_t ic_stride = jcp.id * jcp.ih * jcp.iw; + const dim_t IC = jcp.ngroups * jcp.ic; + const dim_t IHW = jcp.ih * jcp.iw; + constexpr dim_t ic_block = platform::get_cache_line_size(); + const dim_t nb_ic = jcp.ic / ic_block; + const dim_t ic_blocked = nb_ic * ic_block; + parallel_nd(jcp.id, jcp.ih, [&](dim_t id, dim_t ih) { + const T *__restrict im_h = im + id * IHW * IC + ih * jcp.iw * IC; + T *__restrict imtr_h = imtr + id * IHW + ih * jcp.iw; + for (dim_t iw = 0; iw < jcp.iw; iw++) { + const T *__restrict im_w = im_h + iw * IC; + T *__restrict imtr_w = imtr_h + iw; + for (dim_t icb = 0; icb < nb_ic; icb++) { + const T *__restrict im_icb = im_w + icb * ic_block; + T *__restrict imtr_icb = imtr_w + icb * ic_block * ic_stride; + PRAGMA_OMP_SIMD() + for (dim_t ic = 0; ic < ic_block; ic++) { + imtr_icb[ic * ic_stride] = im_icb[ic] + shift; + } + } + for (dim_t ic = ic_blocked; ic < jcp.ic; ic++) { + imtr_w[ic * ic_stride] = im_w[ic] + shift; + } + } + }); +} + +template void transpose_dt(const conv_gemm_conf_t &jcp, + const int8_t *__restrict im, int8_t *__restrict imtr); +template void transpose_dt(const conv_gemm_conf_t &jcp, + const uint8_t *__restrict im, uint8_t *__restrict imtr); +template void transpose_dt(const conv_gemm_conf_t &jcp, + const char *__restrict im, char *__restrict imtr); +template void transpose_dt(const conv_gemm_conf_t &jcp, + const float *__restrict im, float *__restrict imtr); +template void transpose_dt(const conv_gemm_conf_t &jcp, + const bfloat16_t *__restrict im, bfloat16_t *__restrict imtr); + +/* col[kd][kh][kw][g][ic][od][oh][ow] <-- im2col_dt_3d(im[id][ih][iw][g][ic]) */ +template +void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr, + orig_col_dt *__restrict _col, dim_t od) { + // For performance reasons, use uint16_t as a proxy for bfloat16_t + using im_dt = + typename utils::conditional::data_type + == bf16, + uint16_t, orig_im_dt>::type; + using col_dt = + typename utils::conditional::data_type + == bf16, + uint16_t, orig_col_dt>::type; + const im_dt *__restrict imtr + = reinterpret_cast(_imtr); + col_dt *__restrict col = reinterpret_cast(_col); + + col_dt shift = static_cast(jcp.signed_input ? 128 : 0); + const dim_t dd = 1 + jcp.dilate_d; + const dim_t dh = 1 + jcp.dilate_h; + const dim_t dw = 1 + jcp.dilate_w; + const dim_t sd = jcp.stride_d; + const dim_t sh = jcp.stride_h; + const dim_t sw = jcp.stride_w; + const dim_t fp = jcp.f_pad; + const dim_t tp = jcp.t_pad; + const dim_t lp = jcp.l_pad; + const dim_t col_ic_s = jcp.oh * jcp.ow; + const dim_t col_kw_s = jcp.ic * col_ic_s; + const dim_t col_kh_s = jcp.kw * col_kw_s; + const dim_t col_kd_s = jcp.kh * col_kh_s; + const dim_t IHW = jcp.ih * jcp.iw; + const dim_t OHW = jcp.oh * jcp.ow; + + if (sd == 1 && sh == 1 && sw == 1 && dd == 1 && dh == 1 && dw == 1) + parallel_nd(jcp.kd, jcp.kh, jcp.kw, jcp.ic, + [&](dim_t kd, dim_t kh, dim_t kw, dim_t ic) { + col_dt *__restrict col_loc = col + kd * col_kd_s + kh * col_kh_s + + kw * col_kw_s + ic * col_ic_s; + const dim_t id = od - fp + kd; + if (id < 0 || id >= jcp.id) { + for (ptrdiff_t i = 0; i < OHW; i++) + col_loc[i] = shift; + return; + } + const im_dt *__restrict imtr_loc = imtr + (ic * jcp.id + id) * IHW; + const dim_t oh_start = saturate(dim_t(0), jcp.oh, tp - kh); + const dim_t oh_end = saturate(dim_t(0), jcp.oh, jcp.ih + tp - kh); + const dim_t ow_start = saturate(dim_t(0), jcp.ow, lp - kw); + const dim_t ow_end = saturate(dim_t(0), jcp.ow, jcp.iw + lp - kw); + for (dim_t oh = oh_start, ih = oh_start - tp + kh; oh < oh_end; + oh++, ih++) { + col_dt *__restrict col_h = col_loc + oh * jcp.ow; + const im_dt *__restrict imtr_h = imtr_loc + ih * jcp.iw; + for (dim_t ow = ow_start, iw = ow_start - lp + kw; ow < ow_end; + ow++, iw++) { + col_h[ow] = imtr_h[iw]; + } + } + }); + else if (sd == 2 && sh == 2 && sw == 2 && dd == 1 && dh == 1 && dw == 1) + parallel_nd(jcp.kd, jcp.kh, jcp.kw, jcp.ic, + [&](dim_t kd, dim_t kh, dim_t kw, dim_t ic) { + col_dt *__restrict col_loc = col + kd * col_kd_s + kh * col_kh_s + + kw * col_kw_s + ic * col_ic_s; + const dim_t id = od * 2 - fp + kd; + if (id < 0 || id >= jcp.id) { + for (ptrdiff_t i = 0; i < OHW; i++) + col_loc[i] = shift; + return; + } + const im_dt *__restrict imtr_loc = imtr + (ic * jcp.id + id) * IHW; + const dim_t oh_start + = saturate(dim_t(0), jcp.oh, div_up(tp - kh, 2)); + const dim_t oh_end + = saturate(dim_t(0), jcp.oh, div_up(jcp.ih + tp - kh, 2)); + const dim_t ow_start + = saturate(dim_t(0), jcp.ow, div_up(lp - kw, 2)); + const dim_t ow_end + = saturate(dim_t(0), jcp.ow, div_up(jcp.iw + lp - kw, 2)); + for (dim_t oh = oh_start, ih = oh_start * 2 - tp + kh; oh < oh_end; + ++oh, ih += 2) { + col_dt *__restrict col_h = col_loc + oh * jcp.ow; + const im_dt *__restrict imtr_h = imtr_loc + ih * jcp.iw; + for (dim_t ow = ow_start, iw = ow_start * 2 - lp + kw; + ow < ow_end; ++ow, iw += 2) { + col_h[ow] = imtr_h[iw]; + } + } + }); + else + parallel_nd(jcp.kd, jcp.kh, jcp.kw, jcp.ic, + [&](dim_t kd, dim_t kh, dim_t kw, dim_t ic) { + col_dt *__restrict col_loc = col + kd * col_kd_s + kh * col_kh_s + + kw * col_kw_s + ic * col_ic_s; + const dim_t id = od * sd - fp + kd * dd; + if (id < 0 || id >= jcp.id) { + for (ptrdiff_t i = 0; i < OHW; i++) + col_loc[i] = shift; + return; + } + const im_dt *__restrict imtr_loc = imtr + (ic * jcp.id + id) * IHW; + const dim_t oh_start + = saturate(dim_t(0), jcp.oh, div_up(tp - kh * dh, sh)); + const dim_t oh_end = saturate( + dim_t(0), jcp.oh, div_up(jcp.ih + tp - kh * dh, sh)); + const dim_t ow_start + = saturate(dim_t(0), jcp.ow, div_up(lp - kw * dw, sw)); + const dim_t ow_end = saturate( + dim_t(0), jcp.ow, div_up(jcp.iw + lp - kw * dw, sw)); + for (dim_t oh = oh_start, ih = oh_start * sh - tp + kh * dh; + oh < oh_end; ++oh, ih += sh) { + col_dt *__restrict col_h = col_loc + oh * jcp.ow; + const im_dt *__restrict imtr_h = imtr_loc + ih * jcp.iw; + for (dim_t ow = ow_start, iw = ow_start * sw - lp + kw * dw; + ow < ow_end; ++ow, iw += sw) { + col_h[ow] = imtr_h[iw]; + } + } + }); +} + +template void im2col_dt_3d(const conv_gemm_conf_t &jcp, + const void *__restrict im, uint8_t *__restrict col, dim_t od); +template void im2col_dt_3d(const conv_gemm_conf_t &jcp, + const void *__restrict im, uint8_t *__restrict col, dim_t od); +template void im2col_dt_3d(const conv_gemm_conf_t &jcp, + const void *__restrict im, float *__restrict col, dim_t od); +template void im2col_dt_3d(const conv_gemm_conf_t &jcp, + const void *__restrict im, bfloat16_t *__restrict col, dim_t od); + +/* col[ic][kh][kw][oh][ow] <-- im2col(im[ic][ih][iw]) */ +template +void im2col(const conv_gemm_conf_t &jcp, const data_type_t *__restrict im, + data_type_t *__restrict col, dim_t ss, dim_t sb, dim_t cs, dim_t cb) { + + using data_t = + typename utils::conditional::data_type + == bf16, + uint16_t, data_type_t>::type; + const data_t *__restrict _im + = reinterpret_cast(im); + data_t *__restrict _col = reinterpret_cast(col); + + const size_t im_step = jcp.is; + const size_t col_step = jcp.ks * sb; + const dim_t dh = 1 + jcp.dilate_h; + const dim_t dw = 1 + jcp.dilate_w; + const dim_t sh = jcp.stride_h; + const dim_t sw = jcp.stride_w; + const dim_t tp = jcp.t_pad; + const dim_t lp = jcp.l_pad; + const dim_t first_oh = ss / jcp.ow; + const dim_t last_oh = (ss + sb - 1) / jcp.ow; + const dim_t oh_begin = first_oh; + const dim_t oh_end = last_oh + 1; + const dim_t first_ow = ss % jcp.ow; + const dim_t last_ow = (ss + sb - 1) % jcp.ow; + + const data_t zero_val = 0; + + if (jcp.outer_threading) { + if (sw == 1) { + // Generated code is more optimized for stride_w == 1 + // because innermost loop is by width + for (dim_t ic = 0; ic < cb; ic++) { + const data_t *__restrict im_ic = _im + (ic + cs) * im_step; + for_(dim_t kh = 0; kh < jcp.kh; kh++) + for (dim_t kw = 0; kw < jcp.kw; kw++) { + data_t *__restrict col_k + = _col + ic * col_step + (kh * jcp.kw + kw) * sb; + for (dim_t oh = oh_begin; oh < oh_end; oh++) { + const dim_t ih = oh * sh - tp + kh * dh; + const data_t *__restrict im_ + = im_ic + ih * jcp.iw - lp + kw * dw; + const dim_t ow_begin = (oh == first_oh) ? first_ow : 0; + const dim_t ow_end + = (oh == last_oh) ? (last_ow + 1) : jcp.ow; + data_t *__restrict col_ = col_k + oh * jcp.ow - ss; + if (ih < 0 || ih >= jcp.ih) + for (dim_t ow = ow_begin; ow < ow_end; ow++) + col_[ow] = zero_val; + else { + for (dim_t ow = ow_begin; ow < ow_end; ++ow) { + const dim_t iw = ow; + if (iw < lp - kw * dw + || iw >= jcp.iw + lp - kw * dw) + col_[ow] = zero_val; + else + col_[ow] = im_[iw]; + } + } + } + } + } + } else { + for (dim_t ic = 0; ic < cb; ic++) { + const data_t *__restrict im_ = _im + (ic + cs) * im_step; + for_(dim_t kh = 0; kh < jcp.kh; kh++) + for (dim_t kw = 0; kw < jcp.kw; kw++) { + data_t *__restrict col_k + = _col + ic * col_step + (kh * jcp.kw + kw) * sb; + for (dim_t oh = oh_begin; oh < oh_end; oh++) { + const dim_t ih = oh * sh - tp + kh * dh; + const dim_t ow_begin = (oh == first_oh) ? first_ow : 0; + const dim_t ow_end + = (oh == last_oh) ? (last_ow + 1) : jcp.ow; + data_t *__restrict col_oh = col_k + oh * jcp.ow - ss; + if (ih < 0 || ih >= jcp.ih) + for (dim_t ow = ow_begin; ow < ow_end; ow++) + col_oh[ow] = zero_val; + else + for (dim_t ow = ow_begin; ow < ow_end; ow++) { + const dim_t iw = ow * sw - lp + kw * dw; + if (iw < 0 || iw >= jcp.iw) + col_oh[ow] = zero_val; + else { + const ptrdiff_t im_idx = ih * jcp.iw + iw; + col_oh[ow] = im_[im_idx]; + } + } + } + } + } + } + } else { + // TODO: optimize threading if jcp.ic*jcp.kh*jcp.kw*oh_range is small + // comparing to number of threads + const dim_t oh_range = oh_end - oh_begin; + // Generated code is more optimized for stride_w == 1 + // because innermost loop is by width + if (sw == 1) + parallel_nd(cb, jcp.kh, jcp.kw, oh_range, + [&](dim_t ic, dim_t kh, dim_t kw, dim_t ohr) { + const dim_t oh = ohr + oh_begin; + const dim_t ih = oh * sh - tp + kh * dh; + const dim_t ow_start = (oh == first_oh) ? first_ow : 0; + const dim_t ow_end = (oh == last_oh) ? (last_ow + 1) : jcp.ow; + data_t *__restrict col_oh = _col + ic * col_step + + (kh * jcp.kw + kw) * sb + oh * jcp.ow - ss; + const data_t *__restrict im_ + = _im + (ic + cs) * im_step + ih * jcp.iw; + const dim_t iw_shift = kw * dw - lp; + if (ih < 0 || ih >= jcp.ih) + for (dim_t ow = ow_start; ow < ow_end; ow++) + col_oh[ow] = zero_val; + else + for (dim_t ow = ow_start; ow < ow_end; ow++) { + const dim_t iw = ow + iw_shift; + if (iw < 0 || iw >= jcp.iw) + col_oh[ow] = zero_val; + else + col_oh[ow] = im_[iw]; + } + }); + else + parallel_nd(cb, jcp.kh, jcp.kw, oh_range, + [&](dim_t ic, dim_t kh, dim_t kw, dim_t ohr) { + const dim_t oh = ohr + oh_begin; + const dim_t ih = oh * sh - tp + kh * dh; + const dim_t ow_start = (oh == first_oh) ? first_ow : 0; + const dim_t ow_end = (oh == last_oh) ? (last_ow + 1) : jcp.ow; + data_t *__restrict col_oh = _col + ic * col_step + + (kh * jcp.kw + kw) * sb + oh * jcp.ow - ss; + const data_t *__restrict im_ = _im + (ic + cs) * im_step; + if (ih < 0 || ih >= jcp.ih) + for (dim_t ow = ow_start; ow < ow_end; ow++) + col_oh[ow] = zero_val; + else + for (dim_t ow = ow_start; ow < ow_end; ow++) { + const dim_t iw = ow * sw - lp + kw * dw; + if (iw < 0 || iw >= jcp.iw) + col_oh[ow] = zero_val; + else { + const ptrdiff_t im_idx = ih * jcp.iw + iw; + col_oh[ow] = im_[im_idx]; + } + } + }); + } +} + +template void im2col(const conv_gemm_conf_t &jcp, const float *__restrict im, + float *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb); + +template void im2col(const conv_gemm_conf_t &jcp, + const bfloat16_t *__restrict im, bfloat16_t *__restrict col, dim_t hs, + dim_t hb, dim_t ws, dim_t wb); + +/* col[kh][kw][ic][oh][ow] <-- im2col_dt(im[ih][iw][ic]) */ +template +void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict _im, + void *__restrict _imtr, orig_col_dt *__restrict _col, dim_t hs, + dim_t hb, dim_t ws, dim_t wb) { + // For performance reasons, use uint16_t as a proxy for bfloat16_t + using im_dt = + typename utils::conditional::data_type + == bf16, + uint16_t, orig_im_dt>::type; + using col_dt = + typename utils::conditional::data_type + == bf16, + uint16_t, orig_col_dt>::type; + const im_dt *__restrict im = reinterpret_cast(_im); + im_dt *__restrict imtr = reinterpret_cast(_imtr); + col_dt *__restrict col = reinterpret_cast(_col); + + col_dt shift = static_cast(jcp.signed_input ? 128 : 0); + const dim_t dh = 1 + jcp.dilate_h; + const dim_t dw = 1 + jcp.dilate_w; + const dim_t sh = jcp.stride_h; + const dim_t sw = jcp.stride_w; + const dim_t im_iw_stride = jcp.ic * jcp.ngroups; + const dim_t im_ih_stride = jcp.iw * im_iw_stride; + const dim_t tp = jcp.t_pad; + const dim_t lp = jcp.l_pad; + + if (jcp.outer_threading && sh == 1 && sw == 1 && dh == 1 && dw == 1) { + /* im[ih][iw][ic] --> imtr[ic][ih][iw] --> col[kh][kw][ic][oh][ow] */ + const dim_t hp = hs - tp; + const dim_t wp = ws - lp; + const dim_t ih_start = saturate(dim_t(0), jcp.ih, hp); + const dim_t ih_end = saturate(dim_t(0), jcp.ih, hp + hb + jcp.kh); + const dim_t iw_start = saturate(dim_t(0), jcp.iw, wp); + const dim_t iw_end = saturate(dim_t(0), jcp.iw, wp + wb + jcp.kw); + + const dim_t ihb = ih_end - ih_start; + const dim_t iwb = iw_end - iw_start; + + const dim_t imtr_ic_stride = ihb * iwb; + const ptrdiff_t imtr_idx_shift = ih_start * iwb + iw_start; + for (dim_t ic = 0; ic < jcp.ic; ic++) { + const ptrdiff_t imtr_idx_ic = ic * imtr_ic_stride - imtr_idx_shift; + for (dim_t ih = ih_start; ih < ih_end; ih++) { + const ptrdiff_t im_idx_ih = ic + ih * im_ih_stride; + const ptrdiff_t imtr_idx_ih = imtr_idx_ic + ih * iwb; + for (dim_t iw = iw_start; iw < iw_end; iw++) + imtr[imtr_idx_ih + iw] = im[im_idx_ih + iw * im_iw_stride]; + } + } + + const dim_t col_ic_str = hb * wb; + const dim_t col_kw_stride = jcp.ic * col_ic_str; + const dim_t col_kh_stride = jcp.kw * col_kw_stride; + + const dim_t oh_init = ih_start - hp; + const dim_t ow_init = iw_start - wp; + for (dim_t kh = 0; kh < jcp.kh; kh++) { + const ptrdiff_t col_idx_kh = kh * col_kh_stride; + const dim_t oh_kh = oh_init - kh; + const dim_t oh_start = saturate(dim_t(0), hb, oh_kh); + const dim_t oh_end = saturate(dim_t(0), hb, oh_kh + ihb); + for (dim_t kw = 0; kw < jcp.kw; kw++) { + const ptrdiff_t col_idx_kw + = col_idx_kh + kw * jcp.ic * col_ic_str; + const dim_t ow_kw = ow_init - kw; + const dim_t imtr_shift = oh_kh * iwb + ow_kw; + const dim_t ow_start = saturate(dim_t(0), wb, ow_kw); + const dim_t ow_end = saturate(dim_t(0), wb, ow_kw + iwb); + for (dim_t ic = 0; ic < jcp.ic; ic++) { + const ptrdiff_t col_idx_ic = col_idx_kw + ic * col_ic_str; + const dim_t imtr_idx_ic = ic * imtr_ic_stride - imtr_shift; + for (dim_t oh = 0; oh < oh_start; oh++) { + const ptrdiff_t col_idx_oh = col_idx_ic + oh * wb; + for (dim_t ow = 0; ow < wb; ++ow) + col[col_idx_oh + ow] = shift; + } + for (dim_t oh = oh_start; oh < oh_end; oh++) { + const ptrdiff_t col_idx_oh = col_idx_ic + oh * wb; + const ptrdiff_t imtr_idx_oh = imtr_idx_ic + oh * iwb; + for (dim_t ow = 0; ow < ow_start; ++ow) + col[col_idx_oh + ow] = shift; + for (dim_t ow = ow_start; ow < ow_end; ++ow) + col[col_idx_oh + ow] + = imtr[imtr_idx_oh + ow] + shift; + for (dim_t ow = ow_end; ow < wb; ++ow) + col[col_idx_oh + ow] = shift; + } + for (dim_t oh = oh_end; oh < hb; oh++) { + const ptrdiff_t col_idx_oh = col_idx_ic + oh * wb; + for (dim_t ow = 0; ow < wb; ++ow) + col[col_idx_oh + ow] = shift; + } + } + } + } + } else { + parallel_nd(jcp.kh, jcp.kw, jcp.ic, hb, + [&](dim_t kh, dim_t kw, dim_t ic, dim_t oh) { + const dim_t hp = tp - kh * dh; + const dim_t ih = (oh + hs) * sh - hp; + const ptrdiff_t col_idx_base + = (((kh * jcp.kw + kw) * jcp.ic + ic) * hb + oh) * wb; + if (ih < 0 || ih >= jcp.ih) + for (dim_t ow = 0; ow < wb; ow++) + col[col_idx_base + ow] = shift; + else { + const dim_t wp = lp - kw * dw; + const dim_t ow_start + = saturate(dim_t(0), wb, div_up(wp, sw) - ws); + const dim_t ow_end + = saturate(dim_t(0), wb, div_up(jcp.iw + wp, sw) - ws); + for (dim_t ow = 0; ow < ow_start; ow++) + col[col_idx_base + ow] = shift; + const dim_t iw_base = ws * sw - wp; + const ptrdiff_t im_idx_base = ih * im_ih_stride + ic; + for (dim_t ow = ow_start; ow < ow_end; ow++) { + const dim_t iw = iw_base + ow * sw; + const ptrdiff_t im_idx = im_idx_base + iw * im_iw_stride; + col[col_idx_base + ow] = im[im_idx] + shift; + } + for (dim_t ow = ow_end; ow < wb; ow++) + col[col_idx_base + ow] = shift; + } + }); + } +} + +template void im2col_dt(const conv_gemm_conf_t &jcp, + const void *__restrict im, void *__restrict imtr, + uint8_t *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb); +template void im2col_dt(const conv_gemm_conf_t &jcp, + const void *__restrict im, void *__restrict imtr, + uint8_t *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb); +template void im2col_dt(const conv_gemm_conf_t &jcp, + const void *__restrict im, void *__restrict imtr, float *__restrict col, + dim_t hs, dim_t hb, dim_t ws, dim_t wb); + +template void im2col_dt(const conv_gemm_conf_t &jcp, + const void *__restrict im, void *__restrict imtr, + bfloat16_t *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb); + +/* im[id][ih][iw][ic] <-- col2im_dt_3d(col[od][oh][ow][kd][kh][kw][ic]) */ +template +void col2im_dt(const conv_gemm_conf_t &jcp, const orig_T *__restrict _col, + orig_T *__restrict _im) { + // For performance reasons, use uint16_t as a proxy for bfloat16_t + using T = typename utils::conditional< + data_traits_t::data_type == bf16, uint16_t, orig_T>::type; + const T *__restrict col = reinterpret_cast(_col); + T *__restrict im = reinterpret_cast(_im); + + parallel(0, [&](const int ithr, const int nthr) { + dim_t d_nthr = nstl::min(jcp.id, dim_t(nthr)); + dim_t h_nthr = nstl::min(jcp.ih, dim_t(nthr) / d_nthr); + dim_t w_nthr = nstl::min(jcp.iw, dim_t(nthr) / (d_nthr * h_nthr)); + dim_t d_ithr = 1, d_s = 0, d_e = 0, h_ithr = 1, h_s = 0, h_e = 0, + w_ithr = 1, w_s = 0, w_e = 0; + if (ithr < d_nthr * h_nthr * w_nthr) { + d_ithr = ithr / (h_nthr * w_nthr); + h_ithr = (ithr % (h_nthr * w_nthr)) / w_nthr; + w_ithr = (ithr % (h_nthr * w_nthr)) % w_nthr; + balance211(jcp.id, d_nthr, d_ithr, d_s, d_e); + balance211(jcp.ih, h_nthr, h_ithr, h_s, h_e); + balance211(jcp.iw, w_nthr, w_ithr, w_s, w_e); + } else { + d_nthr = h_ithr = w_ithr = -ithr; + d_s = d_e = h_s = h_e = w_s = w_e = -1; + } + + for_(dim_t id = d_s; id < d_e; ++id) + for_(dim_t ih = h_s; ih < h_e; ++ih) + for (dim_t iw = w_s; iw < w_e; ++iw) { + PRAGMA_OMP_SIMD() + for (dim_t ic = 0; ic < jcp.ic; ++ic) { + im[((id * jcp.ih + ih) * jcp.iw + iw) * jcp.ic + ic] = 0; + } + } + + // TODO: reduce region: [0.. oh] --> [h_s * sh .. h_e * sh] + for_(dim_t od = 0; od < jcp.od; ++od) + for_(dim_t oh = 0; oh < jcp.oh; ++oh) + for_(dim_t ow = 0; ow < jcp.ow; ++ow) + for (dim_t kd = 0; kd < jcp.kd; ++kd) { + const dim_t id + = od * jcp.stride_d - jcp.f_pad + kd * (1 + jcp.dilate_d); + if (id < d_s || id >= d_e) continue; + + for (dim_t kh = 0; kh < jcp.kh; ++kh) { + const dim_t ih = oh * jcp.stride_h - jcp.t_pad + + kh * (1 + jcp.dilate_h); + if (ih < h_s || ih >= h_e) continue; + + for (dim_t kw = 0; kw < jcp.kw; ++kw) { + const dim_t iw = ow * jcp.stride_w - jcp.l_pad + + kw * (1 + jcp.dilate_w); + if (iw < w_s || iw >= w_e) continue; + + const size_t col_idx + = (((((od * jcp.oh + oh) * jcp.ow + ow) * jcp.kd + + kd) * jcp.kh + + kh) * jcp.kw + + kw) + * jcp.ic; + const size_t im_idx + = ((id * jcp.ih + ih) * jcp.iw + iw) * jcp.ic; + PRAGMA_OMP_SIMD() + for (dim_t ic = 0; ic < jcp.ic; ++ic) { + im[im_idx + ic] += col[col_idx + ic]; + } + } + } + } + }); +} + +template void col2im_dt(const conv_gemm_conf_t &jcp, + const int32_t *__restrict col, int32_t *__restrict im); + +template void col2im_dt(const conv_gemm_conf_t &jcp, + const float *__restrict col, float *__restrict im); + +template void col2im_dt(const conv_gemm_conf_t &jcp, + const bfloat16_t *__restrict col, bfloat16_t *__restrict im); + +void col2im_3d(const conv_gemm_conf_t &jcp, const float *col, float *im, + dim_t od, int spatial_step, int spatial_block) { + + auto sp_blocked_ker = [&](dim_t ic) { + const size_t col_step = jcp.ks * spatial_block; + const float *__restrict col_ = col + ic * col_step; + float *__restrict im_ic = im + ic * jcp.ih * jcp.iw * jcp.id; + + const dim_t first_oh = spatial_step / jcp.ow; + const dim_t last_oh = (spatial_step + spatial_block - 1) / jcp.ow; + const dim_t oh_begin = first_oh; + const dim_t oh_end = last_oh + 1; + const dim_t first_ow = spatial_step % jcp.ow; + const dim_t last_ow = (spatial_step + spatial_block - 1) % jcp.ow; + const dim_t wei_stride + = nstl::min(jcp.ow * jcp.oh, dim_t(spatial_block)); + + dim_t id = od * jcp.stride_d - jcp.f_pad; + for (dim_t kd = 0; kd < jcp.kd; ++kd) { + if (id < 0 || id >= jcp.id) { + col_ += jcp.kh * jcp.kw * wei_stride; + id += (1 + jcp.dilate_d); + continue; + } + + float *__restrict im_ = im_ic + (size_t)id * jcp.ih * jcp.iw; + for_(dim_t kh = 0; kh < jcp.kh; ++kh) + for_(dim_t kw = 0; kw < jcp.kw; ++kw) + for (dim_t oh = oh_begin, col_off = 0; oh < oh_end; ++oh) { + + const dim_t ow_begin = (oh == first_oh) ? first_ow : 0; + const dim_t ow_end = (oh == last_oh) ? (last_ow + 1) : jcp.ow; + const dim_t ow_work = ow_end - ow_begin; + + const dim_t ih = oh * jcp.stride_h - jcp.t_pad + + kh * (1 + jcp.dilate_h); + if (ih < 0 || ih >= jcp.ih) { + col_off += ow_work; + continue; + } + + for (dim_t ow = ow_begin; ow < ow_end; ++ow, ++col_off) { + const dim_t iw = ow * jcp.stride_w - jcp.l_pad + + kw * (1 + jcp.dilate_w); + if (iw < 0 || iw >= jcp.iw) { continue; } + + const size_t col_idx + = (kh * jcp.kw + kw) * wei_stride + col_off; + const size_t im_idx = ih * jcp.iw + iw; + im_[im_idx] += col_[col_idx]; + } + } + col_ += jcp.kh * jcp.kw * wei_stride; + id += (1 + jcp.dilate_d); + } + }; + + auto ker = [&](dim_t ic) { + const float *__restrict col_ = col + (size_t)ic * jcp.ks * jcp.os; + float *__restrict im_ic = im + (size_t)ic * jcp.ih * jcp.iw * jcp.id; + + dim_t id = od * jcp.stride_d - jcp.f_pad; + for (dim_t kd = 0; kd < jcp.kd; ++kd) { + if (id < 0 || id >= jcp.id) { + col_ += jcp.kh * jcp.kw * jcp.os; + id += (1 + jcp.dilate_d); + continue; + } + + float *__restrict im_ = im_ic + (size_t)id * jcp.ih * jcp.iw; + + for_(dim_t oh = 0; oh < jcp.oh; ++oh) + for (dim_t kh = 0; kh < jcp.kh; ++kh) { + const dim_t ih = oh * jcp.stride_h - jcp.t_pad + + kh * (1 + jcp.dilate_h); + if (ih < 0 || ih >= jcp.ih) continue; + + for_(dim_t ow = 0; ow < jcp.ow; ++ow) + for (dim_t kw = 0; kw < jcp.kw; ++kw) { + const dim_t iw = ow * jcp.stride_w - jcp.l_pad + + kw * (1 + jcp.dilate_w); + if (iw < 0 || iw >= jcp.iw) continue; + + const size_t col_idx + = ((kh * jcp.kw + kw) * jcp.oh + oh) * jcp.ow + ow; + const size_t im_idx = ih * jcp.iw + iw; + im_[im_idx] += col_[col_idx]; + } + } + + col_ += jcp.kh * jcp.kw * jcp.os; + id += (1 + jcp.dilate_d); + } + }; + + const bool blocked_kernel = jcp.os_nb_block > 1; + if (blocked_kernel) + parallel_nd(jcp.ic, sp_blocked_ker); + else + parallel_nd(jcp.ic, ker); +} + +void col2im(const conv_gemm_conf_t &jcp, const float *col, float *im, + int spatial_step, int spatial_block) { + const size_t col_step = jcp.ks * spatial_block; + const size_t im_step = jcp.ih * jcp.iw; + const dim_t iS = jcp.ih * jcp.iw; + + auto sp_blocked_ker = [&](dim_t ic) { + const dim_t wei_stride + = nstl::min(jcp.ow * jcp.oh, dim_t(spatial_block)); + const dim_t first_oh = spatial_step / jcp.ow; + const dim_t last_oh = (spatial_step + spatial_block - 1) / jcp.ow; + const dim_t oh_begin = first_oh; + const dim_t oh_end = last_oh + 1; + const dim_t first_ow = spatial_step % jcp.ow; + const dim_t last_ow = (spatial_step + spatial_block - 1) % jcp.ow; + + float *__restrict img_ithr = im + ic * im_step; + const float *__restrict col_icb = col + ic * col_step; + + if (spatial_step == 0) { + PRAGMA_OMP_SIMD() + for (dim_t is = 0; is < iS; ++is) + img_ithr[is] = 0.; + } + + float *__restrict img_kh = img_ithr; + for (dim_t kh = 0; kh < jcp.kh; ++kh) { + float *__restrict im_ = img_kh; + for (dim_t kw = 0; kw < jcp.kw; ++kw) { + const float *__restrict col_ = col_icb; + for (dim_t oh = oh_begin; oh < oh_end; ++oh) { + const dim_t ow_begin = (oh == first_oh) ? first_ow : 0; + const dim_t ow_end + = (oh == last_oh) ? (last_ow + 1) : jcp.ow; + const dim_t ow_work = ow_end - ow_begin; + + const dim_t ih = oh * jcp.stride_h - jcp.t_pad; + const dim_t ih_ = ih + kh * (1 + jcp.dilate_h); + if (ih_ < 0 || ih_ >= jcp.ih) { + col_ += ow_work; + continue; + } + for (dim_t ow = ow_begin; ow < ow_end; ++ow, ++col_) { + const dim_t iw = ow * jcp.stride_w - jcp.l_pad; + const dim_t iw_ = iw + kw * (1 + jcp.dilate_w); + if (iw_ < 0 || iw_ >= jcp.iw) continue; + + const size_t im_idx = ih * jcp.iw + iw; + im_[im_idx] += *col_; + } + } + col_icb += wei_stride; + im_ += (1 + jcp.dilate_w); + } + img_kh += (jcp.iw * (1 + jcp.dilate_h)); + } + }; + + auto ker = [&](dim_t ic) { + float *__restrict im_ = im + ic * im_step; + const float *__restrict col_ = col + ic * col_step; + PRAGMA_OMP_SIMD() + for (dim_t is = 0; is < iS; ++is) + im_[is] = 0.; + + for_(dim_t kh = 0; kh < jcp.kh; ++kh) + for (dim_t oh = 0; oh < jcp.oh; ++oh) { + const dim_t ih + = oh * jcp.stride_h - jcp.t_pad + kh * (1 + jcp.dilate_h); + if (ih < 0 || ih >= jcp.ih) continue; + + for_(dim_t kw = 0; kw < jcp.kw; ++kw) + for (dim_t ow = 0; ow < jcp.ow; ++ow) { + const dim_t iw = ow * jcp.stride_w - jcp.l_pad + + kw * (1 + jcp.dilate_w); + if (iw < 0 || iw >= jcp.iw) continue; + + const size_t col_idx + = ((kh * jcp.kw + kw) * jcp.oh + oh) * jcp.ow + ow; + const size_t im_idx = ih * jcp.iw + iw; + im_[im_idx] += col_[col_idx]; + } + } + }; + + const bool blocked_kernel = jcp.os_nb_block > 1; + if (blocked_kernel) + parallel_nd(jcp.ic, sp_blocked_ker); + else + parallel_nd(jcp.ic, ker); +} + +status_t init_conf(conv_gemm_conf_t &jcp, + memory_tracking::registrar_t &scratchpad, const convolution_desc_t &cd, + memory_desc_t &src_md, memory_desc_t &weights_md, memory_desc_t &dst_md, + memory_desc_t &bias_md, primitive_attr_t &attr, int max_threads, + bool check_postops) { + const memory_desc_wrapper src_d(&src_md); + const memory_desc_wrapper weights_d(&weights_md); + const memory_desc_wrapper dst_d(&dst_md); + + const bool with_groups = weights_d.ndims() == src_d.ndims() + 1; + const int ndims = src_d.ndims(); + const int is_1d = ndims == 3; + const int is_3d = ndims == 5; + + jcp.prop_kind = cd.prop_kind; + + jcp.ngroups = with_groups ? weights_d.dims()[0] : 1; + jcp.mb = src_d.dims()[0]; + + jcp.oc = dst_d.dims()[1] / jcp.ngroups; + jcp.ic = src_d.dims()[1] / jcp.ngroups; + jcp.id = is_3d ? src_d.dims()[2] : 1; + jcp.ih = is_1d ? 1 : src_d.dims()[ndims - 2]; + jcp.iw = src_d.dims()[ndims - 1]; + jcp.od = is_3d ? dst_d.dims()[2] : 1; + jcp.oh = is_1d ? 1 : dst_d.dims()[ndims - 2]; + jcp.ow = dst_d.dims()[ndims - 1]; + + jcp.kd = is_3d ? weights_d.dims()[with_groups + 2] : 1; + jcp.kh = is_1d ? 1 : weights_d.dims()[with_groups + ndims - 2]; + jcp.kw = weights_d.dims()[with_groups + ndims - 1]; + + jcp.f_pad = is_3d ? cd.padding[0][0] : 0; + jcp.t_pad = is_1d ? 0 : cd.padding[0][ndims - 4]; + jcp.l_pad = cd.padding[0][ndims - 3]; + + jcp.stride_d = is_3d ? cd.strides[0] : 1; + jcp.stride_h = is_1d ? 1 : cd.strides[ndims - 4]; + jcp.stride_w = cd.strides[ndims - 3]; + + jcp.dilate_d = is_3d ? cd.dilates[0] : 0; + jcp.dilate_h = is_1d ? 0 : cd.dilates[ndims - 4]; + jcp.dilate_w = cd.dilates[ndims - 3]; + + jcp.with_bias = cd.bias_desc.format_kind != format_kind::undef + || cd.diff_bias_desc.format_kind != format_kind::undef; + + jcp.is = jcp.ih * jcp.iw; + jcp.os = jcp.oh * jcp.ow; + jcp.ks = jcp.kh * jcp.kw * jcp.kd; + + jcp.signed_input = src_d.data_type() == data_type::s8; + + jcp.outer_threading = false; + + jcp.zp = zero_point_config_t(attr); + jcp.b_pad = nstl::max((jcp.oh - 1) * jcp.stride_h + + (jcp.kh - 1) * (jcp.dilate_h + 1) + - (jcp.ih + jcp.t_pad - 1), + dim_t(0)); + jcp.r_pad = nstl::max((jcp.ow - 1) * jcp.stride_w + + (jcp.kw - 1) * (jcp.dilate_w + 1) + - (jcp.iw + jcp.l_pad - 1), + dim_t(0)); + jcp.e_pad = nstl::max((jcp.od - 1) * jcp.stride_d + + (jcp.kd - 1) * (jcp.dilate_d + 1) + - (jcp.id + jcp.f_pad - 1), + dim_t(0)); + + const bool zp_src_with_padding = jcp.zp.src_exists && padding_exists(jcp); + + if (zp_src_with_padding) { + jcp.zp.src_pad_comp = zero_point_pad_comp_config_t(jcp.f_pad, jcp.e_pad, + jcp.t_pad, jcp.b_pad, jcp.l_pad, jcp.r_pad, jcp.stride_d, + jcp.stride_h, jcp.stride_w, jcp.od, jcp.oh, jcp.ow); + } + + const auto set_or_check_tags + = [&](format_tag_t desired_src_tag, format_tag_t desired_dst_tag, + bool is_src_s8) -> status_t { + using namespace format_tag; + auto src_tag = any, dst_tag = any; + + if (src_d.format_kind() == format_kind::any) { + CHECK(memory_desc_init_by_tag(src_md, desired_src_tag)); + src_tag = desired_src_tag; + } else { + src_tag = src_d.mb_stride_relaxed_match( + nwc, nhwc, ndhwc, ncw, nchw, ncdhw); + } + + if (dst_d.format_kind() == format_kind::any) { + CHECK(memory_desc_init_by_tag(dst_md, desired_dst_tag)); + dst_tag = desired_dst_tag; + } else { + dst_tag = dst_d.mb_stride_relaxed_match( + nwc, nhwc, ndhwc, ncw, nchw, ncdhw); + } + + if (src_tag == format_tag::undef || dst_tag == format_tag::undef) + return status::unimplemented; + if (src_tag != dst_tag) return status::unimplemented; + + if (jcp.with_bias && bias_md.format_kind == format_kind::any) + CHECK(memory_desc_init_by_tag(bias_md, x)); + + const bool is_nspc = utils::one_of(src_tag, nwc, nhwc, ndhwc); + jcp.is_nspc = is_nspc; + + memory_desc_t want_wei_md = weights_md; + auto wei_tag = is_nspc + ? (with_groups ? utils::pick(ndims - 3, wigo, hwigo, dhwigo) + : utils::pick(ndims - 3, wio, hwio, dhwio)) + : (with_groups ? utils::pick(ndims - 3, goiw, goihw, goidhw) + : utils::pick(ndims - 3, oiw, oihw, oidhw)); + CHECK(memory_desc_init_by_tag(want_wei_md, wei_tag)); + + if (is_src_s8) { + want_wei_md.extra.flags = 0 + | memory_extra_flags::compensation_conv_s8s8 + | memory_extra_flags::scale_adjust; + want_wei_md.extra.compensation_mask + = (1 << 0) + (with_groups ? (1 << 1) : 0); + want_wei_md.extra.scale_adjust + = platform::s8s8_weights_scale_factor(); + } + + if (jcp.zp.src_exists) set_zp_src_comp_flags(want_wei_md, with_groups); + + if (weights_md.format_kind == format_kind::any) { + weights_md = want_wei_md; + return status::success; + } + return (want_wei_md == weights_md) ? status::success + : status::unimplemented; + }; + + const bool is_bwd_d = jcp.prop_kind == backward_data; + const bool is_bwd_w = jcp.prop_kind == backward_weights; + const bool is_fwd = !is_bwd_d && !is_bwd_w; + + const auto dst_max_size + = static_cast(jcp.iw) * jcp.ih * jcp.id * jcp.ic * 4; + const auto src_max_size + = static_cast(jcp.ow) * jcp.oh * jcp.od * jcp.oc * 4; + VDISPATCH_CONV_IC(dst_max_size <= INT_MAX && src_max_size <= INT_MAX, + VERBOSE_UNSUPPORTED_FEATURE, + "dst/scr size > INT_MAX is not supported"); + + bool is_int8_conv = (is_fwd ? utils::one_of(src_d.data_type(), s8, u8) + : utils::one_of(dst_d.data_type(), s8, u8)) + && weights_d.data_type() == s8; + + auto default_dat_tag = is_int8_conv + ? utils::pick(ndims - 3, format_tag::nwc, format_tag::nhwc, + format_tag::ndhwc) + : utils::pick(ndims - 3, format_tag::ncw, format_tag::nchw, + format_tag::ncdhw); + const status_t check_tag_status = set_or_check_tags(default_dat_tag, + default_dat_tag, src_md.data_type == data_type::s8); + VDISPATCH_CONV_IC(check_tag_status == status::success, + VERBOSE_UNSUPPORTED_TAG_S, "src"); + + // Does int8 conv ever need to support ncsp input format + VDISPATCH_CONV_IC( + !(is_int8_conv && !src_d.matches_one_of_tag(default_dat_tag)), + VERBOSE_UNSUPPORTED_DT); + + CHECK(attr.set_default_formats(&dst_md)); + + jcp.post_ops = attr.post_ops_; + + const int eltwise_ind = jcp.post_ops.find(primitive_kind::eltwise); + jcp.with_eltwise = eltwise_ind != -1; + const int binary_ind = jcp.post_ops.find(primitive_kind::binary); + const int prelu_ind = jcp.post_ops.find(primitive_kind::prelu); + jcp.with_binary = !everyone_is(-1, binary_ind, prelu_ind); + const int sum_ind = jcp.post_ops.find(primitive_kind::sum); + jcp.with_sum = sum_ind != -1; + + bool is_bf16_conv = false + || (is_fwd + && utils::everyone_is( + bf16, src_d.data_type(), weights_d.data_type())) + || (is_bwd_d + && utils::everyone_is( + bf16, dst_d.data_type(), weights_d.data_type())) + || (is_bwd_w + && utils::everyone_is( + bf16, src_d.data_type(), dst_d.data_type())); + VDISPATCH_CONV_IC(!(is_bf16_conv && !platform::has_data_type_support(bf16)), + VERBOSE_UNSUPPORTED_DT); + + const int vlen = std::max(platform::get_vector_register_size(), 4); + const int data_size = (is_int8_conv ? 1 : (is_bf16_conv ? 2 : 4)); + const int simd_w = vlen / data_size; + + jcp.os_block = jcp.os; + jcp.os_nb_block = 1; + jcp.oc_block = jcp.oc; + jcp.ic_block = jcp.ic; + jcp.loop_order = gemm_loop_rlb; + jcp.nthr_oc = 1; + + jcp.oh_block = is_fwd ? jcp.oh : jcp.ih; + jcp.ow_block = is_fwd ? jcp.ow : jcp.iw; + + using namespace memory_tracking::names; + bool is_depthwise = jcp.ic == 1 && jcp.oc == 1 && jcp.ngroups != 1; + + // TODO: maybe mitigate blocking restriction + const auto L2 = platform::get_per_core_cache_size(2) / data_size; + const int gemm_thrld = 64 * 1024; + + // Heuristic threshold for requested scratchpad memory to avoid + // possible crash on memory allocation: + // 1Gb or size of the buffers already used for this convolution proportional + // to the number of threads and multiplied by a heuristic coefficient (15) + const size_t zp_src_pad_comp_size = zp_src_with_padding + ? (jcp.oc * jcp.ngroups * jcp.zp.src_pad_comp.d + * jcp.zp.src_pad_comp.h * jcp.zp.src_pad_comp.w) + : 0u; + const size_t zp_src_comp_size = jcp.zp.src_is_common + ? utils::rnd_up(jcp.oc * jcp.ngroups, + platform::get_cache_line_size() / sizeof(int)) + : 0u; + + const size_t weights_size = weights_d.size() + + (zp_src_comp_size + zp_src_pad_comp_size) * sizeof(int32_t); + + static constexpr size_t scratchpad_limit_by_absolute_value = (size_t)1 + << 30; // 1Gb + const size_t scratchpad_limit_by_tensor_sizes + = 15 * max_threads * (src_d.size() + weights_size + dst_d.size()); + const size_t scratchpad_limit + = nstl::min(scratchpad_limit_by_absolute_value, + scratchpad_limit_by_tensor_sizes); + + if (is_int8_conv) { + if (is_fwd) { + jcp.im2col_sz + = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih, + jcp.od == jcp.id, jcp.stride_w == 1, + jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1, + !jcp.signed_input) + ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os + : 0; + + dim_t wei_size = jcp.oc * jcp.ic * jcp.kh * jcp.kw; + bool is_blocking_applicable = true && is_fwd && jcp.im2col_sz + && !is_3d && jcp.dilate_h == 0 && jcp.dilate_w == 0 + && !is_depthwise && wei_size < L2 / 2; + if (is_blocking_applicable) { + // looking for oh and ow blocking + dim_t h_block {jcp.oh_block}, w_block {jcp.ow_block}; + dim_t ic = jcp.ic; + dim_t oc = jcp.oc; + dim_t iw = jcp.iw; + dim_t ow = jcp.ow; + dim_t oh = jcp.oh; + dim_t os = oh * ow; + + // 1. cache requirement + dim_t row_size = ic * ow * jcp.ks + 2 * (ic * iw + oc * ow); + // Heuristic rule: gemm needed a lot of memory for internal + // usage + row_size *= 5; + // memory for accumulators + row_size += oc * ow * sizeof(uint32_t); + // memory for transposition + row_size += ic * iw; + + h_block = nstl::max( + dim_t(1), nstl::min(oh, div_up(dim_t(L2), row_size))); + if (h_block == 1) { + dim_t col_size = ic * jcp.ks + 2 * (ic + oc); + if (is_int8_conv) { + col_size *= 5; + col_size += oc * sizeof(uint32_t); + col_size += ic; + } + w_block = nstl::max(dim_t(1), + nstl::min(ow, div_up(dim_t(L2), col_size))); + } + + // 2. threading requirement + if (h_block != oh) + h_block = nstl::max(dim_t(1), rnd_dn(h_block, dim_t(4))); + if (w_block != ow) + w_block = nstl::max(dim_t(1), rnd_dn(w_block, simd_w)); + + float thr_eff = 0.f; + float thr_eff_treshold = 0.9f; + if (w_block == ow) { + do { + dim_t nb_h = div_up(oh, h_block); + dim_t work = jcp.ngroups * jcp.mb * jcp.od * nb_h; + float disb = (float)oh / rnd_up(oh, h_block); + thr_eff = (float)work / rnd_up(work, max_threads); + thr_eff = (thr_eff + disb) / 2.f; + if (thr_eff >= thr_eff_treshold) break; + h_block = rnd_dn(h_block - 4, 4); + } while (h_block > 0); + } + if (thr_eff + < thr_eff_treshold) // we didn't find suitable h_block + { + h_block = 1; + int nb_h = oh; + do { + dim_t nb_w = div_up(ow, w_block); + dim_t work_amount = jcp.ngroups * jcp.mb * nb_h * nb_w; + float disb = (float)ow / rnd_up(ow, w_block); + thr_eff = (float)work_amount + / rnd_up(work_amount, max_threads); + thr_eff = (thr_eff + disb) / 2.f; + if (thr_eff > thr_eff_treshold) break; + w_block = rnd_dn(w_block - simd_w, simd_w); + } while (w_block > 0); + } + h_block = nstl::max(dim_t(1), h_block); + w_block = nstl::max(dim_t(1), w_block); + dim_t inner_work = div_up(os, simd_w) * div_up(oc, simd_w); + const float inner_thr_eff + = (float)inner_work / rnd_up(inner_work, max_threads); + if (thr_eff >= inner_thr_eff / 2 && h_block > 0 + && w_block > 0) { + jcp.oh_block = h_block; + jcp.ow_block = w_block; + jcp.outer_threading = true; + } + // updating jcp.im2col_sz + if (jcp.oh_block != 1) jcp.ow_block = ow; + jcp.im2col_sz + = (ptrdiff_t)ic * jcp.ks * jcp.oh_block * jcp.ow_block; + } + // For threading selection in bwd_d we do: + // 1. Rough estimation of efficiency for inner and outer threading. + // 2. Gemm size estimation in assumption that it does not work + // so effectively for small sizes. + // 64K - this is heuristic gemm size per thread threshold. + const int gemm_thrld = 64 * 1024; + if (!jcp.outer_threading && !is_3d) { + bool is_depthwise + = jcp.ic == 1 && jcp.oc == 1 && jcp.ngroups != 1; + const dim_t outer_work = jcp.ngroups * jcp.mb; + const float outer_thr_eff + = (float)outer_work / rnd_up(outer_work, max_threads); + const size_t inner_work + = div_up(jcp.is, simd_w) * div_up(jcp.ic, simd_w); + const float inner_thr_eff + = (float)inner_work / rnd_up(inner_work, max_threads); + jcp.outer_threading + = (is_depthwise + || (jcp.is / max_threads < 64 && jcp.mb != 1)) + && (outer_thr_eff / inner_thr_eff >= 1.f + || (jcp.os * jcp.ic * jcp.oc) / max_threads + < gemm_thrld); + } + jcp.nthr = jcp.outer_threading ? max_threads : 1; + scratchpad.book( + key_conv_gemm_col, jcp.nthr * jcp.im2col_sz); + scratchpad.book(key_conv_int_dat_in_acc_dt, + jcp.nthr * jcp.oh_block * jcp.ow_block * jcp.oc); + scratchpad.book( + key_conv_gemm_imtr, jcp.nthr * jcp.id * jcp.is * jcp.ic); + } else if (is_bwd_d) { + jcp.im2col_sz + = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih, + jcp.od == jcp.id, jcp.stride_w == 1, + jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1, + !jcp.signed_input) + ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os * jcp.od + : 0; + + bool is_depthwise = jcp.ic == 1 && jcp.oc == 1 && jcp.ngroups != 1; + const size_t outer_work = jcp.ngroups * jcp.mb; + const float outer_thr_eff + = (float)outer_work / rnd_up(outer_work, max_threads); + const size_t inner_work + = div_up(jcp.is, simd_w) * div_up(jcp.ic, simd_w); + const float inner_thr_eff + = (float)inner_work / rnd_up(inner_work, max_threads); + jcp.outer_threading = !is_3d + && (is_depthwise + || (jcp.is / max_threads < 64 && jcp.mb != 1)) + && (outer_thr_eff / inner_thr_eff >= 1.f + || (jcp.is * jcp.ic * jcp.oc) / max_threads + < gemm_thrld); + + jcp.nthr = jcp.outer_threading ? max_threads : 1; + scratchpad.book( + key_conv_gemm_col, jcp.nthr * jcp.im2col_sz); + scratchpad.book(key_conv_int_dat_in_acc_dt, + jcp.nthr * jcp.is * jcp.id * jcp.ic); + } else if (is_bwd_w) { + assert(!"unimplemented prop_kind"); + return status::unimplemented; + } + } else { + jcp.im2col_sz = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih, + jcp.od == jcp.id, jcp.stride_w == 1, + jcp.stride_h == 1, jcp.stride_d == 1, + jcp.ks == 1, !jcp.signed_input) + ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os + : 0; + if (jcp.is_nspc && is_fwd) { + const size_t wei_size + = static_cast(jcp.oc) * jcp.ic * jcp.kh * jcp.kw; + bool is_blocking_applicable = true && is_fwd && jcp.im2col_sz + && !is_3d && jcp.dilate_h == 0 && jcp.dilate_w == 0 + && !is_depthwise && wei_size < static_cast(L2) / 2; + // Logic for blocking for f32_nspc gemm convolution follows that of + // int8_nspc gemm convolution. Currently, not optimized for f32 + // data type. + if (is_blocking_applicable) { + // looking for oh and ow blocking + size_t h_block = jcp.oh_block; + size_t w_block = jcp.ow_block; + + const size_t ic = jcp.ic; + const size_t oc = jcp.oc; + const size_t iw = jcp.iw; + const size_t ow = jcp.ow; + const size_t oh = jcp.oh; + const size_t os = oh * ow; + + // 1. cache requirement + size_t row_size = ic * ow * jcp.ks * data_size + + 2 * (ic * iw + oc * ow) * data_size; + // Heuristic rule: gemm needed a lot of memory for internal + // usage + row_size *= 5; + // memory for accumulators + row_size += oc * ow * data_size; + // memory for transposition + row_size += ic * iw * data_size; + + const size_t L2_rows = div_up(L2, row_size); + h_block = saturate(size_t {1}, L2_rows, oh); + if (h_block == 1) { + size_t col_size = ic * jcp.ks * data_size + + 2 * (ic + oc) * data_size; + const size_t L2_cols = div_up(L2, col_size); + w_block = saturate(size_t {1}, L2_cols, ow); + } + + // 2. threading requirement + if (h_block != oh) + h_block = nstl::max(size_t {1}, rnd_dn(h_block, 4)); + if (w_block != ow) + w_block = nstl::max(size_t {1}, rnd_dn(w_block, simd_w)); + + float thr_eff = 0.f; + float thr_eff_treshold = 0.9f; + if (w_block == ow) { + do { + size_t nb_h = div_up(oh, h_block); + size_t work = jcp.ngroups * jcp.mb * jcp.od * nb_h; + float disb = (float)oh / rnd_up(oh, h_block); + thr_eff = (float)work / rnd_up(work, max_threads); + thr_eff = (thr_eff + disb) / 2.f; + if (thr_eff >= thr_eff_treshold) break; + + if (h_block < 4) + h_block = 0; + else + h_block = rnd_dn(h_block - 4, 4); + } while (h_block > 0); + } + if (thr_eff + < thr_eff_treshold) // we didn't find suitable h_block + { + h_block = 1; + size_t nb_h = oh; + do { + size_t nb_w = div_up(ow, w_block); + size_t work_amount = jcp.ngroups * jcp.mb * nb_h * nb_w; + float disb = (float)ow / rnd_up(ow, w_block); + thr_eff = (float)work_amount + / rnd_up(work_amount, max_threads); + thr_eff = (thr_eff + disb) / 2.f; + if (thr_eff > thr_eff_treshold) break; + + if (w_block < static_cast(simd_w)) + w_block = 0; + else + w_block = rnd_dn(w_block - simd_w, simd_w); + } while (w_block > 0); + } + h_block = nstl::max(size_t {1}, h_block); + w_block = nstl::max(size_t {1}, w_block); + const size_t inner_work + = div_up(os, simd_w) * div_up(oc, simd_w); + const float inner_thr_eff + = (float)inner_work / rnd_up(inner_work, max_threads); + if (thr_eff >= inner_thr_eff / 2 && h_block > 0 + && w_block > 0) { + jcp.oh_block = static_cast(h_block); + jcp.ow_block = static_cast(w_block); + jcp.outer_threading = true; + } + // updating jcp.im2col_sz + if (jcp.oh_block != 1) jcp.ow_block = static_cast(ow); + jcp.im2col_sz + = (ptrdiff_t)ic * jcp.ks * jcp.oh_block * jcp.ow_block; + } + // For threading selection in fwd_d we do: + // 1. Rough estimation of efficiency for inner and outer threading. + // 2. Gemm size estimation in assumption that it does not work + // so effectively for small sizes. + // 64K - this is heuristic gemm size per thread threshold. + constexpr size_t gemm_thrld = 64 * 1024; + if (!jcp.outer_threading && !is_3d) { + bool is_depthwise + = jcp.ic == 1 && jcp.oc == 1 && jcp.ngroups != 1; + const size_t outer_work = jcp.ngroups * jcp.mb; + const float outer_thr_eff + = (float)outer_work / rnd_up(outer_work, max_threads); + const size_t inner_work + = div_up(jcp.is, simd_w) * div_up(jcp.ic, simd_w); + const float inner_thr_eff + = (float)inner_work / rnd_up(inner_work, max_threads); + jcp.outer_threading + = (is_depthwise + || (jcp.is / max_threads < 64 && jcp.mb != 1)) + && (outer_thr_eff / inner_thr_eff >= 1.f + || (static_cast(jcp.os) * jcp.ic + * jcp.oc) + / max_threads + < gemm_thrld); + } + jcp.nthr = jcp.outer_threading ? max_threads : 1; + const size_t gemm_col_datatype_size + = is_bf16_conv ? sizeof(bfloat16_t) : sizeof(float); + + scratchpad.book(key_conv_gemm_col, jcp.nthr * jcp.im2col_sz, + gemm_col_datatype_size); + if (is_bf16_conv) { + scratchpad.book(key_conv_gemm_acc, + jcp.nthr * static_cast(jcp.oh_block) + * jcp.ow_block * jcp.oc); + } + + scratchpad.book(key_conv_gemm_imtr, + jcp.nthr * static_cast(jcp.id) * jcp.is * jcp.ic, + gemm_col_datatype_size); + if (is_bf16_conv && jcp.with_bias + && one_of(data_type::bf16, cd.diff_bias_desc.data_type, + cd.bias_desc.data_type)) { + scratchpad.book( + key_conv_bias_bf16_convert_wsp, jcp.ngroups * jcp.oc); + } + + } else if (!jcp.is_nspc && is_fwd) { + const dim_t sh = jcp.stride_h; + const dim_t sw = jcp.stride_w; + const dim_t spatial = jcp.mb * jcp.ngroups * jcp.od * jcp.os; + dim_t K = jcp.ic * jcp.ks; + + // There is some heuristics in the definition of + // inner/outer threading cross point due to the nature of the + // gemm implementation which we cannot control + bool is_blocking_applicable = true && !is_3d + && (!jcp.im2col_sz + // spatial is small + || spatial >= max_threads * simd_w + // inner threading work is greater then outer + // threading work + || jcp.os < jcp.mb * jcp.ngroups * jcp.od + // im2col is big + || (sw == 1 && K <= 0.05 * jcp.oc)) + // heuristic condition + && (jcp.im2col_sz + || (jcp.ic / jcp.oc < 42 + && jcp.ic * jcp.oc * jcp.is < 1024)); + + if (is_blocking_applicable) { + const dim_t min_oc_block = 8; + const dim_t min_os_block = simd_w; + const float non_cache_access = 20; + const float strided_im2col_k = 8; + const float thr_disb_k = 8; + const float thr_mem_eff_k {1}, oc_disb_k {1}, os_disb_k {1}, + ic_disb_k {1}, reg_osb_disb_k {1}, gemm_eff_k {0.5}, + gemm_calc_eff_k {1}; + const float k_sum = thr_disb_k + oc_disb_k + os_disb_k + + ic_disb_k + reg_osb_disb_k + thr_mem_eff_k + + gemm_eff_k + gemm_calc_eff_k; + + auto calc_max_icb + = [=](dim_t nthr_oc, dim_t ocb, dim_t osb, + dim_t oc_per_thr, dim_t os_per_thr) { + const dim_t block_out_size = ocb * osb; + // TODO: need more precise calculation if stride more than + // kernel size + const dim_t inp_row_size = sh * sw * osb; + dim_t max_icb = 1; + if (jcp.im2col_sz) { + const dim_t col_row_size = jcp.ks * osb; + if (osb >= os_per_thr) { // one pass by os + const dim_t wei_col_size = jcp.ks * ocb; + max_icb = L2 / (inp_row_size + col_row_size); + if (ocb < oc_per_thr) { + max_icb = nstl::min(max_icb, + (L2 - block_out_size) + / (col_row_size + + wei_col_size)); + } + } else { + const dim_t wei_col_size = jcp.ks * oc_per_thr; + max_icb = (L2 - block_out_size) + / (inp_row_size + col_row_size + + wei_col_size); + } + } else { + if (osb >= os_per_thr) + max_icb = L2 / inp_row_size; + else { + const dim_t wei_col_size = jcp.ks * oc_per_thr; + max_icb = L2 / (inp_row_size + wei_col_size); + } + } + if (max_icb < jcp.ic) { + if (jcp.im2col_sz) { + const dim_t col_row_size = jcp.ks * osb; + const dim_t wei_col_size = jcp.ks * oc_per_thr; + max_icb = (L2 - block_out_size) + / (inp_row_size + col_row_size + + wei_col_size); + } + } + return max_icb; + }; + + dim_t best_ocb {1}, best_osb {1}; + dim_t best_nthr_oc {1}; + dim_t best_icb {jcp.ic}; + float best_thr_eff = 0; + + auto try_cfg = [&](dim_t nthr_oc, dim_t ocb, dim_t osb) { + // for given nthr_oc, oc block: + // 1. find ic block to fit into cache + // 2. estimate efficiency basing on rules and heuristic: + // - Minimize im2col cost + // - ratio of FMA number to data size + // - gemm works better if M divided by 48 and N divided by 8 + + const dim_t max_oc = div_up(jcp.oc, nthr_oc); + const dim_t min_oc = nstl::max(dim_t(1), jcp.oc / nthr_oc); + const dim_t max_os + = div_up(spatial, (dim_t)(max_threads / nthr_oc)); + ocb = utils::saturate(min_oc_block, max_oc, ocb); + osb = utils::saturate(min_os_block, max_os, osb); + + // The computation of max_thr_size and min_thr_size is + // based on work balance using: + // balance2D(max_threads, i, spatial, sp_start, sp_end, + // jcp.oc, oc_start, oc_end, nthr_oc); + size_t max_thr_size = 1; + { + const dim_t min_os = div_up( + spatial, (dim_t)div_up(max_threads, nthr_oc)); + /* --- compute max_thr_size ------------ + may not necessarily be (max_oc * max_os) + thr_size = thr_oc * (spatial /nthrs_in_slice); + with spatial as const, thr_size has maxima when + (A: thr_oc is max) and (B: nthrs_in_slice is min) + */ + if (jcp.oc % nthr_oc > max_threads % nthr_oc) { + // If (A) and (B) are true together, then it is the + // global max + max_thr_size = max_oc * max_os; + } else { + const size_t oc_max_os_min = max_oc * min_os; + const size_t oc_min_os_max = min_oc * max_os; + max_thr_size + = nstl::max(oc_max_os_min, oc_min_os_max); + } + } + + size_t min_thr_size {1}; + { + const dim_t min_os = nstl::max(dim_t(1), + spatial / div_up(max_threads, nthr_oc)); + /* --- compute min_thr_size ------------ + may not necessarily be (min_oc * min_y) + thr_size = thr_oc * (spatial /nthrs_in_slice); + with spatial as const, thr_size has minima when + (A: thr_oc is min) and (B: nthrs_in_slice is max) + */ + if (max_threads % nthr_oc > jcp.oc % nthr_oc) { + // If (A) and (B) are true together, then it is the + // global min + min_thr_size = min_oc * min_os; + } else { + const size_t oc_max_os_min = max_oc * min_os; + const size_t oc_min_os_max = min_oc + * (size_t)(spatial + / (dim_t)(max_threads / nthr_oc)); + min_thr_size + = nstl::min(oc_max_os_min, oc_min_os_max); + } + } + auto thr_disb = (float)min_thr_size / max_thr_size; + + const dim_t oc_per_thr = max_oc; + const dim_t os_per_thr = max_os; + ocb = nstl::min(oc_per_thr, ocb); + const dim_t os_max = nstl::min(jcp.os, os_per_thr); + osb = nstl::min(os_max, osb); + + // -- selecting icb --------------------- + dim_t max_ic_block = calc_max_icb( + nthr_oc, ocb, osb, oc_per_thr, os_per_thr); + // if we don't fit into cache then access to memory is + // expensive + dim_t mem_access_cost + = (max_ic_block < 1) ? non_cache_access : 1; + max_ic_block = nstl::max(dim_t(1), max_ic_block); + dim_t icb = nstl::max( + dim_t(1), jcp.ic / div_up(jcp.ic, max_ic_block)); + dim_t nb_ic = div_up(jcp.ic, icb); + dim_t kb = icb * jcp.ks; + dim_t kb_caligned = rnd_up(kb, simd_w); + + // -- mem efficiency ------------ + const size_t out_size + = oc_per_thr * rnd_up(os_per_thr, simd_w); + const size_t out_ops = mem_access_cost * out_size + * ((icb == jcp.ic) ? 1 : (2 * nb_ic - 1)); + const dim_t osb_caligned = rnd_up(osb, simd_w); + const size_t inp_size + = jcp.ic * rnd_up(os_per_thr * sh * sw, simd_w); + size_t inp_ops = 0; + size_t col_ops = 0; + // TODO: simplify calculations + if (jcp.im2col_sz) { + inp_ops = mem_access_cost * jcp.ks * inp_size; + const float col_tail_koeff = (float)osb_caligned / osb; + col_ops = mem_access_cost + * (jcp.ks * inp_size * col_tail_koeff + + jcp.ks * inp_size * col_tail_koeff); + if (sw != 1) // im2col with strides is much slower + col_ops *= strided_im2col_k; + } else { + inp_ops = mem_access_cost * jcp.ks * inp_size; + } + // TODO: what about groups? + const size_t wei_size = oc_per_thr * rnd_up(K, simd_w); + const size_t wei_ops = mem_access_cost * wei_size; + // ratio of real FMA to number of memory ops + const float thr_mem_eff + = (((float)os_per_thr / simd_w) * oc_per_thr * K) + / (inp_ops + col_ops + wei_ops + out_ops); + + auto oc_disb = (float)oc_per_thr / rnd_up(oc_per_thr, ocb); + auto os_disb = (float)os_max / rnd_up(os_max, osb); + auto ic_disb = (float)jcp.ic / rnd_up(jcp.ic, icb); + + auto reg_osb_disb = (float)osb / rnd_up(osb, 3 * simd_w); + + // Heuristics + const float gemm_eff = ((float)osb * ocb * kb) + / ((float)oc_per_thr * os_per_thr * K); + + // number of FMA to memory size + const float gemm_calc_eff + = (((float)osb / simd_w) * ocb * kb) + / (osb_caligned * kb + ocb * kb_caligned + + ocb * osb_caligned); + // optimization: remove pow, when corresponding weight is 1 + const float res_eff = pow(pow(thr_disb, thr_disb_k) + * oc_disb // pow(oc_disb, oc_disb_k) + * os_disb // pow(os_disb, os_disb_k) + * ic_disb // pow(ic_disb, ic_disb_k) + // pow(reg_osb_disb, reg_osb_disb_k) + * reg_osb_disb + //pow(thr_mem_eff, thr_mem_eff_k) + * thr_mem_eff + //pow(gemm_calc_eff, gemm_calc_eff_k) + * pow(gemm_eff, gemm_eff_k) * gemm_calc_eff, + 1.f / k_sum); + + if (res_eff > best_thr_eff) { + best_thr_eff = res_eff; + best_nthr_oc = nthr_oc; + best_ocb = ocb; + best_osb = osb; + best_icb = icb; + } + }; + + auto explore_cfg = [&](dim_t nthr_oc, dim_t ocb, dim_t osb) { + try_cfg(nthr_oc, ocb, osb); + // few combinations to try, as the eff is better when ocb is + // multiple of 8 and osb is multiple of 48 or min_os_block. + try_cfg(nthr_oc, rnd_dn(ocb, 8), rnd_dn(osb, 48)); + try_cfg(nthr_oc, rnd_up(ocb, 8), rnd_dn(osb, 48)); + try_cfg(nthr_oc, rnd_up(ocb, 8), rnd_up(osb, min_os_block)); + try_cfg(nthr_oc, rnd_up(ocb, 8), rnd_up(osb, 48)); + }; + + for (dim_t nthr_oc = 1; nthr_oc <= max_threads; ++nthr_oc) { + const dim_t max_oc_per_thr = div_up(jcp.oc, nthr_oc); + dim_t max_os_per_thr + = div_up(spatial, max_threads / nthr_oc); + dim_t ocb {1}, osb {1}, icb {1}; + if (jcp.im2col_sz) { + try_cfg(nthr_oc, max_oc_per_thr, max_os_per_thr); + if ((best_ocb == max_oc_per_thr) + && (best_osb == max_os_per_thr) + && (best_icb == jcp.ic)) { + // best case scenario + continue; + } + + /* + memory eq from calc_max_icb(): + max_icb = (L2 - block_out_size) + / (inp_row_size + col_row_size + + wei_col_size); + icb*sh*sw*osb + icb*jcp.ks*osb + + jcp.ks*max_oc_per_thr*icb + osb *ocb = L2 + + a_k*icb*osb + b_k*icb + osb*ocb = L2 + We would like to maximize icb*osb*ocb (FMA). + + Unfortunately, above eq and constraint doesn't have + a single solution. So, based on experiments we try + few scenarios. + 1. icb = jcp.ic + 2. Solving the constraint eq we get + osb = (L2 - 2*b_k*icb)/(2*a_k*icb) >= min_oc_block + => icb <= (L2)/(2* min_oc_block * a_k + 2 * b_k) + 3. Maximize channel compute: + ocb = max_oc_per_thr; + icb = jcp.ic; + */ + dim_t a_k = sh * sw + jcp.ks; + dim_t b_k = jcp.ks * max_oc_per_thr; + + // Note 1: + icb = jcp.ic; + ocb = utils::saturate(min_oc_block, max_oc_per_thr, + (L2 - a_k * icb * min_os_block - b_k * icb) + / min_os_block); + osb = utils::saturate(min_os_block, max_os_per_thr, + (L2 - b_k * icb) / (a_k * icb + ocb)); + explore_cfg(nthr_oc, ocb, osb); + + // Note 2: + const dim_t icb_max = nstl::max(dim_t(1), + L2 / (2 * min_oc_block * a_k + 2 * b_k)); + if (icb_max < jcp.ic) { + // adjust icb, such that it is evenly distributed. + icb = jcp.ic + / nstl::max(dim_t(1), jcp.ic / icb_max); + osb = nstl::max(dim_t(1), + (L2 - 2 * b_k * icb) / (2 * icb * a_k)); + ocb = L2 / 2 / osb; + + if (ocb > max_oc_per_thr) { + ocb = max_oc_per_thr; + // reduce mem eq by making ocb constant. we get + osb = utils::saturate(min_os_block, + max_os_per_thr, + (L2 - b_k * icb) / (a_k * icb + ocb)); + } else if (osb > max_os_per_thr) { + // reduce mem eq by making osb constant. we get + osb = max_os_per_thr; + ocb = utils::saturate(min_oc_block, + max_oc_per_thr, + (L2 - a_k * icb * osb - b_k * icb) + / (osb)); + } + + explore_cfg(nthr_oc, ocb, osb); + } + + // Note 3: + ocb = max_oc_per_thr; + icb = jcp.ic; + osb = nstl::max(min_os_block, + rnd_dn((L2 - b_k * icb) / (a_k * icb + ocb), + min_os_block)); + explore_cfg(nthr_oc, ocb, osb); + + } else { + // from calc_max_icb, memory eq is independent of ocb. + // So, set it to maximum. + ocb = max_oc_per_thr; + osb = (L2 - jcp.ks * jcp.ic) / (sh * sw * jcp.ic); + explore_cfg(nthr_oc, ocb, osb); + } + } + jcp.outer_threading = true; + jcp.nthr_oc = best_nthr_oc; + jcp.oc_block = best_ocb; + jcp.os_block = best_osb; + jcp.ic_block = best_icb; + + // TODO: define loop order + // if im2col then gemm_loop_rlb and gemm_loop_lrb looks + // preferable otherwise other loop orders are possible + jcp.loop_order = gemm_loop_rlb; + } else { + const size_t outer_work_amount = jcp.ngroups * jcp.mb * jcp.od; + const float outer_thr_eff = (float)outer_work_amount + / rnd_up(outer_work_amount, max_threads); + const size_t inner_work_amount + = div_up(jcp.os, simd_w) * div_up(jcp.oc, simd_w); + const float inner_thr_eff = (float)inner_work_amount + / rnd_up(inner_work_amount, max_threads); + jcp.outer_threading = jcp.os / max_threads < 512 + && IMPLICATION( + jcp.od == 1, jcp.mb != 1 || jcp.ngroups > 2) + && (outer_thr_eff / inner_thr_eff >= 1.f + || (jcp.os * jcp.ic * jcp.oc) / max_threads + < gemm_thrld); + } + jcp.os_nb_block = div_up(jcp.os, jcp.os_block); + + // BF16: other loops should be explored for potential + // performance speedup, but BF16-dst post-processing implementation + // would require enabling this support. + if (is_bf16_conv) jcp.loop_order = gemm_loop_lbr; + + if (jcp.im2col_sz) + jcp.im2col_sz = (ptrdiff_t)jcp.ic_block * jcp.ks * jcp.os_block; + } else if (jcp.is_nspc && is_bwd_d) { + jcp.im2col_sz + = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih, + jcp.od == jcp.id, jcp.stride_w == 1, + jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1, + !jcp.signed_input) + ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os * jcp.od + : 0; + + bool is_depthwise = jcp.ic == 1 && jcp.oc == 1 && jcp.ngroups != 1; + const size_t outer_work = jcp.ngroups * jcp.mb; + const float outer_thr_eff + = (float)outer_work / rnd_up(outer_work, max_threads); + const size_t inner_work + = div_up(jcp.is, simd_w) * div_up(jcp.ic, simd_w); + const float inner_thr_eff + = (float)inner_work / rnd_up(inner_work, max_threads); + jcp.outer_threading = !is_3d + && (is_depthwise + || (jcp.is / max_threads < 64 && jcp.mb != 1)) + && (outer_thr_eff / inner_thr_eff >= 1.f + || (static_cast(jcp.is) * jcp.ic * jcp.oc) + / max_threads + < gemm_thrld); + + jcp.nthr = jcp.outer_threading ? max_threads : 1; + scratchpad.book(key_conv_gemm_col, jcp.nthr * jcp.im2col_sz); + if (jcp.ngroups > 1 || is_bf16_conv) + scratchpad.book(key_conv_gemm_acc, + jcp.nthr * static_cast(jcp.is) * jcp.id + * jcp.ic); + } else if (!jcp.is_nspc && is_bwd_d) { + const size_t outer_work_amount = jcp.ngroups * jcp.mb; + const float outer_thr_eff = (float)outer_work_amount + / rnd_up(outer_work_amount, max_threads); + const size_t inner_work + = div_up(jcp.is, simd_w) * div_up(jcp.ic, simd_w); + const float inner_thr_eff + = (float)inner_work / rnd_up(inner_work, max_threads); + jcp.outer_threading = (jcp.os / max_threads < 512 || jcp.ks < 64) + && (jcp.mb != 1 || jcp.ngroups > 2) + && (outer_thr_eff / inner_thr_eff >= 1.f + || (jcp.is * jcp.ic * jcp.oc) / max_threads + < gemm_thrld); + } else if (jcp.is_nspc && is_bwd_w) { + jcp.im2col_sz + = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih, + jcp.od == jcp.id, jcp.stride_w == 1, + jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1, + !jcp.signed_input) + ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os + : 0; + const size_t gemm_col_datatype_size + = is_bf16_conv ? sizeof(bfloat16_t) : sizeof(float); + + // Potential scratchpad memory requirement when outer threading is + // enabled during f32/bf16 BWD_W nspc convolution + size_t thr_mem_estimate = max_threads + * (gemm_col_datatype_size * jcp.im2col_sz + + gemm_col_datatype_size * jcp.id * jcp.is * jcp.ic + + sizeof(float) * weights_d.size()); + if (is_bf16_conv) { + thr_mem_estimate += sizeof(float) * weights_d.size(); + if (jcp.with_bias + && one_of(data_type::bf16, cd.diff_bias_desc.data_type, + cd.bias_desc.data_type)) + thr_mem_estimate += sizeof(float) * jcp.ngroups * jcp.oc; + } + const bool outer_threading_mem_ok + = thr_mem_estimate < scratchpad_limit; + + jcp.outer_threading = outer_threading_mem_ok + && jcp.os / max_threads < 256 + && (jcp.mb != 1 || jcp.ngroups > 2); + jcp.nthr = jcp.outer_threading ? max_threads : 1; + + scratchpad.book(key_conv_gemm_col, jcp.nthr * jcp.im2col_sz, + gemm_col_datatype_size); + + jcp.need_wei_reduction = jcp.mb != 1 && jcp.nthr != 1; + scratchpad.book( + key_conv_wei_reduction, jcp.nthr * weights_d.size()); + scratchpad.book(key_conv_gemm_imtr, + static_cast(jcp.nthr) * jcp.id * jcp.is * jcp.ic, + gemm_col_datatype_size); + if (is_bf16_conv) { + size_t conv_acc_buffer_size = weights_d.size(); + scratchpad.book( + key_conv_int_dat_in_acc_dt, conv_acc_buffer_size); + } + if ((is_bf16_conv) && jcp.with_bias + && one_of(data_type::bf16, cd.diff_bias_desc.data_type, + cd.bias_desc.data_type)) + scratchpad.book( + key_conv_bias_bf16_convert_wsp, jcp.ngroups * jcp.oc); + } else if (!jcp.is_nspc && is_bwd_w) { + // Potential scratchpad memory requirement when outer threading is + // enabled during f32/bf16 BWD_W blocked convolution + size_t thr_mem_estimate + = sizeof(float) * max_threads * weights_d.size(); + if (is_bf16_conv) { + thr_mem_estimate += sizeof(float) * weights_d.size(); + if (jcp.with_bias + && one_of(data_type::bf16, cd.diff_bias_desc.data_type, + cd.bias_desc.data_type)) + thr_mem_estimate += sizeof(float) * jcp.ngroups * jcp.oc; + } + const size_t gemm_col_datatype_size + = is_bf16_conv ? sizeof(bfloat16_t) : sizeof(float); + // Minimum memory requirement as os_block >= simd_w + thr_mem_estimate += gemm_col_datatype_size * max_threads * jcp.ic + * jcp.ks * simd_w; + + const bool outer_threading_mem_ok + = thr_mem_estimate < scratchpad_limit; + jcp.outer_threading = outer_threading_mem_ok + && jcp.os / max_threads < 256 + && (jcp.mb != 1 || jcp.ngroups > 2); + } + + if (!jcp.is_nspc) { + jcp.nthr = jcp.outer_threading ? max_threads : 1; + const int sizeof_cacheline_float = 16; + if (is_bwd_w) { + jcp.need_wei_reduction = jcp.mb != 1 && jcp.nthr != 1; + scratchpad.book( + key_conv_wei_reduction, jcp.nthr * weights_d.size()); + } + + if (is_bf16_conv) { + size_t conv_acc_buffer_size = 0; + if (is_fwd) + conv_acc_buffer_size = jcp.nthr + * rnd_up(jcp.oc_block * jcp.os_block, + sizeof_cacheline_float); + else if (is_bwd_d) + conv_acc_buffer_size = jcp.nthr + * rnd_up(jcp.ic * jcp.ih * jcp.iw * jcp.id, + sizeof_cacheline_float); + else if (is_bwd_w) + conv_acc_buffer_size = weights_d.size(); + scratchpad.book( + key_conv_int_dat_in_acc_dt, conv_acc_buffer_size); + if ((is_fwd || is_bwd_w) && jcp.with_bias + && one_of(data_type::bf16, cd.diff_bias_desc.data_type, + cd.bias_desc.data_type)) + scratchpad.book(key_conv_bias_bf16_convert_wsp, + jcp.ngroups * jcp.oc); + } + + const size_t gemm_col_datatype_size = is_bf16_conv && !is_bwd_d + ? sizeof(bfloat16_t) + : sizeof(float); + size_t gemm_col_memory_sz = jcp.nthr * jcp.im2col_sz; + + if (is_bwd_d || is_bwd_w) { + // check available memory + VDISPATCH_CONV_IC(scratchpad_limit >= scratchpad.size(), + VERBOSE_SCRATCHPAD_LIMIT); + + const size_t available_mem + = scratchpad_limit - scratchpad.size(); + if (available_mem + < gemm_col_memory_sz * gemm_col_datatype_size) { + // Required memory in this scenario overflows the + // available memory due to the large dimensions. + const int min_os_block = simd_w; + const int max_os_block = (int)available_mem + / ((int)gemm_col_datatype_size * jcp.nthr + * (jcp.im2col_sz / jcp.os)); + // Choose an arbitrary small coeficient reduce spatial + // dimensions. + // TODO: better heuristic to determine os_block based + // on cache efficiency + float _coef = is_bwd_w ? 0.05 : 0.1; + jcp.os_block = nstl::max( + min_os_block, (int)(max_os_block * _coef)); + jcp.os_nb_block = div_up(jcp.os, jcp.os_block); + jcp.im2col_sz = (ptrdiff_t)jcp.ic * jcp.ks * jcp.os_block; + gemm_col_memory_sz = jcp.nthr * jcp.im2col_sz; + } + } + scratchpad.book(key_conv_gemm_col, gemm_col_memory_sz, + gemm_col_datatype_size); + } + } + + jcp.bias_data_type = cd.bias_desc.data_type; + jcp.dst_data_type = dst_md.data_type; + jcp.sum_data_type = jcp.post_ops.get_sum_dt(jcp.dst_data_type); + jcp.dst_os_stride = dst_d.is_blocking_desc() + ? dst_d.blocking_desc().strides[ndims - 1] + : 0; + jcp.scale_idx_mult = attr.scales_.get_mask(DNNL_ARG_WEIGHTS) > 0; + jcp.with_dst_scale = !attr.scales_.has_default_values(DNNL_ARG_DST); + book_precomputed_scales(scratchpad, attr.scales_, jcp.ngroups * jcp.oc); + + if (jcp.zp.src_exists) { + const auto size = zp_src_comp_size + zp_src_pad_comp_size; + if (size) scratchpad.book(key_conv_gemm_zp_src_comp, size); + } + + VDISPATCH_CONV_IC( + scratchpad.size() <= scratchpad_limit, VERBOSE_SCRATCHPAD_LIMIT); + + return status::success; +} + +void bwd_weights_balance(int ithr, int nthr, int ngroups, int mb, int &ithr_g, + int &nthr_g, int &ithr_mb, int &nthr_mb) { + nthr_g = nstl::min(ngroups, nthr); + nthr_mb = nstl::min(mb, nthr / nthr_g); + if (ithr / nthr_mb >= ngroups) { + ithr_g = ithr_mb = -1; + } else { + ithr_g = ithr / nthr_mb; + ithr_mb = ithr % nthr_mb; + } +} + +void bwd_weights_reduction_par_ncsp(int ithr, int nthr, + const conv_gemm_conf_t &jcp, const float *weights_reduce_ws, + float *weights) { + const size_t weights_g_size = jcp.ic * jcp.oc * jcp.ks; + + size_t weights_start {0}, weights_end {0}; + balance211(weights_g_size, nthr, ithr, weights_start, weights_end); + + for (int i = 0; i < nthr; ++i) { + const float *ws_i = weights_reduce_ws + i * weights_g_size; + for (size_t s = weights_start; s < weights_end; ++s) + weights[s] = (i == 0 ? 0 : weights[s]) + ws_i[s]; + } +} + +void bwd_weights_reduction_par_nspc(int ithr, int nthr, size_t g_start, + size_t g_end, const conv_gemm_conf_t &jcp, + const float *weights_reduce_base, float *diff_weights) { + const dim_t weights_g_size = jcp.oc; + dim_t weights_start {0}, weights_end {0}; + balance211(jcp.ks * jcp.ic, nthr, ithr, weights_start, weights_end); + + // Threads divide work w.r.t. min-batch and groups, therefore + // - weights_reduce_base format: spatial-input_channels-output_channels + // - diff_weights format: spatial-input_channels-groups-output_channels + for (auto tidx = 0; tidx < nthr; ++tidx) { + const float *ws_base + = weights_reduce_base + tidx * weights_g_size * jcp.ks * jcp.ic; + for_(auto w = weights_start; w < weights_end; ++w) + for (auto g = g_start; g < g_end; ++g) { + float *__restrict dwei_ptr + = diff_weights + (w * jcp.ngroups + g) * jcp.oc; + const float *__restrict ws_ptr = ws_base + w * jcp.oc; + if (tidx == 0) { + PRAGMA_OMP_SIMD() + for (auto oc = 0; oc < jcp.oc; ++oc) { + dwei_ptr[oc] = ws_ptr[oc]; + } + } else { + PRAGMA_OMP_SIMD() + for (auto oc = 0; oc < jcp.oc; ++oc) { + dwei_ptr[oc] += ws_ptr[oc]; + } + } + } + } +} + +bool padding_exists(const conv_gemm_conf_t &jcp) noexcept { + return jcp.l_pad || jcp.t_pad || jcp.f_pad || jcp.e_pad || jcp.b_pad + || jcp.r_pad; +} + +} // namespace jit_gemm_convolution_utils +} // namespace rv64 +} // namespace cpu +} // namespace impl +} // namespace dnnl diff --git a/src/cpu/rv64/rvv_gemm_convolution_utils.hpp b/src/cpu/rv64/rvv_gemm_convolution_utils.hpp new file mode 100644 index 00000000000..0659c7c91fa --- /dev/null +++ b/src/cpu/rv64/rvv_gemm_convolution_utils.hpp @@ -0,0 +1,142 @@ +/******************************************************************************* +* Copyright 2016 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef CPU_RV64_RVV_GEMM_CONVOLUTION_UTILS_HPP +#define CPU_RV64_RVV_GEMM_CONVOLUTION_UTILS_HPP + +#include "common/c_types_map.hpp" +#include "common/dnnl_thread.hpp" +#include "common/memory_tracking.hpp" + +#include "cpu/cpu_convolution_pd.hpp" +#include "cpu/cpu_engine.hpp" +#include "cpu/zero_point_utils.hpp" + +namespace dnnl { +namespace impl { +namespace cpu { +namespace rv64 { + +enum conv_gemm_loop_order_t { gemm_loop_rlb, gemm_loop_lrb, gemm_loop_lbr }; +struct conv_gemm_conf_t { + prop_kind_t prop_kind; + + dim_t mb; + dim_t ngroups, ic, oc; + dim_t iw, ih, id, ow, oh, od; + dim_t l_pad, t_pad, f_pad, e_pad, b_pad, r_pad; + dim_t kh, kw, kd; + dim_t stride_h, stride_w, stride_d; + dim_t dilate_h, dilate_w, dilate_d; + bool with_bias; + bool with_eltwise; + bool with_binary; + bool with_sum; + post_ops_t post_ops; + bool is_nspc; + + dim_t is, os, ks; + dim_t ic_block, oc_block; + + int nthr; + ptrdiff_t im2col_sz; + bool need_wei_reduction; + bool signed_input; + dim_t oh_block; + dim_t ow_block; + dim_t os_block, os_nb_block; + bool outer_threading; + conv_gemm_loop_order_t loop_order; + int nthr_oc; + + zero_point_config_t zp; + + data_type_t bias_data_type; + data_type_t dst_data_type; + data_type_t sum_data_type; + size_t dst_os_stride; + size_t scale_idx_mult; + bool with_dst_scale; +}; + +struct single_gemm_conv_chunk_desc_t { + single_gemm_conv_chunk_desc_t() = default; + single_gemm_conv_chunk_desc_t(dim_t d_off, dim_t d_size, dim_t h_off, + dim_t h_size, dim_t w_off, dim_t w_size); + + dim_t d_off_ = 0; + dim_t d_size_ = 0; + dim_t h_off_ = 0; + dim_t h_size_ = 0; + dim_t w_off_ = 0; + dim_t w_size_ = 0; +}; + +namespace jit_gemm_convolution_utils { +template +void im2col_3d(const conv_gemm_conf_t &jcp, const data_type_t *im, + data_type_t *col, dim_t od, int spatial_step, int spatial_block); + +template +void transpose_dt(const conv_gemm_conf_t &jcp, const T *__restrict im, + T *__restrict imtr); + +template +void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict im, + col_dt *__restrict col, dim_t od); + +template +void im2col(const conv_gemm_conf_t &jcp, const data_type_t *__restrict im, + data_type_t *__restrict col, dim_t ss, dim_t sb, dim_t cs, dim_t cb); + +template +void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict im, + void *__restrict imtr, col_dt *__restrict col, dim_t hs, dim_t hb, + dim_t ws, dim_t wb); + +template +void col2im_dt( + const conv_gemm_conf_t &jcp, const T *__restrict col, T *__restrict im); +void col2im_3d(const conv_gemm_conf_t &jcp, const float *col, float *im, + dim_t od, int spatial_step, int spatial_block); +void col2im(const conv_gemm_conf_t &jcp, const float *col, float *im, + int spatial_step, int spatial_block); + +status_t init_conf(conv_gemm_conf_t &jcp, + memory_tracking::registrar_t &scratchpad, const convolution_desc_t &cd, + memory_desc_t &src_md, memory_desc_t &weights_md, memory_desc_t &dst_md, + memory_desc_t &bias_md, primitive_attr_t &attr, int max_threads, + bool check_postops = false); + +void bwd_weights_balance(int ithr, int nthr, int ngroups, int mb, int &ithr_g, + int &nthr_g, int &ithr_mb, int &nthr_mb); +void bwd_weights_reduction_par_ncsp(int ithr, int nthr, + const conv_gemm_conf_t &jcp, const float *weights_reduce_ws, + float *weights); +void bwd_weights_reduction_par_nspc(int ithr, int nthr, size_t g_start, + size_t g_end, const conv_gemm_conf_t &jcp, + const float *weights_reduce_base, float *diff_weights); + +bool padding_exists(const conv_gemm_conf_t &jcp) noexcept; + +} // namespace jit_gemm_convolution_utils + +} // namespace rv64 +} // namespace cpu +} // namespace impl +} // namespace dnnl + +#endif From 6e5fca86fca17ff81097930144e5471b60d1f369 Mon Sep 17 00:00:00 2001 From: StrelkovKM Date: Fri, 20 Mar 2026 08:48:26 +0300 Subject: [PATCH 02/13] [CPU][RV64] Add: xbyak_riscv, jit_rvv_1x1 --- src/CMakeLists.txt | 2 +- src/cpu/cpu_convolution_list.cpp | 15 +- src/cpu/rv64/cpu_isa_traits.cpp | 44 + src/cpu/rv64/cpu_isa_traits.hpp | 107 ++ src/cpu/rv64/jit_generator.hpp | 137 ++ src/cpu/rv64/jit_primitive_conf.hpp | 97 ++ src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp | 581 +++++++ src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp | 109 ++ src/cpu/rv64/jit_rvv_1x1_convolution.cpp | 144 ++ src/cpu/rv64/jit_rvv_1x1_convolution.hpp | 170 ++ third_party/xbyak_riscv/xbyak_riscv.hpp | 1383 +++++++++++++++++ third_party/xbyak_riscv/xbyak_riscv_csr.hpp | 112 ++ .../xbyak_riscv/xbyak_riscv_mnemonic.hpp | 231 +++ third_party/xbyak_riscv/xbyak_riscv_util.hpp | 271 ++++ third_party/xbyak_riscv/xbyak_riscv_v.hpp | 776 +++++++++ 15 files changed, 4175 insertions(+), 4 deletions(-) create mode 100644 src/cpu/rv64/cpu_isa_traits.cpp create mode 100644 src/cpu/rv64/cpu_isa_traits.hpp create mode 100644 src/cpu/rv64/jit_generator.hpp create mode 100644 src/cpu/rv64/jit_primitive_conf.hpp create mode 100644 src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp create mode 100644 src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp create mode 100644 src/cpu/rv64/jit_rvv_1x1_convolution.cpp create mode 100644 src/cpu/rv64/jit_rvv_1x1_convolution.hpp create mode 100644 third_party/xbyak_riscv/xbyak_riscv.hpp create mode 100644 third_party/xbyak_riscv/xbyak_riscv_csr.hpp create mode 100644 third_party/xbyak_riscv/xbyak_riscv_mnemonic.hpp create mode 100644 third_party/xbyak_riscv/xbyak_riscv_util.hpp create mode 100644 third_party/xbyak_riscv/xbyak_riscv_v.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 08d882bfee0..e69a804d39a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -77,7 +77,7 @@ if(DNNL_EXPERIMENTAL) endif() if(DNNL_EXPERIMENTAL_UKERNEL) - if(DNNL_TARGET_ARCH STREQUAL "X64" OR DNNL_TARGET_ARCH STREQUAL "AARCH64") + if(DNNL_TARGET_ARCH STREQUAL "X64" OR DNNL_TARGET_ARCH STREQUAL "AARCH64" OR DNNL_TARGET_ARCH STREQUAL "RISCV64") message(STATUS "Experimental functionality for ukernels is enabled") else() message(FATAL_ERROR "ukernel API isn't supported for ${DNNL_TARGET_ARCH}.") diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp index a43ab39d7d0..350ac8e14e4 100644 --- a/src/cpu/cpu_convolution_list.cpp +++ b/src/cpu/cpu_convolution_list.cpp @@ -76,6 +76,10 @@ using namespace dnnl::impl::cpu::aarch64; #include "cpu/acl/acl_depthwise_convolution.hpp" #include "cpu/acl/acl_winograd_convolution.hpp" using namespace dnnl::impl::cpu::acl; +#elif DNNL_RV64 +#include "cpu/rv64/rvv_gemm_convolution.hpp" +#include "cpu/rv64/jit_rvv_1x1_convolution.hpp" +using namespace dnnl::impl::cpu::rv64; #endif namespace dnnl { @@ -175,9 +179,14 @@ const std::map> &impl_list_map() CPU_INSTANCE_AARCH64(brgemm_1x1_convolution_fwd_t, sve_128) CPU_INSTANCE_AARCH64(brgemm_convolution_fwd_t, sve_128) // CPU_INSTANCE_X64(jit_uni_ncsp_convolution_fwd_t) - CPU_INSTANCE(gemm_convolution_fwd_t) - CPU_INSTANCE(ref_convolution_fwd_t) - CPU_INSTANCE(ref_fused_convolution_fwd_t) + + //CPU_INSTANCE_RV64GCV(jit_rvv_1x1_convolution_fwd_t) + CPU_INSTANCE_RV64GCV(riscv_gemm_convolution_fwd_t) + + + // CPU_INSTANCE(gemm_convolution_fwd_t) + // CPU_INSTANCE(ref_convolution_fwd_t) + // CPU_INSTANCE(ref_fused_convolution_fwd_t) nullptr, }}, {{forward, f32, f16, f32}, { diff --git a/src/cpu/rv64/cpu_isa_traits.cpp b/src/cpu/rv64/cpu_isa_traits.cpp new file mode 100644 index 00000000000..b8c3fc658e0 --- /dev/null +++ b/src/cpu/rv64/cpu_isa_traits.cpp @@ -0,0 +1,44 @@ +/******************************************************************************* +* Copyright 2019 Intel Corporation +* Copyright 2025 Institute of Software, Chinese Academy of Sciences +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "cpu/rv64/cpu_isa_traits.hpp" +#include "cpu/platform.hpp" + +namespace dnnl { +namespace impl { +namespace cpu { +namespace rv64 { + +struct isa_info_t { + isa_info_t(cpu_isa_t aisa) : isa(aisa) {}; + cpu_isa_t isa; +}; + +static isa_info_t get_isa_info_t(void) { + if (mayiuse(zvfh)) return isa_info_t(zvfh); + if (mayiuse(v)) return isa_info_t(v); + return isa_info_t(isa_undef); +} + +cpu_isa_t get_max_cpu_isa() { + return get_isa_info_t().isa; +} + +} // namespace rv64 +} // namespace cpu +} // namespace impl +} // namespace dnnl diff --git a/src/cpu/rv64/cpu_isa_traits.hpp b/src/cpu/rv64/cpu_isa_traits.hpp new file mode 100644 index 00000000000..be5a4fc1d49 --- /dev/null +++ b/src/cpu/rv64/cpu_isa_traits.hpp @@ -0,0 +1,107 @@ +/******************************************************************************* +* Copyright 2018 Intel Corporation +* Copyright 2025 Institute of Software, Chinese Academy of Sciences +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef CPU_RV64_CPU_ISA_TRAITS_HPP +#define CPU_RV64_CPU_ISA_TRAITS_HPP + +#include + +#include "common/type_helpers.hpp" +#include "common/utils.hpp" +#include "dnnl_types.h" + +#ifndef XBYAK_RISCV_V +#define XBYAK_RISCV_V 1 +#endif + +#include "xbyak_riscv/xbyak_riscv_util.hpp" + +namespace dnnl { +namespace impl { +namespace cpu { +namespace rv64 { + +enum cpu_isa_bit_t : unsigned { + v_bit = 1u << 0, + zvfh_bit = 1u << 1, +}; + +enum cpu_isa_t : unsigned { + isa_undef = 0u, + v = v_bit, + zvfh = zvfh_bit | v, + isa_all = ~0u, +}; + +struct Riscv64Cpu { +public: + static Riscv64Cpu &getInstance() { + static Riscv64Cpu instance; + return instance; + } + + bool get_has_v() const { return has_v; } + bool get_has_zvfh() const { return has_zvfh; } + +private: + bool has_v = false; + bool has_zvfh = false; + + Riscv64Cpu() { + const auto &xbyak_cpu = Xbyak_riscv::CPU::getInstance(); + + has_v = xbyak_cpu.hasExtension(Xbyak_riscv::RISCVExtension::V); + + if (has_v) { + has_zvfh + = xbyak_cpu.hasExtension(Xbyak_riscv::RISCVExtension::Zvfh); + } else { + has_zvfh = false; + } + } +}; + +inline bool mayiuse(const cpu_isa_t cpu_isa, bool soft = false) { + MAYBE_UNUSED(soft); + const Riscv64Cpu &cpu = Riscv64Cpu::getInstance(); + + switch (cpu_isa) { + case v: return cpu.get_has_v(); + case zvfh: return cpu.get_has_v() && cpu.get_has_zvfh(); + case isa_undef: return true; + case isa_all: return false; + } + return false; +} + +cpu_isa_t get_max_cpu_isa(); + +#include "common/z_magic.hpp" +/* clang-format off */ +#define JIT_IMPL_NAME_HELPER(prefix, isa, suffix_if_any) \ + ((isa) == isa_undef ? prefix STRINGIFY(any) : \ + ((isa) == v ? prefix STRINGIFY(rvv) : \ + ((isa) == zvfh ? prefix STRINGIFY(rvv_zvfh) : \ + prefix suffix_if_any))) +/* clang-format on */ + +} // namespace rv64 +} // namespace cpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/cpu/rv64/jit_generator.hpp b/src/cpu/rv64/jit_generator.hpp new file mode 100644 index 00000000000..c795aba8c61 --- /dev/null +++ b/src/cpu/rv64/jit_generator.hpp @@ -0,0 +1,137 @@ +/******************************************************************************* +* Copyright 2025 ZTE Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef CPU_RV64_JIT_GENERATOR_HPP +#define CPU_RV64_JIT_GENERATOR_HPP + +#include +#include + +#include "common/c_types_map.hpp" +#include "common/type_helpers.hpp" +#include "common/utils.hpp" +#include "cpu/jit_utils/jit_utils.hpp" + +#include "cpu/rv64/cpu_isa_traits.hpp" +#include "xbyak_riscv/xbyak_riscv.hpp" + +#define DECLARE_CPU_JIT_AUX_FUNCTIONS(gen_name) \ + const char *name() const override { \ + return STRINGIFY(gen_name); \ + } \ + const char *source_file() const override { \ + return __FILE__; \ + } + +#define JIT_ASSERT(condition) \ + do { \ + assert(condition); \ + if (!(condition)) XBYAK_RISCV_THROW(Xbyak_riscv::ERR_INTERNAL); \ + } while (false) + +#define JIT_ASSERT_RET(condition, ret) \ + do { \ + assert(condition); \ + if (!(condition)) \ + XBYAK_RISCV_THROW_RET(Xbyak_riscv::ERR_INTERNAL, ret); \ + } while (false) + +namespace dnnl { +namespace impl { +namespace cpu { +namespace rv64 { + +// Simple helper to check subset relation between two ISA masks. +inline bool is_subset(cpu_isa_t isa, cpu_isa_t max_isa) { + using u_t = typename std::underlying_type::type; + return (static_cast(isa) & static_cast(max_isa)) + == static_cast(isa); +} + +// Minimal RV64 JIT generator base class. +class jit_generator_t : public Xbyak_riscv::CodeGenerator, public c_compatible { +public: + using c_compatible::operator new; + using c_compatible::operator new[]; + using c_compatible::operator delete; + using c_compatible::operator delete[]; + + // All JIT kernels must override these to provide a stable name used for + // debug/logging and jit code registration. + virtual const char *name() const = 0; + virtual const char *source_file() const = 0; + + explicit jit_generator_t(const char * /*unused_name*/, + cpu_isa_t max_cpu_isa = get_max_cpu_isa()) + : Xbyak_riscv::CodeGenerator(max_code_size) + , max_cpu_isa_(max_cpu_isa) {} + + ~jit_generator_t() override = default; + + const uint8_t *jit_ker() const { return jit_ker_; } + + template + void operator()(kernel_args_t... args) const { + using jit_kernel_func_t = void (*)(const kernel_args_t...); + // This const_cast is required for Clang. + // Clang rejects reinterpret_cast from const uint8_t* to function pointer. + auto *fptr = reinterpret_cast( + const_cast(jit_ker_)); + (*fptr)(std::forward(args)...); + } + + virtual status_t create_kernel() { + try { + generate(); + } catch (...) { return status::runtime_error; } + + this->ready(Xbyak_riscv::CodeArray::PROTECT_RWE); + + jit_ker_ = Xbyak_riscv::CodeGenerator::getCode(); + + if (jit_ker_) { + jit_utils::register_jit_code(jit_ker_, + Xbyak_riscv::CodeArray::getSize(), name(), source_file()); + return status::success; + } + + return status::runtime_error; + } + + inline cpu_isa_t max_cpu_isa() const noexcept { return max_cpu_isa_; } + + // Helper to check that a requested ISA is both within the per‑kernel limit + // and supported by the current CPU. + inline bool is_valid_isa(cpu_isa_t isa) const { + return is_subset(isa, max_cpu_isa_) && mayiuse(isa); + } + +protected: + virtual void generate() = 0; + +private: + static constexpr unsigned max_code_size = 256 * 1024; + + const cpu_isa_t max_cpu_isa_; + const uint8_t *jit_ker_ = nullptr; +}; + +} // namespace rv64 +} // namespace cpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/cpu/rv64/jit_primitive_conf.hpp b/src/cpu/rv64/jit_primitive_conf.hpp new file mode 100644 index 00000000000..dde5afb8d32 --- /dev/null +++ b/src/cpu/rv64/jit_primitive_conf.hpp @@ -0,0 +1,97 @@ +/******************************************************************************* +* Copyright 2025 ZTE Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef CPU_RV64_JIT_PRIMITIVE_CONF_HPP +#define CPU_RV64_JIT_PRIMITIVE_CONF_HPP + +#include "common/c_types_map.hpp" + +namespace dnnl { +namespace impl { +namespace cpu { +namespace rv64 { + +struct jit_1x1_conv_conf_t { + prop_kind_t prop_kind; + int mb; + int ngroups, ic, oc, oc_without_padding, ic_without_padding; + int iw, ih, id; + int ow, oh, od; + int os, is; + int kw, kh, kd; + int stride_w, stride_h, stride_d; + int t_pad, l_pad, f_pad; + + int ic_block, oc_block; + int load_block, reduce_block; + int bcast_block; + + dim_t load_dim, bcast_dim, reduce_dim; + + int ur, ur_tail; + int load_loop_blk; + int reduce_loop_unroll; + int nthr; + int nb_bcast, nb_load, nb_reduce, load_grp_count; + int nb_load_blocking, nb_load_blocking_max; + int nb_bcast_blocking, nb_bcast_blocking_max; + int nb_reduce_blocking; + + dim_t reduce_loop_bcast_step; + int reduce_loop_load_step; + int bcast_loop_bcast_step; + int bcast_loop_output_step; + int load_loop_load_step; + int load_loop_iter_step; + + bool with_bias; + bool with_sum; + bool with_eltwise; + bool with_binary; + bool with_dw_conv; + + int typesize_in; + int typesize_out; + int typesize_bia; + int typesize_acc; + + format_tag_t src_tag, wei_tag, dst_tag; +}; + +struct jit_1x1_conv_args_t { + const void *bcast_data; + const void *load_data; + const void *output_data; + const void *bias_data; + + size_t load_dim; + size_t bcast_dim; + size_t reduce_dim; + + size_t first_last_flag; +}; + +enum { + FLAG_REDUCE_FIRST = 1 << 0, + FLAG_REDUCE_LAST = 1 << 1, +}; + +} // namespace rv64 +} // namespace cpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp new file mode 100644 index 00000000000..c63a375d13b --- /dev/null +++ b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp @@ -0,0 +1,581 @@ +/******************************************************************************* +* Copyright 2025 ZTE Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include "common/c_types_map.hpp" +#include "common/dnnl_thread.hpp" +#include "common/memory.hpp" +#include "common/utils.hpp" + +#include "cpu/rv64/jit_rvv_1x1_conv_kernel.hpp" + +#define GET_OFF(field) \ + static_cast(offsetof(jit_1x1_conv_args_t, field)) + +namespace dnnl { +namespace impl { +namespace cpu { +namespace rv64 { + +using namespace dnnl::impl::format_tag; +using namespace dnnl::impl::prop_kind; +using namespace dnnl::impl::utils; +using namespace Xbyak_riscv; + +jit_rvv_1x1_conv_kernel_t::jit_rvv_1x1_conv_kernel_t( + const jit_1x1_conv_conf_t &ajcp, const primitive_attr_t &attr, + const memory_desc_t &dst_md) + : jit_generator_t("jit_rvv_1x1_conv_kernel"), jcp(ajcp), attr_(attr) { + create_kernel(); +} + +status_t jit_rvv_1x1_conv_kernel_t::init_conf(jit_1x1_conv_conf_t &jcp, + const convolution_desc_t &cd, const memory_desc_wrapper &src_d, + const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d, + const primitive_attr_t &attr, int nthreads, bool reduce_src) { + + const int ndims = src_d.ndims(); + + jcp.prop_kind = cd.prop_kind; + jcp.nthr = nthreads; + + jcp.with_bias = cd.bias_desc.format_kind != format_kind::undef; + + // Initialize dimensions + jcp.mb = src_d.dims()[0]; + jcp.ngroups + = weights_d.ndims() == src_d.ndims() + 1 ? weights_d.dims()[0] : 1; + jcp.oc_without_padding = dst_d.dims()[1] / jcp.ngroups; + jcp.ic_without_padding = src_d.dims()[1] / jcp.ngroups; + jcp.oc = jcp.oc_without_padding; + jcp.ic = jcp.ic_without_padding; + + // Targeting SEW=32 (float), LMUL=1, VLEN=128 -> simd_w = 4 + const int simd_w = 4; + + // OC is padded to match oc_block in weights format (Oihw4o) + // IC is not padded; kernel handles IC tail processing + jcp.oc = rnd_up(jcp.oc, simd_w); + + // 3D convolution support + jcp.id = (ndims == 5) ? src_d.dims()[2] : 1; + jcp.od = (ndims == 5) ? dst_d.dims()[2] : 1; + + jcp.ih = (ndims == 3) ? 1 : src_d.dims()[ndims - 2]; + jcp.iw = src_d.dims()[ndims - 1]; + jcp.oh = (ndims == 3) ? 1 : dst_d.dims()[ndims - 2]; + jcp.ow = dst_d.dims()[ndims - 1]; + + // Spatial dimensions: D*H*W + jcp.os = jcp.od * jcp.oh * jcp.ow; + jcp.is = jcp.id * jcp.ih * jcp.iw; + + jcp.oc_block = simd_w; + jcp.ic_block = simd_w; + + // Dynamic parameter calculation + // Register constraint: (ur * load_loop_blk) + (unroll * load_loop_blk) + 1 <= 32 + jcp.reduce_loop_unroll = 4; + + const int SMALL_SPATIAL = 10; + const int BIG_SPATIAL = 65; + const int BIG_LOAD_DIM = (jcp.ic >= 512) ? 256 : 512; + + // Initial load_loop_blk selection + if (jcp.oc % (2 * jcp.oc_block) == 0 && jcp.os >= 11) { + jcp.load_loop_blk = 2; + } else { + jcp.load_loop_blk = 1; + } + + // Dynamic ur selection algorithm + int max_regs, min_regs, size_threshold; + + const int spatial = jcp.od * jcp.oh; + + // Select register range based on batch size and thread count + if ((8 * jcp.mb) / jcp.nthr >= 1 || jcp.mb == 1) { + max_regs = 9; + min_regs = 6; + size_threshold = 14; + + // Special shape optimization + if (jcp.oc > 128 && jcp.oc < BIG_LOAD_DIM && spatial > SMALL_SPATIAL + && spatial < BIG_SPATIAL && jcp.ic < 256) { + max_regs = 6; + min_regs = 5; + } + } else { + max_regs = 30; + min_regs = 9; + size_threshold = 14; + } + + // Initial ur + jcp.ur = 1; + + // First pass: find largest ur that divides spatial evenly + for (int ur_w = max_regs; ur_w >= min_regs; ur_w--) { + if ((spatial >= size_threshold && spatial % ur_w == 0) + || (spatial < size_threshold && jcp.os % ur_w == 0)) { + jcp.ur = ur_w; + break; + } + } + + // If first pass fails, use heuristic + if (jcp.ur == 1) { + jcp.ur = nstl::min(max_regs, jcp.os); + int os_tail = jcp.os % max_regs; + for (int i = max_regs; i >= min_regs; i--) { + int i_tail = jcp.os % i; + if (i_tail > os_tail || i_tail == 0) { + jcp.ur = i; + os_tail = i_tail; + if (i_tail == 0) break; + } + } + } + + // Adjust ur based on load_loop_blk (ensure register limit) + // Register constraint: ur * load_loop_blk + unroll * load_loop_blk + 1 <= 32 + int max_ur_for_blk = (32 - 1 - jcp.reduce_loop_unroll * jcp.load_loop_blk) + / jcp.load_loop_blk; + if (jcp.ur > max_ur_for_blk) { + jcp.ur = max_ur_for_blk; + if (jcp.ur < 1) jcp.ur = 1; + } + + jcp.load_block = jcp.oc_block; + jcp.reduce_block = jcp.ic_block; + + jcp.bcast_block = jcp.ur; + jcp.load_dim = jcp.oc_without_padding; + jcp.bcast_dim = jcp.os; + jcp.reduce_dim = jcp.ic_without_padding; + + jcp.ur_tail = jcp.bcast_dim % jcp.ur; + + jcp.nb_bcast = div_up(jcp.os, jcp.bcast_block); + jcp.nb_load = div_up(jcp.oc_without_padding, jcp.load_block); + jcp.nb_reduce = div_up(jcp.ic_without_padding, jcp.reduce_block); + jcp.load_grp_count = 1; + + // Blocking strategy for NHWC layout + jcp.nb_reduce_blocking = jcp.nb_reduce; + jcp.nb_load_blocking = jcp.nb_load; + jcp.nb_load_blocking_max = jcp.nb_load; + + // Spatial dimension blocking (in ur units) + int target_bcast_blocking = 735; + jcp.nb_bcast_blocking + = nstl::min(jcp.nb_bcast, div_up(target_bcast_blocking, jcp.ur)); + if (jcp.nb_bcast_blocking == 0) jcp.nb_bcast_blocking = 1; + jcp.nb_bcast_blocking_max = jcp.nb_bcast_blocking; + + // Optimize reduce_loop_unroll based on available registers + if (jcp.load_loop_blk == 2) { + jcp.reduce_loop_unroll = 4; + } else { + jcp.reduce_loop_unroll = 4; + } + + // Layout-dependent stride parameters (for NHWC) + jcp.typesize_in = sizeof(float); + jcp.typesize_out = sizeof(float); + + jcp.reduce_loop_bcast_step = jcp.typesize_in; + jcp.reduce_loop_load_step = jcp.oc_block * jcp.typesize_in; + + // Strides within bcast_loop (spatial dimensions) + jcp.bcast_loop_bcast_step + = jcp.ngroups * jcp.ic_without_padding * jcp.typesize_in; + jcp.bcast_loop_output_step + = jcp.ngroups * jcp.oc_without_padding * jcp.typesize_out; + + // Strides within load_loop (OC dimension) + jcp.load_loop_load_step + = jcp.ic_without_padding * jcp.oc_block * jcp.typesize_in; + jcp.load_loop_iter_step = jcp.oc_block; + + return status::success; +} + +void jit_rvv_1x1_conv_kernel_t::init_scratchpad( + memory_tracking::registrar_t &scratchpad, + const jit_1x1_conv_conf_t &jcp) { + // Not implemented +} + +void jit_rvv_1x1_conv_kernel_t::balance(jit_1x1_conv_conf_t &jcp) { + // Not implemented +} + +void jit_rvv_1x1_conv_kernel_t::generate() { + preamble(); + + // Set initial VL to oc_block (4) + li(reg_tmp_imm, jcp.oc_block); + vsetvli(reg_tmp_imm, reg_tmp_imm, Xbyak_riscv::SEW::e32, + Xbyak_riscv::LMUL::m1); + + // Load parameters + ld(reg_bcast_data, reg_param, GET_OFF(bcast_data)); + ld(reg_load_data, reg_param, GET_OFF(load_data)); + ld(reg_output_data, reg_param, GET_OFF(output_data)); + if (jcp.with_bias) ld(reg_bias_data, reg_param, GET_OFF(bias_data)); + + ld(reg_load_loop_work, reg_param, GET_OFF(load_dim)); + ld(reg_bcast_loop_work, reg_param, GET_OFF(bcast_dim)); + ld(reg_reduce_loop_work, reg_param, GET_OFF(reduce_dim)); + ld(reg_reduce_pos_flag, reg_param, GET_OFF(first_last_flag)); + + // Main loop generation + auto load_loop_body = [=](int load_loop_blk) { + bcast_loop(load_loop_blk); + + // Update pointers and work counters + li(reg_tmp_imm, load_loop_blk * jcp.load_loop_load_step); + add(reg_load_data, reg_load_data, reg_tmp_imm); + + if (jcp.with_bias) { + li(reg_tmp_imm, load_loop_blk * jcp.oc_block * jcp.typesize_out); + add(reg_bias_data, reg_bias_data, reg_tmp_imm); + } + + li(reg_tmp_imm, load_loop_blk * jcp.oc_block * jcp.typesize_out); + add(reg_output_data, reg_output_data, reg_tmp_imm); + + li(reg_tmp_imm, load_loop_blk * jcp.load_loop_iter_step); + sub(reg_load_loop_work, reg_load_loop_work, reg_tmp_imm); + }; + + Label load_loop_label, load_loop_end, load_loop_tail; + + if (jcp.load_loop_blk > 1) { + L(load_loop_label); + li(reg_tmp_imm, jcp.load_loop_blk * jcp.oc_block); + blt(reg_load_loop_work, reg_tmp_imm, load_loop_tail); + + // Ensure VL is full + li(reg_tmp_imm, jcp.oc_block); + vsetvli(reg_tmp_imm, reg_tmp_imm, Xbyak_riscv::SEW::e32, + Xbyak_riscv::LMUL::m1); + + load_loop_body(jcp.load_loop_blk); + jal(x0, load_loop_label); + } + + L(load_loop_tail); + { + Label tail_loop; + L(tail_loop); + blez(reg_load_loop_work, load_loop_end); + + // Last block may be partial, use vsetvli to set VL dynamically + vsetvli(reg_tmp_imm, reg_load_loop_work, Xbyak_riscv::SEW::e32, + Xbyak_riscv::LMUL::m1); + + bcast_loop(1); + + // Update pointers and work counters (tail loop) + li(reg_tmp_imm, jcp.load_loop_load_step); + add(reg_load_data, reg_load_data, reg_tmp_imm); + if (jcp.with_bias) { + li(reg_tmp_imm, jcp.oc_block * jcp.typesize_out); + add(reg_bias_data, reg_bias_data, reg_tmp_imm); + } + li(reg_tmp_imm, jcp.oc_block * jcp.typesize_out); + add(reg_output_data, reg_output_data, reg_tmp_imm); + + li(reg_tmp_imm, jcp.oc_block); + sub(reg_load_loop_work, reg_load_loop_work, reg_tmp_imm); + + jal(x0, tail_loop); + } + L(load_loop_end); + + postamble(); +} + +void jit_rvv_1x1_conv_kernel_t::preamble() { + addi(sp, sp, -64); + sd(ra, sp, 56); + sd(s0, sp, 48); + sd(s1, sp, 40); + sd(s2, sp, 32); + sd(s3, sp, 24); + sd(s4, sp, 16); + sd(s5, sp, 8); +} + +void jit_rvv_1x1_conv_kernel_t::postamble() { + ld(ra, sp, 56); + ld(s0, sp, 48); + ld(s1, sp, 40); + ld(s2, sp, 32); + ld(s3, sp, 24); + ld(s4, sp, 16); + ld(s5, sp, 8); + addi(sp, sp, 64); + ret(); +} + +void jit_rvv_1x1_conv_kernel_t::bcast_loop(int load_loop_blk) { + mv(reg_bcast_loop_iter, reg_bcast_loop_work); + mv(aux1_reg_bcast_data, reg_bcast_data); + mv(aux_reg_output_data, reg_output_data); + + Label bcast_loop_label, bcast_loop_tail; + + li(reg_tmp_imm, jcp.ur); + blt(reg_bcast_loop_iter, reg_tmp_imm, bcast_loop_tail); + + L(bcast_loop_label); + { + reduce_loop(load_loop_blk, jcp.ur); + + li(reg_tmp_imm, jcp.ur * jcp.bcast_loop_bcast_step); + add(aux1_reg_bcast_data, aux1_reg_bcast_data, reg_tmp_imm); + + li(reg_tmp_imm, jcp.ur * jcp.bcast_loop_output_step); + add(aux_reg_output_data, aux_reg_output_data, reg_tmp_imm); + + addi(reg_bcast_loop_iter, reg_bcast_loop_iter, -jcp.ur); + li(reg_tmp_imm, jcp.ur); + bge(reg_bcast_loop_iter, reg_tmp_imm, bcast_loop_label); + } + + L(bcast_loop_tail); + if (jcp.ur_tail > 0) { + Label bcast_loop_tail_end; + blez(reg_bcast_loop_iter, bcast_loop_tail_end); + + reduce_loop(load_loop_blk, jcp.ur_tail); + + L(bcast_loop_tail_end); + } +} + +void jit_rvv_1x1_conv_kernel_t::reduce_loop(int load_loop_blk, int ur) { + mv(aux_reg_load_data, reg_load_data); + mv(aux_reg_bcast_data, aux1_reg_bcast_data); + + auto init = [=]() { + Label init_zero, init_done; + andi(reg_tmp_imm, reg_reduce_pos_flag, FLAG_REDUCE_FIRST); + bnez(reg_tmp_imm, init_zero); + + // Load from dst for accumulation + mv(reg_tmp_addr, aux_reg_output_data); + for (int i_ur = 0; i_ur < ur; ++i_ur) { + for (int i_load = 0; i_load < load_loop_blk; ++i_load) { + vle32_v(vreg_accum(i_load, i_ur), reg_tmp_addr); + if (i_load + 1 < load_loop_blk) + addi(reg_tmp_addr, reg_tmp_addr, + jcp.load_block * jcp.typesize_out); + } + li(reg_tmp_imm, + jcp.bcast_loop_output_step + - (load_loop_blk - 1) * jcp.load_block + * jcp.typesize_out); + add(reg_tmp_addr, reg_tmp_addr, reg_tmp_imm); + } + jal(x0, init_done); + + L(init_zero); + for (int i_load = 0; i_load < load_loop_blk; ++i_load) { + if (jcp.with_bias) { + size_t bias_off + = (size_t)i_load * jcp.oc_block * jcp.typesize_out; + if (bias_off == 0) { + vle32_v(vreg_load(0), reg_bias_data); + } else { + li(reg_tmp_addr, bias_off); + add(reg_tmp_addr, reg_tmp_addr, reg_bias_data); + vle32_v(vreg_load(0), reg_tmp_addr); + } + } + for (int i_ur = 0; i_ur < ur; ++i_ur) { + if (jcp.with_bias) { + vmv_v_v(vreg_accum(i_load, i_ur), vreg_load(0)); + } else { + vxor_vv(vreg_accum(i_load, i_ur), vreg_accum(i_load, i_ur), + vreg_accum(i_load, i_ur)); + } + } + } + L(init_done); + }; + + auto store = [=]() { + mv(reg_tmp_addr, aux_reg_output_data); + for (int i_ur = 0; i_ur < ur; ++i_ur) { + for (int i_load = 0; i_load < load_loop_blk; ++i_load) { + vse32_v(vreg_accum(i_load, i_ur), reg_tmp_addr); + if (i_load + 1 < load_loop_blk) + addi(reg_tmp_addr, reg_tmp_addr, + jcp.load_block * jcp.typesize_out); + } + li(reg_tmp_imm, + jcp.bcast_loop_output_step + - (load_loop_blk - 1) * jcp.load_block + * jcp.typesize_out); + add(reg_tmp_addr, reg_tmp_addr, reg_tmp_imm); + } + }; + + auto fma_block = [=](int current_unroll, bool last_block) { + for (int i_unroll = 0; i_unroll < current_unroll; ++i_unroll) { + flw(freg_bcast, aux_reg_bcast_data, 0); + + for (int i_ur = 0; i_ur < ur; ++i_ur) { + for (int i_load = 0; i_load < load_loop_blk; ++i_load) { + vfmacc_vf(vreg_accum(i_load, i_ur), freg_bcast, + vreg_load(i_load, i_unroll)); + } + + if (i_ur + 1 < ur) { + size_t offset + = (size_t)(i_ur + 1) * jcp.bcast_loop_bcast_step; + if (offset <= 2047) { + flw(freg_bcast, aux_reg_bcast_data, offset); + } else { + li(reg_tmp_addr, offset); + add(reg_tmp_addr, reg_tmp_addr, aux_reg_bcast_data); + flw(freg_bcast, reg_tmp_addr, 0); + } + } + } + addi(aux_reg_bcast_data, aux_reg_bcast_data, + jcp.reduce_loop_bcast_step); + } + + // Update weight pointer to next unroll block + li(reg_tmp_imm, jcp.reduce_loop_unroll * jcp.reduce_loop_load_step); + add(aux_reg_load_data, aux_reg_load_data, reg_tmp_imm); + + // Prefetch weights for next iteration + if (!last_block) { + for (int i_unroll = 0; i_unroll < jcp.reduce_loop_unroll; + ++i_unroll) { + for (int i_load = 0; i_load < load_loop_blk; ++i_load) { + size_t weight_off + = (size_t)i_unroll * jcp.reduce_loop_load_step + + (size_t)i_load * jcp.load_loop_load_step; + li(reg_tmp_addr, weight_off); + add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr); + vle32_v(vreg_load(i_load, i_unroll), reg_tmp_addr); + } + } + } + }; + + init(); + + // Load first round of weights (IC=0..unroll-1) + for (int i_unroll = 0; i_unroll < jcp.reduce_loop_unroll; ++i_unroll) { + for (int i_load = 0; i_load < load_loop_blk; ++i_load) { + size_t weight_off = (size_t)i_unroll * jcp.reduce_loop_load_step + + (size_t)i_load * jcp.load_loop_load_step; + if (weight_off == 0) { + vle32_v(vreg_load(i_load, i_unroll), aux_reg_load_data); + } else { + li(reg_tmp_addr, weight_off); + add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr); + vle32_v(vreg_load(i_load, i_unroll), reg_tmp_addr); + } + } + } + + mv(reduce_loop_iter, reg_reduce_loop_work); + Label reduce_loop_label, reduce_loop_tail; + + li(reg_tmp_imm, jcp.reduce_loop_unroll); + blt(reduce_loop_iter, reg_tmp_imm, reduce_loop_tail); + + L(reduce_loop_label); + { + li(reg_tmp_imm, jcp.reduce_loop_unroll); + sub(reg_tmp_imm, reduce_loop_iter, reg_tmp_imm); + li(reg_tmp_addr, jcp.reduce_loop_unroll); + Label is_last, do_fma; + blt(reg_tmp_imm, reg_tmp_addr, is_last); + fma_block(jcp.reduce_loop_unroll, false); + jal(x0, do_fma); + L(is_last); + fma_block(jcp.reduce_loop_unroll, true); + L(do_fma); + + addi(reduce_loop_iter, reduce_loop_iter, -jcp.reduce_loop_unroll); + li(reg_tmp_imm, jcp.reduce_loop_unroll); + bge(reduce_loop_iter, reg_tmp_imm, reduce_loop_label); + } + + L(reduce_loop_tail); + { + Label tail_done; + blez(reduce_loop_iter, tail_done); + Label tail_loop; + L(tail_loop); + { + for (int i_load = 0; i_load < load_loop_blk; ++i_load) { + size_t weight_off = (size_t)i_load * jcp.load_loop_load_step; + if (weight_off == 0) { + vle32_v(vreg_load(i_load, 0), aux_reg_load_data); + } else { + li(reg_tmp_addr, weight_off); + add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr); + vle32_v(vreg_load(i_load, 0), reg_tmp_addr); + } + } + + flw(freg_bcast, aux_reg_bcast_data, 0); + for (int i_ur = 0; i_ur < ur; ++i_ur) { + for (int i_load = 0; i_load < load_loop_blk; ++i_load) { + vfmacc_vf(vreg_accum(i_load, i_ur), freg_bcast, + vreg_load(i_load, 0)); + } + if (i_ur + 1 < ur) { + size_t offset + = (size_t)(i_ur + 1) * jcp.bcast_loop_bcast_step; + if (offset <= 2047) { + flw(freg_bcast, aux_reg_bcast_data, offset); + } else { + li(reg_tmp_addr, offset); + add(reg_tmp_addr, reg_tmp_addr, aux_reg_bcast_data); + flw(freg_bcast, reg_tmp_addr, 0); + } + } + } + + addi(aux_reg_bcast_data, aux_reg_bcast_data, + jcp.reduce_loop_bcast_step); + addi(aux_reg_load_data, aux_reg_load_data, + jcp.reduce_loop_load_step); + addi(reduce_loop_iter, reduce_loop_iter, -1); + bnez(reduce_loop_iter, tail_loop); + } + L(tail_done); + } + + store(); +} + +} // namespace rv64 +} // namespace cpu +} // namespace impl +} // namespace dnnl diff --git a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp new file mode 100644 index 00000000000..0fcd9774aec --- /dev/null +++ b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp @@ -0,0 +1,109 @@ +/******************************************************************************* +* Copyright 2025 ZTE Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef CPU_RV64_JIT_RVV_1X1_CONV_KERNEL_HPP +#define CPU_RV64_JIT_RVV_1X1_CONV_KERNEL_HPP + +#include "common/c_types_map.hpp" +#include "common/memory_tracking.hpp" + +#include "cpu/rv64/jit_generator.hpp" +#include "cpu/rv64/jit_primitive_conf.hpp" + +namespace dnnl { +namespace impl { +namespace cpu { +namespace rv64 { + +using namespace Xbyak_riscv; + +struct jit_rvv_1x1_conv_kernel_t : public jit_generator_t { + jit_rvv_1x1_conv_kernel_t(const jit_1x1_conv_conf_t &ajcp, + const primitive_attr_t &attr, const memory_desc_t &dst_md); + + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_rvv_1x1_conv_kernel) + + static status_t init_conf(jit_1x1_conv_conf_t &jcp, + const convolution_desc_t &cd, const memory_desc_wrapper &src_d, + const memory_desc_wrapper &weights_d, + const memory_desc_wrapper &dst_d, const primitive_attr_t &attr, + int nthreads, bool reduce_src); + + static void init_scratchpad(memory_tracking::registrar_t &scratchpad, + const jit_1x1_conv_conf_t &jcp); + + static void balance(jit_1x1_conv_conf_t &jcp); + + jit_1x1_conv_conf_t jcp; + const primitive_attr_t &attr_; + +private: + using Reg = Xbyak_riscv::Reg; + using VReg = Xbyak_riscv::VReg; + using FReg = Xbyak_riscv::FReg; + + const Reg reg_param = a0; + const Reg reg_bcast_data = a1; + const Reg reg_load_data = a2; + const Reg reg_output_data = a3; + const Reg reg_bias_data = a4; + + const Reg reg_load_loop_work = t0; + const Reg reg_bcast_loop_work = t1; + const Reg reg_reduce_loop_work = t2; + + const Reg aux_reg_bcast_data = t3; + const Reg aux_reg_load_data = t4; + const Reg aux_reg_output_data = t5; + const Reg aux1_reg_bcast_data = t6; + + const Reg reduce_loop_iter = s0; + const Reg reg_bcast_loop_iter = s1; + const Reg reg_reduce_pos_flag = s2; + const Reg reg_output_stride = s3; + + const Reg reg_tmp_imm = s4; + const Reg reg_tmp_addr = s5; + + VReg vreg_accum(int i_load, int i_ur) { + // Avoid v0, start from v1 + return VReg(1 + i_ur * jcp.load_loop_blk + i_load); + } + + VReg vreg_load(int i_load, int i_unroll = 0) { + // Allocate after accum to avoid conflicts + // accum uses v1 to v(ur * load_loop_blk) + return VReg(1 + jcp.ur * jcp.load_loop_blk + + i_unroll * jcp.load_loop_blk + i_load); + } + + const FReg freg_bcast = fa0; + const FReg freg_load = fa1; + + void generate() override; + void preamble(); + void postamble(); + void bcast_loop(int load_loop_blk); + void reduce_loop(int load_loop_blk, int ur); + void fma_block(int load_loop_blk, int ur); +}; + +} // namespace rv64 +} // namespace cpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/src/cpu/rv64/jit_rvv_1x1_convolution.cpp b/src/cpu/rv64/jit_rvv_1x1_convolution.cpp new file mode 100644 index 00000000000..f744419990a --- /dev/null +++ b/src/cpu/rv64/jit_rvv_1x1_convolution.cpp @@ -0,0 +1,144 @@ +/******************************************************************************* +* Copyright 2025 ZTE Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "common/c_types_map.hpp" +#include "common/dnnl_thread.hpp" +#include "common/type_helpers.hpp" +#include "common/utils.hpp" + +#include "cpu/rv64/jit_rvv_1x1_convolution.hpp" + +namespace dnnl { +namespace impl { +namespace cpu { +namespace rv64 { + +using namespace dnnl::impl::status; +using namespace dnnl::impl::utils; + +void jit_rvv_1x1_convolution_fwd_t::execute_forward( + const exec_ctx_t &ctx) const { + auto src = CTX_IN_MEM(const float *, DNNL_ARG_SRC); + auto weights = CTX_IN_MEM(const float *, DNNL_ARG_WEIGHTS); + auto bias = CTX_IN_MEM(const float *, DNNL_ARG_BIAS); + auto dst = CTX_OUT_MEM(float *, DNNL_ARG_DST); + + const auto &scratchpad = ctx.get_scratchpad_grantor(); + + parallel(pd()->jcp_.nthr, [&](const int ithr, const int nthr) { + execute_forward_thr(ithr, nthr, src, weights, bias, dst, scratchpad); + }); +} + +void jit_rvv_1x1_convolution_fwd_t::execute_forward_thr(const int ithr, + const int nthr, const float *src, const float *weights, + const float *bias, float *dst, + const memory_tracking::grantor_t &scratchpad) const { + + const memory_desc_wrapper src_d(pd()->src_md()); + const memory_desc_wrapper dst_d(pd()->dst_md()); + const memory_desc_wrapper weights_d(pd()->weights_md(0)); + + const auto &jcp = pd()->jcp_; + + auto step = [](int default_step, int remaining, int tail_step) { + assert(default_step <= tail_step); + return remaining < tail_step ? remaining : default_step; + }; + + // RVV 1x1 convolution uses NHWC layout. + // Spatial dimensions are collapsed into 'os'. + // Threading is balanced over (MB * groups * nb_bcast) and (nb_load). + + const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast; + int bcast_start {0}, bcast_end {0}, ocb_start {0}, ocb_end {0}; + + balance2D(nthr, ithr, work_amount, bcast_start, bcast_end, jcp.nb_load, + ocb_start, ocb_end, jcp.load_grp_count); + + if (bcast_start >= bcast_end || ocb_start >= ocb_end) return; + + auto p = jit_1x1_conv_args_t(); + + auto ker_1x1 = [&](int ocb, int load_step, int icb, int n, int g, int osb, + int bcast_step) { + const int oc_off = g * jcp.oc_without_padding + ocb * jcp.oc_block; + const size_t dst_off + = (size_t)n * jcp.os * jcp.ngroups * jcp.oc_without_padding + + (size_t)osb * jcp.bcast_block * jcp.ngroups + * jcp.oc_without_padding + + oc_off; + + p.output_data = &dst[dst_off]; + p.bias_data = bias ? &bias[oc_off] : nullptr; + + const size_t wei_off = (size_t)g * jcp.oc * jcp.ic_without_padding + + (size_t)ocb * jcp.ic_without_padding * jcp.oc_block + + (size_t)icb * jcp.ic_block * jcp.oc_block; + p.load_data = &weights[wei_off]; + + const int ic_off = g * jcp.ic_without_padding + icb * jcp.ic_block; + const size_t src_off + = (size_t)n * jcp.is * jcp.ngroups * jcp.ic_without_padding + + (size_t)osb * jcp.bcast_block * jcp.ngroups + * jcp.ic_without_padding + + ic_off; + p.bcast_data = &src[src_off]; + + p.bcast_dim = this_block_size( + osb * jcp.bcast_block, jcp.os, bcast_step * jcp.bcast_block); + p.load_dim = this_block_size(ocb * jcp.oc_block, jcp.oc_without_padding, + load_step * jcp.oc_block); + p.reduce_dim = this_block_size(icb * jcp.ic_block, + jcp.ic_without_padding, jcp.nb_reduce_blocking * jcp.ic_block); + + p.first_last_flag = (icb == 0 ? FLAG_REDUCE_FIRST : 0) + | (icb + jcp.nb_reduce_blocking >= jcp.nb_reduce + ? FLAG_REDUCE_LAST + : 0); + + (*kernel_)(&p); + }; + + // Loop order: Load -> Bcast -> Reduce (LBR) + // This order keeps weights in registers/L1 while iterating over spatial. + for (int ocb = ocb_start; ocb < ocb_end;) { + int load_step = step( + jcp.nb_load_blocking, ocb_end - ocb, jcp.nb_load_blocking_max); + int iwork = bcast_start; + while (iwork < bcast_end) { + int n {0}, g {0}, osb {0}; + nd_iterator_init( + iwork, n, jcp.mb, g, jcp.ngroups, osb, jcp.nb_bcast); + + int bcast_step = step(jcp.nb_bcast_blocking, bcast_end - iwork, + jcp.nb_bcast_blocking_max); + bcast_step = nstl::min(bcast_step, jcp.nb_bcast - osb); + + for (int icb = 0; icb < jcp.nb_reduce; + icb += jcp.nb_reduce_blocking) { + ker_1x1(ocb, load_step, icb, n, g, osb, bcast_step); + } + iwork += bcast_step; + } + ocb += load_step; + } +} + +} // namespace rv64 +} // namespace cpu +} // namespace impl +} // namespace dnnl diff --git a/src/cpu/rv64/jit_rvv_1x1_convolution.hpp b/src/cpu/rv64/jit_rvv_1x1_convolution.hpp new file mode 100644 index 00000000000..2d379cc6ec9 --- /dev/null +++ b/src/cpu/rv64/jit_rvv_1x1_convolution.hpp @@ -0,0 +1,170 @@ +/******************************************************************************* +* Copyright 2025 ZTE Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef CPU_RV64_JIT_RVV_1X1_CONVOLUTION_HPP +#define CPU_RV64_JIT_RVV_1X1_CONVOLUTION_HPP + +#include "common/c_types_map.hpp" +#include "common/dnnl_thread.hpp" +#include "common/memory_tracking.hpp" +#include "common/primitive.hpp" +#include "common/utils.hpp" + +#include "cpu/cpu_convolution_pd.hpp" +#include "cpu/platform.hpp" + +#include "cpu/rv64/jit_rvv_1x1_conv_kernel.hpp" + +namespace dnnl { +namespace impl { +namespace cpu { +namespace rv64 { + +struct jit_rvv_1x1_convolution_fwd_t : public primitive_t { + struct pd_t : public cpu_convolution_fwd_pd_t { + using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t; + + DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_1x1:", v, ""), + jit_rvv_1x1_convolution_fwd_t); + + status_t init(engine_t *engine) { + using namespace utils; + using namespace format_tag; + + const memory_desc_wrapper src_d(src_md()); + const memory_desc_wrapper weights_d(weights_md()); + const memory_desc_wrapper dst_d(dst_md()); + + VDISPATCH_CONV(is_fwd(), VERBOSE_BAD_PROPKIND); + VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct), + VERBOSE_BAD_ALGORITHM); + VDISPATCH_CONV( + expect_data_types(data_type::f32, data_type::f32, + data_type::f32, data_type::f32, data_type::undef), + VERBOSE_UNSUPPORTED_DT); + VDISPATCH_CONV(attr()->has_default_values( + primitive_attr_t::skip_mask_t::post_ops), + VERBOSE_UNSUPPORTED_ATTR); + VDISPATCH_CONV(post_ops_ok(), VERBOSE_UNSUPPORTED_POSTOP); + VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, ""); + + // Only support: data = nwc/nhwc/ndhwc, weights = blocked formats (Oiw4o/gOiw4o/etc) + const int n = ndims(); + const bool g = with_groups(); + const auto dat_tag_nxc = utils::pick(n - 3, nwc, nhwc, ndhwc); + const auto wei_tag_blocked = utils::pick(2 * n - 6 + (g ? 1 : 0), + Oiw4o, gOiw4o, Oihw4o, gOihw4o, Oidhw4o, gOidhw4o); + + // Check if src/dst match supported format (nxc) + // Only accept format_kind::any as a fallback, reject explicit + // unsupported formats + VDISPATCH_CONV(IMPLICATION(src_d.matches_one_of_tag(dat_tag_nxc) + != dat_tag_nxc, + src_d.format_kind() == format_kind::any), + VERBOSE_UNSUPPORTED_TAG); + VDISPATCH_CONV(IMPLICATION(dst_d.matches_one_of_tag(dat_tag_nxc) + != dat_tag_nxc, + dst_d.format_kind() == format_kind::any), + VERBOSE_UNSUPPORTED_TAG); + VDISPATCH_CONV( + IMPLICATION(weights_d.matches_one_of_tag(wei_tag_blocked) + != wei_tag_blocked, + weights_d.format_kind() == format_kind::any), + VERBOSE_UNSUPPORTED_TAG); + + // Set default formats if format_kind == any + VDISPATCH_CONV(set_default_formats(), VERBOSE_UNSUPPORTED_TAG); + + // ISA check + VDISPATCH_CONV(mayiuse(v), VERBOSE_UNSUPPORTED_ISA); + + // 1x1 convolution check + const int ndims = src_d.ndims(); + const int weights_ndims = weights_d.ndims(); + for (int i = 0; i < ndims - 2; ++i) { + VDISPATCH_CONV( + weights_d.dims()[weights_ndims - (ndims - 2) + i] == 1, + VERBOSE_UNSUPPORTED_FEATURE, + "only 1x1 convolution is supported"); + VDISPATCH_CONV(desc()->strides[i] == 1, + VERBOSE_UNSUPPORTED_FEATURE, + "only stride 1 is supported"); + VDISPATCH_CONV(desc()->padding[0][i] == 0, + VERBOSE_UNSUPPORTED_FEATURE, + "padding is not supported"); + } + + VDISPATCH_CONV_SC(jit_rvv_1x1_conv_kernel_t::init_conf(jcp_, + *desc(), src_d, weights_d, dst_d, *attr(), + dnnl_get_max_threads(), false), + VERBOSE_UNSUPPORTED_FEATURE, "init_conf failed"); + + auto scratchpad = scratchpad_registry().registrar(); + jit_rvv_1x1_conv_kernel_t::init_scratchpad(scratchpad, jcp_); + + return status::success; + } + + jit_1x1_conv_conf_t jcp_ = utils::zero(); + + protected: + bool post_ops_ok() const { + // TODO: Post-ops support is not implemented yet. + return attr()->post_ops_.len() == 0; + } + bool set_default_formats() { + using namespace format_tag; + const int n = ndims(); + const bool g = with_groups(); + const auto dat_tag = utils::pick(n - 3, nwc, nhwc, ndhwc); + const auto wei_tag = utils::pick(2 * n - 6 + (g ? 1 : 0), Oiw4o, + gOiw4o, Oihw4o, gOihw4o, Oidhw4o, gOidhw4o); + + return set_default_formats_common(dat_tag, wei_tag, dat_tag); + } + }; + + jit_rvv_1x1_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {} + + status_t init(engine_t *engine) override { + CHECK(safe_ptr_assign(kernel_, + new jit_rvv_1x1_conv_kernel_t( + pd()->jcp_, *pd()->attr(), *pd()->dst_md()))); + return kernel_->create_kernel(); + } + + status_t execute(const exec_ctx_t &ctx) const override { + execute_forward(ctx); + return status::success; + } + +private: + void execute_forward(const exec_ctx_t &ctx) const; + void execute_forward_thr(const int ithr, const int nthr, const float *src, + const float *weights, const float *bias, float *dst, + const memory_tracking::grantor_t &scratchpad) const; + + const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } + + std::unique_ptr kernel_; +}; + +} // namespace rv64 +} // namespace cpu +} // namespace impl +} // namespace dnnl + +#endif diff --git a/third_party/xbyak_riscv/xbyak_riscv.hpp b/third_party/xbyak_riscv/xbyak_riscv.hpp new file mode 100644 index 00000000000..249553a36f9 --- /dev/null +++ b/third_party/xbyak_riscv/xbyak_riscv.hpp @@ -0,0 +1,1383 @@ +#pragma once +/*! + @file xbyak_riscv.hpp + @brief Xbyak_riscv ; JIT assembler for RISC-V + @author herumi + @url https://github.com/herumi/xbyak_riscv + @note modified new BSD license + http://opensource.org/licenses/BSD-3-Clause +*/ + +// Copyright (C), 2023, KNS Group LLC (YADRO) + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 + #ifndef WIN32_LEAN_AND_MEAN + #define WIN32_LEAN_AND_MEAN + #endif + #include + #include +#elif defined(__GNUC__) + #include + #include + #include +#endif +#if defined(__APPLE__) + #define XBYAK_RISCV_USE_MAP_JIT + #include + #ifndef MAP_JIT + #define MAP_JIT 0x800 + #endif +#endif + +#if defined(__GNUC__) && !defined(__MINGW32__) + #define XBYAK_RISCV_USE_MMAP_ALLOCATOR +#endif + +#ifdef NDEBUG + #define XBYAK_RISCV_ASSERT(x) +#else + #define XBYAK_RISCV_ASSERT(x) assert(x) +#endif + +// MFD_CLOEXEC defined only linux 3.17 or later. +// Android wraps the memfd_create syscall from API version 30. +#if !defined(MFD_CLOEXEC) || (defined(__ANDROID__) && __ANDROID_API__ < 30) + #undef XBYAK_RISCV_USE_MEMFD +#endif + +#if defined(_WIN64) || defined(__MINGW64__) || (defined(__CYGWIN__) && defined(__x86_64__)) + #define XBYAK_RISCV64_WIN +#elif defined(__x86_64__) + #define XBYAK_RISCV64_GCC +#endif +#if !defined(XBYAK_RISCV64) && !defined(XBYAK_RISCV32) + #if defined(XBYAK_RISCV64_GCC) || defined(XBYAK_RISCV64_WIN) + #define XBYAK_RISCV64 + #else + #define XBYAK_RISCV32 + #endif +#endif + +#ifdef _MSC_VER + #pragma warning(push) + #pragma warning(disable : 4514) /* remove inline function */ + #pragma warning(disable : 4786) /* identifier is too long */ + #pragma warning(disable : 4503) /* name is too long */ + #pragma warning(disable : 4127) /* constant expresison */ +#endif + +#include "xbyak_riscv_csr.hpp" + +#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || (defined(_MSC_VER) && _MSC_VER >= 1910) + #define XBYAK_RISCV_CONSTEXPR constexpr +#else + #define XBYAK_RISCV_CONSTEXPR +#endif + +namespace Xbyak_riscv { + +enum { + DEFAULT_MAX_CODE_SIZE = 4096, + VERSION = 0x1010 /* 0xABCD = A.BC.D */ +}; + +inline uint32_t getVersion() { return VERSION; } + +enum { + ERR_NONE = 1, + ERR_OFFSET_IS_TOO_BIG, + ERR_CODE_IS_TOO_BIG, + ERR_IMM_IS_TOO_BIG, + ERR_INVALID_IMM_OF_JAL, + ERR_INVALID_IMM_OF_BTYPE, + ERR_LABEL_IS_NOT_FOUND, + ERR_LABEL_IS_REDEFINED, + ERR_LABEL_IS_TOO_FAR, + ERR_LABEL_IS_NOT_SET_BY_L, + ERR_LABEL_IS_ALREADY_SET_BY_L, + ERR_CANT_PROTECT, + ERR_CANT_ALLOC, + ERR_BAD_PARAMETER, + ERR_MUNMAP, + ERR_BAD_ALIGN, + ERR_INTERNAL // Put it at last. +}; + +inline const char *ConvertErrorToString(int err) +{ + static const char *errTbl[] = { + "none", + "offset is too big", + "code is too big", + "imm is too big", + "invalid imm of jal", + "invalid imm of Btype", + "label is not found", + "label is redefined", + "label is too far", + "label is not set by L", + "label is already set by L", + "can't protect", + "can't alloc", + "bad parameter", + "munmap", + "bad align", + "internal error" + }; + assert(ERR_INTERNAL == sizeof(errTbl) / sizeof(*errTbl)); + return err <= ERR_INTERNAL ? errTbl[err] : "unknown err"; +} + +#ifdef XBYAK_RISCV_NO_EXCEPTION +namespace local { + +inline int& GetErrorRef() { + static thread_local int err = 0; + return err; +} + +inline void SetError(int err) { + if (local::GetErrorRef()) return; // keep the first err code + local::GetErrorRef() = err; +} + +} // local + +inline void ClearError() { + local::GetErrorRef() = 0; +} +inline int GetError() { return Xbyak_riscv::local::GetErrorRef(); } + +#define XBYAK_RISCV_THROW(err) { Xbyak_riscv::local::SetError(err); return; } +#define XBYAK_RISCV_THROW_RET(err, r) { Xbyak_riscv::local::SetError(err); return r; } + +#else +class Error : public std::exception { + int err_; +public: + explicit Error(int err) : err_(err) + { + if (err_ < 0 || err_ > ERR_INTERNAL) { + err_ = ERR_INTERNAL; + } + } + operator int() const { return err_; } + const char *what() const noexcept override + { + return ConvertErrorToString(err_); + } +}; + +// dummy functions +inline void ClearError() { } +inline int GetError() { return 0; } + +inline const char *ConvertErrorToString(const Error& err) +{ + return err.what(); +} + +#define XBYAK_RISCV_THROW(err) { throw Error(err); } +#define XBYAK_RISCV_THROW_RET(err, r) { throw Error(err); } + +#endif + +inline void *AlignedMalloc(size_t size, size_t alignment) +{ +#ifdef __MINGW32__ + return __mingw_aligned_malloc(size, alignment); +#elif defined(_WIN32) + return _aligned_malloc(size, alignment); +#else + void *p; + int ret = posix_memalign(&p, alignment, size); + return (ret == 0) ? p : 0; +#endif +} + +inline void AlignedFree(void *p) +{ +#ifdef __MINGW32__ + __mingw_aligned_free(p); +#elif defined(_MSC_VER) + _aligned_free(p); +#else + free(p); +#endif +} + +namespace local { + +static const size_t ALIGN_PAGE_SIZE = 4096; + +inline XBYAK_RISCV_CONSTEXPR uint32_t mask(size_t n) +{ + XBYAK_RISCV_ASSERT(n <= 32); + return n == 32 ? 0xffffffff : (1u << n) - 1; +} +// is x <= mask(n) ? +inline XBYAK_RISCV_CONSTEXPR bool inBit(uint32_t x, size_t n) +{ + return x <= mask(n); +} + +// is x a signed n-bit integer? +inline XBYAK_RISCV_CONSTEXPR bool inSBit(int x, int n) +{ + return -(1 << (n-1)) <= x && x < (1 << (n-1)); +} + +// split x to hi20bits and low12bits +// return false if x in 12-bit signed integer +inline bool split32bit(int *pH, int* pL, int x) { + if (inSBit(x, 12)) return false; + int H = (x >> 12) & mask(20); + int L = x & mask(12); + if (x & (1 << 11)) { + H++; + L = L | (mask(20) << 12); + } + *pH = H; + *pL = L; + return true; +} + +// @@@ embedded by bit_pattern.py (DON'T DELETE THIS LINE) +inline size_t get20_10to1_11_19to12_z12(size_t v) { return ((v & (1<<20)) << 11)| ((v & (1023<<1)) << 20)| ((v & (1<<11)) << 9)| (v & (255<<12)); } +inline size_t get12_10to5_z13_4to1_11_z7(size_t v) { return ((v & (1<<12)) << 19)| ((v & (63<<5)) << 20)| ((v & (15<<1)) << 7)| ((v & (1<<11)) >> 4); } +inline size_t get5to4_9to6_2_3_z5(size_t v) { return ((v & (3<<4)) << 7)| ((v & (15<<6)) << 1)| ((v & (1<<2)) << 4)| ((v & (1<<3)) << 2); } +inline size_t get9_z5_4_6_8to7_5_z2(size_t v) { return ((v & (1<<9)) << 3)| ((v & (1<<4)) << 2)| ((v & (1<<6)) >> 1)| ((v & (3<<7)) >> 4)| ((v & (1<<5)) >> 3); } +inline size_t get5to3_z3_2_6_z5(size_t v) { return ((v & (7<<3)) << 7)| ((v & (1<<2)) << 4)| ((v & (1<<6)) >> 1); } +inline size_t get5to3_z3_7_6_z5(size_t v) { return ((v & (7<<3)) << 7)| ((v & (1<<7)) >> 1)| ((v & (1<<6)) >> 1); } +inline size_t get5_z5_4to0_z2(size_t v) { return ((v & (1<<5)) << 7)| ((v & 31) << 2); } +inline size_t get11_4_9to8_10_6_7_3to1_5_z2(size_t v) { return ((v & (1<<11)) << 1)| ((v & (1<<4)) << 7)| ((v & (3<<8)) << 1)| ((v & (1<<10)) >> 2)| ((v & (1<<6)) << 1)| ((v & (1<<7)) >> 1)| ((v & (7<<1)) << 2)| ((v & (1<<5)) >> 3); } +inline size_t get17_z5_16to12_z2(size_t v) { return ((v & (1<<17)) >> 5)| ((v & (31<<12)) >> 10); } +inline size_t get5_z5_4to2_7to6_z2(size_t v) { return ((v & (1<<5)) << 7)| ((v & (7<<2)) << 2)| ((v & (3<<6)) >> 4); } +inline size_t get5_z5_4to3_8to6_z2(size_t v) { return ((v & (1<<5)) << 7)| ((v & (3<<3)) << 2)| ((v & (7<<6)) >> 4); } +inline size_t get5to2_7to6_z7(size_t v) { return ((v & (15<<2)) << 7)| ((v & (3<<6)) << 1); } +inline size_t get5to3_8to6_z7(size_t v) { return ((v & (7<<3)) << 7)| ((v & (7<<6)) << 1); } +// @@@ embedded by bit_pattern.py (DON'T DELETE THIS LINE) + +} // local + +/* + custom allocator +*/ +struct Allocator { + explicit Allocator(const std::string& = "") {} // same interface with MmapAllocator + virtual uint8_t *alloc(size_t size) { return reinterpret_cast(AlignedMalloc(size, local::ALIGN_PAGE_SIZE)); } + virtual void free(uint8_t *p) { AlignedFree(p); } + virtual ~Allocator() {} + /* override to return false if you call protect() manually */ + virtual bool useProtect() const { return true; } +}; + +#ifdef XBYAK_RISCV_USE_MMAP_ALLOCATOR +#ifdef XBYAK_RISCV_USE_MAP_JIT +namespace local { + +inline int getMacOsVersionPure() +{ + char buf[64]; + size_t size = sizeof(buf); + int err = sysctlbyname("kern.osrelease", buf, &size, NULL, 0); + if (err != 0) return 0; + char *endp; + int major = strtol(buf, &endp, 10); + if (*endp != '.') return 0; + return major; +} + +inline int getMacOsVersion() +{ + static const int version = getMacOsVersionPure(); + return version; +} + +} // local +#endif +class MmapAllocator : public Allocator { + struct Allocation { + size_t size; +#if defined(XBYAK_RISCV_USE_MEMFD) + // fd_ is only used with XBYAK_RISCV_USE_MEMFD. We keep the file open + // during the lifetime of each allocation in order to support + // checkpoint/restore by unprivileged users. + int fd; +#endif + }; + const std::string name_; // only used with XBYAK_RISCV_USE_MEMFD + typedef std::unordered_map AllocationList; + AllocationList allocList_; +public: + explicit MmapAllocator(const std::string& name = "xbyak") : name_(name) {} + uint8_t *alloc(size_t size) override + { + const size_t alignedSizeM1 = local::ALIGN_PAGE_SIZE - 1; + size = (size + alignedSizeM1) & ~alignedSizeM1; +#if defined(MAP_ANONYMOUS) + int mode = MAP_PRIVATE | MAP_ANONYMOUS; +#elif defined(MAP_ANON) + int mode = MAP_PRIVATE | MAP_ANON; +#else + #error "not supported" +#endif +#if defined(XBYAK_RISCV_USE_MAP_JIT) + const int mojaveVersion = 18; + if (local::getMacOsVersion() >= mojaveVersion) mode |= MAP_JIT; +#endif + int fd = -1; +#if defined(XBYAK_RISCV_USE_MEMFD) + fd = memfd_create(name_.c_str(), MFD_CLOEXEC); + if (fd != -1) { + mode = MAP_SHARED; + if (ftruncate(fd, size) != 0) { + close(fd); + XBYAK_RISCV_THROW_RET(ERR_CANT_ALLOC, 0) + } + } +#endif + void *p = mmap(NULL, size, PROT_READ | PROT_WRITE, mode, fd, 0); + if (p == MAP_FAILED) { + if (fd != -1) close(fd); + XBYAK_RISCV_THROW_RET(ERR_CANT_ALLOC, 0) + } + assert(p); + Allocation &alloc = allocList_[(uintptr_t)p]; + alloc.size = size; +#if defined(XBYAK_RISCV_USE_MEMFD) + alloc.fd = fd; +#endif + return (uint8_t*)p; + } + void free(uint8_t *p) override + { + if (p == 0) return; + AllocationList::iterator i = allocList_.find((uintptr_t)p); + if (i == allocList_.end()) XBYAK_RISCV_THROW(ERR_BAD_PARAMETER) + if (munmap((void*)i->first, i->second.size) < 0) XBYAK_RISCV_THROW(ERR_MUNMAP) +#if defined(XBYAK_RISCV_USE_MEMFD) + if (i->second.fd != -1) close(i->second.fd); +#endif + allocList_.erase(i); + } +}; +#endif + +namespace local { + +// Register Interface +class IReg { +public: + enum Kind { + GPR = 1, // General purpose register + FReg = 1 << 1, // Floating-point register + VECTOR = 1 << 2, // Vector register + }; +protected: + uint32_t idx_; + Kind kind_; +public: + XBYAK_RISCV_CONSTEXPR IReg(uint32_t idx = 0, Kind kind = GPR) + : idx_(idx), kind_(kind) + { + XBYAK_RISCV_ASSERT(local::inBit(idx, 5)); + } + XBYAK_RISCV_CONSTEXPR int getIdx() const { return idx_; } + const char *toString() const + { + if (kind_ == GPR) { + static const char tbl[][4] = { + "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", + "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", + "x24", "x25", "x26", "x27", "x28", "x29", "x30", "x31", + }; + return tbl[idx_]; + } else if (kind_ == FReg) { + static const char tbl[][4] = { + "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", + "f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15", + "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23", + "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", + }; + return tbl[idx_]; + } else if (kind_ == VECTOR) { + static const char tbl[][4] = { + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", + }; + return tbl[idx_]; + } + XBYAK_RISCV_THROW_RET(ERR_INTERNAL, 0); + } + bool operator==(const IReg& rhs) const + { + return idx_ == rhs.idx_ && kind_ == rhs.kind_; + } + bool operator!=(const IReg& rhs) const { return !operator==(rhs); } + +}; + +} // local + +// General Purpose Register +struct Reg : public local::IReg { + explicit XBYAK_RISCV_CONSTEXPR Reg(int idx = 0) : local::IReg(idx, IReg::Kind::GPR) { } +}; + +static XBYAK_RISCV_CONSTEXPR Reg x0(0), x1(1), x2(2), x3(3), x4(4), x5(5), x6(6), x7(7); +static XBYAK_RISCV_CONSTEXPR Reg x8(8), x9(9), x10(10), x11(11), x12(12), x13(13), x14(14), x15(15); +static XBYAK_RISCV_CONSTEXPR Reg x16(16), x17(17), x18(18), x19(19), x20(20), x21(21), x22(22), x23(23); +static XBYAK_RISCV_CONSTEXPR Reg x24(24), x25(25), x26(26), x27(27), x28(28), x29(29), x30(30), x31(31); + +static XBYAK_RISCV_CONSTEXPR Reg zero(x0); +static XBYAK_RISCV_CONSTEXPR Reg ra(x1); +static XBYAK_RISCV_CONSTEXPR Reg sp(x2); +static XBYAK_RISCV_CONSTEXPR Reg gp(x3); +static XBYAK_RISCV_CONSTEXPR Reg tp(x4); +static XBYAK_RISCV_CONSTEXPR Reg t0(x5); +static XBYAK_RISCV_CONSTEXPR Reg t1(x6); +static XBYAK_RISCV_CONSTEXPR Reg t2(x7); +static XBYAK_RISCV_CONSTEXPR Reg fp(x8); +static XBYAK_RISCV_CONSTEXPR Reg s0(x8); +static XBYAK_RISCV_CONSTEXPR Reg s1(x9); +static XBYAK_RISCV_CONSTEXPR Reg a0(x10), a1(x11), a2(x12), a3(x13), a4(x14), a5(x15), a6(x16), a7(x17); +static XBYAK_RISCV_CONSTEXPR Reg s2(x18), s3(x19), s4(x20), s5(x21), s6(x22), s7(x23), s8(x24), s9(x25); +static XBYAK_RISCV_CONSTEXPR Reg s10(x26), s11(x27); +static XBYAK_RISCV_CONSTEXPR Reg t3(x28), t4(x29), t5(x30), t6(x31); + +// Floating Point Register +struct FReg : public local::IReg { + explicit XBYAK_RISCV_CONSTEXPR FReg(int idx = 0) : local::IReg(idx, IReg::Kind::FReg) { } +}; + +static XBYAK_RISCV_CONSTEXPR FReg f0(0), f1(1), f2(2), f3(3), f4(4), f5(5), f6(6), f7(7); +static XBYAK_RISCV_CONSTEXPR FReg f8(8), f9(9), f10(10), f11(11), f12(12), f13(13), f14(14), f15(15); +static XBYAK_RISCV_CONSTEXPR FReg f16(16), f17(17), f18(18), f19(19), f20(20), f21(21), f22(22), f23(23); +static XBYAK_RISCV_CONSTEXPR FReg f24(24), f25(25), f26(26), f27(27), f28(28), f29(29), f30(30), f31(31); +// ABI name +static XBYAK_RISCV_CONSTEXPR FReg ft0(0), ft1(1), ft2(2), ft3(3), ft4(4), ft5(5), ft6(6), ft7(7); +static XBYAK_RISCV_CONSTEXPR FReg fs0(8), fs1(9), fa0(10), fa1(11), fa2(12), fa3(13), fa4(14), fa5(15), fa6(16), fa7(f17); +static XBYAK_RISCV_CONSTEXPR FReg fs2(18), fs3(19), fs4(20), fs5(21), fs6(22), fs7(23), fs8(24), fs9(25), fs10(26), fs11(27); +static XBYAK_RISCV_CONSTEXPR FReg ft8(28), ft9(29), ft10(30), ft11(31); + +#if defined(XBYAK_RISCV_V) && XBYAK_RISCV_V == 1 +// Vector Register +struct VReg : public local::IReg { + explicit XBYAK_RISCV_CONSTEXPR VReg(int idx = 0) : local::IReg(idx, IReg::Kind::VECTOR) { } +}; + +static XBYAK_RISCV_CONSTEXPR VReg v0(0), v1(1), v2(2), v3(3), v4(4), v5(5), v6(6), v7(7); +static XBYAK_RISCV_CONSTEXPR VReg v8(8), v9(9), v10(10), v11(11), v12(12), v13(13), v14(14), v15(15); +static XBYAK_RISCV_CONSTEXPR VReg v16(16), v17(17), v18(18), v19(19), v20(20), v21(21), v22(22), v23(23); +static XBYAK_RISCV_CONSTEXPR VReg v24(24), v25(25), v26(26), v27(27), v28(28), v29(29), v30(30), v31(31); +#endif + +// 2nd parameter for constructor of CodeArray(maxSize, userPtr, alloc) +void *const DontSetProtectRWE = (void*)2; //-V566 + +class CodeArray { + enum Type { + USER_BUF = 1, // use userPtr(non alignment, non protect) + ALLOC_BUF // use new(alignment, protect) + }; + CodeArray(const CodeArray& rhs); + void operator=(const CodeArray&); + bool isAllocType() const { return type_ == ALLOC_BUF; } + const Type type_; +#ifdef XBYAK_RISCV_USE_MMAP_ALLOCATOR + MmapAllocator defaultAllocator_; +#else + Allocator defaultAllocator_; +#endif + Allocator *alloc_; +protected: + size_t maxSize_; + uint8_t *top_; + size_t size_; + + bool useProtect() const { return alloc_->useProtect(); } +public: + enum ProtectMode { + PROTECT_RW = 0, // read/write + PROTECT_RWE = 1, // read/write/exec + PROTECT_RE = 2 // read/exec + }; + explicit CodeArray(size_t maxSize, void *userPtr = 0, Allocator *allocator = 0) + : type_((userPtr == 0 || userPtr == DontSetProtectRWE) ? ALLOC_BUF : USER_BUF) + , alloc_(allocator ? allocator : (Allocator*)&defaultAllocator_) + , maxSize_(maxSize) + , top_(type_ == USER_BUF ? reinterpret_cast(userPtr) : alloc_->alloc((std::max)(maxSize, 1))) + , size_(0) + { + if (maxSize_ > 0 && top_ == 0) XBYAK_RISCV_THROW(ERR_CANT_ALLOC) + if ((type_ == ALLOC_BUF && userPtr != DontSetProtectRWE && useProtect()) && !setProtectMode(PROTECT_RWE, false)) { + alloc_->free(top_); + XBYAK_RISCV_THROW(ERR_CANT_PROTECT) + } + } + virtual ~CodeArray() + { + if (isAllocType()) { + if (useProtect()) setProtectModeRW(false); + alloc_->free(top_); + } + } + bool setProtectMode(ProtectMode mode, bool throwException = true) + { + bool isOK = protect(top_, maxSize_, mode); + if (isOK) return true; + if (throwException) XBYAK_RISCV_THROW_RET(ERR_CANT_PROTECT, false) + return false; + } + bool setProtectModeRE(bool throwException = true) { return setProtectMode(PROTECT_RE, throwException); } + bool setProtectModeRW(bool throwException = true) { return setProtectMode(PROTECT_RW, throwException); } + void resetSize() + { + size_ = 0; + } + void writeBytes(size_t offset, uint64_t v, size_t n) + { + if (n > 8) XBYAK_RISCV_THROW(ERR_BAD_PARAMETER) + if (offset + n > maxSize_) XBYAK_RISCV_THROW(ERR_CODE_IS_TOO_BIG) + uint8_t *const p = top_ + offset; + for (size_t i = 0; i < n; i++) { + p[i] = static_cast(v >> (i * 8)); + } + } + void writeBytes(const uint8_t *addr, uint64_t v, size_t n) + { + writeBytes(addr - top_, v, n); + } + void appendBytes(uint64_t v, size_t n) + { + writeBytes(size_, v, n); + size_ += n; + } + void append4B(uint32_t code) { appendBytes(code, 4); } + void append2B(uint32_t code) { appendBytes(code, 2); } + void append1B(uint32_t code) { appendBytes(code, 1); } + void write4B(size_t offset, uint32_t v) { writeBytes(offset, v, 4); } + void dump(bool separate = false) const + { + const uint8_t *p = getCode(); + const size_t bufSize = getSize(); + if (separate) { + size_t pos = 0; + while (pos < bufSize) { + uint32_t v = p[pos]; + size_t n = (v & 3) == 3 ? 4 : 2; + if (pos + n <= bufSize) { + for (size_t i = 0; i < n; i++) { + printf("%02x", p[pos + n - 1 - i]); + } + printf("\n"); + pos += n; + } else { + printf("%02x error\n", v); + return; + } + } + return; + } + size_t remain = bufSize; + for (int i = 0; i < 4; i++) { + size_t disp = 16; + if (remain < 16) { + disp = remain; + } + for (size_t j = 0; j < 16; j++) { + if (j < disp) { + printf("%02x", p[i * 16 + j]); + } + } + putchar('\n'); + remain -= disp; + if (remain == 0) { + break; + } + } + } + const uint8_t *getCode() const { return top_; } + template + const F getCode() const { return reinterpret_cast(top_); } + const uint8_t *getCurr() const { return &top_[size_]; } + template + const F getCurr() const { return reinterpret_cast(&top_[size_]); } + size_t getSize() const { return size_; } + void setSize(size_t size) + { + if (size > maxSize_) XBYAK_RISCV_THROW(ERR_OFFSET_IS_TOO_BIG) + size_ = size; + } + /** + change exec permission of memory + @param addr [in] buffer address + @param size [in] buffer size + @param protectMode [in] mode(RW/RWE/RE) + @return true(success), false(failure) + */ + static inline bool protect(const void *addr, size_t size, int protectMode) + { +#if defined(_WIN32) + const DWORD c_rw = PAGE_READWRITE; + const DWORD c_rwe = PAGE_EXECUTE_READWRITE; + const DWORD c_re = PAGE_EXECUTE_READ; + DWORD mode; +#else + const int c_rw = PROT_READ | PROT_WRITE; + const int c_rwe = PROT_READ | PROT_WRITE | PROT_EXEC; + const int c_re = PROT_READ | PROT_EXEC; + int mode; +#endif + switch (protectMode) { + case PROTECT_RW: mode = c_rw; break; + case PROTECT_RWE: mode = c_rwe; break; + case PROTECT_RE: mode = c_re; break; + default: + return false; + } +#if defined(_WIN32) + DWORD oldProtect; + return VirtualProtect(const_cast(addr), size, mode, &oldProtect) != 0; +#elif defined(__GNUC__) + size_t pageSize = sysconf(_SC_PAGESIZE); + size_t iaddr = reinterpret_cast(addr); + size_t roundAddr = iaddr & ~(pageSize - static_cast(1)); + return mprotect(reinterpret_cast(roundAddr), size + (iaddr - roundAddr), mode) == 0; +#else + return true; +#endif + } + /** + get aligned memory pointer + @param addr [in] address + @param alignedSize [in] power of two + @return aligned addr by alingedSize + */ + static inline uint8_t *getAlignedAddress(uint8_t *addr, size_t alignedSize = 16) + { + return reinterpret_cast((reinterpret_cast(addr) + alignedSize - 1) & ~(alignedSize - static_cast(1))); + } +}; + +struct Jmp { + enum Type { + tJal, + tBtype, + tRawAddress, + } type; + const uint8_t* from; /* address of the jmp mnemonic */ + uint32_t encoded; + size_t encSize() const + { + return (type == tRawAddress) ? sizeof(size_t) : 4; + } + // jal + Jmp(const uint8_t *from, uint32_t opcode, const Reg& rd) + : type(tJal) + , from(from) + , encoded((rd.getIdx() << 7) | opcode) + { + } + // B-type + Jmp(const uint8_t* from, uint32_t opcode, uint32_t funct3, const Reg& src1, const Reg& src2) + : type(tBtype) + , from(from) + , encoded((src2.getIdx() << 20) | (src1.getIdx() << 15) | (funct3 << 12) | opcode) + { + } + // raw address + explicit Jmp(const uint8_t* from) + : type(tRawAddress) + , from(from) + , encoded(0) + { + } + static inline bool isValidImm(size_t imm, size_t maskBit) + { + const size_t M = local::mask(maskBit); + return (imm < M || ~M <= imm) && (imm & 1) == 0; + } + size_t encode(const uint8_t* addr) const + { + if (addr == 0) return 0; + if (type == tRawAddress) return size_t(addr); + const size_t imm = addr - from; + if (type == tJal) { + if (!isValidImm(imm, 20)) XBYAK_RISCV_THROW(ERR_INVALID_IMM_OF_JAL) + return local::get20_10to1_11_19to12_z12(imm) | encoded; + } else { + if (!isValidImm(imm, 12)) XBYAK_RISCV_THROW(ERR_INVALID_IMM_OF_JAL) + return local::get12_10to5_z13_4to1_11_z7(imm) | encoded; + } + } + // update jmp address by base->getCurr() + void update(CodeArray *base) const + { + base->writeBytes(from, encode(base->getCurr()), encSize()); + } + // append jmp opcode with addr + void appendCode(CodeArray *base, const uint8_t *addr) const + { + base->appendBytes(encode(addr), encSize()); + } +}; + +class LabelManager; + +class Label { + mutable LabelManager *mgr; + mutable int id; + friend class LabelManager; +public: + Label() : mgr(0), id(0) {} + Label(const Label& rhs); + Label& operator=(const Label& rhs); + ~Label(); + void clear() { mgr = 0; id = 0; } + int getId() const { return id; } + const uint8_t *getAddress() const; +}; + +class LabelManager { + // for Label class + struct ClabelVal { + ClabelVal(const uint8_t* addr = 0) : addr(addr), refCount(1) {} + const uint8_t* addr; + int refCount; + }; + typedef std::unordered_map ClabelDefList; + typedef std::unordered_multimap ClabelUndefList; + typedef std::unordered_set LabelPtrList; + + CodeArray *base_; + mutable int labelId_; + ClabelDefList clabelDefList_; + ClabelUndefList clabelUndefList_; + LabelPtrList labelPtrList_; + + int getId(const Label& label) const + { + if (label.id == 0) label.id = labelId_++; + return label.id; + } + void define_inner(ClabelDefList& defList, ClabelUndefList& undefList, int labelId, const uint8_t* addr) + { + // add label + ClabelDefList::value_type item(labelId, addr); + std::pair ret = defList.insert(item); + if (!ret.second) XBYAK_RISCV_THROW(ERR_LABEL_IS_REDEFINED) + // search undefined label + for (;;) { + ClabelUndefList::iterator itr = undefList.find(labelId); + if (itr == undefList.end()) break; + const Jmp& jmp = itr->second; + jmp.update(base_); + undefList.erase(itr); + } + } + friend class Label; + void incRefCount(int id, Label *label) + { + clabelDefList_[id].refCount++; + labelPtrList_.insert(label); + } + void decRefCount(int id, Label *label) + { + labelPtrList_.erase(label); + ClabelDefList::iterator i = clabelDefList_.find(id); + if (i == clabelDefList_.end()) return; + if (i->second.refCount == 1) { + clabelDefList_.erase(id); + } else { + --i->second.refCount; + } + } + template + bool hasUndefinedLabel_inner(const T& list) const + { + return !list.empty(); + } + // detach all labels linked to LabelManager + void resetLabelPtrList() + { + for (LabelPtrList::iterator i = labelPtrList_.begin(), ie = labelPtrList_.end(); i != ie; ++i) { + (*i)->clear(); + } + labelPtrList_.clear(); + } +public: + LabelManager() + { + reset(); + } + ~LabelManager() + { + resetLabelPtrList(); + } + void reset() + { + base_ = 0; + labelId_ = 1; + clabelDefList_.clear(); + clabelUndefList_.clear(); + resetLabelPtrList(); + } + void set(CodeArray *base) { base_ = base; } + void defineClabel(Label& label) + { + define_inner(clabelDefList_, clabelUndefList_, getId(label), base_->getCurr()); + label.mgr = this; + labelPtrList_.insert(&label); + } + void assign(Label& dst, const Label& src) + { + ClabelDefList::const_iterator i = clabelDefList_.find(src.id); + if (i == clabelDefList_.end()) XBYAK_RISCV_THROW(ERR_LABEL_IS_NOT_SET_BY_L) + define_inner(clabelDefList_, clabelUndefList_, dst.id, i->second.addr); + dst.mgr = this; + labelPtrList_.insert(&dst); + } + // return 0 unless label exists + const uint8_t* getAddr(const Label& label) const + { + ClabelDefList::const_iterator i = clabelDefList_.find(getId(label)); + if (i == clabelDefList_.end()) return 0; + return i->second.addr; + } + void addUndefinedLabel(const Label& label, const Jmp& jmp) + { + clabelUndefList_.insert(ClabelUndefList::value_type(label.id, jmp)); + } + bool hasUndefClabel() const { return hasUndefinedLabel_inner(clabelUndefList_); } + const uint8_t *getCode() const { return base_->getCode(); } +}; + +inline Label::Label(const Label& rhs) +{ + id = rhs.id; + mgr = rhs.mgr; + if (mgr) mgr->incRefCount(id, this); +} +inline Label& Label::operator=(const Label& rhs) +{ + if (id) XBYAK_RISCV_THROW_RET(ERR_LABEL_IS_ALREADY_SET_BY_L, *this) + id = rhs.id; + mgr = rhs.mgr; + if (mgr) mgr->incRefCount(id, this); + return *this; +} +inline Label::~Label() +{ + if (id && mgr) mgr->decRefCount(id, this); +} +inline const uint8_t* Label::getAddress() const +{ + if (mgr == 0) return 0; + return mgr->getAddr(*this); +} + +namespace local { + +template +struct Bit { + uint32_t v; + Bit(uint32_t v) + : v(v) + { + XBYAK_RISCV_ASSERT(inBit(v, n)); + } + Bit(const IReg& r) + : v(r.getIdx()) + { + } + Bit(VM vm) + : v(static_cast(vm)) + { + } + Bit(CSR csr) + : v(static_cast(csr)) + { + } + Bit(RM rm) + : v(static_cast(rm)) + { + } +}; + +} // local + +class CodeGenerator : public CodeArray { +public: + enum AqRlType { + T_aq = 2, + T_rl = 1, + T_aqrl = 3, + }; + typedef local::Bit<1> Bit1; + typedef local::Bit<2> Bit2; + typedef local::Bit<3> Bit3; + typedef local::Bit<5> Bit5; + typedef local::Bit<6> Bit6; + typedef local::Bit<7> Bit7; + typedef local::Bit<12> Bit12; + typedef local::Bit<32> Bit32; +private: + CodeGenerator operator=(const CodeGenerator&) = delete; + LabelManager labelMgr_; + int XLEN_; + bool isRV32_; + bool supportRVC_; + void opJmp(const Label& label, const Jmp& jmp) + { + const uint8_t* addr = labelMgr_.getAddr(label); + jmp.appendCode(this, addr); + if (addr) return; + labelMgr_.addUndefinedLabel(label, jmp); + } + uint32_t enc2(uint32_t a, uint32_t b) const { return (a<<7) | (b<<15); } + uint32_t enc3(uint32_t a, uint32_t b, uint32_t c) const { return enc2(a, b) | (c<<20); } + void Rtype(Bit7 opcode, Bit3 funct3, Bit7 funct7, Bit5 rd, Bit5 rs1, Bit5 rs2) + { + uint32_t v = (funct7.v<<25) | (funct3.v<<12) | opcode.v | enc3(rd.v, rs1.v, rs2.v); + append4B(v); + } + void Itype(Bit7 opcode, Bit3 funct3, Bit5 rd, Bit5 rs1, int imm) + { + if (!local::inSBit(imm, 12)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG) + uint32_t v = (imm<<20) | (funct3.v<<12) | opcode.v | enc2(rd.v, rs1.v); + append4B(v); + } + void Stype(Bit7 opcode, Bit3 funct3, Bit5 rs1, Bit5 rs2, int imm) + { + if (!local::inSBit(imm, 12)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG) + uint32_t v = ((imm>>5)<<25) | (funct3.v<<12) | opcode.v | enc3(imm & local::mask(5), rs1.v, rs2.v); + append4B(v); + } + void Utype(Bit7 opcode, Bit5 rd, uint32_t imm) + { + if (imm >= (1u << 20)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG) + uint32_t v = (imm<<12) | opcode.v | (rd.v<<7); + append4B(v); + } + void opShift(Bit7 pre, Bit3 funct3, Bit7 opcode, Bit5 rd, Bit5 rs1, uint32_t shamt, int range = 0) + { + if (range == 0) range = isRV32_ ? 5 : 6; + if (shamt >= (1u << range)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG) + uint32_t v = (pre.v<<25) | (funct3.v<<12) | opcode.v | enc3(rd.v, rs1.v, shamt); + append4B(v); + } + void opAtomic(Bit5 rd, Bit5 rs2, Bit5 addr, Bit5 funct5, Bit3 funct3, uint32_t flag) + { + assert(flag <= 3); + Rtype(0x2f, funct3.v, (funct5.v << 2) | flag, rd, addr, rs2); + } + void opIVV(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 vs1, Bit5 vd) + { + /* + 31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 + func6 vm vs2 vs1 func3 vd opcode + + func6, func3, and opcode must be encoded in the baseValue + */ + uint32_t v = (vm.v<<25) | (vs2.v<<20) | (vs1.v<<15) | (vd.v<<7); + v |= baseValue.v; // force-encode base value + append4B(v); + } + void opFVV(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 vs1, Bit5 d) + { + /* + 31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 + func6 vm vs2 vs1 func3 vd/rd opcode + + func6, func3, and opcode must be encoded in the baseValue + */ + uint32_t v = (vm.v<<25) | (vs2.v<<20) | (vs1.v<<15) | (d.v<<7); + v |= baseValue.v; // force-encode base value + append4B(v); + } + void opMVV(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 vs1, Bit5 d) + { + /* + 31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 + func6 vm vs2 vs1 func3 vd/rd opcode + + func6, func3, and opcode must be encoded in the baseValue + */ + uint32_t v = (vm.v<<25) | (vs2.v<<20) | (vs1.v<<15) | (d.v<<7); + v |= baseValue.v; // force-encode base value + append4B(v); + } + void opIVI(Bit32 baseValue, Bit1 vm, Bit5 vs2, uint32_t imm, Bit5 vd) + { + /* + 31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 + func6 vm vs2 imm func3 vd opcode + + func6, func3, and opcode must be encoded in the baseValue + */ + uint32_t v = (vm.v<<25) | (vs2.v<<20) | ((imm & local::mask(5))<<15) | (vd.v<<7); + v |= baseValue.v; // force-encode base value + append4B(v); + } + void opIVX(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 rs1, Bit5 vd) + { + /* + 31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 + func6 vm vs2 rs1 func3 vd opcode + + func6, func3, and opcode must be encoded in the baseValue + */ + uint32_t v = (vm.v<<25) | (vs2.v<<20) | (rs1.v<<15) | (vd.v<<7); + v |= baseValue.v; // force-encode base value + append4B(v); + } + void opFVF(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 rs1, Bit5 vd) + { + /* + 31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 + func6 vm vs2 rs1 func3 vd opcode + + func6, func3, and opcode must be encoded in the baseValue + */ + uint32_t v = (vm.v<<25) | (vs2.v<<20) | (rs1.v<<15) | (vd.v<<7); + v |= baseValue.v; // force-encode base value + append4B(v); + } + void opMVX(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 rs1, Bit5 d) + { + /* + 31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 + func6 vm vs2 rs1 func3 vd/rd opcode + + func6, func3, and opcode must be encoded in the baseValue + */ + uint32_t v = (vm.v<<25) | (vs2.v<<20) | (rs1.v<<15) | (d.v<<7); + v |= baseValue.v; // force-encode base value + append4B(v); + } + void opVectorLoad(Bit32 baseValue, Bit1 vm, Bit5 rs2_vs2, Bit5 rs1, Bit5 vd) + { + /* + 31 .. 29 | 28 | 27 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 + nf mew mop vm lumop/rs2/vs2 rs1 width vd opcode + + mew, mop, width, lumop, and opcode must be encoded in the baseValue + */ + uint32_t v = (vm.v<<25) | (rs2_vs2.v<<20) | (rs1.v<<15) | (vd.v<<7); + v |= baseValue.v; // force-encode base value + append4B(v); + } + void opVectorStore(Bit32 baseValue, Bit1 vm, Bit5 rs2_vs2, Bit5 rs1, Bit5 vs3) + { + /* + 31 .. 29 | 28 | 27 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 + nf mew mop vm sumop/rs2/vs2 rs1 width vd opcode + + mew, mop, width, sumop, and opcode must be encoded in the baseValue + */ + uint32_t v = (vm.v<<25) | (rs2_vs2.v<<20) | (rs1.v<<15) | (vs3.v<<7); + v |= baseValue.v; // force-encode base value + append4B(v); + } + void opCSR(Bit32 baseValue, Bit12 csr, Bit5 rs1_uimm, Bit5 rd) + { + /* + 31 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 + csr rs1_uimm func3 rd opcode + + func3 and opcode must be encoded in the baseValue + */ + uint32_t v = (csr.v<<20) | (rs1_uimm.v<<15) | (rd.v<<7); + v |= baseValue.v; // force-encode base value + append4B(v); + } + void opLoadFP(Bit32 baseValue, int imm, Bit5 rs1, Bit5 rd) + { + if (!local::inSBit(imm, 12)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG) + /* + 31 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 + imm[11:0] rs1 width rd opcode + + width and opcode must be encoded in the baseValue + */ + uint32_t v = (imm<<20) | (rs1.v<<15) | (rd.v<<7); + v |= baseValue.v; // force-encode base value + append4B(v); + } + void opStoreFP(Bit32 baseValue, int imm, Bit5 rs2, Bit5 rs1) + { + if (!local::inSBit(imm, 12)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG) + /* + 31 .. 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 + imm[11:5] rs2 rs1 width imm[4:0] opcode + + width and opcode must be encoded in the baseValue + */ + uint32_t imm_11_5 = imm & (local::mask(7)<<5); + uint32_t imm_4_0 = imm & local::mask(5); + uint32_t v = (imm_11_5<<20) | (rs2.v<<20) | (rs1.v<<15) | (imm_4_0<<7); + v |= baseValue.v; // force-encode base value + append4B(v); + } + void opFP(Bit32 baseValue, Bit5 rs2, Bit5 rs1, Bit3 rm, Bit5 rd) + { + /* + 31 .. 27 | 26 .. 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 + func5 fmt rs2 rs1 rm rd opcode + + func5, fmt, and opcode must be encoded in the baseValue + */ + uint32_t v = (rs2.v<<20) | (rs1.v<<15) | (rm.v<<12) | (rd.v<<7); + v |= baseValue.v; // force-encode base value + append4B(v); + } + void opR4(Bit32 baseValue, Bit5 rs3, Bit5 rs2, Bit5 rs1, Bit3 rm, Bit5 rd) + { + /* + 31 .. 27 | 26 .. 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 + rs3 fmt rs2 rs1 rm rd opcode + + fmt and opcode must be encoded in the baseValue + */ + uint32_t v = (rs3.v<<27) | (rs2.v<<20) | (rs1.v<<15) | (rm.v<<12) | (rd.v<<7); + v |= baseValue.v; // force-encode base value + append4B(v); + } + bool isValiCidx(uint32_t idx) const { return 8 <= idx && idx < 16; } + // c_addi, c_addiw + bool c_addi_inner(const Reg& rd, const Reg& rs, uint32_t imm, uint32_t funct3) + { + uint32_t dIdx = rd.getIdx(); + uint32_t sIdx = rs.getIdx(); + if (sIdx == 0 && c_li(rd, imm, 2, 1)) return true; + if (dIdx == 0 || dIdx != sIdx || !local::inSBit(imm, 6)) return false; + uint32_t v = (funct3<<13) | ((imm & (1<<5))<<7) | (dIdx<<7) | ((imm & 31)<<2)| 1; + append2B(v); + return true; + } + bool c_addi16sp(const Reg& rd, const Reg& rs, uint32_t imm) + { + if (rd != sp || rs != sp || (imm % 16) != 0 || (496 < imm && imm < ~512u) || imm == 0) return false; + uint32_t v = (3<<13) | (2<<7) | 1 | local::get9_z5_4_6_8to7_5_z2(imm); + append2B(v); + return true; + } + // c_li, c_slli + bool c_li(const Reg& rd, uint32_t imm, uint32_t funct3, uint32_t op) + { + if (rd == x0 || !local::inSBit(imm, 6)) return false; + uint32_t v = (funct3<<13) | (rd.getIdx() << 7) | op | local::get5_z5_4to0_z2(imm); + append2B(v); + return true; + } + bool c_lui(const Reg& rd, uint32_t imm) + { + if (rd == x0 || rd == x2 || imm == 0 || (32 <= imm && imm < (1<<20)-32)) return false; + uint32_t v = (3<<13) | (rd.getIdx()<<7) | 1 | local::get5_z5_4to0_z2(imm); + append2B(v); + return true; + } + bool c_addi(const Reg& rd, const Reg& rs, uint32_t imm) + { + uint32_t dIdx = rd.getIdx(); + if (imm == 0 && c_mv(rd, rs, 0)) return true; + if (c_addi_inner(rd, rs, imm, 0)) return true; + if (c_addi16sp(rd, rs, imm)) return true; + // c.addi4spn(rd, imm) = c.addi(rd, x2, imm) + if (rs != sp || !isValiCidx(dIdx) || imm == 0 || (imm % 4) != 0 || imm >= 1024) return false; + uint32_t v = ((dIdx-8)<<2) | local::get5to4_9to6_2_3_z5(imm); + append2B(v); + return true; + } + uint32_t creg2(uint32_t a, uint32_t b) { return ((a-8)<<7) | ((b-8)<<2); } + // c_lw, c_sw + bool c_lsw(const Reg& rd, const Reg& rs, int imm, uint32_t funct3) + { + uint32_t dIdx = rd.getIdx(); + uint32_t sIdx = rs.getIdx(); + if (!isValiCidx(dIdx) || !isValiCidx(sIdx) || (imm % 4) != 0 || imm < 0 || imm >= (1 << 7)) return false; + uint32_t v = (funct3<<13) | creg2(sIdx, dIdx) | local::get5to3_z3_2_6_z5(imm); + append2B(v); + return true; + } + // c_ld, c_sd + bool c_lsd(const Reg& rd, const Reg& rs, int imm, uint32_t funct3) + { + uint32_t dIdx = rd.getIdx(); + uint32_t sIdx = rs.getIdx(); + if (!isValiCidx(dIdx) || !isValiCidx(sIdx) || (imm % 8) != 0 || imm < 0 || imm >= (1 << 8)) return false; + uint32_t v = (funct3<<13) | creg2(sIdx, dIdx) | local::get5to3_z3_7_6_z5(imm); + append2B(v); + return true; + } + // c_srli, c_srai, c_andi + bool c_srli(const Reg& rd, const Reg& rs, int imm, uint32_t funct2, bool allowImm0 = false) + { + uint32_t dIdx = rd.getIdx(); + uint32_t sIdx = rs.getIdx(); + if (dIdx != sIdx || !isValiCidx(dIdx) || (!allowImm0 && imm == 0) || imm >= (1 << 6)) return false; + uint32_t v = (4<<13) | (funct2<<10) | ((dIdx-8)<<7) | local::get5_z5_4to0_z2(imm) | 1; + append2B(v); + return true; + } + // rd = rs1 + // c_sub, c_xor, c_or, c_and, c_subw + bool c_noimm(const Reg& rd, const Reg& rs1, const Reg& rs2, uint32_t funct3, uint32_t funct2) + { + uint32_t dIdx = rd.getIdx(); + uint32_t sIdx = rs2.getIdx(); + if (rd.getIdx() != rs1.getIdx() || !isValiCidx(dIdx) || !isValiCidx(sIdx)) return false; + uint32_t v = (funct3<<10) | ((dIdx-8)<<7) | (funct2<<5) | ((sIdx-8)<<2) | 1; + append2B(v); + return true; + } + // c_lwsp, c_flwsp + bool c_lwsp(const Reg& rd, const Reg& addr, int imm, uint32_t funct3) + { + uint32_t idx = rd.getIdx(); + if (addr != sp || (imm % 4) != 0 || (imm >> 8)) return false; + uint32_t v = (funct3<<13) | (idx<<7) | local::get5_z5_4to2_7to6_z2(imm) | 2; + append2B(v); + return true; + } + // c_ldsp + bool c_ldsp(const Reg& rd, const Reg& addr, int imm, uint32_t funct3) + { + uint32_t idx = rd.getIdx(); + if (addr != sp || (imm % 8) != 0 || (imm >> 9)) return false; + uint32_t v = (funct3<<13) | (idx<<7) | local::get5_z5_4to3_8to6_z2(imm) | 2; + append2B(v); + return true; + } + // c.mv, c.add + bool c_mv(const Reg& rd, const Reg& rs, uint32_t funct1) + { + if (rd == x0 || rs == x0) return false; + uint32_t v = (4<<13) | (funct1<<12) | (rd.getIdx()<<7) | (rs.getIdx()<<2) | 2; + append2B(v); + return true; + } + bool c_swsp(const Reg& rs, const Reg& addr, int imm, uint32_t funct3) + { + if (addr != sp || (imm % 4) != 0 || (imm >> 8)) return false; + uint32_t v = (funct3<<13) | (rs.getIdx()<<2) | local::get5to2_7to6_z7(imm) | 2; + append2B(v); + return true; + } + bool c_sdsp(const Reg& rs, const Reg& addr, int imm, uint32_t funct3) + { + if (addr != sp || (imm % 8) != 0 || (imm >> 9)) return false; + uint32_t v = (funct3<<13) | (rs.getIdx()<<2) | local::get5to3_8to6_z7(imm) | 2; + append2B(v); + return true; + } +public: + void L(Label& label) { labelMgr_.defineClabel(label); } + Label L() { Label label; L(label); return label; } + /* + assign src to dst + require + dst : does not used by L() + src : used by L() + */ + void assignL(Label& dst, const Label& src) { labelMgr_.assign(dst, src); } + /* + put the absolute address of label to buffer + @note the put size is 4(32-bit), 8(64-bit) + */ + void putL(const Label &label) + { + Jmp jmp(getCurr()); + opJmp(label, jmp); + } + + // constructor + CodeGenerator(size_t maxSize = DEFAULT_MAX_CODE_SIZE, void *userPtr = DontSetProtectRWE, Allocator *allocator = 0) + : CodeArray(maxSize, userPtr, allocator) + , XLEN_(64) + , isRV32_(false) + , supportRVC_(false) + { + labelMgr_.set(this); + } + void reset() + { + ClearError(); + resetSize(); + labelMgr_.reset(); + labelMgr_.set(this); + XLEN_ = 64; + isRV32_ = false; + supportRVC_ = false; + } + void setRV32(bool on = true) + { + isRV32_ = on; + XLEN_ = on ? 32 : 64; + } + void supportRVC(bool on = true) + { + supportRVC_ = on; + } + bool hasUndefinedLabel() const { return labelMgr_.hasUndefClabel(); } + static inline void clearCache(void *p, size_t n) + { +#ifdef _WIN32 + FlushInstructionCache(GetCurrentProcess(), begin, n); +#elif defined(__APPLE__) + sys_icache_invalidate(begin, n); +#else + __builtin___clear_cache((char *)p, (char *)p + n); +#endif + } + /* + MUST call ready() to complete generating code if you use AutoGrow mode. + It is not necessary for the other mode if hasUndefinedLabel() is true. + */ + void ready(ProtectMode mode = PROTECT_RWE) + { + if (hasUndefinedLabel()) XBYAK_RISCV_THROW(ERR_LABEL_IS_NOT_FOUND) + if (useProtect()) setProtectMode(mode); + clearCache(top_, size_); + } + // set read/exec + void readyRE() { return ready(PROTECT_RE); } + + void align(size_t x) + { + if (x == 1) return; + if (x < 4 || (x & (x - 1))) XBYAK_RISCV_THROW(ERR_BAD_ALIGN) + size_t remain = size_t(getCurr()) % x; + if (remain % 4) XBYAK_RISCV_THROW(ERR_INTERNAL) + if (remain) { + for (size_t i = 0; i < (x - remain) / 4; i++) { + nop(); + } + } + } + +#include "xbyak_riscv_mnemonic.hpp" +#if defined(XBYAK_RISCV_V) && XBYAK_RISCV_V == 1 +#include "xbyak_riscv_v.hpp" +#endif +}; + +#ifdef _MSC_VER + #pragma warning(pop) +#endif +} // Xbyak_riscv + diff --git a/third_party/xbyak_riscv/xbyak_riscv_csr.hpp b/third_party/xbyak_riscv/xbyak_riscv_csr.hpp new file mode 100644 index 00000000000..5f04ed441a1 --- /dev/null +++ b/third_party/xbyak_riscv/xbyak_riscv_csr.hpp @@ -0,0 +1,112 @@ +/****************************************************************************** +* Copyright (C), 2023, KNS Group LLC (YADRO) +* +* Licensed under the 3-Clause BSD License +* You may obtain a copy of the License at https://opensource.org/license/bsd-3-clause/ +*******************************************************************************/ + +#pragma once +namespace Xbyak_riscv { + +// Control and Status Register +enum class CSR : uint32_t { + // FP CSRs + fflags = 0x001, // Floating-Point Accrued Exceptions + frm = 0x002, // Floating-Point Dynamic Rounding Mode + fcsr = 0x003, // Floating-Point Control and Status register + // vector CSRs + vstart = 0x008, // Vector start position + vxsat = 0x009, // Fixed-Point Saturate Flag + vxrm = 0x00A, // Fixed-Point Rounding Mode + vcsr = 0x00F, // Vector control and status register + vl = 0xC20, // Vector length + vtype = 0xC21, // Vector data type register + vlenb = 0xC22, // VLEN/8 (vector register length in bytes) +}; + + +// Selected Element Width +enum class SEW : uint32_t { + e8 = 0x0, + e16 = 0x1, + e32 = 0x2, + e64 = 0x3 +}; + +// Vector Length Multiplier +enum class LMUL : uint32_t { + mf8 = 0x5, + mf4 = 0x6, + mf2 = 0x7, + m1 = 0x0, + m2 = 0x1, + m4 = 0x2, + m8 = 0x3 +}; + +// Vector Mask Agnostic +enum class VMA : uint32_t { + mu = 0, // undisturbed + ma = 1, // agnostic +}; + +// Vector Tail Agnostic +enum class VTA : uint32_t { + tu = 0, // undisturbed + ta = 1, // agnostic +}; + +enum class VectorAddressingMode : uint32_t { + unitStride = 0x0, + indexedUnordered = 0x1, + strided = 0x2, + indexedOrdered = 0x3 + // other encodings are reserved +}; + +enum class UnitStrideVectorAddressingModeLoad : uint32_t { + load = 0x0, // unit-stride load + wholeRegisterLoad = 0x8, // unit-stride, whole register load + maskLoad = 0xb, // unit-stride, mask load, EEW=8 + faultOnlyFirst = 0x10 // unit-stride fault-only-first + // other encodings are reserved +}; + +enum class UnitStrideVectorAddressingModeStore : uint32_t { + store = 0x0, // unit-stride store + wholeRegisterStore = 0x8, // unit-stride, whole register store + maskStore = 0xb // unit-stride, mask store, EEW=8 + // other encodings are reserved +}; + +enum class WidthEncoding : uint32_t { + e8 = 0x0, // Vector 8-bit element + e16 = 0x5, // Vector 16-bit element + e32 = 0x6, // Vector 32-bit element + e64 = 0x7, // Vector 64-bit element +}; + +enum class VM : uint32_t { + unmasked = 1, + masked = 0 +}; + +enum class RM : uint32_t { + rne = 0x0, // Round to Nearest, ties to Even + rtz = 0x1, // Round towards Zero + rdn = 0x2, // Round Down (towards -infinity) + rup = 0x3, // Round Up (towards + infinity) + rmm = 0x4, // Round to Nearest, ties to Max Magnitude + dyn = 0x7 // In instruction’s rm field, selects dynamic rounding mode; + // In Rounding Mode register, reserved. +}; + +enum class FFlags : uint32_t { + NV = 0x01, // Invalid Operation + DZ = 0x02, // Divide by Zero + OF = 0x04, // Overflow + UF = 0x08, // Underflow + NX = 0x10 // Inexact +}; + +} // Xbyak_riscv diff --git a/third_party/xbyak_riscv/xbyak_riscv_mnemonic.hpp b/third_party/xbyak_riscv/xbyak_riscv_mnemonic.hpp new file mode 100644 index 00000000000..b050d46cc75 --- /dev/null +++ b/third_party/xbyak_riscv/xbyak_riscv_mnemonic.hpp @@ -0,0 +1,231 @@ +const char *getVersionString() const { return "1.01"; } +void add(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && rd == rs1 && c_mv(rd, rs2, 1)) return; Rtype(0x33, 0, 0x0, rd, rs1, rs2); } +void sub(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x23, 0)) return; Rtype(0x33, 0, 0x20, rd, rs1, rs2); } +void sll(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 1, 0x0, rd, rs1, rs2); } +void slt(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 2, 0x0, rd, rs1, rs2); } +void sltu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 3, 0x0, rd, rs1, rs2); } +void xor_(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x23, 1)) return; Rtype(0x33, 4, 0x0, rd, rs1, rs2); } +void srl(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 5, 0x0, rd, rs1, rs2); } +void sra(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 5, 0x20, rd, rs1, rs2); } +void or_(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x23, 2)) return; Rtype(0x33, 6, 0x0, rd, rs1, rs2); } +void and_(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x23, 3)) return; Rtype(0x33, 7, 0x0, rd, rs1, rs2); } +void addw(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x27, 1)) return; Rtype(0x3b, 0, 0x0, rd, rs1, rs2); } +void subw(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x27, 0)) return; Rtype(0x3b, 0, 0x20, rd, rs1, rs2); } +void sllw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 1, 0x0, rd, rs1, rs2); } +void srlw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 5, 0x0, rd, rs1, rs2); } +void sraw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 5, 0x20, rd, rs1, rs2); } +void mul(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 0, 0x1, rd, rs1, rs2); } +void mulh(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 1, 0x1, rd, rs1, rs2); } +void mulhsu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 2, 0x1, rd, rs1, rs2); } +void mulhu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 3, 0x1, rd, rs1, rs2); } +void div(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 4, 0x1, rd, rs1, rs2); } +void divu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 5, 0x1, rd, rs1, rs2); } +void rem(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 6, 0x1, rd, rs1, rs2); } +void remu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 7, 0x1, rd, rs1, rs2); } +void mulw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 0, 0x1, rd, rs1, rs2); } +void divw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 4, 0x1, rd, rs1, rs2); } +void remw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 6, 0x1, rd, rs1, rs2); } +void remuw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 7, 0x1, rd, rs1, rs2); } +void addi(const Reg& rd, const Reg& rs1, int imm) { if (supportRVC_ && c_addi(rd, rs1, imm)) return; Itype(0x13, 0, rd, rs1, imm); } +void slti(const Reg& rd, const Reg& rs1, int imm) { Itype(0x13, 2, rd, rs1, imm); } +void sltiu(const Reg& rd, const Reg& rs1, int imm) { Itype(0x13, 3, rd, rs1, imm); } +void xori(const Reg& rd, const Reg& rs1, int imm) { Itype(0x13, 4, rd, rs1, imm); } +void ori(const Reg& rd, const Reg& rs1, int imm) { Itype(0x13, 6, rd, rs1, imm); } +void andi(const Reg& rd, const Reg& rs1, int imm) { if (supportRVC_ && c_srli(rd, rs1, imm, 2, true)) return; Itype(0x13, 7, rd, rs1, imm); } +void addiw(const Reg& rd, const Reg& rs1, int imm) { if (supportRVC_ && c_addi_inner(rd, rs1, imm, 1)) return; Itype(0x1b, 0, rd, rs1, imm); } +// load-op rd, imm(addr); rd = addr[imm]; +void jalr(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x67, 0, rd, addr, imm); } +void lb(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 0, rd, addr, imm); } +void lh(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 1, rd, addr, imm); } +void lw(const Reg& rd, const Reg& addr, int imm = 0) { if (supportRVC_ && (c_lwsp(rd, addr, imm, 2) || c_lsw(rd, addr, imm, 2))) return; Itype(0x3, 2, rd, addr, imm); } +void lbu(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 4, rd, addr, imm); } +void lhu(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 5, rd, addr, imm); } +void lwu(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 6, rd, addr, imm); } +void ld(const Reg& rd, const Reg& addr, int imm = 0) { if (supportRVC_ && (c_ldsp(rd, addr, imm, 3) || c_lsd(rd, addr, imm, 3))) return; Itype(0x3, 3, rd, addr, imm); } +void auipc(const Reg& rd, uint32_t imm) { Utype(0x17, rd, imm); } +void lui(const Reg& rd, uint32_t imm) { if (supportRVC_ && c_lui(rd, imm)) return; Utype(0x37, rd, imm); } +void slli(const Reg& rd, const Reg& rs1, uint32_t shamt) { if (supportRVC_ && rd == rs1 && shamt != 0 && c_li(rd, shamt, 0, 2)) return; opShift(0x0, 1, 0x13, rd, rs1, shamt); } +void srli(const Reg& rd, const Reg& rs1, uint32_t shamt) { if (supportRVC_ && c_srli(rd, rs1, shamt, 0)) return; opShift(0x0, 5, 0x13, rd, rs1, shamt); } +void srai(const Reg& rd, const Reg& rs1, uint32_t shamt) { if (supportRVC_ && c_srli(rd, rs1, shamt, 1)) return; opShift(0x20, 5, 0x13, rd, rs1, shamt); } +void slliw(const Reg& rd, const Reg& rs1, uint32_t shamt) { opShift(0x0, 1, 0x1b, rd, rs1, shamt, 5); } +void srliw(const Reg& rd, const Reg& rs1, uint32_t shamt) { opShift(0x0, 5, 0x1b, rd, rs1, shamt, 5); } +void sraiw(const Reg& rd, const Reg& rs1, uint32_t shamt) { opShift(0x20, 5, 0x1b, rd, rs1, shamt, 5); } +void fence_rw_rw() { append4B(0x330000f); } +void fence_tso() { append4B(0x8330000f); } +void fence_rw_w() { append4B(0x310000f); } +void fence_r_rw() { append4B(0x230000f); } +void fence_r_r() { append4B(0x220000f); } +void fence_w_w() { append4B(0x110000f); } +void fence_i() { append4B(0x100f); } +void ecall() { append4B(0x73); } +void ebreak() { if (supportRVC_) append2B(0x9002); else append4B(0x00100073); } +// store-op rs, imm(addr) ; addr[imm] = rs; +void sb(const Reg& rs, const Reg& addr, int imm = 0) { Stype(0x23, 0, addr, rs, imm); } +void sh(const Reg& rs, const Reg& addr, int imm = 0) { Stype(0x23, 1, addr, rs, imm); } +void sw(const Reg& rs, const Reg& addr, int imm = 0) { if (supportRVC_ && (c_swsp(rs, addr, imm, 6) || c_lsw(rs, addr, imm, 6))) return; Stype(0x23, 2, addr, rs, imm); } +void sd(const Reg& rs, const Reg& addr, int imm = 0) { if (supportRVC_ && (c_sdsp(rs, addr, imm, 7) || c_lsd(rs, addr, imm, 7))) return; Stype(0x23, 3, addr, rs, imm); } +void beq(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 0, rs1, rs2); opJmp(label, jmp); } +void bne(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 1, rs1, rs2); opJmp(label, jmp); } +void blt(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 4, rs1, rs2); opJmp(label, jmp); } +void bge(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 5, rs1, rs2); opJmp(label, jmp); } +void bltu(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 6, rs1, rs2); opJmp(label, jmp); } +void bgeu(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 7, rs1, rs2); opJmp(label, jmp); } +void beqz(const Reg& rs, const Label& label) { beq(rs, x0, label); } +void bnez(const Reg& rs, const Label& label) { bne(rs, x0, label); } +void blez(const Reg& rs, const Label& label) { bge(x0, rs, label); } +void bgez(const Reg& rs, const Label& label) { bge(rs, x0, label); } +void bltz(const Reg& rs, const Label& label) { blt(rs, x0, label); } +void bgtz(const Reg& rs, const Label& label) { blt(x0, rs, label); } +void bgt(const Reg& rs, const Reg& rt, const Label& label) { blt(rt, rs, label); } +void ble(const Reg& rs, const Reg& rt, const Label& label) { bge(rt, rs, label); } +void bgtu(const Reg& rs, const Reg& rt, const Label& label) { bltu(rt, rs, label); } +void bleu(const Reg& rs, const Reg& rt, const Label& label) { bgeu(rt, rs, label); } +// amos**, rd, rs2, (addr) +void sc_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x3, 2, flag); } +void sc_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x3, 3, flag); } +void amoswap_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x1, 2, flag); } +void amoswap_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x1, 3, flag); } +void amoadd_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x0, 2, flag); } +void amoadd_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x0, 3, flag); } +void amoxor_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x4, 2, flag); } +void amoxor_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x4, 3, flag); } +void amoand_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0xc, 2, flag); } +void amoand_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0xc, 3, flag); } +void amoor_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x8, 2, flag); } +void amoor_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x8, 3, flag); } +void amomin_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x10, 2, flag); } +void amomin_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x10, 3, flag); } +void amomax_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x14, 2, flag); } +void amomax_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x14, 3, flag); } +void amominu_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x18, 2, flag); } +void amominu_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x18, 3, flag); } +void amomaxu_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x1c, 2, flag); } +void amomaxu_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x1c, 3, flag); } +void csrrw(const Reg& rd, CSR csr, const Reg& rs1) { opCSR(0x1073, csr, rs1, rd); } +void csrrs(const Reg& rd, CSR csr, const Reg& rs1) { opCSR(0x2073, csr, rs1, rd); } +void csrrc(const Reg& rd, CSR csr, const Reg& rs1) { opCSR(0x3073, csr, rs1, rd); } +void csrrwi(const Reg& rd, CSR csr, uint32_t imm) { opCSR(0x5073, csr, imm, rd); } +void csrrsi(const Reg& rd, CSR csr, uint32_t imm) { opCSR(0x6073, csr, imm, rd); } +void csrrci(const Reg& rd, CSR csr, uint32_t imm) { opCSR(0x7073, csr, imm, rd); } +void fadd_s(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x53, rs2, rs1, rm, rd); } +void fclass_s(const Reg& rd, const FReg& rs1) { opFP(0xe0001053, 0, rs1, 0, rd); } +void fcvt_s_w(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd0000053, 0, rs1, rm, rd); } +void fcvt_s_wu(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd0100053, 0, rs1, rm, rd); } +void fcvt_w_s(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc0000053, 0, rs1, rm, rd); } +void fcvt_wu_s(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc0100053, 0, rs1, rm, rd); } +void fdiv_s(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x18000053, rs2, rs1, rm, rd); } +void feq_s(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa0002053, rs2, rs1, 0, rd); } +void fle_s(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa0000053, rs2, rs1, 0, rd); } +void flt_s(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa0001053, rs2, rs1, 0, rd); } +void fmax_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x28001053, rs2, rs1, 0, rd); } +void fmin_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x28000053, rs2, rs1, 0, rd); } +void fmul_s(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x10000053, rs2, rs1, rm, rd); } +void fmv_w_x(const FReg& rd, const Reg& rs1) { opFP(0xf0000053, 0, rs1, 0, rd); } +void fmv_x_w(const Reg& rd, const FReg& rs1) { opFP(0xe0000053, 0, rs1, 0, rd); } +void fsgnj_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x20000053, rs2, rs1, 0, rd); } +void fsgnjn_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x20001053, rs2, rs1, 0, rd); } +void fsgnjx_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x20002053, rs2, rs1, 0, rd); } +void fsqrt_s(const FReg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0x58000053, 0, rs1, rm, rd); } +void fsub_s(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x8000053, rs2, rs1, rm, rd); } +void fcvt_l_s(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc0200053, 0, rs1, rm, rd); } +void fcvt_lu_s(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc0300053, 0, rs1, rm, rd); } +void fcvt_s_l(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd0200053, 0, rs1, rm, rd); } +void fcvt_s_lu(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd0300053, 0, rs1, rm, rd); } +void fadd_h(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x4000053, rs2, rs1, rm, rd); } +void fclass_h(const Reg& rd, const FReg& rs1) { opFP(0xe4001053, 0, rs1, 0, rd); } +void fcvt_h_s(const Reg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0x44000053, 0, rs1, rm, rd); } +void fcvt_h_w(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd4000053, 0, rs1, rm, rd); } +void fcvt_h_wu(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd4100053, 0, rs1, rm, rd); } +void fcvt_s_h(const Reg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0x40200053, 0, rs1, rm, rd); } +void fcvt_w_h(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc4000053, 0, rs1, rm, rd); } +void fcvt_wu_h(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc4100053, 0, rs1, rm, rd); } +void fdiv_h(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x1c000053, rs2, rs1, rm, rd); } +void feq_h(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa4002053, rs2, rs1, 0, rd); } +void fle_h(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa4000053, rs2, rs1, 0, rd); } +void flt_h(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa4001053, rs2, rs1, 0, rd); } +void fmax_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x2c001053, rs2, rs1, 0, rd); } +void fmin_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x2c000053, rs2, rs1, 0, rd); } +void fmul_h(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x14000053, rs2, rs1, rm, rd); } +void fmv_h_x(const FReg& rd, const Reg& rs1) { opFP(0xf4000053, 0, rs1, 0, rd); } +void fmv_x_h(const Reg& rd, const FReg& rs1) { opFP(0xe4000053, 0, rs1, 0, rd); } +void fsgnj_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x24000053, rs2, rs1, 0, rd); } +void fsgnjn_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x24001053, rs2, rs1, 0, rd); } +void fsgnjx_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x24002053, rs2, rs1, 0, rd); } +void fsqrt_h(const FReg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0x5c000053, 0, rs1, rm, rd); } +void fsub_h(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0xc000053, rs2, rs1, rm, rd); } +void fcvt_h_l(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd4200053, 0, rs1, rm, rd); } +void fcvt_h_lu(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd4300053, 0, rs1, rm, rd); } +void fcvt_l_h(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc4200053, 0, rs1, rm, rd); } +void fcvt_lu_h(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc4300053, 0, rs1, rm, rd); } + +void fmadd_s(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x43, rs3, rs2, rs1, rm, rd); } +void fmsub_s(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x47, rs3, rs2, rs1, rm, rd); } +void fnmsub_s(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x4b, rs3, rs2, rs1, rm, rd); } +void fnmadd_s(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x4f, rs3, rs2, rs1, rm, rd); } + +void fmadd_h(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x4000043, rs3, rs2, rs1, rm, rd); } +void fmsub_h(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x4000047, rs3, rs2, rs1, rm, rd); } +void fnmsub_h(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x400004b, rs3, rs2, rs1, rm, rd); } +void fnmadd_h(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x400004f, rs3, rs2, rs1, rm, rd); } + + +void flq(const FReg& rd, const Reg& rs1, int32_t imm12 = 0) { opLoadFP(0x4007, imm12, rs1, rd); } +void fsq(const FReg& rs2, const Reg& rs1, int32_t imm12 = 0) { opStoreFP(0x4027, imm12, rs2, rs1); } +void fld(const FReg& rd, const Reg& rs1, int32_t imm12 = 0) { opLoadFP(0x3007, imm12, rs1, rd); } +void fsd(const FReg& rs2, const Reg& rs1, int32_t imm12 = 0) { opStoreFP(0x3027, imm12, rs2, rs1); } +void flw(const FReg& rd, const Reg& rs1, int32_t imm12 = 0) { opLoadFP(0x2007, imm12, rs1, rd); } +void fsw(const FReg& rs2, const Reg& rs1, int32_t imm12 = 0) { opStoreFP(0x2027, imm12, rs2, rs1); } +void flh(const FReg& rd, const Reg& rs1, int32_t imm12 = 0) { opLoadFP(0x1007, imm12, rs1, rd); } +void fsh(const FReg& rs2, const Reg& rs1, int32_t imm12 = 0) { opStoreFP(0x1027, imm12, rs2, rs1); } + + +void nop() { if (supportRVC_) { append2B(0x0001); return; } addi(x0, x0, 0); } +void li(const Reg& rd, uint32_t imm) +{ + if (imm && (imm & local::mask(12)) == 0) { // lower 12 bits of imm are zero + lui(rd, uint32_t(imm >> 12)); + return; + } + int H, L; + if (!local::split32bit(&H, &L, imm)) { + addi(rd, zero, imm); + return; + } + lui(rd, H); + if (isRV32_) { + addi(rd, rd, L); + } else { + addiw(rd, rd, L); + } +} +void mv(const Reg& rd, const Reg& rs) { addi(rd, rs, 0); } +void not_(const Reg& rd, const Reg& rs) { xori(rd, rs, -1); } +void neg(const Reg& rd, const Reg& rs) { sub(rd, x0, rs); } +void negw(const Reg& rd, const Reg& rs) { subw(rd, x0, rs); } +void sext_b(const Reg& rd, const Reg& rs) { slli(rd, rs, XLEN_ - 8); srai(rd, rd, XLEN_ - 8); } +void sext_h(const Reg& rd, const Reg& rs) { slli(rd, rs, XLEN_ - 16); srai(rd, rd, XLEN_ - 16); } +void sext_w(const Reg& rd, const Reg& rs) { addiw(rd, rs, 0); } +void zext_b(const Reg& rd, const Reg& rs) { andi(rd, rs, 255); } +void zext_h(const Reg& rd, const Reg& rs) { slli(rd, rs, XLEN_ - 16); srli(rd, rd, XLEN_ - 16); } +void zext_w(const Reg& rd, const Reg& rs) { slli(rd, rs, XLEN_ - 32); srli(rd, rd, XLEN_ - 32); } +void seqz(const Reg& rd, const Reg& rs) { sltiu(rd, rs, 1); } +void snez(const Reg& rd, const Reg& rs) { sltu(rd, x0, rs); } +void sltz(const Reg& rd, const Reg& rs) { slt(rd, rs, x0); } +void sgtz(const Reg& rd, const Reg& rs) { slt(rd, x0, rs); } +void fence() { append4B(0x0ff0000f); } +void j_(const Label& label) { jal(x0, label); } +void jal(const Reg& rd, const Label& label) { Jmp jmp(getCurr(), 0x6f, rd); opJmp(label, jmp); } +void jr(const Reg& rs) { jalr(x0, rs, 0); } +void jalr(const Reg& rs) { jalr(x1, rs, 0); } +void ret() { jalr(x0, x1); } +// lr rd, (addr) +void lr_w(const Reg& rd, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, 0, addr, 2, 2, flag); } +void lr_d(const Reg& rd, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, 0, addr, 2, 3, flag); } +void csrr(const Reg& rd, CSR csr) { csrrs(rd, csr, x0); } +void csrw(CSR csr, const Reg& rs) { csrrw(x0, csr, rs); } +void csrs(CSR csr, const Reg& rs) { csrrs(x0, csr, rs); } +void csrc(CSR csr, const Reg& rs) { csrrc(x0, csr, rs); } +void csrwi(CSR csr, uint32_t imm) { csrrwi(x0, csr, imm); } +void csrsi(CSR csr, uint32_t imm) { csrrsi(x0, csr, imm); } +void csrci(CSR csr, uint32_t imm) { csrrci(x0, csr, imm); } + diff --git a/third_party/xbyak_riscv/xbyak_riscv_util.hpp b/third_party/xbyak_riscv/xbyak_riscv_util.hpp new file mode 100644 index 00000000000..6fdeab13b0e --- /dev/null +++ b/third_party/xbyak_riscv/xbyak_riscv_util.hpp @@ -0,0 +1,271 @@ +/****************************************************************************** +* Copyright (C), 2023, KNS Group LLC (YADRO) +* +* Licensed under the 3-Clause BSD License +* You may obtain a copy of the License at https://opensource.org/license/bsd-3-clause/ +*******************************************************************************/ + +#pragma once + +#include +#include +#include +#include "xbyak_riscv_csr.hpp" +#include "xbyak_riscv.hpp" + +#if defined(__linux__) && defined(__riscv) +#include +#include +#include +#include +#include +#include +#endif + +namespace Xbyak_riscv { + +// Legacy HWCAP constants +#ifndef COMPAT_HWCAP_ISA_I +#define COMPAT_HWCAP_ISA_I (1U << ('I' - 'A')) +#endif + +#ifndef COMPAT_HWCAP_ISA_M +#define COMPAT_HWCAP_ISA_M (1U << ('M' - 'A')) +#endif + +#ifndef COMPAT_HWCAP_ISA_A +#define COMPAT_HWCAP_ISA_A (1U << ('A' - 'A')) +#endif + +#ifndef COMPAT_HWCAP_ISA_F +#define COMPAT_HWCAP_ISA_F (1U << ('F' - 'A')) +#endif + +#ifndef COMPAT_HWCAP_ISA_D +#define COMPAT_HWCAP_ISA_D (1U << ('D' - 'A')) +#endif + +#ifndef COMPAT_HWCAP_ISA_C +#define COMPAT_HWCAP_ISA_C (1U << ('C' - 'A')) +#endif + +#ifndef COMPAT_HWCAP_ISA_V +#define COMPAT_HWCAP_ISA_V (1U << ('V' - 'A')) +#endif + +#if defined(__linux__) && defined(__riscv) +// Definitions for riscv_hwprobe (Linux 6.4+) +#ifndef __NR_riscv_hwprobe +#define __NR_riscv_hwprobe 258 +#endif + +#ifndef RISCV_HWPROBE_KEY_IMA_EXT_0 +#define RISCV_HWPROBE_KEY_IMA_EXT_0 4 +#endif + +#ifndef RISCV_HWPROBE_IMA_V +#define RISCV_HWPROBE_IMA_V (1ULL << 2) +#endif + +#ifndef RISCV_HWPROBE_EXT_ZVBB +#define RISCV_HWPROBE_EXT_ZVBB (1ULL << 17) +#endif + +#ifndef RISCV_HWPROBE_EXT_ZVBC +#define RISCV_HWPROBE_EXT_ZVBC (1ULL << 18) +#endif + +#ifndef RISCV_HWPROBE_EXT_ZVKG +#define RISCV_HWPROBE_EXT_ZVKG (1ULL << 20) +#endif + +#ifndef RISCV_HWPROBE_EXT_ZVFH +#define RISCV_HWPROBE_EXT_ZVFH (1ULL << 30) +#endif + +struct riscv_hwprobe { + int64_t key; + uint64_t value; +}; +#endif + +enum class RISCVExtension : uint64_t { + // 0-25: Legacy single-letter map (matches HWCAP for convenience) + I = COMPAT_HWCAP_ISA_I, + M = COMPAT_HWCAP_ISA_M, + A = COMPAT_HWCAP_ISA_A, + F = COMPAT_HWCAP_ISA_F, + D = COMPAT_HWCAP_ISA_D, + C = COMPAT_HWCAP_ISA_C, + V = COMPAT_HWCAP_ISA_V, + + // 26+: Extended Z-extensions + // Adding new extensions here is safe and conflict-free + Zvfh = 1ULL << 26, + Zvbb = 1ULL << 27, + Zvbc = 1ULL << 28, + Zvkg = 1ULL << 29 +}; + +template +struct CSRReader : public CodeGenerator { + // Buffer capacity exactly for 2 instructions. + static constexpr size_t capacity = 8; + + CSRReader() : CodeGenerator(capacity) { + csrrs(a0, csr, x0); + ret(); + } +}; + +/** + * Class that detects information about a RISC-V CPU. + */ +class CPU final { +public: + static const CPU& getInstance() { + static const CPU cpu; + return cpu; + } + + CPU() { + hwcapFeatures = 0; + xlen = sizeof(void*) * 8; // Fallback if sysconf fails + +#if defined(__linux__) && defined(__riscv) + // Set hwcapFeatures with AT_HWCAP value from + // the Linux auxiliary vector to check for base extensions support. + hwcapFeatures = getauxval(AT_HWCAP) & ( + COMPAT_HWCAP_ISA_I | + COMPAT_HWCAP_ISA_M | + COMPAT_HWCAP_ISA_A | + COMPAT_HWCAP_ISA_F | + COMPAT_HWCAP_ISA_D | + COMPAT_HWCAP_ISA_C | + COMPAT_HWCAP_ISA_V + ); + + // Try to use riscv_hwprobe to detect Z-extensions + struct riscv_hwprobe requests[] = { + {RISCV_HWPROBE_KEY_IMA_EXT_0, 0} + }; + + int ret = syscall(__NR_riscv_hwprobe, &requests, sizeof(requests) / sizeof(requests[0]), 0, NULL, 0); + + if (ret == 0) { + uint64_t v = requests[0].value; + // Update V support from hwprobe if present + if (v & RISCV_HWPROBE_IMA_V) hwcapFeatures |= static_cast(RISCVExtension::V); + + // Detect Z-extensions using the table + const struct { + RISCVExtension id; + uint64_t hwprobe_bit; // Bit in RISCV_HWPROBE_KEY_IMA_EXT_0 + } table[] = { + { RISCVExtension::Zvfh, RISCV_HWPROBE_EXT_ZVFH }, + { RISCVExtension::Zvbb, RISCV_HWPROBE_EXT_ZVBB }, + { RISCVExtension::Zvbc, RISCV_HWPROBE_EXT_ZVBC }, + { RISCVExtension::Zvkg, RISCV_HWPROBE_EXT_ZVKG } + }; + for (const auto& entry : table) { + if (v & entry.hwprobe_bit) { + hwcapFeatures |= static_cast(entry.id); + } + } + } + + // Set xlen, number of cores, cache info + xlen = sysconf(_SC_LONG_BIT); + numCores = sysconf(_SC_NPROCESSORS_ONLN); + + dataCacheSize_[0] = sysconf(_SC_LEVEL1_DCACHE_SIZE); + dataCacheSize_[1] = sysconf(_SC_LEVEL2_CACHE_SIZE); + dataCacheSize_[2] = sysconf(_SC_LEVEL3_CACHE_SIZE); + dataCacheSize_[3] = sysconf(_SC_LEVEL4_CACHE_SIZE); + + dataCacheLineSize_[0] = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + dataCacheLineSize_[1] = sysconf(_SC_LEVEL2_CACHE_LINESIZE); + dataCacheLineSize_[2] = sysconf(_SC_LEVEL3_CACHE_LINESIZE); + dataCacheLineSize_[3] = sysconf(_SC_LEVEL4_CACHE_LINESIZE); +#endif + + // Set vlen + if(hasExtension(RISCVExtension::V)) { + CSRReader csrReaderGenerator; + csrReaderGenerator.ready(); + const auto csrReader = csrReaderGenerator.getCode(); + vlen = csrReader() * 8 /* bit */; + } + + // Set flen (bit) + if (hasExtension(RISCVExtension::D)) { + flen = 64; + } else if (hasExtension(RISCVExtension::F)) { + flen = 32; + } + } + + /** + * Checks if a particular RISC-V extension is available. + * + * @param extension The extension to check. + */ + bool hasExtension(RISCVExtension extension) const { + return (hwcapFeatures & static_cast(extension)) != 0; + } + + /** + * Get vector register width in bits + */ + uint32_t getVlen() const { + return vlen; + } + + /** + * Get general purpose register width in bits + */ + uint32_t getXlen() const { + return xlen; + }; + + /** + * Get floating-point register width in bits + */ + uint32_t getFlen() const { + return flen; + } + + uint32_t getNumCores() const { + return numCores; + } + + /** + * Get data cache size in bytes + * @param lvl Cache level 1..4 + */ + uint32_t getDataCacheSize(uint32_t lvl) const { + if (lvl == 0 || lvl > maxNumberCacheLevels) XBYAK_RISCV_THROW(ERR_BAD_PARAMETER); + return dataCacheSize_[lvl - 1]; + } + + /** + * Get data cache line size in bytes + * @param lvl Cache level 1..4 + */ + uint32_t getDataCacheLineSize(uint32_t lvl) const { + if (lvl == 0 || lvl > maxNumberCacheLevels) XBYAK_RISCV_THROW(ERR_BAD_PARAMETER); + return dataCacheLineSize_[lvl - 1]; + } + +private: + uint64_t hwcapFeatures = 0; + static constexpr size_t maxNumberCacheLevels = 4; + uint32_t dataCacheSize_[maxNumberCacheLevels] = {0, 0, 0, 0}; + uint32_t dataCacheLineSize_[maxNumberCacheLevels] = {0, 0, 0, 0}; + uint32_t numCores = 0; + uint32_t xlen = 0; + uint32_t vlen = 0; + uint32_t flen = 0; +}; + +} // Xbyak_riscv diff --git a/third_party/xbyak_riscv/xbyak_riscv_v.hpp b/third_party/xbyak_riscv/xbyak_riscv_v.hpp new file mode 100644 index 00000000000..7bff4daf391 --- /dev/null +++ b/third_party/xbyak_riscv/xbyak_riscv_v.hpp @@ -0,0 +1,776 @@ +/* + Copyright (C), 2023, MITSUNARI Shigeo + Copyright (C), 2023, KNS Group LLC (YADRO) + Licensed under the 3-Clause BSD License + You may obtain a copy of the License at https://opensource.org/license/bsd-3-clause/ +*/ +void vaadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x24002057, vm, vs2, vs1, vd); } +void vaadd_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x24006057, vm, vs2, rs1, vd); } +void vaaddu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x20002057, vm, vs2, vs1, vd); } +void vaaddu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x20006057, vm, vs2, rs1, vd); } +void vadc_vim(const VReg& vd, const VReg& vs2, int32_t simm5) { opIVI(0x40003057, 0, vs2, simm5, vd); } +void vadc_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x40000057, 0, vs2, vs1, vd); } +void vadc_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x40004057, 0, vs2, rs1, vd); } +void vadd_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x3057, vm, vs2, simm5, vd); } +void vadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x57, vm, vs2, vs1, vd); } +void vadd_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x4057, vm, vs2, rs1, vd); } +void vand_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x24003057, vm, vs2, simm5, vd); } +void vand_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x24000057, vm, vs2, vs1, vd); } +void vand_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x24004057, vm, vs2, rs1, vd); } +void vasub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x2c002057, vm, vs2, vs1, vd); } +void vasub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x2c006057, vm, vs2, rs1, vd); } +void vasubu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x28002057, vm, vs2, vs1, vd); } +void vasubu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x28006057, vm, vs2, rs1, vd); } +void vcompress_vm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opMVV(0x5e002057, 0, vs2, vs1, vd); } +void vcpop_m(const Reg& rd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x40082057, vm, vs2, 0, rd); } +void vdiv_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x84002057, vm, vs2, vs1, vd); } +void vdiv_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x84006057, vm, vs2, rs1, vd); } +void vdivu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x80002057, vm, vs2, vs1, vd); } +void vdivu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x80006057, vm, vs2, rs1, vd); } +void vfadd_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x5057, vm, vs2, rs1, vd); } +void vfadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x1057, vm, vs2, vs1, vd); } +void vfclass_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x4c081057, vm, vs2, 0, vd); } +void vfcvt_f_x_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48019057, vm, vs2, 0, vd); } +void vfcvt_f_xu_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48011057, vm, vs2, 0, vd); } +void vfcvt_rtz_x_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48039057, vm, vs2, 0, vd); } +void vfcvt_rtz_xu_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48031057, vm, vs2, 0, vd); } +void vfcvt_x_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48009057, vm, vs2, 0, vd); } +void vfcvt_xu_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48001057, vm, vs2, 0, vd); } +void vfdiv_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x80005057, vm, vs2, rs1, vd); } +void vfdiv_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x80001057, vm, vs2, vs1, vd); } +void vfirst_m(const Reg& rd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x4008a057, vm, vs2, 0, rd); } +void vfmacc_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xb0005057, vm, vs2, rs1, vd); } +void vfmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xb0001057, vm, vs2, vs1, vd); } +void vfmadd_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xa0005057, vm, vs2, rs1, vd); } +void vfmadd_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xa0001057, vm, vs2, vs1, vd); } +void vfmax_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x18005057, vm, vs2, rs1, vd); } +void vfmax_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x18001057, vm, vs2, vs1, vd); } +void vfmerge_vfm(const VReg& vd, const VReg& vs2, const FReg& rs1) { opFVF(0x5c005057, 0, vs2, rs1, vd); } +void vfmin_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x10005057, vm, vs2, rs1, vd); } +void vfmin_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x10001057, vm, vs2, vs1, vd); } +void vfmsac_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xb8005057, vm, vs2, rs1, vd); } +void vfmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xb8001057, vm, vs2, vs1, vd); } +void vfmsub_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xa8005057, vm, vs2, rs1, vd); } +void vfmsub_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xa8001057, vm, vs2, vs1, vd); } +void vfmul_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x90005057, vm, vs2, rs1, vd); } +void vfmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x90001057, vm, vs2, vs1, vd); } +void vfmv_f_s(const FReg& rd, const VReg& vs2) { opFVV(0x42001057, 0, vs2, 0, rd); } +void vfmv_s_f(const VReg& vd, const FReg& rs1) { opFVF(0x42005057, 0, 0, rs1, vd); } +void vfmv_v_f(const VReg& vd, const FReg& rs1) { opFVF(0x5e005057, 0, 0, rs1, vd); } +void vfncvt_f_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x480a1057, vm, vs2, 0, vd); } +void vfncvt_f_x_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48099057, vm, vs2, 0, vd); } +void vfncvt_f_xu_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48091057, vm, vs2, 0, vd); } +void vfncvt_rod_f_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x480a9057, vm, vs2, 0, vd); } +void vfncvt_rtz_x_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x480b9057, vm, vs2, 0, vd); } +void vfncvt_rtz_xu_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x480b1057, vm, vs2, 0, vd); } +void vfncvt_x_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48089057, vm, vs2, 0, vd); } +void vfncvt_xu_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48081057, vm, vs2, 0, vd); } +void vfnmacc_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xb4005057, vm, vs2, rs1, vd); } +void vfnmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xb4001057, vm, vs2, vs1, vd); } +void vfnmadd_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xa4005057, vm, vs2, rs1, vd); } +void vfnmadd_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xa4001057, vm, vs2, vs1, vd); } +void vfnmsac_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xbc005057, vm, vs2, rs1, vd); } +void vfnmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xbc001057, vm, vs2, vs1, vd); } +void vfnmsub_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xac005057, vm, vs2, rs1, vd); } +void vfnmsub_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xac001057, vm, vs2, vs1, vd); } +void vfrdiv_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x84005057, vm, vs2, rs1, vd); } +void vfrec7_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x4c029057, vm, vs2, 0, vd); } +void vfredmax_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x1c001057, vm, vs2, vs1, vd); } +void vfredmin_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x14001057, vm, vs2, vs1, vd); } +void vfredosum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xc001057, vm, vs2, vs1, vd); } +void vfredusum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x4001057, vm, vs2, vs1, vd); } +void vfrsqrt7_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x4c021057, vm, vs2, 0, vd); } +void vfrsub_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x9c005057, vm, vs2, rs1, vd); } +void vfsgnj_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x20005057, vm, vs2, rs1, vd); } +void vfsgnj_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x20001057, vm, vs2, vs1, vd); } +void vfsgnjn_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x24005057, vm, vs2, rs1, vd); } +void vfsgnjn_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x24001057, vm, vs2, vs1, vd); } +void vfsgnjx_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x28005057, vm, vs2, rs1, vd); } +void vfsgnjx_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x28001057, vm, vs2, vs1, vd); } +void vfslide1down_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x3c005057, vm, vs2, rs1, vd); } +void vfslide1up_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x38005057, vm, vs2, rs1, vd); } +void vfsqrt_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x4c001057, vm, vs2, 0, vd); } +void vfsub_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x8005057, vm, vs2, rs1, vd); } +void vfsub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x8001057, vm, vs2, vs1, vd); } +void vfwadd_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xc0005057, vm, vs2, rs1, vd); } +void vfwadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xc0001057, vm, vs2, vs1, vd); } +void vfwadd_wf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xd0005057, vm, vs2, rs1, vd); } +void vfwadd_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xd0001057, vm, vs2, vs1, vd); } +void vfwcvt_f_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48061057, vm, vs2, 0, vd); } +void vfwcvt_f_x_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48059057, vm, vs2, 0, vd); } +void vfwcvt_f_xu_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48051057, vm, vs2, 0, vd); } +void vfwcvt_rtz_x_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48079057, vm, vs2, 0, vd); } +void vfwcvt_rtz_xu_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48071057, vm, vs2, 0, vd); } +void vfwcvt_x_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48049057, vm, vs2, 0, vd); } +void vfwcvt_xu_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48041057, vm, vs2, 0, vd); } +void vfwmacc_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xf0005057, vm, vs2, rs1, vd); } +void vfwmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xf0001057, vm, vs2, vs1, vd); } +void vfwmsac_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xf8005057, vm, vs2, rs1, vd); } +void vfwmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xf8001057, vm, vs2, vs1, vd); } +void vfwmul_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xe0005057, vm, vs2, rs1, vd); } +void vfwmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xe0001057, vm, vs2, vs1, vd); } +void vfwnmacc_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xf4005057, vm, vs2, rs1, vd); } +void vfwnmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xf4001057, vm, vs2, vs1, vd); } +void vfwnmsac_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xfc005057, vm, vs2, rs1, vd); } +void vfwnmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xfc001057, vm, vs2, vs1, vd); } +void vfwredosum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xcc001057, vm, vs2, vs1, vd); } +void vfwredusum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xc4001057, vm, vs2, vs1, vd); } +void vfwsub_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xc8005057, vm, vs2, rs1, vd); } +void vfwsub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xc8001057, vm, vs2, vs1, vd); } +void vfwsub_wf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xd8005057, vm, vs2, rs1, vd); } +void vfwsub_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xd8001057, vm, vs2, vs1, vd); } +void vid_v(const VReg& vd, VM vm=VM::unmasked) { opMVV(0x5008a057, vm, 0, 0, vd); } +void viota_m(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x50082057, vm, vs2, 0, vd); } +void vl1re16_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2805007, 0, 0, rs1, vd); } +void vl1re32_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2806007, 0, 0, rs1, vd); } +void vl1re64_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2807007, 0, 0, rs1, vd); } +void vl1re8_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2800007, 0, 0, rs1, vd); } +void vl2re16_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2805007, 0, 0, rs1, vd); } +void vl2re32_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2806007, 0, 0, rs1, vd); } +void vl2re64_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2807007, 0, 0, rs1, vd); } +void vl2re8_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2800007, 0, 0, rs1, vd); } +void vl4re16_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2805007, 0, 0, rs1, vd); } +void vl4re32_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2806007, 0, 0, rs1, vd); } +void vl4re64_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2807007, 0, 0, rs1, vd); } +void vl4re8_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2800007, 0, 0, rs1, vd); } +void vl8re16_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2805007, 0, 0, rs1, vd); } +void vl8re32_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2806007, 0, 0, rs1, vd); } +void vl8re64_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2807007, 0, 0, rs1, vd); } +void vl8re8_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2800007, 0, 0, rs1, vd); } +void vlseg1e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10007007, vm, 0, rs1, vd); } +void vlseg2e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x30007007, vm, 0, rs1, vd); } +void vlseg3e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x50007007, vm, 0, rs1, vd); } +void vlseg4e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x70007007, vm, 0, rs1, vd); } +void vlseg5e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x90007007, vm, 0, rs1, vd); } +void vlseg6e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb0007007, vm, 0, rs1, vd); } +void vlseg7e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd0007007, vm, 0, rs1, vd); } +void vlseg8e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf0007007, vm, 0, rs1, vd); } +void vle1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10007007, vm, 0, rs1, vd); } +void vlseg1e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11007007, vm, 0, rs1, vd); } +void vlseg2e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x31007007, vm, 0, rs1, vd); } +void vlseg3e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x51007007, vm, 0, rs1, vd); } +void vlseg4e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x71007007, vm, 0, rs1, vd); } +void vlseg5e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x91007007, vm, 0, rs1, vd); } +void vlseg6e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb1007007, vm, 0, rs1, vd); } +void vlseg7e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd1007007, vm, 0, rs1, vd); } +void vlseg8e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf1007007, vm, 0, rs1, vd); } +void vle1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11007007, vm, 0, rs1, vd); } +void vlseg1e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10000007, vm, 0, rs1, vd); } +void vlseg2e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x30000007, vm, 0, rs1, vd); } +void vlseg3e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x50000007, vm, 0, rs1, vd); } +void vlseg4e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x70000007, vm, 0, rs1, vd); } +void vlseg5e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x90000007, vm, 0, rs1, vd); } +void vlseg6e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb0000007, vm, 0, rs1, vd); } +void vlseg7e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd0000007, vm, 0, rs1, vd); } +void vlseg8e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf0000007, vm, 0, rs1, vd); } +void vle128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10000007, vm, 0, rs1, vd); } +void vlseg1e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11000007, vm, 0, rs1, vd); } +void vlseg2e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x31000007, vm, 0, rs1, vd); } +void vlseg3e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x51000007, vm, 0, rs1, vd); } +void vlseg4e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x71000007, vm, 0, rs1, vd); } +void vlseg5e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x91000007, vm, 0, rs1, vd); } +void vlseg6e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb1000007, vm, 0, rs1, vd); } +void vlseg7e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd1000007, vm, 0, rs1, vd); } +void vlseg8e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf1000007, vm, 0, rs1, vd); } +void vle128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11000007, vm, 0, rs1, vd); } +void vlseg1e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x5007, vm, 0, rs1, vd); } +void vlseg2e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x20005007, vm, 0, rs1, vd); } +void vlseg3e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x40005007, vm, 0, rs1, vd); } +void vlseg4e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x60005007, vm, 0, rs1, vd); } +void vlseg5e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x80005007, vm, 0, rs1, vd); } +void vlseg6e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa0005007, vm, 0, rs1, vd); } +void vlseg7e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc0005007, vm, 0, rs1, vd); } +void vlseg8e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe0005007, vm, 0, rs1, vd); } +void vle16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x5007, vm, 0, rs1, vd); } +void vlseg1e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1005007, vm, 0, rs1, vd); } +void vlseg2e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x21005007, vm, 0, rs1, vd); } +void vlseg3e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x41005007, vm, 0, rs1, vd); } +void vlseg4e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x61005007, vm, 0, rs1, vd); } +void vlseg5e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x81005007, vm, 0, rs1, vd); } +void vlseg6e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa1005007, vm, 0, rs1, vd); } +void vlseg7e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc1005007, vm, 0, rs1, vd); } +void vlseg8e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe1005007, vm, 0, rs1, vd); } +void vle16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1005007, vm, 0, rs1, vd); } +void vlseg1e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10005007, vm, 0, rs1, vd); } +void vlseg2e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x30005007, vm, 0, rs1, vd); } +void vlseg3e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x50005007, vm, 0, rs1, vd); } +void vlseg4e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x70005007, vm, 0, rs1, vd); } +void vlseg5e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x90005007, vm, 0, rs1, vd); } +void vlseg6e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb0005007, vm, 0, rs1, vd); } +void vlseg7e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd0005007, vm, 0, rs1, vd); } +void vlseg8e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf0005007, vm, 0, rs1, vd); } +void vle256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10005007, vm, 0, rs1, vd); } +void vlseg1e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11005007, vm, 0, rs1, vd); } +void vlseg2e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x31005007, vm, 0, rs1, vd); } +void vlseg3e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x51005007, vm, 0, rs1, vd); } +void vlseg4e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x71005007, vm, 0, rs1, vd); } +void vlseg5e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x91005007, vm, 0, rs1, vd); } +void vlseg6e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb1005007, vm, 0, rs1, vd); } +void vlseg7e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd1005007, vm, 0, rs1, vd); } +void vlseg8e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf1005007, vm, 0, rs1, vd); } +void vle256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11005007, vm, 0, rs1, vd); } +void vlseg1e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x6007, vm, 0, rs1, vd); } +void vlseg2e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x20006007, vm, 0, rs1, vd); } +void vlseg3e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x40006007, vm, 0, rs1, vd); } +void vlseg4e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x60006007, vm, 0, rs1, vd); } +void vlseg5e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x80006007, vm, 0, rs1, vd); } +void vlseg6e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa0006007, vm, 0, rs1, vd); } +void vlseg7e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc0006007, vm, 0, rs1, vd); } +void vlseg8e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe0006007, vm, 0, rs1, vd); } +void vle32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x6007, vm, 0, rs1, vd); } +void vlseg1e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1006007, vm, 0, rs1, vd); } +void vlseg2e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x21006007, vm, 0, rs1, vd); } +void vlseg3e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x41006007, vm, 0, rs1, vd); } +void vlseg4e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x61006007, vm, 0, rs1, vd); } +void vlseg5e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x81006007, vm, 0, rs1, vd); } +void vlseg6e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa1006007, vm, 0, rs1, vd); } +void vlseg7e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc1006007, vm, 0, rs1, vd); } +void vlseg8e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe1006007, vm, 0, rs1, vd); } +void vle32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1006007, vm, 0, rs1, vd); } +void vlseg1e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10006007, vm, 0, rs1, vd); } +void vlseg2e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x30006007, vm, 0, rs1, vd); } +void vlseg3e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x50006007, vm, 0, rs1, vd); } +void vlseg4e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x70006007, vm, 0, rs1, vd); } +void vlseg5e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x90006007, vm, 0, rs1, vd); } +void vlseg6e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb0006007, vm, 0, rs1, vd); } +void vlseg7e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd0006007, vm, 0, rs1, vd); } +void vlseg8e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf0006007, vm, 0, rs1, vd); } +void vle512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10006007, vm, 0, rs1, vd); } +void vlseg1e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11006007, vm, 0, rs1, vd); } +void vlseg2e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x31006007, vm, 0, rs1, vd); } +void vlseg3e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x51006007, vm, 0, rs1, vd); } +void vlseg4e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x71006007, vm, 0, rs1, vd); } +void vlseg5e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x91006007, vm, 0, rs1, vd); } +void vlseg6e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb1006007, vm, 0, rs1, vd); } +void vlseg7e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd1006007, vm, 0, rs1, vd); } +void vlseg8e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf1006007, vm, 0, rs1, vd); } +void vle512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11006007, vm, 0, rs1, vd); } +void vlseg1e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x7007, vm, 0, rs1, vd); } +void vlseg2e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x20007007, vm, 0, rs1, vd); } +void vlseg3e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x40007007, vm, 0, rs1, vd); } +void vlseg4e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x60007007, vm, 0, rs1, vd); } +void vlseg5e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x80007007, vm, 0, rs1, vd); } +void vlseg6e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa0007007, vm, 0, rs1, vd); } +void vlseg7e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc0007007, vm, 0, rs1, vd); } +void vlseg8e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe0007007, vm, 0, rs1, vd); } +void vle64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x7007, vm, 0, rs1, vd); } +void vlseg1e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1007007, vm, 0, rs1, vd); } +void vlseg2e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x21007007, vm, 0, rs1, vd); } +void vlseg3e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x41007007, vm, 0, rs1, vd); } +void vlseg4e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x61007007, vm, 0, rs1, vd); } +void vlseg5e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x81007007, vm, 0, rs1, vd); } +void vlseg6e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa1007007, vm, 0, rs1, vd); } +void vlseg7e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc1007007, vm, 0, rs1, vd); } +void vlseg8e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe1007007, vm, 0, rs1, vd); } +void vle64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1007007, vm, 0, rs1, vd); } +void vlseg1e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x7, vm, 0, rs1, vd); } +void vlseg2e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x20000007, vm, 0, rs1, vd); } +void vlseg3e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x40000007, vm, 0, rs1, vd); } +void vlseg4e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x60000007, vm, 0, rs1, vd); } +void vlseg5e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x80000007, vm, 0, rs1, vd); } +void vlseg6e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa0000007, vm, 0, rs1, vd); } +void vlseg7e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc0000007, vm, 0, rs1, vd); } +void vlseg8e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe0000007, vm, 0, rs1, vd); } +void vle8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x7, vm, 0, rs1, vd); } +void vlseg1e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1000007, vm, 0, rs1, vd); } +void vlseg2e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x21000007, vm, 0, rs1, vd); } +void vlseg3e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x41000007, vm, 0, rs1, vd); } +void vlseg4e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x61000007, vm, 0, rs1, vd); } +void vlseg5e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x81000007, vm, 0, rs1, vd); } +void vlseg6e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa1000007, vm, 0, rs1, vd); } +void vlseg7e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc1000007, vm, 0, rs1, vd); } +void vlseg8e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe1000007, vm, 0, rs1, vd); } +void vle8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1000007, vm, 0, rs1, vd); } +void vlm_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2b00007, 0, 0, rs1, vd); } +void vloxei1024_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x1c007007, vm, vs2, rs1, vd); } +void vloxei128_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x1c000007, vm, vs2, rs1, vd); } +void vloxei16_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0xc005007, vm, vs2, rs1, vd); } +void vloxei256_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x1c005007, vm, vs2, rs1, vd); } +void vloxei32_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0xc006007, vm, vs2, rs1, vd); } +void vloxei512_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x1c006007, vm, vs2, rs1, vd); } +void vloxei64_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0xc007007, vm, vs2, rs1, vd); } +void vloxei8_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0xc000007, vm, vs2, rs1, vd); } +void vlsseg1e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18007007, vm, rs2, rs1, vd); } +void vlsseg2e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x38007007, vm, rs2, rs1, vd); } +void vlsseg3e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x58007007, vm, rs2, rs1, vd); } +void vlsseg4e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x78007007, vm, rs2, rs1, vd); } +void vlsseg5e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x98007007, vm, rs2, rs1, vd); } +void vlsseg6e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xb8007007, vm, rs2, rs1, vd); } +void vlsseg7e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xd8007007, vm, rs2, rs1, vd); } +void vlsseg8e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xf8007007, vm, rs2, rs1, vd); } +void vlse1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18007007, vm, rs2, rs1, vd); } +void vlsseg1e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18000007, vm, rs2, rs1, vd); } +void vlsseg2e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x38000007, vm, rs2, rs1, vd); } +void vlsseg3e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x58000007, vm, rs2, rs1, vd); } +void vlsseg4e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x78000007, vm, rs2, rs1, vd); } +void vlsseg5e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x98000007, vm, rs2, rs1, vd); } +void vlsseg6e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xb8000007, vm, rs2, rs1, vd); } +void vlsseg7e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xd8000007, vm, rs2, rs1, vd); } +void vlsseg8e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xf8000007, vm, rs2, rs1, vd); } +void vlse128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18000007, vm, rs2, rs1, vd); } +void vlsseg1e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8005007, vm, rs2, rs1, vd); } +void vlsseg2e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x28005007, vm, rs2, rs1, vd); } +void vlsseg3e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x48005007, vm, rs2, rs1, vd); } +void vlsseg4e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x68005007, vm, rs2, rs1, vd); } +void vlsseg5e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x88005007, vm, rs2, rs1, vd); } +void vlsseg6e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xa8005007, vm, rs2, rs1, vd); } +void vlsseg7e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xc8005007, vm, rs2, rs1, vd); } +void vlsseg8e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xe8005007, vm, rs2, rs1, vd); } +void vlse16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8005007, vm, rs2, rs1, vd); } +void vlsseg1e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18005007, vm, rs2, rs1, vd); } +void vlsseg2e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x38005007, vm, rs2, rs1, vd); } +void vlsseg3e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x58005007, vm, rs2, rs1, vd); } +void vlsseg4e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x78005007, vm, rs2, rs1, vd); } +void vlsseg5e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x98005007, vm, rs2, rs1, vd); } +void vlsseg6e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xb8005007, vm, rs2, rs1, vd); } +void vlsseg7e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xd8005007, vm, rs2, rs1, vd); } +void vlsseg8e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xf8005007, vm, rs2, rs1, vd); } +void vlse256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18005007, vm, rs2, rs1, vd); } +void vlsseg1e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8006007, vm, rs2, rs1, vd); } +void vlsseg2e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x28006007, vm, rs2, rs1, vd); } +void vlsseg3e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x48006007, vm, rs2, rs1, vd); } +void vlsseg4e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x68006007, vm, rs2, rs1, vd); } +void vlsseg5e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x88006007, vm, rs2, rs1, vd); } +void vlsseg6e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xa8006007, vm, rs2, rs1, vd); } +void vlsseg7e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xc8006007, vm, rs2, rs1, vd); } +void vlsseg8e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xe8006007, vm, rs2, rs1, vd); } +void vlse32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8006007, vm, rs2, rs1, vd); } +void vlsseg1e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18006007, vm, rs2, rs1, vd); } +void vlsseg2e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x38006007, vm, rs2, rs1, vd); } +void vlsseg3e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x58006007, vm, rs2, rs1, vd); } +void vlsseg4e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x78006007, vm, rs2, rs1, vd); } +void vlsseg5e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x98006007, vm, rs2, rs1, vd); } +void vlsseg6e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xb8006007, vm, rs2, rs1, vd); } +void vlsseg7e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xd8006007, vm, rs2, rs1, vd); } +void vlsseg8e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xf8006007, vm, rs2, rs1, vd); } +void vlse512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18006007, vm, rs2, rs1, vd); } +void vlsseg1e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8007007, vm, rs2, rs1, vd); } +void vlsseg2e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x28007007, vm, rs2, rs1, vd); } +void vlsseg3e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x48007007, vm, rs2, rs1, vd); } +void vlsseg4e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x68007007, vm, rs2, rs1, vd); } +void vlsseg5e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x88007007, vm, rs2, rs1, vd); } +void vlsseg6e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xa8007007, vm, rs2, rs1, vd); } +void vlsseg7e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xc8007007, vm, rs2, rs1, vd); } +void vlsseg8e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xe8007007, vm, rs2, rs1, vd); } +void vlse64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8007007, vm, rs2, rs1, vd); } +void vlsseg1e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8000007, vm, rs2, rs1, vd); } +void vlsseg2e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x28000007, vm, rs2, rs1, vd); } +void vlsseg3e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x48000007, vm, rs2, rs1, vd); } +void vlsseg4e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x68000007, vm, rs2, rs1, vd); } +void vlsseg5e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x88000007, vm, rs2, rs1, vd); } +void vlsseg6e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xa8000007, vm, rs2, rs1, vd); } +void vlsseg7e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xc8000007, vm, rs2, rs1, vd); } +void vlsseg8e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xe8000007, vm, rs2, rs1, vd); } +void vlse8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8000007, vm, rs2, rs1, vd); } +void vluxei1024_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x14007007, vm, vs2, rs1, vd); } +void vluxei128_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x14000007, vm, vs2, rs1, vd); } +void vluxei16_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x4005007, vm, vs2, rs1, vd); } +void vluxei256_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x14005007, vm, vs2, rs1, vd); } +void vluxei32_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x4006007, vm, vs2, rs1, vd); } +void vluxei512_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x14006007, vm, vs2, rs1, vd); } +void vluxei64_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x4007007, vm, vs2, rs1, vd); } +void vluxei8_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x4000007, vm, vs2, rs1, vd); } +void vmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xb4002057, vm, vs2, vs1, vd); } +void vmacc_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xb4006057, vm, vs2, rs1, vd); } +void vmadc_vi(const VReg& vd, const VReg& vs2, int32_t simm5) { opIVI(0x46003057, 0, vs2, simm5, vd); } +void vmadc_vim(const VReg& vd, const VReg& vs2, int32_t simm5) { opIVI(0x44003057, 0, vs2, simm5, vd); } +void vmadc_vv(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x46000057, 0, vs2, vs1, vd); } +void vmadc_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x44000057, 0, vs2, vs1, vd); } +void vmadc_vx(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x46004057, 0, vs2, rs1, vd); } +void vmadc_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x44004057, 0, vs2, rs1, vd); } +void vmadd_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xa4002057, vm, vs2, vs1, vd); } +void vmadd_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xa4006057, vm, vs2, rs1, vd); } +void vmand_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x64002057, vm, vs2, vs1, vd); } +void vmandn_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x60002057, vm, vs2, vs1, vd); } +void vmax_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x1c000057, vm, vs2, vs1, vd); } +void vmax_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x1c004057, vm, vs2, rs1, vd); } +void vmaxu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x18000057, vm, vs2, vs1, vd); } +void vmaxu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x18004057, vm, vs2, rs1, vd); } +void vmerge_vim(const VReg& vd, const VReg& vs2, int32_t simm5) { opIVI(0x5c003057, 0, vs2, simm5, vd); } +void vmerge_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x5c000057, 0, vs2, vs1, vd); } +void vmerge_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x5c004057, 0, vs2, rs1, vd); } +void vmfeq_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x60005057, vm, vs2, rs1, vd); } +void vmfeq_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x60001057, vm, vs2, vs1, vd); } +void vmfge_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x7c005057, vm, vs2, rs1, vd); } +void vmfgt_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x74005057, vm, vs2, rs1, vd); } +void vmfle_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x64005057, vm, vs2, rs1, vd); } +void vmfle_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x64001057, vm, vs2, vs1, vd); } +void vmflt_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x6c005057, vm, vs2, rs1, vd); } +void vmflt_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x6c001057, vm, vs2, vs1, vd); } +void vmfne_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x70005057, vm, vs2, rs1, vd); } +void vmfne_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x70001057, vm, vs2, vs1, vd); } +void vmin_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x14000057, vm, vs2, vs1, vd); } +void vmin_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x14004057, vm, vs2, rs1, vd); } +void vminu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x10000057, vm, vs2, vs1, vd); } +void vminu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x10004057, vm, vs2, rs1, vd); } +void vmnand_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x74002057, vm, vs2, vs1, vd); } +void vmnor_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x78002057, vm, vs2, vs1, vd); } +void vmor_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x68002057, vm, vs2, vs1, vd); } +void vmorn_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x70002057, vm, vs2, vs1, vd); } +void vmsbc_vv(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x4e000057, 0, vs2, vs1, vd); } +void vmsbc_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x4c000057, 0, vs2, vs1, vd); } +void vmsbc_vx(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x4e004057, 0, vs2, rs1, vd); } +void vmsbc_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x4c004057, 0, vs2, rs1, vd); } +void vmsbf_m(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x5000a057, vm, vs2, 0, vd); } +void vmseq_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x60003057, vm, vs2, simm5, vd); } +void vmseq_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x60000057, vm, vs2, vs1, vd); } +void vmseq_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x60004057, vm, vs2, rs1, vd); } +void vmsgt_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x7c003057, vm, vs2, simm5, vd); } +void vmsgt_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x7c004057, vm, vs2, rs1, vd); } +void vmsgtu_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x78003057, vm, vs2, simm5, vd); } +void vmsgtu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x78004057, vm, vs2, rs1, vd); } +void vmsif_m(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x5001a057, vm, vs2, 0, vd); } +void vmsle_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x74003057, vm, vs2, simm5, vd); } +void vmsle_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x74000057, vm, vs2, vs1, vd); } +void vmsle_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x74004057, vm, vs2, rs1, vd); } +void vmsleu_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x70003057, vm, vs2, simm5, vd); } +void vmsleu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x70000057, vm, vs2, vs1, vd); } +void vmsleu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x70004057, vm, vs2, rs1, vd); } +void vmslt_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x6c000057, vm, vs2, vs1, vd); } +void vmslt_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x6c004057, vm, vs2, rs1, vd); } +void vmsltu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x68000057, vm, vs2, vs1, vd); } +void vmsltu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x68004057, vm, vs2, rs1, vd); } +void vmsne_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x64003057, vm, vs2, simm5, vd); } +void vmsne_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x64000057, vm, vs2, vs1, vd); } +void vmsne_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x64004057, vm, vs2, rs1, vd); } +void vmsof_m(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x50012057, vm, vs2, 0, vd); } +void vmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x94002057, vm, vs2, vs1, vd); } +void vmul_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x94006057, vm, vs2, rs1, vd); } +void vmulh_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x9c002057, vm, vs2, vs1, vd); } +void vmulh_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x9c006057, vm, vs2, rs1, vd); } +void vmulhsu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x98002057, vm, vs2, vs1, vd); } +void vmulhsu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x98006057, vm, vs2, rs1, vd); } +void vmulhu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x90002057, vm, vs2, vs1, vd); } +void vmulhu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x90006057, vm, vs2, rs1, vd); } +void vmv1r_v(const VReg& vd, const VReg& vs2) { opIVI(0x9e003057, 0, vs2, 0, vd); } +void vmv2r_v(const VReg& vd, const VReg& vs2) { opIVI(0x9e00b057, 0, vs2, 0, vd); } +void vmv4r_v(const VReg& vd, const VReg& vs2) { opIVI(0x9e01b057, 0, vs2, 0, vd); } +void vmv8r_v(const VReg& vd, const VReg& vs2) { opIVI(0x9e03b057, 0, vs2, 0, vd); } +void vmv_s_x(const VReg& vd, const Reg& rs1) { opMVX(0x42006057, 0, 0, rs1, vd); } +void vmv_v_i(const VReg& vd, int32_t simm5) { opIVI(0x5e003057, 0, 0, simm5, vd); } +void vmv_v_v(const VReg& vd, const VReg& vs1) { opIVV(0x5e000057, 0, 0, vs1, vd); } +void vmv_v_x(const VReg& vd, const Reg& rs1) { opIVX(0x5e004057, 0, 0, rs1, vd); } +void vmv_x_s(const Reg& rd, const VReg& vs2) { opMVV(0x42002057, 0, vs2, 0, rd); } +void vmxnor_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x7c002057, vm, vs2, vs1, vd); } +void vmxor_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x6c002057, vm, vs2, vs1, vd); } +void vnclip_wi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xbc003057, vm, vs2, simm5, vd); } +void vnclip_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xbc000057, vm, vs2, vs1, vd); } +void vnclip_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xbc004057, vm, vs2, rs1, vd); } +void vnclipu_wi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xb8003057, vm, vs2, simm5, vd); } +void vnclipu_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xb8000057, vm, vs2, vs1, vd); } +void vnclipu_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xb8004057, vm, vs2, rs1, vd); } +void vnmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xbc002057, vm, vs2, vs1, vd); } +void vnmsac_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xbc006057, vm, vs2, rs1, vd); } +void vnmsub_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xac002057, vm, vs2, vs1, vd); } +void vnmsub_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xac006057, vm, vs2, rs1, vd); } +void vnsra_wi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xb4003057, vm, vs2, simm5, vd); } +void vnsra_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xb4000057, vm, vs2, vs1, vd); } +void vnsra_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xb4004057, vm, vs2, rs1, vd); } +void vnsrl_wi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xb0003057, vm, vs2, simm5, vd); } +void vnsrl_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xb0000057, vm, vs2, vs1, vd); } +void vnsrl_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xb0004057, vm, vs2, rs1, vd); } +void vor_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x28003057, vm, vs2, simm5, vd); } +void vor_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x28000057, vm, vs2, vs1, vd); } +void vor_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x28004057, vm, vs2, rs1, vd); } +void vredand_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x4002057, vm, vs2, vs1, vd); } +void vredmax_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x1c002057, vm, vs2, vs1, vd); } +void vredmaxu_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x18002057, vm, vs2, vs1, vd); } +void vredmin_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x14002057, vm, vs2, vs1, vd); } +void vredminu_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x10002057, vm, vs2, vs1, vd); } +void vredor_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x8002057, vm, vs2, vs1, vd); } +void vredsum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x2057, vm, vs2, vs1, vd); } +void vredxor_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xc002057, vm, vs2, vs1, vd); } +void vrem_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x8c002057, vm, vs2, vs1, vd); } +void vrem_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x8c006057, vm, vs2, rs1, vd); } +void vremu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x88002057, vm, vs2, vs1, vd); } +void vremu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x88006057, vm, vs2, rs1, vd); } +void vrgather_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x30003057, vm, vs2, simm5, vd); } +void vrgather_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x30000057, vm, vs2, vs1, vd); } +void vrgather_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x30004057, vm, vs2, rs1, vd); } +void vrgatherei16_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x38000057, vm, vs2, vs1, vd); } +void vrsub_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xc003057, vm, vs2, simm5, vd); } +void vrsub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xc004057, vm, vs2, rs1, vd); } +void vs1r_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2800027, 0, 0, rs1, vs3); } +void vs2r_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2800027, 0, 0, rs1, vs3); } +void vs4r_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2800027, 0, 0, rs1, vs3); } +void vs8r_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2800027, 0, 0, rs1, vs3); } +void vsadd_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x84003057, vm, vs2, simm5, vd); } +void vsadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x84000057, vm, vs2, vs1, vd); } +void vsadd_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x84004057, vm, vs2, rs1, vd); } +void vsaddu_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x80003057, vm, vs2, simm5, vd); } +void vsaddu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x80000057, vm, vs2, vs1, vd); } +void vsaddu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x80004057, vm, vs2, rs1, vd); } +void vsbc_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x48000057, 0, vs2, vs1, vd); } +void vsbc_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x48004057, 0, vs2, rs1, vd); } +void vsseg1e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10007027, vm, 0, rs1, vs3); } +void vsseg2e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x30007027, vm, 0, rs1, vs3); } +void vsseg3e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x50007027, vm, 0, rs1, vs3); } +void vsseg4e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x70007027, vm, 0, rs1, vs3); } +void vsseg5e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x90007027, vm, 0, rs1, vs3); } +void vsseg6e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xb0007027, vm, 0, rs1, vs3); } +void vsseg7e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xd0007027, vm, 0, rs1, vs3); } +void vsseg8e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xf0007027, vm, 0, rs1, vs3); } +void vse1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10007027, vm, 0, rs1, vs3); } +void vsseg1e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10000027, vm, 0, rs1, vs3); } +void vsseg2e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x30000027, vm, 0, rs1, vs3); } +void vsseg3e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x50000027, vm, 0, rs1, vs3); } +void vsseg4e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x70000027, vm, 0, rs1, vs3); } +void vsseg5e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x90000027, vm, 0, rs1, vs3); } +void vsseg6e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xb0000027, vm, 0, rs1, vs3); } +void vsseg7e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xd0000027, vm, 0, rs1, vs3); } +void vsseg8e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xf0000027, vm, 0, rs1, vs3); } +void vse128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10000027, vm, 0, rs1, vs3); } +void vsseg1e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x5027, vm, 0, rs1, vs3); } +void vsseg2e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x20005027, vm, 0, rs1, vs3); } +void vsseg3e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x40005027, vm, 0, rs1, vs3); } +void vsseg4e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x60005027, vm, 0, rs1, vs3); } +void vsseg5e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x80005027, vm, 0, rs1, vs3); } +void vsseg6e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xa0005027, vm, 0, rs1, vs3); } +void vsseg7e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xc0005027, vm, 0, rs1, vs3); } +void vsseg8e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xe0005027, vm, 0, rs1, vs3); } +void vse16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x5027, vm, 0, rs1, vs3); } +void vsseg1e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10005027, vm, 0, rs1, vs3); } +void vsseg2e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x30005027, vm, 0, rs1, vs3); } +void vsseg3e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x50005027, vm, 0, rs1, vs3); } +void vsseg4e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x70005027, vm, 0, rs1, vs3); } +void vsseg5e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x90005027, vm, 0, rs1, vs3); } +void vsseg6e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xb0005027, vm, 0, rs1, vs3); } +void vsseg7e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xd0005027, vm, 0, rs1, vs3); } +void vsseg8e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xf0005027, vm, 0, rs1, vs3); } +void vse256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10005027, vm, 0, rs1, vs3); } +void vsseg1e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x6027, vm, 0, rs1, vs3); } +void vsseg2e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x20006027, vm, 0, rs1, vs3); } +void vsseg3e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x40006027, vm, 0, rs1, vs3); } +void vsseg4e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x60006027, vm, 0, rs1, vs3); } +void vsseg5e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x80006027, vm, 0, rs1, vs3); } +void vsseg6e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xa0006027, vm, 0, rs1, vs3); } +void vsseg7e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xc0006027, vm, 0, rs1, vs3); } +void vsseg8e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xe0006027, vm, 0, rs1, vs3); } +void vse32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x6027, vm, 0, rs1, vs3); } +void vsseg1e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10006027, vm, 0, rs1, vs3); } +void vsseg2e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x30006027, vm, 0, rs1, vs3); } +void vsseg3e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x50006027, vm, 0, rs1, vs3); } +void vsseg4e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x70006027, vm, 0, rs1, vs3); } +void vsseg5e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x90006027, vm, 0, rs1, vs3); } +void vsseg6e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xb0006027, vm, 0, rs1, vs3); } +void vsseg7e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xd0006027, vm, 0, rs1, vs3); } +void vsseg8e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xf0006027, vm, 0, rs1, vs3); } +void vse512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10006027, vm, 0, rs1, vs3); } +void vsseg1e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x7027, vm, 0, rs1, vs3); } +void vsseg2e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x20007027, vm, 0, rs1, vs3); } +void vsseg3e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x40007027, vm, 0, rs1, vs3); } +void vsseg4e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x60007027, vm, 0, rs1, vs3); } +void vsseg5e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x80007027, vm, 0, rs1, vs3); } +void vsseg6e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xa0007027, vm, 0, rs1, vs3); } +void vsseg7e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xc0007027, vm, 0, rs1, vs3); } +void vsseg8e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xe0007027, vm, 0, rs1, vs3); } +void vse64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x7027, vm, 0, rs1, vs3); } +void vsseg1e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x27, vm, 0, rs1, vs3); } +void vsseg2e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x20000027, vm, 0, rs1, vs3); } +void vsseg3e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x40000027, vm, 0, rs1, vs3); } +void vsseg4e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x60000027, vm, 0, rs1, vs3); } +void vsseg5e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x80000027, vm, 0, rs1, vs3); } +void vsseg6e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xa0000027, vm, 0, rs1, vs3); } +void vsseg7e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xc0000027, vm, 0, rs1, vs3); } +void vsseg8e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xe0000027, vm, 0, rs1, vs3); } +void vse8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x27, vm, 0, rs1, vs3); } +void vsext_vf2(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x4803a057, vm, vs2, 0, vd); } +void vsext_vf4(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x4802a057, vm, vs2, 0, vd); } +void vsext_vf8(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x4801a057, vm, vs2, 0, vd); } +void vslide1down_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x3c006057, vm, vs2, rs1, vd); } +void vslide1up_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x38006057, vm, vs2, rs1, vd); } +void vslidedown_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x3c003057, vm, vs2, simm5, vd); } +void vslidedown_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x3c004057, vm, vs2, rs1, vd); } +void vslideup_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x38003057, vm, vs2, simm5, vd); } +void vslideup_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x38004057, vm, vs2, rs1, vd); } +void vsll_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x94003057, vm, vs2, simm5, vd); } +void vsll_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x94000057, vm, vs2, vs1, vd); } +void vsll_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x94004057, vm, vs2, rs1, vd); } +void vsm_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2b00027, 0, 0, rs1, vs3); } +void vsmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x9c000057, vm, vs2, vs1, vd); } +void vsmul_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x9c004057, vm, vs2, rs1, vd); } +void vsoxei1024_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x1c007027, vm, vs2, rs1, vs3); } +void vsoxei128_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x1c000027, vm, vs2, rs1, vs3); } +void vsoxei16_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0xc005027, vm, vs2, rs1, vs3); } +void vsoxei256_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x1c005027, vm, vs2, rs1, vs3); } +void vsoxei32_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0xc006027, vm, vs2, rs1, vs3); } +void vsoxei512_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x1c006027, vm, vs2, rs1, vs3); } +void vsoxei64_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0xc007027, vm, vs2, rs1, vs3); } +void vsoxei8_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0xc000027, vm, vs2, rs1, vs3); } +void vsra_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xa4003057, vm, vs2, simm5, vd); } +void vsra_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xa4000057, vm, vs2, vs1, vd); } +void vsra_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xa4004057, vm, vs2, rs1, vd); } +void vsrl_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xa0003057, vm, vs2, simm5, vd); } +void vsrl_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xa0000057, vm, vs2, vs1, vd); } +void vsrl_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xa0004057, vm, vs2, rs1, vd); } +void vssseg1e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18007027, vm, rs2, rs1, vs3); } +void vssseg2e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x38007027, vm, rs2, rs1, vs3); } +void vssseg3e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x58007027, vm, rs2, rs1, vs3); } +void vssseg4e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x78007027, vm, rs2, rs1, vs3); } +void vssseg5e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x98007027, vm, rs2, rs1, vs3); } +void vssseg6e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xb8007027, vm, rs2, rs1, vs3); } +void vssseg7e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xd8007027, vm, rs2, rs1, vs3); } +void vssseg8e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xf8007027, vm, rs2, rs1, vs3); } +void vsse1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18007027, vm, rs2, rs1, vs3); } +void vssseg1e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18000027, vm, rs2, rs1, vs3); } +void vssseg2e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x38000027, vm, rs2, rs1, vs3); } +void vssseg3e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x58000027, vm, rs2, rs1, vs3); } +void vssseg4e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x78000027, vm, rs2, rs1, vs3); } +void vssseg5e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x98000027, vm, rs2, rs1, vs3); } +void vssseg6e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xb8000027, vm, rs2, rs1, vs3); } +void vssseg7e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xd8000027, vm, rs2, rs1, vs3); } +void vssseg8e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xf8000027, vm, rs2, rs1, vs3); } +void vsse128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18000027, vm, rs2, rs1, vs3); } +void vssseg1e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8005027, vm, rs2, rs1, vs3); } +void vssseg2e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x28005027, vm, rs2, rs1, vs3); } +void vssseg3e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x48005027, vm, rs2, rs1, vs3); } +void vssseg4e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x68005027, vm, rs2, rs1, vs3); } +void vssseg5e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x88005027, vm, rs2, rs1, vs3); } +void vssseg6e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xa8005027, vm, rs2, rs1, vs3); } +void vssseg7e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xc8005027, vm, rs2, rs1, vs3); } +void vssseg8e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xe8005027, vm, rs2, rs1, vs3); } +void vsse16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8005027, vm, rs2, rs1, vs3); } +void vssseg1e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18005027, vm, rs2, rs1, vs3); } +void vssseg2e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x38005027, vm, rs2, rs1, vs3); } +void vssseg3e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x58005027, vm, rs2, rs1, vs3); } +void vssseg4e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x78005027, vm, rs2, rs1, vs3); } +void vssseg5e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x98005027, vm, rs2, rs1, vs3); } +void vssseg6e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xb8005027, vm, rs2, rs1, vs3); } +void vssseg7e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xd8005027, vm, rs2, rs1, vs3); } +void vssseg8e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xf8005027, vm, rs2, rs1, vs3); } +void vsse256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18005027, vm, rs2, rs1, vs3); } +void vssseg1e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8006027, vm, rs2, rs1, vs3); } +void vssseg2e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x28006027, vm, rs2, rs1, vs3); } +void vssseg3e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x48006027, vm, rs2, rs1, vs3); } +void vssseg4e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x68006027, vm, rs2, rs1, vs3); } +void vssseg5e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x88006027, vm, rs2, rs1, vs3); } +void vssseg6e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xa8006027, vm, rs2, rs1, vs3); } +void vssseg7e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xc8006027, vm, rs2, rs1, vs3); } +void vssseg8e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xe8006027, vm, rs2, rs1, vs3); } +void vsse32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8006027, vm, rs2, rs1, vs3); } +void vssseg1e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18006027, vm, rs2, rs1, vs3); } +void vssseg2e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x38006027, vm, rs2, rs1, vs3); } +void vssseg3e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x58006027, vm, rs2, rs1, vs3); } +void vssseg4e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x78006027, vm, rs2, rs1, vs3); } +void vssseg5e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x98006027, vm, rs2, rs1, vs3); } +void vssseg6e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xb8006027, vm, rs2, rs1, vs3); } +void vssseg7e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xd8006027, vm, rs2, rs1, vs3); } +void vssseg8e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xf8006027, vm, rs2, rs1, vs3); } +void vsse512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18006027, vm, rs2, rs1, vs3); } +void vssseg1e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8007027, vm, rs2, rs1, vs3); } +void vssseg2e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x28007027, vm, rs2, rs1, vs3); } +void vssseg3e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x48007027, vm, rs2, rs1, vs3); } +void vssseg4e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x68007027, vm, rs2, rs1, vs3); } +void vssseg5e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x88007027, vm, rs2, rs1, vs3); } +void vssseg6e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xa8007027, vm, rs2, rs1, vs3); } +void vssseg7e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xc8007027, vm, rs2, rs1, vs3); } +void vssseg8e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xe8007027, vm, rs2, rs1, vs3); } +void vsse64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8007027, vm, rs2, rs1, vs3); } +void vssseg1e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8000027, vm, rs2, rs1, vs3); } +void vssseg2e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x28000027, vm, rs2, rs1, vs3); } +void vssseg3e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x48000027, vm, rs2, rs1, vs3); } +void vssseg4e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x68000027, vm, rs2, rs1, vs3); } +void vssseg5e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x88000027, vm, rs2, rs1, vs3); } +void vssseg6e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xa8000027, vm, rs2, rs1, vs3); } +void vssseg7e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xc8000027, vm, rs2, rs1, vs3); } +void vssseg8e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xe8000027, vm, rs2, rs1, vs3); } +void vsse8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8000027, vm, rs2, rs1, vs3); } +void vssra_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xac003057, vm, vs2, simm5, vd); } +void vssra_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xac000057, vm, vs2, vs1, vd); } +void vssra_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xac004057, vm, vs2, rs1, vd); } +void vssrl_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xa8003057, vm, vs2, simm5, vd); } +void vssrl_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xa8000057, vm, vs2, vs1, vd); } +void vssrl_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xa8004057, vm, vs2, rs1, vd); } +void vssub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x8c000057, vm, vs2, vs1, vd); } +void vssub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x8c004057, vm, vs2, rs1, vd); } +void vssubu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x88000057, vm, vs2, vs1, vd); } +void vssubu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x88004057, vm, vs2, rs1, vd); } +void vsub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x8000057, vm, vs2, vs1, vd); } +void vsub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x8004057, vm, vs2, rs1, vd); } +void vsuxei1024_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x14007027, vm, vs2, rs1, vs3); } +void vsuxei128_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x14000027, vm, vs2, rs1, vs3); } +void vsuxei16_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x4005027, vm, vs2, rs1, vs3); } +void vsuxei256_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x14005027, vm, vs2, rs1, vs3); } +void vsuxei32_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x4006027, vm, vs2, rs1, vs3); } +void vsuxei512_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x14006027, vm, vs2, rs1, vs3); } +void vsuxei64_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x4007027, vm, vs2, rs1, vs3); } +void vsuxei8_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x4000027, vm, vs2, rs1, vs3); } +void vwadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xc4002057, vm, vs2, vs1, vd); } +void vwadd_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xc4006057, vm, vs2, rs1, vd); } +void vwadd_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xd4002057, vm, vs2, vs1, vd); } +void vwadd_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xd4006057, vm, vs2, rs1, vd); } +void vwaddu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xc0002057, vm, vs2, vs1, vd); } +void vwaddu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xc0006057, vm, vs2, rs1, vd); } +void vwaddu_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xd0002057, vm, vs2, vs1, vd); } +void vwaddu_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xd0006057, vm, vs2, rs1, vd); } +void vwmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xf4002057, vm, vs2, vs1, vd); } +void vwmacc_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xf4006057, vm, vs2, rs1, vd); } +void vwmaccsu_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xfc002057, vm, vs2, vs1, vd); } +void vwmaccsu_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xfc006057, vm, vs2, rs1, vd); } +void vwmaccu_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xf0002057, vm, vs2, vs1, vd); } +void vwmaccu_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xf0006057, vm, vs2, rs1, vd); } +void vwmaccus_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xf8006057, vm, vs2, rs1, vd); } +void vwmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xec002057, vm, vs2, vs1, vd); } +void vwmul_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xec006057, vm, vs2, rs1, vd); } +void vwmulsu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xe8002057, vm, vs2, vs1, vd); } +void vwmulsu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xe8006057, vm, vs2, rs1, vd); } +void vwmulu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xe0002057, vm, vs2, vs1, vd); } +void vwmulu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xe0006057, vm, vs2, rs1, vd); } +void vwredsum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xc4000057, vm, vs2, vs1, vd); } +void vwredsumu_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xc0000057, vm, vs2, vs1, vd); } +void vwsub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xcc002057, vm, vs2, vs1, vd); } +void vwsub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xcc006057, vm, vs2, rs1, vd); } +void vwsub_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xdc002057, vm, vs2, vs1, vd); } +void vwsub_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xdc006057, vm, vs2, rs1, vd); } +void vwsubu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xc8002057, vm, vs2, vs1, vd); } +void vwsubu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xc8006057, vm, vs2, rs1, vd); } +void vwsubu_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xd8002057, vm, vs2, vs1, vd); } +void vwsubu_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xd8006057, vm, vs2, rs1, vd); } +void vxor_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x2c003057, vm, vs2, simm5, vd); } +void vxor_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x2c000057, vm, vs2, vs1, vd); } +void vxor_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x2c004057, vm, vs2, rs1, vd); } +void vzext_vf2(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x48032057, vm, vs2, 0, vd); } +void vzext_vf4(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x48022057, vm, vs2, 0, vd); } +void vzext_vf8(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x48012057, vm, vs2, 0, vd); } + +void vsetivli(const Reg& rd, uint32_t uimm, SEW sew, LMUL lmul=LMUL::m1, VTA vta=VTA::tu, VMA vma=VMA::mu) { + uint32_t zimm = (static_cast(vma)<<7) | + (static_cast(vta)<<6) | + (static_cast(sew)<<3) | + (static_cast(lmul)); + uint32_t v = (0x3<<30) | (zimm<<20) | (uimm<<15) | (0x7<<12) | (rd.getIdx()<<7) | (0x57); + append4B(v); +} + +void vsetvli(const Reg& rd, const Reg& rs1, SEW sew, LMUL lmul=LMUL::m1, VTA vta=VTA::tu, VMA vma=VMA::mu) { + uint32_t zimm = (static_cast(vma)<<7) | + (static_cast(vta)<<6) | + (static_cast(sew)<<3) | + (static_cast(lmul)); + uint32_t v = (0x0<<31) | (zimm<<20) | (rs1.getIdx()<<15) | (0x7<<12) | (rd.getIdx()<<7) | (0x57); + append4B(v); +} + +void vsetvl(const Reg& rd, const Reg& rs1, const Reg& rs2) { + uint32_t v = (0x40<<25) | (rs2.getIdx()<<20) | (rs1.getIdx()<<15) | (0x7<<12) | (rd.getIdx()<<7) | (0x57); + append4B(v); +} + + +// Copy mask register +void vmmv_m(const VReg& vd, const VReg& vs) { vmand_mm(vd, vs, vs); } +// Clear mask register +void vmclr_m(const VReg& vd) { vmxor_mm(vd, vd, vd); } +// Set mask register +void vmset_m(const VReg& vd) { vmxnor_mm(vd, vd, vd); } +// Invert bits +void vmnot_m(const VReg& vd, const VReg& vs) { vmnand_mm(vd, vs, vs); } + + +// vector compare pseudoinstructions +void vmfgt_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { vmflt_vv(vd, vs2, vs1, vm); } +void vmfge_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { vmfle_vv(vd, vs2, vs1, vm); } + +// sign-related pseudoinstructions +void vfabs_v(const VReg& vd, const VReg& vs, VM vm=VM::unmasked) { vfsgnjx_vv(vd, vs, vs, vm); } +void vfneg_v(const VReg& vd, const VReg& vs, VM vm=VM::unmasked) { vfsgnjn_vv(vd, vs, vs, vm); } From 231fcc0f76097c11249e05eb7d3ee1c7c47e96a3 Mon Sep 17 00:00:00 2001 From: StrelkovKM Date: Mon, 23 Mar 2026 19:29:28 +0000 Subject: [PATCH 03/13] [CPU][RV64] Edit:CMakeLists.txt Reason: fix jit_utils --- src/CMakeLists.txt | 2 +- src/cpu/CMakeLists.txt | 2 +- src/cpu/cpu_convolution_list.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e69a804d39a..a70b63dad37 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -77,7 +77,7 @@ if(DNNL_EXPERIMENTAL) endif() if(DNNL_EXPERIMENTAL_UKERNEL) - if(DNNL_TARGET_ARCH STREQUAL "X64" OR DNNL_TARGET_ARCH STREQUAL "AARCH64" OR DNNL_TARGET_ARCH STREQUAL "RISCV64") + if(DNNL_TARGET_ARCH STREQUAL "X64" OR DNNL_TARGET_ARCH STREQUAL "AARCH64" OR DNNL_TARGET_ARCH STREQUAL "RV64") message(STATUS "Experimental functionality for ukernels is enabled") else() message(FATAL_ERROR "ukernel API isn't supported for ${DNNL_TARGET_ARCH}.") diff --git a/src/cpu/CMakeLists.txt b/src/cpu/CMakeLists.txt index ab791ee7b2c..19923b7b12d 100644 --- a/src/cpu/CMakeLists.txt +++ b/src/cpu/CMakeLists.txt @@ -29,7 +29,7 @@ foreach(SOURCE_FILE ${SOURCES_EXTRA}) list(APPEND SOURCES "${SOURCE_FILE}") endforeach() -if((DNNL_TARGET_ARCH STREQUAL "X64") OR (DNNL_TARGET_ARCH STREQUAL "AARCH64")) +if((DNNL_TARGET_ARCH STREQUAL "X64") OR (DNNL_TARGET_ARCH STREQUAL "AARCH64") OR (DNNL_TARGET_ARCH STREQUAL "RV64")) file(GLOB_RECURSE SOURCES_JIT_UTILS ${CMAKE_CURRENT_SOURCE_DIR}/jit_utils/*.[ch] ${CMAKE_CURRENT_SOURCE_DIR}/jit_utils/*.[ch]pp diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp index 350ac8e14e4..47b0fb1494b 100644 --- a/src/cpu/cpu_convolution_list.cpp +++ b/src/cpu/cpu_convolution_list.cpp @@ -180,7 +180,7 @@ const std::map> &impl_list_map() CPU_INSTANCE_AARCH64(brgemm_convolution_fwd_t, sve_128) // CPU_INSTANCE_X64(jit_uni_ncsp_convolution_fwd_t) - //CPU_INSTANCE_RV64GCV(jit_rvv_1x1_convolution_fwd_t) + CPU_INSTANCE_RV64GCV(jit_rvv_1x1_convolution_fwd_t) CPU_INSTANCE_RV64GCV(riscv_gemm_convolution_fwd_t) From a51ec7d084b1f967288e01bf7d7e22a051a9deb3 Mon Sep 17 00:00:00 2001 From: StrelkovKM Date: Wed, 22 Apr 2026 15:39:51 +0000 Subject: [PATCH 04/13] [CPU][RV64] Edit:CMake src/ --- src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a70b63dad37..08d882bfee0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -77,7 +77,7 @@ if(DNNL_EXPERIMENTAL) endif() if(DNNL_EXPERIMENTAL_UKERNEL) - if(DNNL_TARGET_ARCH STREQUAL "X64" OR DNNL_TARGET_ARCH STREQUAL "AARCH64" OR DNNL_TARGET_ARCH STREQUAL "RV64") + if(DNNL_TARGET_ARCH STREQUAL "X64" OR DNNL_TARGET_ARCH STREQUAL "AARCH64") message(STATUS "Experimental functionality for ukernels is enabled") else() message(FATAL_ERROR "ukernel API isn't supported for ${DNNL_TARGET_ARCH}.") From 3f044236329a3f848f25e707ad2c09319cd964b9 Mon Sep 17 00:00:00 2001 From: StrelkovKM Date: Wed, 22 Apr 2026 15:47:06 +0000 Subject: [PATCH 05/13] [CPU][RV64] Edit: Return ref impl src/ --- src/cpu/cpu_convolution_list.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp index 47b0fb1494b..b91ac65b790 100644 --- a/src/cpu/cpu_convolution_list.cpp +++ b/src/cpu/cpu_convolution_list.cpp @@ -184,9 +184,9 @@ const std::map> &impl_list_map() CPU_INSTANCE_RV64GCV(riscv_gemm_convolution_fwd_t) - // CPU_INSTANCE(gemm_convolution_fwd_t) - // CPU_INSTANCE(ref_convolution_fwd_t) - // CPU_INSTANCE(ref_fused_convolution_fwd_t) + CPU_INSTANCE(gemm_convolution_fwd_t) + CPU_INSTANCE(ref_convolution_fwd_t) + CPU_INSTANCE(ref_fused_convolution_fwd_t) nullptr, }}, {{forward, f32, f16, f32}, { From 981f6827985614efd7bb6eb35a455c6e21627de2 Mon Sep 17 00:00:00 2001 From: StrelkovKM Date: Wed, 22 Apr 2026 16:13:46 +0000 Subject: [PATCH 06/13] [CPU][RV64] optimize jit_rvv & im2col --- src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp | 34 +++++++++++++++++------- src/cpu/rv64/rvv_gemm_convolution.cpp | 13 ++++++--- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp index c63a375d13b..243b93ff36e 100644 --- a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp +++ b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp @@ -225,12 +225,17 @@ void jit_rvv_1x1_conv_kernel_t::balance(jit_1x1_conv_conf_t &jcp) { } void jit_rvv_1x1_conv_kernel_t::generate() { + static_assert(sizeof(size_t) == 8, "oneDNN RV64 requires 64-bit pointer arithmetic"); + preamble(); // Set initial VL to oc_block (4) - li(reg_tmp_imm, jcp.oc_block); - vsetvli(reg_tmp_imm, reg_tmp_imm, Xbyak_riscv::SEW::e32, - Xbyak_riscv::LMUL::m1); + if (jcp.oc_block <= 31) { + vsetivli(reg_tmp_imm, jcp.oc_block, Xbyak_riscv::SEW::e32, Xbyak_riscv::LMUL::m1); + } else { + li(reg_tmp_imm, jcp.oc_block); + vsetvli(reg_tmp_imm, reg_tmp_imm, Xbyak_riscv::SEW::e32, Xbyak_riscv::LMUL::m1); + } // Load parameters ld(reg_bcast_data, reg_param, GET_OFF(bcast_data)); @@ -423,9 +428,14 @@ void jit_rvv_1x1_conv_kernel_t::reduce_loop(int load_loop_blk, int ur) { auto store = [=]() { mv(reg_tmp_addr, aux_reg_output_data); + + bool has_relu = false; + for (int i_ur = 0; i_ur < ur; ++i_ur) { for (int i_load = 0; i_load < load_loop_blk; ++i_load) { + vse32_v(vreg_accum(i_load, i_ur), reg_tmp_addr); + if (i_load + 1 < load_loop_blk) addi(reg_tmp_addr, reg_tmp_addr, jcp.load_block * jcp.typesize_out); @@ -449,8 +459,8 @@ void jit_rvv_1x1_conv_kernel_t::reduce_loop(int load_loop_blk, int ur) { } if (i_ur + 1 < ur) { - size_t offset - = (size_t)(i_ur + 1) * jcp.bcast_loop_bcast_step; + ptrdiff_t offset + = (ptrdiff_t)(i_ur + 1) * jcp.bcast_loop_bcast_step; if (offset <= 2047) { flw(freg_bcast, aux_reg_bcast_data, offset); } else { @@ -470,12 +480,16 @@ void jit_rvv_1x1_conv_kernel_t::reduce_loop(int load_loop_blk, int ur) { // Prefetch weights for next iteration if (!last_block) { - for (int i_unroll = 0; i_unroll < jcp.reduce_loop_unroll; - ++i_unroll) { + for (int i_unroll = 0; i_unroll < jcp.reduce_loop_unroll; ++i_unroll) { for (int i_load = 0; i_load < load_loop_blk; ++i_load) { - size_t weight_off - = (size_t)i_unroll * jcp.reduce_loop_load_step - + (size_t)i_load * jcp.load_loop_load_step; + ptrdiff_t weight_off = (ptrdiff_t)i_unroll * jcp.reduce_loop_load_step + + (ptrdiff_t)i_load * jcp.load_loop_load_step; + + li(reg_tmp_addr, weight_off + 256); + + add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr); + flw(x0, reg_tmp_addr, 0); + li(reg_tmp_addr, weight_off); add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr); vle32_v(vreg_load(i_load, i_unroll), reg_tmp_addr); diff --git a/src/cpu/rv64/rvv_gemm_convolution.cpp b/src/cpu/rv64/rvv_gemm_convolution.cpp index fc20fb2fecf..ed75ccfc0ea 100644 --- a/src/cpu/rv64/rvv_gemm_convolution.cpp +++ b/src/cpu/rv64/rvv_gemm_convolution.cpp @@ -106,12 +106,17 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc( // jit_gemm_convolution_utils::im2col_dt_3d() requires external // data initialization by zeroes + const size_t total_sz = jcp.im2col_sz; + const size_t vlmax = __riscv_vsetvlmax_e32m1(); + const vfloat32m1_t v_zero = __riscv_vfmv_v_f_f32m1(0.0f, vlmax); ptrdiff_t i = 0; - while (i < jcp.im2col_sz) { - size_t vl = __riscv_vsetvl_e32m1(jcp.im2col_sz - i); - vfloat32m1_t v_zero = __riscv_vfmv_v_f_f32m1(0.0f, vl); + + for (; i <= (ptrdiff_t)total_sz - (ptrdiff_t)vlmax; i += (ptrdiff_t)vlmax) { + __riscv_vse32_v_f32m1(col + i, v_zero, vlmax); + } + if (i < (ptrdiff_t)total_sz) { + size_t vl = __riscv_vsetvl_e32m1(total_sz - i); __riscv_vse32_v_f32m1(col + i, v_zero, vl); - i += vl; } } From 9b7837fc5bca28f171be3059fc87c717285fe1a7 Mon Sep 17 00:00:00 2001 From: StrelkovKM Date: Wed, 22 Apr 2026 19:46:13 +0000 Subject: [PATCH 07/13] [CPU][RV64] bugfix --- src/cpu/cpu_convolution_list.cpp | 6 ++-- src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp | 46 ++++++++++++++++-------- 2 files changed, 35 insertions(+), 17 deletions(-) diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp index b91ac65b790..47b0fb1494b 100644 --- a/src/cpu/cpu_convolution_list.cpp +++ b/src/cpu/cpu_convolution_list.cpp @@ -184,9 +184,9 @@ const std::map> &impl_list_map() CPU_INSTANCE_RV64GCV(riscv_gemm_convolution_fwd_t) - CPU_INSTANCE(gemm_convolution_fwd_t) - CPU_INSTANCE(ref_convolution_fwd_t) - CPU_INSTANCE(ref_fused_convolution_fwd_t) + // CPU_INSTANCE(gemm_convolution_fwd_t) + // CPU_INSTANCE(ref_convolution_fwd_t) + // CPU_INSTANCE(ref_fused_convolution_fwd_t) nullptr, }}, {{forward, f32, f16, f32}, { diff --git a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp index 243b93ff36e..e88b18214b3 100644 --- a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp +++ b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp @@ -211,6 +211,10 @@ status_t jit_rvv_1x1_conv_kernel_t::init_conf(jit_1x1_conv_conf_t &jcp, = jcp.ic_without_padding * jcp.oc_block * jcp.typesize_in; jcp.load_loop_iter_step = jcp.oc_block; + if (jcp.reduce_loop_load_step > (1LL << 40) / jcp.reduce_loop_unroll) { + return status::unimplemented; + } + return status::success; } @@ -225,7 +229,6 @@ void jit_rvv_1x1_conv_kernel_t::balance(jit_1x1_conv_conf_t &jcp) { } void jit_rvv_1x1_conv_kernel_t::generate() { - static_assert(sizeof(size_t) == 8, "oneDNN RV64 requires 64-bit pointer arithmetic"); preamble(); @@ -286,13 +289,17 @@ void jit_rvv_1x1_conv_kernel_t::generate() { L(load_loop_tail); { - Label tail_loop; + Label tail_loop, tail_end; L(tail_loop); - blez(reg_load_loop_work, load_loop_end); + blez(reg_load_loop_work, tail_end); // Last block may be partial, use vsetvli to set VL dynamically - vsetvli(reg_tmp_imm, reg_load_loop_work, Xbyak_riscv::SEW::e32, - Xbyak_riscv::LMUL::m1); + if (jcp.oc_block <= 31) { + vsetivli(reg_tmp_imm, jcp.oc_block, Xbyak_riscv::SEW::e32, Xbyak_riscv::LMUL::m1); + } else { + li(reg_tmp_imm, jcp.oc_block); + vsetvli(reg_tmp_imm, reg_tmp_imm, Xbyak_riscv::SEW::e32, Xbyak_riscv::LMUL::m1); + } bcast_loop(1); @@ -305,11 +312,12 @@ void jit_rvv_1x1_conv_kernel_t::generate() { } li(reg_tmp_imm, jcp.oc_block * jcp.typesize_out); add(reg_output_data, reg_output_data, reg_tmp_imm); - + li(reg_tmp_imm, jcp.oc_block); sub(reg_load_loop_work, reg_load_loop_work, reg_tmp_imm); - + jal(x0, tail_loop); + L(tail_end); } L(load_loop_end); @@ -369,8 +377,17 @@ void jit_rvv_1x1_conv_kernel_t::bcast_loop(int load_loop_blk) { Label bcast_loop_tail_end; blez(reg_bcast_loop_iter, bcast_loop_tail_end); - reduce_loop(load_loop_blk, jcp.ur_tail); + auto restore_vl = [=]() { + if (jcp.oc_block <= 31) { + vsetivli(reg_tmp_imm, jcp.oc_block, SEW::e32, LMUL::m1); + } else { + li(reg_tmp_imm, jcp.oc_block); + vsetvli(reg_tmp_imm, reg_tmp_imm, SEW::e32, LMUL::m1); + } + }; + reduce_loop(load_loop_blk, jcp.ur_tail); + restore_vl(); L(bcast_loop_tail_end); } } @@ -429,8 +446,6 @@ void jit_rvv_1x1_conv_kernel_t::reduce_loop(int load_loop_blk, int ur) { auto store = [=]() { mv(reg_tmp_addr, aux_reg_output_data); - bool has_relu = false; - for (int i_ur = 0; i_ur < ur; ++i_ur) { for (int i_load = 0; i_load < load_loop_blk; ++i_load) { @@ -488,7 +503,10 @@ void jit_rvv_1x1_conv_kernel_t::reduce_loop(int load_loop_blk, int ur) { li(reg_tmp_addr, weight_off + 256); add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr); - flw(x0, reg_tmp_addr, 0); + #if defined(__riscv_zicbom) + // cbo.prefetch.i: 0b0000000_00010_00000_010_00000_0001111 + asm volatile(".word 0x0020000f" : : "r"(reg_tmp_addr)); + #endif li(reg_tmp_addr, weight_off); add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr); @@ -503,8 +521,8 @@ void jit_rvv_1x1_conv_kernel_t::reduce_loop(int load_loop_blk, int ur) { // Load first round of weights (IC=0..unroll-1) for (int i_unroll = 0; i_unroll < jcp.reduce_loop_unroll; ++i_unroll) { for (int i_load = 0; i_load < load_loop_blk; ++i_load) { - size_t weight_off = (size_t)i_unroll * jcp.reduce_loop_load_step - + (size_t)i_load * jcp.load_loop_load_step; + ptrdiff_t weight_off = (ptrdiff_t)i_unroll * jcp.reduce_loop_load_step + + (ptrdiff_t)i_load * jcp.load_loop_load_step; if (weight_off == 0) { vle32_v(vreg_load(i_load, i_unroll), aux_reg_load_data); } else { @@ -547,7 +565,7 @@ void jit_rvv_1x1_conv_kernel_t::reduce_loop(int load_loop_blk, int ur) { L(tail_loop); { for (int i_load = 0; i_load < load_loop_blk; ++i_load) { - size_t weight_off = (size_t)i_load * jcp.load_loop_load_step; + ptrdiff_t weight_off = (ptrdiff_t)i_load * jcp.load_loop_load_step; if (weight_off == 0) { vle32_v(vreg_load(i_load, 0), aux_reg_load_data); } else { From e55a495ceb45cdfb207275e42a1fe621ec2ab720 Mon Sep 17 00:00:00 2001 From: StrelkovKM Date: Fri, 24 Apr 2026 17:19:49 +0000 Subject: [PATCH 08/13] Update branch --- src/cpu/rv64/rvv_postops.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cpu/rv64/rvv_postops.hpp b/src/cpu/rv64/rvv_postops.hpp index 28c54f2e77e..bb4171b8412 100644 --- a/src/cpu/rv64/rvv_postops.hpp +++ b/src/cpu/rv64/rvv_postops.hpp @@ -18,6 +18,7 @@ #include + namespace dnnl { namespace impl { namespace cpu { From 874b0f38d85d4c67236d18ffdc4864ab9e05f35d Mon Sep 17 00:00:00 2001 From: StrelkovKM Date: Fri, 24 Apr 2026 17:20:43 +0000 Subject: [PATCH 09/13] Reset --- src/cpu/rv64/rvv_postops.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/cpu/rv64/rvv_postops.hpp b/src/cpu/rv64/rvv_postops.hpp index bb4171b8412..28c54f2e77e 100644 --- a/src/cpu/rv64/rvv_postops.hpp +++ b/src/cpu/rv64/rvv_postops.hpp @@ -18,7 +18,6 @@ #include - namespace dnnl { namespace impl { namespace cpu { From a9d51bfa917c98439857aeea8aa95f14589caac0 Mon Sep 17 00:00:00 2001 From: StrelkovKM Date: Sun, 26 Apr 2026 14:50:04 +0000 Subject: [PATCH 10/13] [CPU][RV64]Return ref & add debug comments --- src/cpu/cpu_convolution_list.cpp | 6 +++--- src/cpu/rv64/rvv_gemm_convolution.hpp | 10 +++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp index 2e6bb28fe58..2913a81313b 100644 --- a/src/cpu/cpu_convolution_list.cpp +++ b/src/cpu/cpu_convolution_list.cpp @@ -184,9 +184,9 @@ const std::map> &impl_list_map() CPU_INSTANCE_RV64GCV(riscv_gemm_convolution_fwd_t) - // CPU_INSTANCE(gemm_convolution_fwd_t) - // CPU_INSTANCE(ref_convolution_fwd_t) - // CPU_INSTANCE(ref_fused_convolution_fwd_t) + CPU_INSTANCE(gemm_convolution_fwd_t) + CPU_INSTANCE(ref_convolution_fwd_t) + CPU_INSTANCE(ref_fused_convolution_fwd_t) nullptr, }}, {{forward, f32, f16, f32}, { diff --git a/src/cpu/rv64/rvv_gemm_convolution.hpp b/src/cpu/rv64/rvv_gemm_convolution.hpp index 7bcda8e9462..e0f2afe3c07 100644 --- a/src/cpu/rv64/rvv_gemm_convolution.hpp +++ b/src/cpu/rv64/rvv_gemm_convolution.hpp @@ -68,6 +68,8 @@ struct riscv_gemm_convolution_fwd_t : public primitive_t { // TODO: make `init_conf` assign initialized object to `jcp_` jcp_ = conv_gemm_conf_t(); + + std::cout << "GEMM INIT CONSTRUCTION" << std::endl; return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad, *desc(), src_md_, weights_md_, dst_md_, bias_md_, attr_, dnnl_get_max_threads()); @@ -113,18 +115,24 @@ struct riscv_gemm_convolution_fwd_t : public primitive_t { : primitive_t(apd), post_ops_(nullptr) {} status_t init(engine_t *engine) override { + std::cout << "GEMM INIT" << std::endl; const auto &jcp = pd()->jcp_; if (jcp.with_eltwise || jcp.with_binary) { CHECK(safe_ptr_assign(post_ops_, new ref_post_ops_t(jcp.post_ops))); CHECK(post_ops_->init(pd()->dst_md())); } + + std::cout << "GEMM SUCCESS" << std::endl; return status::success; } using data_t = typename prec_traits_t::type; status_t execute(const exec_ctx_t &ctx) const override { + fprintf(stderr, "[RVV EXECUTE] Layer executed!\n"); + fflush(stderr); + bool is_nspc = pd()->jcp_.is_nspc; return is_nspc ? execute_forward_nspc(ctx) : execute_forward_ncsp(ctx); } @@ -146,4 +154,4 @@ struct riscv_gemm_convolution_fwd_t : public primitive_t { } // namespace impl } // namespace dnnl -#endif +#endif \ No newline at end of file From 7c82da94cd9c864ff4285ba09282ce3e80e5e4b0 Mon Sep 17 00:00:00 2001 From: StrelkovKM Date: Sun, 26 Apr 2026 16:01:13 +0000 Subject: [PATCH 11/13] [CPU][RV64] Optimize im2col, RVV(Bias + ReLU) --- src/cpu/cpu_convolution_list.cpp | 8 +- src/cpu/rv64/cpu_isa_traits.cpp | 44 - src/cpu/rv64/cpu_isa_traits.hpp | 107 -- src/cpu/rv64/jit_generator.hpp | 137 -- src/cpu/rv64/jit_primitive_conf.hpp | 97 -- src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp | 613 -------- src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp | 109 -- src/cpu/rv64/jit_rvv_1x1_convolution.cpp | 144 -- src/cpu/rv64/jit_rvv_1x1_convolution.hpp | 170 -- src/cpu/rv64/rvv_gemm_convolution.cpp | 266 ++-- src/cpu/rv64/rvv_gemm_convolution.hpp | 1 - third_party/xbyak_riscv/xbyak_riscv.hpp | 1383 ----------------- third_party/xbyak_riscv/xbyak_riscv_csr.hpp | 112 -- .../xbyak_riscv/xbyak_riscv_mnemonic.hpp | 231 --- third_party/xbyak_riscv/xbyak_riscv_util.hpp | 271 ---- third_party/xbyak_riscv/xbyak_riscv_v.hpp | 776 --------- 16 files changed, 175 insertions(+), 4294 deletions(-) delete mode 100644 src/cpu/rv64/cpu_isa_traits.cpp delete mode 100644 src/cpu/rv64/cpu_isa_traits.hpp delete mode 100644 src/cpu/rv64/jit_generator.hpp delete mode 100644 src/cpu/rv64/jit_primitive_conf.hpp delete mode 100644 src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp delete mode 100644 src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp delete mode 100644 src/cpu/rv64/jit_rvv_1x1_convolution.cpp delete mode 100644 src/cpu/rv64/jit_rvv_1x1_convolution.hpp delete mode 100644 third_party/xbyak_riscv/xbyak_riscv.hpp delete mode 100644 third_party/xbyak_riscv/xbyak_riscv_csr.hpp delete mode 100644 third_party/xbyak_riscv/xbyak_riscv_mnemonic.hpp delete mode 100644 third_party/xbyak_riscv/xbyak_riscv_util.hpp delete mode 100644 third_party/xbyak_riscv/xbyak_riscv_v.hpp diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp index 2913a81313b..c8f41b8e947 100644 --- a/src/cpu/cpu_convolution_list.cpp +++ b/src/cpu/cpu_convolution_list.cpp @@ -78,7 +78,6 @@ using namespace dnnl::impl::cpu::aarch64; using namespace dnnl::impl::cpu::acl; #elif DNNL_RV64 #include "cpu/rv64/rvv_gemm_convolution.hpp" -#include "cpu/rv64/jit_rvv_1x1_convolution.hpp" using namespace dnnl::impl::cpu::rv64; #endif @@ -180,13 +179,12 @@ const std::map> &impl_list_map() CPU_INSTANCE_AARCH64(brgemm_convolution_fwd_t, sve_128) // CPU_INSTANCE_X64(jit_uni_ncsp_convolution_fwd_t) - CPU_INSTANCE_RV64GCV(jit_rvv_1x1_convolution_fwd_t) CPU_INSTANCE_RV64GCV(riscv_gemm_convolution_fwd_t) - CPU_INSTANCE(gemm_convolution_fwd_t) - CPU_INSTANCE(ref_convolution_fwd_t) - CPU_INSTANCE(ref_fused_convolution_fwd_t) + // CPU_INSTANCE(gemm_convolution_fwd_t) + // CPU_INSTANCE(ref_convolution_fwd_t) + // CPU_INSTANCE(ref_fused_convolution_fwd_t) nullptr, }}, {{forward, f32, f16, f32}, { diff --git a/src/cpu/rv64/cpu_isa_traits.cpp b/src/cpu/rv64/cpu_isa_traits.cpp deleted file mode 100644 index b8c3fc658e0..00000000000 --- a/src/cpu/rv64/cpu_isa_traits.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/******************************************************************************* -* Copyright 2019 Intel Corporation -* Copyright 2025 Institute of Software, Chinese Academy of Sciences -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "cpu/rv64/cpu_isa_traits.hpp" -#include "cpu/platform.hpp" - -namespace dnnl { -namespace impl { -namespace cpu { -namespace rv64 { - -struct isa_info_t { - isa_info_t(cpu_isa_t aisa) : isa(aisa) {}; - cpu_isa_t isa; -}; - -static isa_info_t get_isa_info_t(void) { - if (mayiuse(zvfh)) return isa_info_t(zvfh); - if (mayiuse(v)) return isa_info_t(v); - return isa_info_t(isa_undef); -} - -cpu_isa_t get_max_cpu_isa() { - return get_isa_info_t().isa; -} - -} // namespace rv64 -} // namespace cpu -} // namespace impl -} // namespace dnnl diff --git a/src/cpu/rv64/cpu_isa_traits.hpp b/src/cpu/rv64/cpu_isa_traits.hpp deleted file mode 100644 index be5a4fc1d49..00000000000 --- a/src/cpu/rv64/cpu_isa_traits.hpp +++ /dev/null @@ -1,107 +0,0 @@ -/******************************************************************************* -* Copyright 2018 Intel Corporation -* Copyright 2025 Institute of Software, Chinese Academy of Sciences -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef CPU_RV64_CPU_ISA_TRAITS_HPP -#define CPU_RV64_CPU_ISA_TRAITS_HPP - -#include - -#include "common/type_helpers.hpp" -#include "common/utils.hpp" -#include "dnnl_types.h" - -#ifndef XBYAK_RISCV_V -#define XBYAK_RISCV_V 1 -#endif - -#include "xbyak_riscv/xbyak_riscv_util.hpp" - -namespace dnnl { -namespace impl { -namespace cpu { -namespace rv64 { - -enum cpu_isa_bit_t : unsigned { - v_bit = 1u << 0, - zvfh_bit = 1u << 1, -}; - -enum cpu_isa_t : unsigned { - isa_undef = 0u, - v = v_bit, - zvfh = zvfh_bit | v, - isa_all = ~0u, -}; - -struct Riscv64Cpu { -public: - static Riscv64Cpu &getInstance() { - static Riscv64Cpu instance; - return instance; - } - - bool get_has_v() const { return has_v; } - bool get_has_zvfh() const { return has_zvfh; } - -private: - bool has_v = false; - bool has_zvfh = false; - - Riscv64Cpu() { - const auto &xbyak_cpu = Xbyak_riscv::CPU::getInstance(); - - has_v = xbyak_cpu.hasExtension(Xbyak_riscv::RISCVExtension::V); - - if (has_v) { - has_zvfh - = xbyak_cpu.hasExtension(Xbyak_riscv::RISCVExtension::Zvfh); - } else { - has_zvfh = false; - } - } -}; - -inline bool mayiuse(const cpu_isa_t cpu_isa, bool soft = false) { - MAYBE_UNUSED(soft); - const Riscv64Cpu &cpu = Riscv64Cpu::getInstance(); - - switch (cpu_isa) { - case v: return cpu.get_has_v(); - case zvfh: return cpu.get_has_v() && cpu.get_has_zvfh(); - case isa_undef: return true; - case isa_all: return false; - } - return false; -} - -cpu_isa_t get_max_cpu_isa(); - -#include "common/z_magic.hpp" -/* clang-format off */ -#define JIT_IMPL_NAME_HELPER(prefix, isa, suffix_if_any) \ - ((isa) == isa_undef ? prefix STRINGIFY(any) : \ - ((isa) == v ? prefix STRINGIFY(rvv) : \ - ((isa) == zvfh ? prefix STRINGIFY(rvv_zvfh) : \ - prefix suffix_if_any))) -/* clang-format on */ - -} // namespace rv64 -} // namespace cpu -} // namespace impl -} // namespace dnnl - -#endif diff --git a/src/cpu/rv64/jit_generator.hpp b/src/cpu/rv64/jit_generator.hpp deleted file mode 100644 index c795aba8c61..00000000000 --- a/src/cpu/rv64/jit_generator.hpp +++ /dev/null @@ -1,137 +0,0 @@ -/******************************************************************************* -* Copyright 2025 ZTE Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef CPU_RV64_JIT_GENERATOR_HPP -#define CPU_RV64_JIT_GENERATOR_HPP - -#include -#include - -#include "common/c_types_map.hpp" -#include "common/type_helpers.hpp" -#include "common/utils.hpp" -#include "cpu/jit_utils/jit_utils.hpp" - -#include "cpu/rv64/cpu_isa_traits.hpp" -#include "xbyak_riscv/xbyak_riscv.hpp" - -#define DECLARE_CPU_JIT_AUX_FUNCTIONS(gen_name) \ - const char *name() const override { \ - return STRINGIFY(gen_name); \ - } \ - const char *source_file() const override { \ - return __FILE__; \ - } - -#define JIT_ASSERT(condition) \ - do { \ - assert(condition); \ - if (!(condition)) XBYAK_RISCV_THROW(Xbyak_riscv::ERR_INTERNAL); \ - } while (false) - -#define JIT_ASSERT_RET(condition, ret) \ - do { \ - assert(condition); \ - if (!(condition)) \ - XBYAK_RISCV_THROW_RET(Xbyak_riscv::ERR_INTERNAL, ret); \ - } while (false) - -namespace dnnl { -namespace impl { -namespace cpu { -namespace rv64 { - -// Simple helper to check subset relation between two ISA masks. -inline bool is_subset(cpu_isa_t isa, cpu_isa_t max_isa) { - using u_t = typename std::underlying_type::type; - return (static_cast(isa) & static_cast(max_isa)) - == static_cast(isa); -} - -// Minimal RV64 JIT generator base class. -class jit_generator_t : public Xbyak_riscv::CodeGenerator, public c_compatible { -public: - using c_compatible::operator new; - using c_compatible::operator new[]; - using c_compatible::operator delete; - using c_compatible::operator delete[]; - - // All JIT kernels must override these to provide a stable name used for - // debug/logging and jit code registration. - virtual const char *name() const = 0; - virtual const char *source_file() const = 0; - - explicit jit_generator_t(const char * /*unused_name*/, - cpu_isa_t max_cpu_isa = get_max_cpu_isa()) - : Xbyak_riscv::CodeGenerator(max_code_size) - , max_cpu_isa_(max_cpu_isa) {} - - ~jit_generator_t() override = default; - - const uint8_t *jit_ker() const { return jit_ker_; } - - template - void operator()(kernel_args_t... args) const { - using jit_kernel_func_t = void (*)(const kernel_args_t...); - // This const_cast is required for Clang. - // Clang rejects reinterpret_cast from const uint8_t* to function pointer. - auto *fptr = reinterpret_cast( - const_cast(jit_ker_)); - (*fptr)(std::forward(args)...); - } - - virtual status_t create_kernel() { - try { - generate(); - } catch (...) { return status::runtime_error; } - - this->ready(Xbyak_riscv::CodeArray::PROTECT_RWE); - - jit_ker_ = Xbyak_riscv::CodeGenerator::getCode(); - - if (jit_ker_) { - jit_utils::register_jit_code(jit_ker_, - Xbyak_riscv::CodeArray::getSize(), name(), source_file()); - return status::success; - } - - return status::runtime_error; - } - - inline cpu_isa_t max_cpu_isa() const noexcept { return max_cpu_isa_; } - - // Helper to check that a requested ISA is both within the per‑kernel limit - // and supported by the current CPU. - inline bool is_valid_isa(cpu_isa_t isa) const { - return is_subset(isa, max_cpu_isa_) && mayiuse(isa); - } - -protected: - virtual void generate() = 0; - -private: - static constexpr unsigned max_code_size = 256 * 1024; - - const cpu_isa_t max_cpu_isa_; - const uint8_t *jit_ker_ = nullptr; -}; - -} // namespace rv64 -} // namespace cpu -} // namespace impl -} // namespace dnnl - -#endif diff --git a/src/cpu/rv64/jit_primitive_conf.hpp b/src/cpu/rv64/jit_primitive_conf.hpp deleted file mode 100644 index dde5afb8d32..00000000000 --- a/src/cpu/rv64/jit_primitive_conf.hpp +++ /dev/null @@ -1,97 +0,0 @@ -/******************************************************************************* -* Copyright 2025 ZTE Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef CPU_RV64_JIT_PRIMITIVE_CONF_HPP -#define CPU_RV64_JIT_PRIMITIVE_CONF_HPP - -#include "common/c_types_map.hpp" - -namespace dnnl { -namespace impl { -namespace cpu { -namespace rv64 { - -struct jit_1x1_conv_conf_t { - prop_kind_t prop_kind; - int mb; - int ngroups, ic, oc, oc_without_padding, ic_without_padding; - int iw, ih, id; - int ow, oh, od; - int os, is; - int kw, kh, kd; - int stride_w, stride_h, stride_d; - int t_pad, l_pad, f_pad; - - int ic_block, oc_block; - int load_block, reduce_block; - int bcast_block; - - dim_t load_dim, bcast_dim, reduce_dim; - - int ur, ur_tail; - int load_loop_blk; - int reduce_loop_unroll; - int nthr; - int nb_bcast, nb_load, nb_reduce, load_grp_count; - int nb_load_blocking, nb_load_blocking_max; - int nb_bcast_blocking, nb_bcast_blocking_max; - int nb_reduce_blocking; - - dim_t reduce_loop_bcast_step; - int reduce_loop_load_step; - int bcast_loop_bcast_step; - int bcast_loop_output_step; - int load_loop_load_step; - int load_loop_iter_step; - - bool with_bias; - bool with_sum; - bool with_eltwise; - bool with_binary; - bool with_dw_conv; - - int typesize_in; - int typesize_out; - int typesize_bia; - int typesize_acc; - - format_tag_t src_tag, wei_tag, dst_tag; -}; - -struct jit_1x1_conv_args_t { - const void *bcast_data; - const void *load_data; - const void *output_data; - const void *bias_data; - - size_t load_dim; - size_t bcast_dim; - size_t reduce_dim; - - size_t first_last_flag; -}; - -enum { - FLAG_REDUCE_FIRST = 1 << 0, - FLAG_REDUCE_LAST = 1 << 1, -}; - -} // namespace rv64 -} // namespace cpu -} // namespace impl -} // namespace dnnl - -#endif diff --git a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp deleted file mode 100644 index e88b18214b3..00000000000 --- a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp +++ /dev/null @@ -1,613 +0,0 @@ -/******************************************************************************* -* Copyright 2025 ZTE Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include -#include "common/c_types_map.hpp" -#include "common/dnnl_thread.hpp" -#include "common/memory.hpp" -#include "common/utils.hpp" - -#include "cpu/rv64/jit_rvv_1x1_conv_kernel.hpp" - -#define GET_OFF(field) \ - static_cast(offsetof(jit_1x1_conv_args_t, field)) - -namespace dnnl { -namespace impl { -namespace cpu { -namespace rv64 { - -using namespace dnnl::impl::format_tag; -using namespace dnnl::impl::prop_kind; -using namespace dnnl::impl::utils; -using namespace Xbyak_riscv; - -jit_rvv_1x1_conv_kernel_t::jit_rvv_1x1_conv_kernel_t( - const jit_1x1_conv_conf_t &ajcp, const primitive_attr_t &attr, - const memory_desc_t &dst_md) - : jit_generator_t("jit_rvv_1x1_conv_kernel"), jcp(ajcp), attr_(attr) { - create_kernel(); -} - -status_t jit_rvv_1x1_conv_kernel_t::init_conf(jit_1x1_conv_conf_t &jcp, - const convolution_desc_t &cd, const memory_desc_wrapper &src_d, - const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d, - const primitive_attr_t &attr, int nthreads, bool reduce_src) { - - const int ndims = src_d.ndims(); - - jcp.prop_kind = cd.prop_kind; - jcp.nthr = nthreads; - - jcp.with_bias = cd.bias_desc.format_kind != format_kind::undef; - - // Initialize dimensions - jcp.mb = src_d.dims()[0]; - jcp.ngroups - = weights_d.ndims() == src_d.ndims() + 1 ? weights_d.dims()[0] : 1; - jcp.oc_without_padding = dst_d.dims()[1] / jcp.ngroups; - jcp.ic_without_padding = src_d.dims()[1] / jcp.ngroups; - jcp.oc = jcp.oc_without_padding; - jcp.ic = jcp.ic_without_padding; - - // Targeting SEW=32 (float), LMUL=1, VLEN=128 -> simd_w = 4 - const int simd_w = 4; - - // OC is padded to match oc_block in weights format (Oihw4o) - // IC is not padded; kernel handles IC tail processing - jcp.oc = rnd_up(jcp.oc, simd_w); - - // 3D convolution support - jcp.id = (ndims == 5) ? src_d.dims()[2] : 1; - jcp.od = (ndims == 5) ? dst_d.dims()[2] : 1; - - jcp.ih = (ndims == 3) ? 1 : src_d.dims()[ndims - 2]; - jcp.iw = src_d.dims()[ndims - 1]; - jcp.oh = (ndims == 3) ? 1 : dst_d.dims()[ndims - 2]; - jcp.ow = dst_d.dims()[ndims - 1]; - - // Spatial dimensions: D*H*W - jcp.os = jcp.od * jcp.oh * jcp.ow; - jcp.is = jcp.id * jcp.ih * jcp.iw; - - jcp.oc_block = simd_w; - jcp.ic_block = simd_w; - - // Dynamic parameter calculation - // Register constraint: (ur * load_loop_blk) + (unroll * load_loop_blk) + 1 <= 32 - jcp.reduce_loop_unroll = 4; - - const int SMALL_SPATIAL = 10; - const int BIG_SPATIAL = 65; - const int BIG_LOAD_DIM = (jcp.ic >= 512) ? 256 : 512; - - // Initial load_loop_blk selection - if (jcp.oc % (2 * jcp.oc_block) == 0 && jcp.os >= 11) { - jcp.load_loop_blk = 2; - } else { - jcp.load_loop_blk = 1; - } - - // Dynamic ur selection algorithm - int max_regs, min_regs, size_threshold; - - const int spatial = jcp.od * jcp.oh; - - // Select register range based on batch size and thread count - if ((8 * jcp.mb) / jcp.nthr >= 1 || jcp.mb == 1) { - max_regs = 9; - min_regs = 6; - size_threshold = 14; - - // Special shape optimization - if (jcp.oc > 128 && jcp.oc < BIG_LOAD_DIM && spatial > SMALL_SPATIAL - && spatial < BIG_SPATIAL && jcp.ic < 256) { - max_regs = 6; - min_regs = 5; - } - } else { - max_regs = 30; - min_regs = 9; - size_threshold = 14; - } - - // Initial ur - jcp.ur = 1; - - // First pass: find largest ur that divides spatial evenly - for (int ur_w = max_regs; ur_w >= min_regs; ur_w--) { - if ((spatial >= size_threshold && spatial % ur_w == 0) - || (spatial < size_threshold && jcp.os % ur_w == 0)) { - jcp.ur = ur_w; - break; - } - } - - // If first pass fails, use heuristic - if (jcp.ur == 1) { - jcp.ur = nstl::min(max_regs, jcp.os); - int os_tail = jcp.os % max_regs; - for (int i = max_regs; i >= min_regs; i--) { - int i_tail = jcp.os % i; - if (i_tail > os_tail || i_tail == 0) { - jcp.ur = i; - os_tail = i_tail; - if (i_tail == 0) break; - } - } - } - - // Adjust ur based on load_loop_blk (ensure register limit) - // Register constraint: ur * load_loop_blk + unroll * load_loop_blk + 1 <= 32 - int max_ur_for_blk = (32 - 1 - jcp.reduce_loop_unroll * jcp.load_loop_blk) - / jcp.load_loop_blk; - if (jcp.ur > max_ur_for_blk) { - jcp.ur = max_ur_for_blk; - if (jcp.ur < 1) jcp.ur = 1; - } - - jcp.load_block = jcp.oc_block; - jcp.reduce_block = jcp.ic_block; - - jcp.bcast_block = jcp.ur; - jcp.load_dim = jcp.oc_without_padding; - jcp.bcast_dim = jcp.os; - jcp.reduce_dim = jcp.ic_without_padding; - - jcp.ur_tail = jcp.bcast_dim % jcp.ur; - - jcp.nb_bcast = div_up(jcp.os, jcp.bcast_block); - jcp.nb_load = div_up(jcp.oc_without_padding, jcp.load_block); - jcp.nb_reduce = div_up(jcp.ic_without_padding, jcp.reduce_block); - jcp.load_grp_count = 1; - - // Blocking strategy for NHWC layout - jcp.nb_reduce_blocking = jcp.nb_reduce; - jcp.nb_load_blocking = jcp.nb_load; - jcp.nb_load_blocking_max = jcp.nb_load; - - // Spatial dimension blocking (in ur units) - int target_bcast_blocking = 735; - jcp.nb_bcast_blocking - = nstl::min(jcp.nb_bcast, div_up(target_bcast_blocking, jcp.ur)); - if (jcp.nb_bcast_blocking == 0) jcp.nb_bcast_blocking = 1; - jcp.nb_bcast_blocking_max = jcp.nb_bcast_blocking; - - // Optimize reduce_loop_unroll based on available registers - if (jcp.load_loop_blk == 2) { - jcp.reduce_loop_unroll = 4; - } else { - jcp.reduce_loop_unroll = 4; - } - - // Layout-dependent stride parameters (for NHWC) - jcp.typesize_in = sizeof(float); - jcp.typesize_out = sizeof(float); - - jcp.reduce_loop_bcast_step = jcp.typesize_in; - jcp.reduce_loop_load_step = jcp.oc_block * jcp.typesize_in; - - // Strides within bcast_loop (spatial dimensions) - jcp.bcast_loop_bcast_step - = jcp.ngroups * jcp.ic_without_padding * jcp.typesize_in; - jcp.bcast_loop_output_step - = jcp.ngroups * jcp.oc_without_padding * jcp.typesize_out; - - // Strides within load_loop (OC dimension) - jcp.load_loop_load_step - = jcp.ic_without_padding * jcp.oc_block * jcp.typesize_in; - jcp.load_loop_iter_step = jcp.oc_block; - - if (jcp.reduce_loop_load_step > (1LL << 40) / jcp.reduce_loop_unroll) { - return status::unimplemented; - } - - return status::success; -} - -void jit_rvv_1x1_conv_kernel_t::init_scratchpad( - memory_tracking::registrar_t &scratchpad, - const jit_1x1_conv_conf_t &jcp) { - // Not implemented -} - -void jit_rvv_1x1_conv_kernel_t::balance(jit_1x1_conv_conf_t &jcp) { - // Not implemented -} - -void jit_rvv_1x1_conv_kernel_t::generate() { - - preamble(); - - // Set initial VL to oc_block (4) - if (jcp.oc_block <= 31) { - vsetivli(reg_tmp_imm, jcp.oc_block, Xbyak_riscv::SEW::e32, Xbyak_riscv::LMUL::m1); - } else { - li(reg_tmp_imm, jcp.oc_block); - vsetvli(reg_tmp_imm, reg_tmp_imm, Xbyak_riscv::SEW::e32, Xbyak_riscv::LMUL::m1); - } - - // Load parameters - ld(reg_bcast_data, reg_param, GET_OFF(bcast_data)); - ld(reg_load_data, reg_param, GET_OFF(load_data)); - ld(reg_output_data, reg_param, GET_OFF(output_data)); - if (jcp.with_bias) ld(reg_bias_data, reg_param, GET_OFF(bias_data)); - - ld(reg_load_loop_work, reg_param, GET_OFF(load_dim)); - ld(reg_bcast_loop_work, reg_param, GET_OFF(bcast_dim)); - ld(reg_reduce_loop_work, reg_param, GET_OFF(reduce_dim)); - ld(reg_reduce_pos_flag, reg_param, GET_OFF(first_last_flag)); - - // Main loop generation - auto load_loop_body = [=](int load_loop_blk) { - bcast_loop(load_loop_blk); - - // Update pointers and work counters - li(reg_tmp_imm, load_loop_blk * jcp.load_loop_load_step); - add(reg_load_data, reg_load_data, reg_tmp_imm); - - if (jcp.with_bias) { - li(reg_tmp_imm, load_loop_blk * jcp.oc_block * jcp.typesize_out); - add(reg_bias_data, reg_bias_data, reg_tmp_imm); - } - - li(reg_tmp_imm, load_loop_blk * jcp.oc_block * jcp.typesize_out); - add(reg_output_data, reg_output_data, reg_tmp_imm); - - li(reg_tmp_imm, load_loop_blk * jcp.load_loop_iter_step); - sub(reg_load_loop_work, reg_load_loop_work, reg_tmp_imm); - }; - - Label load_loop_label, load_loop_end, load_loop_tail; - - if (jcp.load_loop_blk > 1) { - L(load_loop_label); - li(reg_tmp_imm, jcp.load_loop_blk * jcp.oc_block); - blt(reg_load_loop_work, reg_tmp_imm, load_loop_tail); - - // Ensure VL is full - li(reg_tmp_imm, jcp.oc_block); - vsetvli(reg_tmp_imm, reg_tmp_imm, Xbyak_riscv::SEW::e32, - Xbyak_riscv::LMUL::m1); - - load_loop_body(jcp.load_loop_blk); - jal(x0, load_loop_label); - } - - L(load_loop_tail); - { - Label tail_loop, tail_end; - L(tail_loop); - blez(reg_load_loop_work, tail_end); - - // Last block may be partial, use vsetvli to set VL dynamically - if (jcp.oc_block <= 31) { - vsetivli(reg_tmp_imm, jcp.oc_block, Xbyak_riscv::SEW::e32, Xbyak_riscv::LMUL::m1); - } else { - li(reg_tmp_imm, jcp.oc_block); - vsetvli(reg_tmp_imm, reg_tmp_imm, Xbyak_riscv::SEW::e32, Xbyak_riscv::LMUL::m1); - } - - bcast_loop(1); - - // Update pointers and work counters (tail loop) - li(reg_tmp_imm, jcp.load_loop_load_step); - add(reg_load_data, reg_load_data, reg_tmp_imm); - if (jcp.with_bias) { - li(reg_tmp_imm, jcp.oc_block * jcp.typesize_out); - add(reg_bias_data, reg_bias_data, reg_tmp_imm); - } - li(reg_tmp_imm, jcp.oc_block * jcp.typesize_out); - add(reg_output_data, reg_output_data, reg_tmp_imm); - - li(reg_tmp_imm, jcp.oc_block); - sub(reg_load_loop_work, reg_load_loop_work, reg_tmp_imm); - - jal(x0, tail_loop); - L(tail_end); - } - L(load_loop_end); - - postamble(); -} - -void jit_rvv_1x1_conv_kernel_t::preamble() { - addi(sp, sp, -64); - sd(ra, sp, 56); - sd(s0, sp, 48); - sd(s1, sp, 40); - sd(s2, sp, 32); - sd(s3, sp, 24); - sd(s4, sp, 16); - sd(s5, sp, 8); -} - -void jit_rvv_1x1_conv_kernel_t::postamble() { - ld(ra, sp, 56); - ld(s0, sp, 48); - ld(s1, sp, 40); - ld(s2, sp, 32); - ld(s3, sp, 24); - ld(s4, sp, 16); - ld(s5, sp, 8); - addi(sp, sp, 64); - ret(); -} - -void jit_rvv_1x1_conv_kernel_t::bcast_loop(int load_loop_blk) { - mv(reg_bcast_loop_iter, reg_bcast_loop_work); - mv(aux1_reg_bcast_data, reg_bcast_data); - mv(aux_reg_output_data, reg_output_data); - - Label bcast_loop_label, bcast_loop_tail; - - li(reg_tmp_imm, jcp.ur); - blt(reg_bcast_loop_iter, reg_tmp_imm, bcast_loop_tail); - - L(bcast_loop_label); - { - reduce_loop(load_loop_blk, jcp.ur); - - li(reg_tmp_imm, jcp.ur * jcp.bcast_loop_bcast_step); - add(aux1_reg_bcast_data, aux1_reg_bcast_data, reg_tmp_imm); - - li(reg_tmp_imm, jcp.ur * jcp.bcast_loop_output_step); - add(aux_reg_output_data, aux_reg_output_data, reg_tmp_imm); - - addi(reg_bcast_loop_iter, reg_bcast_loop_iter, -jcp.ur); - li(reg_tmp_imm, jcp.ur); - bge(reg_bcast_loop_iter, reg_tmp_imm, bcast_loop_label); - } - - L(bcast_loop_tail); - if (jcp.ur_tail > 0) { - Label bcast_loop_tail_end; - blez(reg_bcast_loop_iter, bcast_loop_tail_end); - - auto restore_vl = [=]() { - if (jcp.oc_block <= 31) { - vsetivli(reg_tmp_imm, jcp.oc_block, SEW::e32, LMUL::m1); - } else { - li(reg_tmp_imm, jcp.oc_block); - vsetvli(reg_tmp_imm, reg_tmp_imm, SEW::e32, LMUL::m1); - } - }; - - reduce_loop(load_loop_blk, jcp.ur_tail); - restore_vl(); - L(bcast_loop_tail_end); - } -} - -void jit_rvv_1x1_conv_kernel_t::reduce_loop(int load_loop_blk, int ur) { - mv(aux_reg_load_data, reg_load_data); - mv(aux_reg_bcast_data, aux1_reg_bcast_data); - - auto init = [=]() { - Label init_zero, init_done; - andi(reg_tmp_imm, reg_reduce_pos_flag, FLAG_REDUCE_FIRST); - bnez(reg_tmp_imm, init_zero); - - // Load from dst for accumulation - mv(reg_tmp_addr, aux_reg_output_data); - for (int i_ur = 0; i_ur < ur; ++i_ur) { - for (int i_load = 0; i_load < load_loop_blk; ++i_load) { - vle32_v(vreg_accum(i_load, i_ur), reg_tmp_addr); - if (i_load + 1 < load_loop_blk) - addi(reg_tmp_addr, reg_tmp_addr, - jcp.load_block * jcp.typesize_out); - } - li(reg_tmp_imm, - jcp.bcast_loop_output_step - - (load_loop_blk - 1) * jcp.load_block - * jcp.typesize_out); - add(reg_tmp_addr, reg_tmp_addr, reg_tmp_imm); - } - jal(x0, init_done); - - L(init_zero); - for (int i_load = 0; i_load < load_loop_blk; ++i_load) { - if (jcp.with_bias) { - size_t bias_off - = (size_t)i_load * jcp.oc_block * jcp.typesize_out; - if (bias_off == 0) { - vle32_v(vreg_load(0), reg_bias_data); - } else { - li(reg_tmp_addr, bias_off); - add(reg_tmp_addr, reg_tmp_addr, reg_bias_data); - vle32_v(vreg_load(0), reg_tmp_addr); - } - } - for (int i_ur = 0; i_ur < ur; ++i_ur) { - if (jcp.with_bias) { - vmv_v_v(vreg_accum(i_load, i_ur), vreg_load(0)); - } else { - vxor_vv(vreg_accum(i_load, i_ur), vreg_accum(i_load, i_ur), - vreg_accum(i_load, i_ur)); - } - } - } - L(init_done); - }; - - auto store = [=]() { - mv(reg_tmp_addr, aux_reg_output_data); - - for (int i_ur = 0; i_ur < ur; ++i_ur) { - for (int i_load = 0; i_load < load_loop_blk; ++i_load) { - - vse32_v(vreg_accum(i_load, i_ur), reg_tmp_addr); - - if (i_load + 1 < load_loop_blk) - addi(reg_tmp_addr, reg_tmp_addr, - jcp.load_block * jcp.typesize_out); - } - li(reg_tmp_imm, - jcp.bcast_loop_output_step - - (load_loop_blk - 1) * jcp.load_block - * jcp.typesize_out); - add(reg_tmp_addr, reg_tmp_addr, reg_tmp_imm); - } - }; - - auto fma_block = [=](int current_unroll, bool last_block) { - for (int i_unroll = 0; i_unroll < current_unroll; ++i_unroll) { - flw(freg_bcast, aux_reg_bcast_data, 0); - - for (int i_ur = 0; i_ur < ur; ++i_ur) { - for (int i_load = 0; i_load < load_loop_blk; ++i_load) { - vfmacc_vf(vreg_accum(i_load, i_ur), freg_bcast, - vreg_load(i_load, i_unroll)); - } - - if (i_ur + 1 < ur) { - ptrdiff_t offset - = (ptrdiff_t)(i_ur + 1) * jcp.bcast_loop_bcast_step; - if (offset <= 2047) { - flw(freg_bcast, aux_reg_bcast_data, offset); - } else { - li(reg_tmp_addr, offset); - add(reg_tmp_addr, reg_tmp_addr, aux_reg_bcast_data); - flw(freg_bcast, reg_tmp_addr, 0); - } - } - } - addi(aux_reg_bcast_data, aux_reg_bcast_data, - jcp.reduce_loop_bcast_step); - } - - // Update weight pointer to next unroll block - li(reg_tmp_imm, jcp.reduce_loop_unroll * jcp.reduce_loop_load_step); - add(aux_reg_load_data, aux_reg_load_data, reg_tmp_imm); - - // Prefetch weights for next iteration - if (!last_block) { - for (int i_unroll = 0; i_unroll < jcp.reduce_loop_unroll; ++i_unroll) { - for (int i_load = 0; i_load < load_loop_blk; ++i_load) { - ptrdiff_t weight_off = (ptrdiff_t)i_unroll * jcp.reduce_loop_load_step - + (ptrdiff_t)i_load * jcp.load_loop_load_step; - - li(reg_tmp_addr, weight_off + 256); - - add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr); - #if defined(__riscv_zicbom) - // cbo.prefetch.i: 0b0000000_00010_00000_010_00000_0001111 - asm volatile(".word 0x0020000f" : : "r"(reg_tmp_addr)); - #endif - - li(reg_tmp_addr, weight_off); - add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr); - vle32_v(vreg_load(i_load, i_unroll), reg_tmp_addr); - } - } - } - }; - - init(); - - // Load first round of weights (IC=0..unroll-1) - for (int i_unroll = 0; i_unroll < jcp.reduce_loop_unroll; ++i_unroll) { - for (int i_load = 0; i_load < load_loop_blk; ++i_load) { - ptrdiff_t weight_off = (ptrdiff_t)i_unroll * jcp.reduce_loop_load_step - + (ptrdiff_t)i_load * jcp.load_loop_load_step; - if (weight_off == 0) { - vle32_v(vreg_load(i_load, i_unroll), aux_reg_load_data); - } else { - li(reg_tmp_addr, weight_off); - add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr); - vle32_v(vreg_load(i_load, i_unroll), reg_tmp_addr); - } - } - } - - mv(reduce_loop_iter, reg_reduce_loop_work); - Label reduce_loop_label, reduce_loop_tail; - - li(reg_tmp_imm, jcp.reduce_loop_unroll); - blt(reduce_loop_iter, reg_tmp_imm, reduce_loop_tail); - - L(reduce_loop_label); - { - li(reg_tmp_imm, jcp.reduce_loop_unroll); - sub(reg_tmp_imm, reduce_loop_iter, reg_tmp_imm); - li(reg_tmp_addr, jcp.reduce_loop_unroll); - Label is_last, do_fma; - blt(reg_tmp_imm, reg_tmp_addr, is_last); - fma_block(jcp.reduce_loop_unroll, false); - jal(x0, do_fma); - L(is_last); - fma_block(jcp.reduce_loop_unroll, true); - L(do_fma); - - addi(reduce_loop_iter, reduce_loop_iter, -jcp.reduce_loop_unroll); - li(reg_tmp_imm, jcp.reduce_loop_unroll); - bge(reduce_loop_iter, reg_tmp_imm, reduce_loop_label); - } - - L(reduce_loop_tail); - { - Label tail_done; - blez(reduce_loop_iter, tail_done); - Label tail_loop; - L(tail_loop); - { - for (int i_load = 0; i_load < load_loop_blk; ++i_load) { - ptrdiff_t weight_off = (ptrdiff_t)i_load * jcp.load_loop_load_step; - if (weight_off == 0) { - vle32_v(vreg_load(i_load, 0), aux_reg_load_data); - } else { - li(reg_tmp_addr, weight_off); - add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr); - vle32_v(vreg_load(i_load, 0), reg_tmp_addr); - } - } - - flw(freg_bcast, aux_reg_bcast_data, 0); - for (int i_ur = 0; i_ur < ur; ++i_ur) { - for (int i_load = 0; i_load < load_loop_blk; ++i_load) { - vfmacc_vf(vreg_accum(i_load, i_ur), freg_bcast, - vreg_load(i_load, 0)); - } - if (i_ur + 1 < ur) { - size_t offset - = (size_t)(i_ur + 1) * jcp.bcast_loop_bcast_step; - if (offset <= 2047) { - flw(freg_bcast, aux_reg_bcast_data, offset); - } else { - li(reg_tmp_addr, offset); - add(reg_tmp_addr, reg_tmp_addr, aux_reg_bcast_data); - flw(freg_bcast, reg_tmp_addr, 0); - } - } - } - - addi(aux_reg_bcast_data, aux_reg_bcast_data, - jcp.reduce_loop_bcast_step); - addi(aux_reg_load_data, aux_reg_load_data, - jcp.reduce_loop_load_step); - addi(reduce_loop_iter, reduce_loop_iter, -1); - bnez(reduce_loop_iter, tail_loop); - } - L(tail_done); - } - - store(); -} - -} // namespace rv64 -} // namespace cpu -} // namespace impl -} // namespace dnnl diff --git a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp deleted file mode 100644 index 0fcd9774aec..00000000000 --- a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp +++ /dev/null @@ -1,109 +0,0 @@ -/******************************************************************************* -* Copyright 2025 ZTE Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef CPU_RV64_JIT_RVV_1X1_CONV_KERNEL_HPP -#define CPU_RV64_JIT_RVV_1X1_CONV_KERNEL_HPP - -#include "common/c_types_map.hpp" -#include "common/memory_tracking.hpp" - -#include "cpu/rv64/jit_generator.hpp" -#include "cpu/rv64/jit_primitive_conf.hpp" - -namespace dnnl { -namespace impl { -namespace cpu { -namespace rv64 { - -using namespace Xbyak_riscv; - -struct jit_rvv_1x1_conv_kernel_t : public jit_generator_t { - jit_rvv_1x1_conv_kernel_t(const jit_1x1_conv_conf_t &ajcp, - const primitive_attr_t &attr, const memory_desc_t &dst_md); - - DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_rvv_1x1_conv_kernel) - - static status_t init_conf(jit_1x1_conv_conf_t &jcp, - const convolution_desc_t &cd, const memory_desc_wrapper &src_d, - const memory_desc_wrapper &weights_d, - const memory_desc_wrapper &dst_d, const primitive_attr_t &attr, - int nthreads, bool reduce_src); - - static void init_scratchpad(memory_tracking::registrar_t &scratchpad, - const jit_1x1_conv_conf_t &jcp); - - static void balance(jit_1x1_conv_conf_t &jcp); - - jit_1x1_conv_conf_t jcp; - const primitive_attr_t &attr_; - -private: - using Reg = Xbyak_riscv::Reg; - using VReg = Xbyak_riscv::VReg; - using FReg = Xbyak_riscv::FReg; - - const Reg reg_param = a0; - const Reg reg_bcast_data = a1; - const Reg reg_load_data = a2; - const Reg reg_output_data = a3; - const Reg reg_bias_data = a4; - - const Reg reg_load_loop_work = t0; - const Reg reg_bcast_loop_work = t1; - const Reg reg_reduce_loop_work = t2; - - const Reg aux_reg_bcast_data = t3; - const Reg aux_reg_load_data = t4; - const Reg aux_reg_output_data = t5; - const Reg aux1_reg_bcast_data = t6; - - const Reg reduce_loop_iter = s0; - const Reg reg_bcast_loop_iter = s1; - const Reg reg_reduce_pos_flag = s2; - const Reg reg_output_stride = s3; - - const Reg reg_tmp_imm = s4; - const Reg reg_tmp_addr = s5; - - VReg vreg_accum(int i_load, int i_ur) { - // Avoid v0, start from v1 - return VReg(1 + i_ur * jcp.load_loop_blk + i_load); - } - - VReg vreg_load(int i_load, int i_unroll = 0) { - // Allocate after accum to avoid conflicts - // accum uses v1 to v(ur * load_loop_blk) - return VReg(1 + jcp.ur * jcp.load_loop_blk - + i_unroll * jcp.load_loop_blk + i_load); - } - - const FReg freg_bcast = fa0; - const FReg freg_load = fa1; - - void generate() override; - void preamble(); - void postamble(); - void bcast_loop(int load_loop_blk); - void reduce_loop(int load_loop_blk, int ur); - void fma_block(int load_loop_blk, int ur); -}; - -} // namespace rv64 -} // namespace cpu -} // namespace impl -} // namespace dnnl - -#endif diff --git a/src/cpu/rv64/jit_rvv_1x1_convolution.cpp b/src/cpu/rv64/jit_rvv_1x1_convolution.cpp deleted file mode 100644 index f744419990a..00000000000 --- a/src/cpu/rv64/jit_rvv_1x1_convolution.cpp +++ /dev/null @@ -1,144 +0,0 @@ -/******************************************************************************* -* Copyright 2025 ZTE Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "common/c_types_map.hpp" -#include "common/dnnl_thread.hpp" -#include "common/type_helpers.hpp" -#include "common/utils.hpp" - -#include "cpu/rv64/jit_rvv_1x1_convolution.hpp" - -namespace dnnl { -namespace impl { -namespace cpu { -namespace rv64 { - -using namespace dnnl::impl::status; -using namespace dnnl::impl::utils; - -void jit_rvv_1x1_convolution_fwd_t::execute_forward( - const exec_ctx_t &ctx) const { - auto src = CTX_IN_MEM(const float *, DNNL_ARG_SRC); - auto weights = CTX_IN_MEM(const float *, DNNL_ARG_WEIGHTS); - auto bias = CTX_IN_MEM(const float *, DNNL_ARG_BIAS); - auto dst = CTX_OUT_MEM(float *, DNNL_ARG_DST); - - const auto &scratchpad = ctx.get_scratchpad_grantor(); - - parallel(pd()->jcp_.nthr, [&](const int ithr, const int nthr) { - execute_forward_thr(ithr, nthr, src, weights, bias, dst, scratchpad); - }); -} - -void jit_rvv_1x1_convolution_fwd_t::execute_forward_thr(const int ithr, - const int nthr, const float *src, const float *weights, - const float *bias, float *dst, - const memory_tracking::grantor_t &scratchpad) const { - - const memory_desc_wrapper src_d(pd()->src_md()); - const memory_desc_wrapper dst_d(pd()->dst_md()); - const memory_desc_wrapper weights_d(pd()->weights_md(0)); - - const auto &jcp = pd()->jcp_; - - auto step = [](int default_step, int remaining, int tail_step) { - assert(default_step <= tail_step); - return remaining < tail_step ? remaining : default_step; - }; - - // RVV 1x1 convolution uses NHWC layout. - // Spatial dimensions are collapsed into 'os'. - // Threading is balanced over (MB * groups * nb_bcast) and (nb_load). - - const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast; - int bcast_start {0}, bcast_end {0}, ocb_start {0}, ocb_end {0}; - - balance2D(nthr, ithr, work_amount, bcast_start, bcast_end, jcp.nb_load, - ocb_start, ocb_end, jcp.load_grp_count); - - if (bcast_start >= bcast_end || ocb_start >= ocb_end) return; - - auto p = jit_1x1_conv_args_t(); - - auto ker_1x1 = [&](int ocb, int load_step, int icb, int n, int g, int osb, - int bcast_step) { - const int oc_off = g * jcp.oc_without_padding + ocb * jcp.oc_block; - const size_t dst_off - = (size_t)n * jcp.os * jcp.ngroups * jcp.oc_without_padding - + (size_t)osb * jcp.bcast_block * jcp.ngroups - * jcp.oc_without_padding - + oc_off; - - p.output_data = &dst[dst_off]; - p.bias_data = bias ? &bias[oc_off] : nullptr; - - const size_t wei_off = (size_t)g * jcp.oc * jcp.ic_without_padding - + (size_t)ocb * jcp.ic_without_padding * jcp.oc_block - + (size_t)icb * jcp.ic_block * jcp.oc_block; - p.load_data = &weights[wei_off]; - - const int ic_off = g * jcp.ic_without_padding + icb * jcp.ic_block; - const size_t src_off - = (size_t)n * jcp.is * jcp.ngroups * jcp.ic_without_padding - + (size_t)osb * jcp.bcast_block * jcp.ngroups - * jcp.ic_without_padding - + ic_off; - p.bcast_data = &src[src_off]; - - p.bcast_dim = this_block_size( - osb * jcp.bcast_block, jcp.os, bcast_step * jcp.bcast_block); - p.load_dim = this_block_size(ocb * jcp.oc_block, jcp.oc_without_padding, - load_step * jcp.oc_block); - p.reduce_dim = this_block_size(icb * jcp.ic_block, - jcp.ic_without_padding, jcp.nb_reduce_blocking * jcp.ic_block); - - p.first_last_flag = (icb == 0 ? FLAG_REDUCE_FIRST : 0) - | (icb + jcp.nb_reduce_blocking >= jcp.nb_reduce - ? FLAG_REDUCE_LAST - : 0); - - (*kernel_)(&p); - }; - - // Loop order: Load -> Bcast -> Reduce (LBR) - // This order keeps weights in registers/L1 while iterating over spatial. - for (int ocb = ocb_start; ocb < ocb_end;) { - int load_step = step( - jcp.nb_load_blocking, ocb_end - ocb, jcp.nb_load_blocking_max); - int iwork = bcast_start; - while (iwork < bcast_end) { - int n {0}, g {0}, osb {0}; - nd_iterator_init( - iwork, n, jcp.mb, g, jcp.ngroups, osb, jcp.nb_bcast); - - int bcast_step = step(jcp.nb_bcast_blocking, bcast_end - iwork, - jcp.nb_bcast_blocking_max); - bcast_step = nstl::min(bcast_step, jcp.nb_bcast - osb); - - for (int icb = 0; icb < jcp.nb_reduce; - icb += jcp.nb_reduce_blocking) { - ker_1x1(ocb, load_step, icb, n, g, osb, bcast_step); - } - iwork += bcast_step; - } - ocb += load_step; - } -} - -} // namespace rv64 -} // namespace cpu -} // namespace impl -} // namespace dnnl diff --git a/src/cpu/rv64/jit_rvv_1x1_convolution.hpp b/src/cpu/rv64/jit_rvv_1x1_convolution.hpp deleted file mode 100644 index 2d379cc6ec9..00000000000 --- a/src/cpu/rv64/jit_rvv_1x1_convolution.hpp +++ /dev/null @@ -1,170 +0,0 @@ -/******************************************************************************* -* Copyright 2025 ZTE Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef CPU_RV64_JIT_RVV_1X1_CONVOLUTION_HPP -#define CPU_RV64_JIT_RVV_1X1_CONVOLUTION_HPP - -#include "common/c_types_map.hpp" -#include "common/dnnl_thread.hpp" -#include "common/memory_tracking.hpp" -#include "common/primitive.hpp" -#include "common/utils.hpp" - -#include "cpu/cpu_convolution_pd.hpp" -#include "cpu/platform.hpp" - -#include "cpu/rv64/jit_rvv_1x1_conv_kernel.hpp" - -namespace dnnl { -namespace impl { -namespace cpu { -namespace rv64 { - -struct jit_rvv_1x1_convolution_fwd_t : public primitive_t { - struct pd_t : public cpu_convolution_fwd_pd_t { - using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t; - - DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_1x1:", v, ""), - jit_rvv_1x1_convolution_fwd_t); - - status_t init(engine_t *engine) { - using namespace utils; - using namespace format_tag; - - const memory_desc_wrapper src_d(src_md()); - const memory_desc_wrapper weights_d(weights_md()); - const memory_desc_wrapper dst_d(dst_md()); - - VDISPATCH_CONV(is_fwd(), VERBOSE_BAD_PROPKIND); - VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct), - VERBOSE_BAD_ALGORITHM); - VDISPATCH_CONV( - expect_data_types(data_type::f32, data_type::f32, - data_type::f32, data_type::f32, data_type::undef), - VERBOSE_UNSUPPORTED_DT); - VDISPATCH_CONV(attr()->has_default_values( - primitive_attr_t::skip_mask_t::post_ops), - VERBOSE_UNSUPPORTED_ATTR); - VDISPATCH_CONV(post_ops_ok(), VERBOSE_UNSUPPORTED_POSTOP); - VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, ""); - - // Only support: data = nwc/nhwc/ndhwc, weights = blocked formats (Oiw4o/gOiw4o/etc) - const int n = ndims(); - const bool g = with_groups(); - const auto dat_tag_nxc = utils::pick(n - 3, nwc, nhwc, ndhwc); - const auto wei_tag_blocked = utils::pick(2 * n - 6 + (g ? 1 : 0), - Oiw4o, gOiw4o, Oihw4o, gOihw4o, Oidhw4o, gOidhw4o); - - // Check if src/dst match supported format (nxc) - // Only accept format_kind::any as a fallback, reject explicit - // unsupported formats - VDISPATCH_CONV(IMPLICATION(src_d.matches_one_of_tag(dat_tag_nxc) - != dat_tag_nxc, - src_d.format_kind() == format_kind::any), - VERBOSE_UNSUPPORTED_TAG); - VDISPATCH_CONV(IMPLICATION(dst_d.matches_one_of_tag(dat_tag_nxc) - != dat_tag_nxc, - dst_d.format_kind() == format_kind::any), - VERBOSE_UNSUPPORTED_TAG); - VDISPATCH_CONV( - IMPLICATION(weights_d.matches_one_of_tag(wei_tag_blocked) - != wei_tag_blocked, - weights_d.format_kind() == format_kind::any), - VERBOSE_UNSUPPORTED_TAG); - - // Set default formats if format_kind == any - VDISPATCH_CONV(set_default_formats(), VERBOSE_UNSUPPORTED_TAG); - - // ISA check - VDISPATCH_CONV(mayiuse(v), VERBOSE_UNSUPPORTED_ISA); - - // 1x1 convolution check - const int ndims = src_d.ndims(); - const int weights_ndims = weights_d.ndims(); - for (int i = 0; i < ndims - 2; ++i) { - VDISPATCH_CONV( - weights_d.dims()[weights_ndims - (ndims - 2) + i] == 1, - VERBOSE_UNSUPPORTED_FEATURE, - "only 1x1 convolution is supported"); - VDISPATCH_CONV(desc()->strides[i] == 1, - VERBOSE_UNSUPPORTED_FEATURE, - "only stride 1 is supported"); - VDISPATCH_CONV(desc()->padding[0][i] == 0, - VERBOSE_UNSUPPORTED_FEATURE, - "padding is not supported"); - } - - VDISPATCH_CONV_SC(jit_rvv_1x1_conv_kernel_t::init_conf(jcp_, - *desc(), src_d, weights_d, dst_d, *attr(), - dnnl_get_max_threads(), false), - VERBOSE_UNSUPPORTED_FEATURE, "init_conf failed"); - - auto scratchpad = scratchpad_registry().registrar(); - jit_rvv_1x1_conv_kernel_t::init_scratchpad(scratchpad, jcp_); - - return status::success; - } - - jit_1x1_conv_conf_t jcp_ = utils::zero(); - - protected: - bool post_ops_ok() const { - // TODO: Post-ops support is not implemented yet. - return attr()->post_ops_.len() == 0; - } - bool set_default_formats() { - using namespace format_tag; - const int n = ndims(); - const bool g = with_groups(); - const auto dat_tag = utils::pick(n - 3, nwc, nhwc, ndhwc); - const auto wei_tag = utils::pick(2 * n - 6 + (g ? 1 : 0), Oiw4o, - gOiw4o, Oihw4o, gOihw4o, Oidhw4o, gOidhw4o); - - return set_default_formats_common(dat_tag, wei_tag, dat_tag); - } - }; - - jit_rvv_1x1_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {} - - status_t init(engine_t *engine) override { - CHECK(safe_ptr_assign(kernel_, - new jit_rvv_1x1_conv_kernel_t( - pd()->jcp_, *pd()->attr(), *pd()->dst_md()))); - return kernel_->create_kernel(); - } - - status_t execute(const exec_ctx_t &ctx) const override { - execute_forward(ctx); - return status::success; - } - -private: - void execute_forward(const exec_ctx_t &ctx) const; - void execute_forward_thr(const int ithr, const int nthr, const float *src, - const float *weights, const float *bias, float *dst, - const memory_tracking::grantor_t &scratchpad) const; - - const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } - - std::unique_ptr kernel_; -}; - -} // namespace rv64 -} // namespace cpu -} // namespace impl -} // namespace dnnl - -#endif diff --git a/src/cpu/rv64/rvv_gemm_convolution.cpp b/src/cpu/rv64/rvv_gemm_convolution.cpp index ed75ccfc0ea..dfb271575d0 100644 --- a/src/cpu/rv64/rvv_gemm_convolution.cpp +++ b/src/cpu/rv64/rvv_gemm_convolution.cpp @@ -1,22 +1,17 @@ /******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. +Copyright 2016 Intel Corporation +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. *******************************************************************************/ - #include #include - #include "common/c_types_map.hpp" #include "common/dnnl_thread.hpp" #include "common/type_helpers.hpp" @@ -38,10 +33,102 @@ struct im_pos_t { dim_t n, g, od, sp, ic, oc; bool do_im2col(const im_pos_t &prev) const { return true - && (n != prev.n || g != prev.g || od != prev.od || sp != prev.sp - || ic != prev.ic); + && (n != prev.n || g != prev.g || od != prev.od || sp != prev.sp + || ic != prev.ic); } }; + +// Helper function to apply bias and eltwise using RVV in NSPC layout +// Using float explicitly as data_t is float in this specialization +static void apply_bias_eltwise_rvv_nspc( + const float *__restrict bia_arr, + float *__restrict dst_arr, + size_t start_oc, size_t end_oc, + bool with_bias, + bool with_eltwise, + const ref_post_ops_t *post_ops, + const exec_ctx_t &ctx, + const memory_desc_t *dst_md, // Changed to pointer to memory_desc_t + const conv_gemm_conf_t &jcp, + size_t g, size_t os_offset_factor) { + + size_t n_elems = end_oc - start_oc + 1; + if (n_elems == 0) return; + + size_t oc = 0; + const float *b_ptr = with_bias ? (bia_arr + start_oc) : nullptr; + float *d_ptr = dst_arr + start_oc; + + // Prepare eltwise params if needed + float eltwise_alpha = 0.0f; + float eltwise_scale = 1.0f; + bool is_fast_relu = false; + + if (with_eltwise && jcp.post_ops.len() == 1) { + const auto &eltwise = jcp.post_ops.entry_.back().eltwise; + if (eltwise.alg == alg_kind::eltwise_relu) { + eltwise_alpha = eltwise.alpha; + eltwise_scale = eltwise.scale; + is_fast_relu = true; + } + } + + while (oc < n_elems) { + size_t vl = __riscv_vsetvl_e32m1(n_elems - oc); + + vfloat32m1_t v_dst = __riscv_vle32_v_f32m1(d_ptr + oc, vl); + + // 1. Add Bias + if (with_bias) { + vfloat32m1_t v_bias = __riscv_vle32_v_f32m1(b_ptr + oc, vl); + v_dst = __riscv_vfadd_vv_f32m1(v_dst, v_bias, vl); + } + + // 2. Apply Eltwise (Fast ReLU path) + if (is_fast_relu) { + if (eltwise_alpha == 0.0f) { + // Standard ReLU + v_dst = __riscv_vfmax_vf_f32m1(v_dst, 0.0f, vl); + } else { + // Leaky ReLU-like + vbool32_t mask = __riscv_vmflt_vf_f32m1_b32(v_dst, 0.0f, vl); + v_dst = __riscv_vfmul_vf_f32m1_m(mask, v_dst, eltwise_alpha, vl); + } + + if (eltwise_scale != 1.0f) { + v_dst = __riscv_vfmul_vf_f32m1(v_dst, eltwise_scale, vl); + } + __riscv_vse32_v_f32m1(d_ptr + oc, v_dst, vl); + oc += vl; + } else { + // If not fast relu, break to handle scalarly or generic post-ops + break; + } + } + + // Handle remaining elements or generic post-ops scalarly + if (oc < n_elems || (!is_fast_relu && with_eltwise)) { + for (size_t i = oc; i < n_elems; ++i) { + size_t cur_oc = start_oc + i; + float *dst_val = dst_arr + cur_oc; + + if (with_bias) { + *dst_val += bia_arr[cur_oc]; + } + + if (with_eltwise || jcp.with_binary) { + ref_post_ops_t::args_t args; + args.ctx = &ctx; + args.dst_md = dst_md; // Use the passed pointer + // Calculate offset correctly + // Note: l_offset calculation might need adjustment based on exact memory layout expectations of post_ops + args.l_offset = (g * jcp.oc + cur_oc) * (jcp.os * jcp.od); + post_ops->execute(*dst_val, args); + } + } + } +} + } // namespace status_t riscv_gemm_convolution_fwd_t::execute_forward_nspc( @@ -50,7 +137,6 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_nspc( auto wei_base = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS); auto bia_base = CTX_IN_MEM(const data_t *, DNNL_ARG_BIAS); auto dst_base = CTX_OUT_MEM(data_t *, DNNL_ARG_DST); - auto scratchpad = ctx.get_scratchpad_grantor(); const conv_gemm_conf_t &jcp = pd()->jcp_; std::atomic st(status::success); @@ -58,7 +144,11 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_nspc( parallel(jcp.nthr, [&](const int ithr, const int nthr) { status_t st_thr = execute_forward_thr_nspc(ctx, ithr, nthr, src_base, wei_base, bia_base, dst_base, scratchpad); - if (st_thr != status::success) st = st_thr; + + if (st_thr != status::success) { + status_t expected = status::success; + st.compare_exchange_strong(expected, st_thr); + } }); return st; @@ -69,7 +159,6 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc( const data_t *src_base, const data_t *wei_base, const data_t *bia_base, data_t *dst_base, const memory_tracking::grantor_t &scratchpad) const { const conv_gemm_conf_t &jcp = pd()->jcp_; - // Src Format: mb-spatial-groups-input_channels const dim_t src_mb_stride = jcp.id * jcp.ih * jcp.iw * jcp.ngroups * jcp.ic; const dim_t src_g_stride = jcp.ic; @@ -92,7 +181,7 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc( assert(IMPLICATION(is_problem_3d, jcp.oh_block == jcp.oh && jcp.ow_block == jcp.ow - && jcp.ic_block == jcp.ic)); + && jcp.ic_block == jcp.ic)); assert(IMPLICATION(jcp.ow_block != jcp.ow, jcp.oh_block == 1)); const dim_t nb_oh = div_up(jcp.oh, jcp.oh_block); @@ -102,10 +191,8 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc( balance211(work_amount, nthr, ithr, start, end); nd_iterator_init(start, n, jcp.mb, g, jcp.ngroups, ohb, nb_oh, owb, nb_ow); + // Pre-zeroing for 3D problem if needed (outside loop) if (jcp.im2col_sz && is_problem_3d) { - // jit_gemm_convolution_utils::im2col_dt_3d() requires external - // data initialization by zeroes - const size_t total_sz = jcp.im2col_sz; const size_t vlmax = __riscv_vsetvlmax_e32m1(); const vfloat32m1_t v_zero = __riscv_vfmv_v_f_f32m1(0.0f, vlmax); @@ -120,6 +207,10 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc( } } + // Cache post_ops pointer and dst_md + const ref_post_ops_t *post_ops_ptr = post_ops_.get(); + const memory_desc_t *dst_md_ptr = pd()->dst_md(); + for (dim_t iwork = start; iwork < end; ++iwork) { dim_t oh = ohb * jcp.oh_block; dim_t ow = owb * jcp.ow_block; @@ -129,14 +220,16 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc( const int h_step = nstl::min(jcp.oh_block, jcp.oh - oh); const int w_step = nstl::min(jcp.ow_block, jcp.ow - ow); + if (jcp.im2col_sz && is_problem_3d) { - jit_gemm_convolution_utils::transpose_dt(jcp, src, imtr); + jit_gemm_convolution_utils::transpose_dt(jcp, src, imtr); } for (int od = 0; od < jcp.od; od++) { data_t *__restrict dst = dst_base + n * dst_mb_stride + g * dst_g_stride + ((od * jcp.oh + oh) * jcp.ow + ow) * dst_os_stride; + if (jcp.im2col_sz) { if (is_problem_3d) jit_gemm_convolution_utils::im2col_dt_3d( @@ -152,25 +245,27 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc( const dim_t LDA = M * jcp.ngroups; const dim_t LDB = jcp.im2col_sz ? N : K * jcp.ngroups; const dim_t LDC = M * jcp.ngroups; - const char *BT = jcp.im2col_sz ? "T" : "N"; + const char *BT = jcp.im2col_sz ? "T " : "N "; const data_t onef = 1.f; const float beta = jcp.with_sum ? 1.0f : 0.0f; const data_t *__restrict src_od = src + od * jcp.oh * jcp.ow * jcp.ngroups * jcp.ic; - status_t st = extended_sgemm("N", BT, &M, &N, &K, &onef, wei, &LDA, + + status_t st = extended_sgemm("N ", BT, &M, &N, &K, &onef, wei, &LDA, jcp.im2col_sz ? col : (data_t *)src_od, &LDB, &beta, dst, &LDC); if (st != status::success) return st; if (jcp.with_bias || jcp.with_eltwise || jcp.with_binary) { - parallel(0, [&](int ithr, int nthr) { - dim_t start, end; - balance211(N * jcp.oc, nthr, ithr, start, end); + // NOTE: Keeping parallel(0, ...) as requested + parallel(0, [&](int ithr_inner, int nthr_inner) { + dim_t start_inner, end_inner; + balance211(N * jcp.oc, nthr_inner, ithr_inner, start_inner, end_inner); - const size_t first_oc = start % jcp.oc; - const size_t last_oc = (end - 1) % jcp.oc; - const size_t first_os = start / jcp.oc; - const size_t last_os = (end - 1) / jcp.oc; + const size_t first_oc = start_inner % jcp.oc; + const size_t last_oc = (end_inner - 1) % jcp.oc; + const size_t first_os = start_inner / jcp.oc; + const size_t last_os = (end_inner - 1) / jcp.oc; for (size_t os = first_os; os <= last_os; ++os) { const size_t start_oc = (os == first_os) ? first_oc : 0; @@ -181,60 +276,36 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc( = bia_base ? bia_base + g * jcp.oc : nullptr; data_t *__restrict dst_arr = dst + os * dst_os_stride; - if (jcp.with_bias) { - size_t n_elems = end_oc - start_oc + 1; - if (n_elems > 0) { - size_t oc = 0; - const data_t *b_ptr = bia_arr + start_oc; - data_t *d_ptr = dst_arr + start_oc; - - while (oc < n_elems) { - size_t vl = __riscv_vsetvl_e32m1( - n_elems - oc); - vfloat32m1_t v_dst = __riscv_vle32_v_f32m1( - d_ptr + oc, vl); - vfloat32m1_t v_bias = __riscv_vle32_v_f32m1( - b_ptr + oc, vl); - v_dst = __riscv_vfadd_vv_f32m1( - v_dst, v_bias, vl); - __riscv_vse32_v_f32m1( - d_ptr + oc, v_dst, vl); - oc += vl; - } - } - } - - if (jcp.with_eltwise || jcp.with_binary) { - bool fast_relu_done = false; - if (jcp.with_eltwise && jcp.post_ops.len() == 1) { - // fast branch for ReLU case - const auto &eltwise - = jcp.post_ops.entry_.back().eltwise; - - if (eltwise.alg == alg_kind::eltwise_relu) { - const auto alpha = eltwise.alpha; - const auto scale = eltwise.scale; - PRAGMA_OMP_SIMD() - for (size_t oc = start_oc; oc <= end_oc; - oc++) { - if (dst_arr[oc] < 0) - dst_arr[oc] *= alpha; - dst_arr[oc] *= scale; + // Check if we can use optimized RVV path + bool has_binary = jcp.with_binary; + bool has_complex_eltwise = jcp.with_eltwise && !(jcp.post_ops.len() == 1 && jcp.post_ops.entry_.back().eltwise.alg == alg_kind::eltwise_relu); + + if (!has_binary && !has_complex_eltwise) { + apply_bias_eltwise_rvv_nspc( + (const float*)bia_arr, (float*)dst_arr, start_oc, end_oc, + jcp.with_bias, jcp.with_eltwise, + post_ops_ptr, ctx, dst_md_ptr, jcp, g, 0); + } else { + // Fallback to original scalar logic for complex cases + if (jcp.with_bias) { + size_t n_elems = end_oc - start_oc + 1; + if (n_elems > 0) { + // Scalar bias add + for(size_t k=0; kdst_md(); - + args.dst_md = dst_md_ptr; + for (size_t oc = start_oc; oc <= end_oc; oc++) { - // jcp.od is not part of jcp.os, so multiply - // jcp.od to get spatial offset. args.l_offset = (g * jcp.oc + oc) * (jcp.os * jcp.od); - post_ops_->execute(dst_arr[oc], args); + post_ops_ptr->execute(dst_arr[oc], args); } } } @@ -253,7 +324,6 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp( auto weights = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS); auto bias = CTX_IN_MEM(const data_t *, DNNL_ARG_BIAS); auto dst = CTX_OUT_MEM(data_t *, DNNL_ARG_DST); - auto col = ctx.get_scratchpad_grantor().get(key_conv_gemm_col); const conv_gemm_conf_t &jcp = this->pd()->jcp_; @@ -278,7 +348,7 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp( assert(IMPLICATION(is_problem_3d, jcp.os_block == jcp.os && jcp.ic_block == jcp.ic - && jcp.os_nb_block == 1)); + && jcp.os_nb_block == 1)); status_t st = status::success; parallel(jcp.nthr, [&](const int ithr, const int nthr) { @@ -288,9 +358,20 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp( // external data initialization by zeroes const bool outer_padding = jcp.os_nb_block == 1; if (outer_padding && is_problem_3d) { - for (ptrdiff_t i = 0; i < jcp.im2col_sz; i++) - _col[i] = (data_t)0; + // OPTIMIZATION: Vectorized zeroing + const size_t total_sz = jcp.im2col_sz; + const size_t vlmax = __riscv_vsetvlmax_e32m1(); + const vfloat32m1_t v_zero = __riscv_vfmv_v_f_f32m1(0.0f, vlmax); + ptrdiff_t i = 0; + for (; i <= (ptrdiff_t)total_sz - (ptrdiff_t)vlmax; i += (ptrdiff_t)vlmax) { + __riscv_vse32_v_f32m1(_col + i, v_zero, vlmax); + } + if (i < (ptrdiff_t)total_sz) { + size_t vl = __riscv_vsetvl_e32m1(total_sz - i); + __riscv_vse32_v_f32m1(_col + i, v_zero, vl); + } } + auto inner_ker = [&](int spatial, const im_pos_t &curr, im_pos_t &prev, im_pos_t &step, const im_pos_t &end) { const data_t *_src @@ -315,7 +396,7 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp( const data_t one = 1.0; const dim_t M = jcp.os * jcp.od; - const dim_t m = step.sp; + const dim_t m = step.sp ; const dim_t LDA = jcp.im2col_sz ? m : M; data_t *_dst = dst + curr.n * dst_mb_stride + curr.g * dst_g_stride + curr.oc * M + curr.od * jcp.os + curr.sp; @@ -331,14 +412,11 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp( const data_t *_weights = weights + curr.g * weights_g_size + curr.oc * weights_oc_size + curr.ic * jcp.ks; - status_t st = extended_sgemm("N", "N", &m, &N, &K, &one, _source, + status_t st = extended_sgemm("N ", "N ", &m, &N, &K, &one, _source, &LDA, _weights, &LDB, &beta, _dst, &M); if (st != status::success) return st; if (curr.ic == jcp.ic - step.ic) { - // TODO: for "outer threading" we have parallel section within - // outermost "parallel". It is not good. Consider to use - // "parallel" here with number of threads passed as parameter const int oc_start = curr.g * jcp.oc + curr.oc; if (jcp.with_eltwise || jcp.with_binary) { bool fast_relu_done = false; @@ -364,11 +442,11 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp( v_d, b, vl); // Add bias v_d = __riscv_vfmax_vf_f32m1( - v_d, 0.0f, vl); + v_d, 0.0f, vl); if (eltwise.scale != 1.0f) { v_d = __riscv_vfmul_vf_f32m1( - v_d, eltwise.scale, vl); + v_d, eltwise.scale, vl); } __riscv_vse32_v_f32m1(d_ + oS, v_d, vl); @@ -385,10 +463,10 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp( v_d = __riscv_vfadd_vf_f32m1( v_d, b, vl); // Add bias vbool32_t mask - = __riscv_vmflt_vf_f32m1_b32( + = __riscv_vmflt_vf_f32m1_b32( v_d, 0.0f, vl); v_d = __riscv_vfmul_vf_f32m1_m( - mask, v_d, eltwise.alpha, vl); + mask, v_d, eltwise.alpha, vl); v_d = __riscv_vfmul_vf_f32m1( v_d, eltwise.scale, vl); __riscv_vse32_v_f32m1(d_ + oS, v_d, vl); @@ -499,4 +577,4 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp( } // namespace rv64 } // namespace cpu } // namespace impl -} // namespace dnnl +} // namespace dnnl \ No newline at end of file diff --git a/src/cpu/rv64/rvv_gemm_convolution.hpp b/src/cpu/rv64/rvv_gemm_convolution.hpp index e0f2afe3c07..19f4289920c 100644 --- a/src/cpu/rv64/rvv_gemm_convolution.hpp +++ b/src/cpu/rv64/rvv_gemm_convolution.hpp @@ -69,7 +69,6 @@ struct riscv_gemm_convolution_fwd_t : public primitive_t { // TODO: make `init_conf` assign initialized object to `jcp_` jcp_ = conv_gemm_conf_t(); - std::cout << "GEMM INIT CONSTRUCTION" << std::endl; return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad, *desc(), src_md_, weights_md_, dst_md_, bias_md_, attr_, dnnl_get_max_threads()); diff --git a/third_party/xbyak_riscv/xbyak_riscv.hpp b/third_party/xbyak_riscv/xbyak_riscv.hpp deleted file mode 100644 index 249553a36f9..00000000000 --- a/third_party/xbyak_riscv/xbyak_riscv.hpp +++ /dev/null @@ -1,1383 +0,0 @@ -#pragma once -/*! - @file xbyak_riscv.hpp - @brief Xbyak_riscv ; JIT assembler for RISC-V - @author herumi - @url https://github.com/herumi/xbyak_riscv - @note modified new BSD license - http://opensource.org/licenses/BSD-3-Clause -*/ - -// Copyright (C), 2023, KNS Group LLC (YADRO) - -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef _WIN32 - #ifndef WIN32_LEAN_AND_MEAN - #define WIN32_LEAN_AND_MEAN - #endif - #include - #include -#elif defined(__GNUC__) - #include - #include - #include -#endif -#if defined(__APPLE__) - #define XBYAK_RISCV_USE_MAP_JIT - #include - #ifndef MAP_JIT - #define MAP_JIT 0x800 - #endif -#endif - -#if defined(__GNUC__) && !defined(__MINGW32__) - #define XBYAK_RISCV_USE_MMAP_ALLOCATOR -#endif - -#ifdef NDEBUG - #define XBYAK_RISCV_ASSERT(x) -#else - #define XBYAK_RISCV_ASSERT(x) assert(x) -#endif - -// MFD_CLOEXEC defined only linux 3.17 or later. -// Android wraps the memfd_create syscall from API version 30. -#if !defined(MFD_CLOEXEC) || (defined(__ANDROID__) && __ANDROID_API__ < 30) - #undef XBYAK_RISCV_USE_MEMFD -#endif - -#if defined(_WIN64) || defined(__MINGW64__) || (defined(__CYGWIN__) && defined(__x86_64__)) - #define XBYAK_RISCV64_WIN -#elif defined(__x86_64__) - #define XBYAK_RISCV64_GCC -#endif -#if !defined(XBYAK_RISCV64) && !defined(XBYAK_RISCV32) - #if defined(XBYAK_RISCV64_GCC) || defined(XBYAK_RISCV64_WIN) - #define XBYAK_RISCV64 - #else - #define XBYAK_RISCV32 - #endif -#endif - -#ifdef _MSC_VER - #pragma warning(push) - #pragma warning(disable : 4514) /* remove inline function */ - #pragma warning(disable : 4786) /* identifier is too long */ - #pragma warning(disable : 4503) /* name is too long */ - #pragma warning(disable : 4127) /* constant expresison */ -#endif - -#include "xbyak_riscv_csr.hpp" - -#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || (defined(_MSC_VER) && _MSC_VER >= 1910) - #define XBYAK_RISCV_CONSTEXPR constexpr -#else - #define XBYAK_RISCV_CONSTEXPR -#endif - -namespace Xbyak_riscv { - -enum { - DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x1010 /* 0xABCD = A.BC.D */ -}; - -inline uint32_t getVersion() { return VERSION; } - -enum { - ERR_NONE = 1, - ERR_OFFSET_IS_TOO_BIG, - ERR_CODE_IS_TOO_BIG, - ERR_IMM_IS_TOO_BIG, - ERR_INVALID_IMM_OF_JAL, - ERR_INVALID_IMM_OF_BTYPE, - ERR_LABEL_IS_NOT_FOUND, - ERR_LABEL_IS_REDEFINED, - ERR_LABEL_IS_TOO_FAR, - ERR_LABEL_IS_NOT_SET_BY_L, - ERR_LABEL_IS_ALREADY_SET_BY_L, - ERR_CANT_PROTECT, - ERR_CANT_ALLOC, - ERR_BAD_PARAMETER, - ERR_MUNMAP, - ERR_BAD_ALIGN, - ERR_INTERNAL // Put it at last. -}; - -inline const char *ConvertErrorToString(int err) -{ - static const char *errTbl[] = { - "none", - "offset is too big", - "code is too big", - "imm is too big", - "invalid imm of jal", - "invalid imm of Btype", - "label is not found", - "label is redefined", - "label is too far", - "label is not set by L", - "label is already set by L", - "can't protect", - "can't alloc", - "bad parameter", - "munmap", - "bad align", - "internal error" - }; - assert(ERR_INTERNAL == sizeof(errTbl) / sizeof(*errTbl)); - return err <= ERR_INTERNAL ? errTbl[err] : "unknown err"; -} - -#ifdef XBYAK_RISCV_NO_EXCEPTION -namespace local { - -inline int& GetErrorRef() { - static thread_local int err = 0; - return err; -} - -inline void SetError(int err) { - if (local::GetErrorRef()) return; // keep the first err code - local::GetErrorRef() = err; -} - -} // local - -inline void ClearError() { - local::GetErrorRef() = 0; -} -inline int GetError() { return Xbyak_riscv::local::GetErrorRef(); } - -#define XBYAK_RISCV_THROW(err) { Xbyak_riscv::local::SetError(err); return; } -#define XBYAK_RISCV_THROW_RET(err, r) { Xbyak_riscv::local::SetError(err); return r; } - -#else -class Error : public std::exception { - int err_; -public: - explicit Error(int err) : err_(err) - { - if (err_ < 0 || err_ > ERR_INTERNAL) { - err_ = ERR_INTERNAL; - } - } - operator int() const { return err_; } - const char *what() const noexcept override - { - return ConvertErrorToString(err_); - } -}; - -// dummy functions -inline void ClearError() { } -inline int GetError() { return 0; } - -inline const char *ConvertErrorToString(const Error& err) -{ - return err.what(); -} - -#define XBYAK_RISCV_THROW(err) { throw Error(err); } -#define XBYAK_RISCV_THROW_RET(err, r) { throw Error(err); } - -#endif - -inline void *AlignedMalloc(size_t size, size_t alignment) -{ -#ifdef __MINGW32__ - return __mingw_aligned_malloc(size, alignment); -#elif defined(_WIN32) - return _aligned_malloc(size, alignment); -#else - void *p; - int ret = posix_memalign(&p, alignment, size); - return (ret == 0) ? p : 0; -#endif -} - -inline void AlignedFree(void *p) -{ -#ifdef __MINGW32__ - __mingw_aligned_free(p); -#elif defined(_MSC_VER) - _aligned_free(p); -#else - free(p); -#endif -} - -namespace local { - -static const size_t ALIGN_PAGE_SIZE = 4096; - -inline XBYAK_RISCV_CONSTEXPR uint32_t mask(size_t n) -{ - XBYAK_RISCV_ASSERT(n <= 32); - return n == 32 ? 0xffffffff : (1u << n) - 1; -} -// is x <= mask(n) ? -inline XBYAK_RISCV_CONSTEXPR bool inBit(uint32_t x, size_t n) -{ - return x <= mask(n); -} - -// is x a signed n-bit integer? -inline XBYAK_RISCV_CONSTEXPR bool inSBit(int x, int n) -{ - return -(1 << (n-1)) <= x && x < (1 << (n-1)); -} - -// split x to hi20bits and low12bits -// return false if x in 12-bit signed integer -inline bool split32bit(int *pH, int* pL, int x) { - if (inSBit(x, 12)) return false; - int H = (x >> 12) & mask(20); - int L = x & mask(12); - if (x & (1 << 11)) { - H++; - L = L | (mask(20) << 12); - } - *pH = H; - *pL = L; - return true; -} - -// @@@ embedded by bit_pattern.py (DON'T DELETE THIS LINE) -inline size_t get20_10to1_11_19to12_z12(size_t v) { return ((v & (1<<20)) << 11)| ((v & (1023<<1)) << 20)| ((v & (1<<11)) << 9)| (v & (255<<12)); } -inline size_t get12_10to5_z13_4to1_11_z7(size_t v) { return ((v & (1<<12)) << 19)| ((v & (63<<5)) << 20)| ((v & (15<<1)) << 7)| ((v & (1<<11)) >> 4); } -inline size_t get5to4_9to6_2_3_z5(size_t v) { return ((v & (3<<4)) << 7)| ((v & (15<<6)) << 1)| ((v & (1<<2)) << 4)| ((v & (1<<3)) << 2); } -inline size_t get9_z5_4_6_8to7_5_z2(size_t v) { return ((v & (1<<9)) << 3)| ((v & (1<<4)) << 2)| ((v & (1<<6)) >> 1)| ((v & (3<<7)) >> 4)| ((v & (1<<5)) >> 3); } -inline size_t get5to3_z3_2_6_z5(size_t v) { return ((v & (7<<3)) << 7)| ((v & (1<<2)) << 4)| ((v & (1<<6)) >> 1); } -inline size_t get5to3_z3_7_6_z5(size_t v) { return ((v & (7<<3)) << 7)| ((v & (1<<7)) >> 1)| ((v & (1<<6)) >> 1); } -inline size_t get5_z5_4to0_z2(size_t v) { return ((v & (1<<5)) << 7)| ((v & 31) << 2); } -inline size_t get11_4_9to8_10_6_7_3to1_5_z2(size_t v) { return ((v & (1<<11)) << 1)| ((v & (1<<4)) << 7)| ((v & (3<<8)) << 1)| ((v & (1<<10)) >> 2)| ((v & (1<<6)) << 1)| ((v & (1<<7)) >> 1)| ((v & (7<<1)) << 2)| ((v & (1<<5)) >> 3); } -inline size_t get17_z5_16to12_z2(size_t v) { return ((v & (1<<17)) >> 5)| ((v & (31<<12)) >> 10); } -inline size_t get5_z5_4to2_7to6_z2(size_t v) { return ((v & (1<<5)) << 7)| ((v & (7<<2)) << 2)| ((v & (3<<6)) >> 4); } -inline size_t get5_z5_4to3_8to6_z2(size_t v) { return ((v & (1<<5)) << 7)| ((v & (3<<3)) << 2)| ((v & (7<<6)) >> 4); } -inline size_t get5to2_7to6_z7(size_t v) { return ((v & (15<<2)) << 7)| ((v & (3<<6)) << 1); } -inline size_t get5to3_8to6_z7(size_t v) { return ((v & (7<<3)) << 7)| ((v & (7<<6)) << 1); } -// @@@ embedded by bit_pattern.py (DON'T DELETE THIS LINE) - -} // local - -/* - custom allocator -*/ -struct Allocator { - explicit Allocator(const std::string& = "") {} // same interface with MmapAllocator - virtual uint8_t *alloc(size_t size) { return reinterpret_cast(AlignedMalloc(size, local::ALIGN_PAGE_SIZE)); } - virtual void free(uint8_t *p) { AlignedFree(p); } - virtual ~Allocator() {} - /* override to return false if you call protect() manually */ - virtual bool useProtect() const { return true; } -}; - -#ifdef XBYAK_RISCV_USE_MMAP_ALLOCATOR -#ifdef XBYAK_RISCV_USE_MAP_JIT -namespace local { - -inline int getMacOsVersionPure() -{ - char buf[64]; - size_t size = sizeof(buf); - int err = sysctlbyname("kern.osrelease", buf, &size, NULL, 0); - if (err != 0) return 0; - char *endp; - int major = strtol(buf, &endp, 10); - if (*endp != '.') return 0; - return major; -} - -inline int getMacOsVersion() -{ - static const int version = getMacOsVersionPure(); - return version; -} - -} // local -#endif -class MmapAllocator : public Allocator { - struct Allocation { - size_t size; -#if defined(XBYAK_RISCV_USE_MEMFD) - // fd_ is only used with XBYAK_RISCV_USE_MEMFD. We keep the file open - // during the lifetime of each allocation in order to support - // checkpoint/restore by unprivileged users. - int fd; -#endif - }; - const std::string name_; // only used with XBYAK_RISCV_USE_MEMFD - typedef std::unordered_map AllocationList; - AllocationList allocList_; -public: - explicit MmapAllocator(const std::string& name = "xbyak") : name_(name) {} - uint8_t *alloc(size_t size) override - { - const size_t alignedSizeM1 = local::ALIGN_PAGE_SIZE - 1; - size = (size + alignedSizeM1) & ~alignedSizeM1; -#if defined(MAP_ANONYMOUS) - int mode = MAP_PRIVATE | MAP_ANONYMOUS; -#elif defined(MAP_ANON) - int mode = MAP_PRIVATE | MAP_ANON; -#else - #error "not supported" -#endif -#if defined(XBYAK_RISCV_USE_MAP_JIT) - const int mojaveVersion = 18; - if (local::getMacOsVersion() >= mojaveVersion) mode |= MAP_JIT; -#endif - int fd = -1; -#if defined(XBYAK_RISCV_USE_MEMFD) - fd = memfd_create(name_.c_str(), MFD_CLOEXEC); - if (fd != -1) { - mode = MAP_SHARED; - if (ftruncate(fd, size) != 0) { - close(fd); - XBYAK_RISCV_THROW_RET(ERR_CANT_ALLOC, 0) - } - } -#endif - void *p = mmap(NULL, size, PROT_READ | PROT_WRITE, mode, fd, 0); - if (p == MAP_FAILED) { - if (fd != -1) close(fd); - XBYAK_RISCV_THROW_RET(ERR_CANT_ALLOC, 0) - } - assert(p); - Allocation &alloc = allocList_[(uintptr_t)p]; - alloc.size = size; -#if defined(XBYAK_RISCV_USE_MEMFD) - alloc.fd = fd; -#endif - return (uint8_t*)p; - } - void free(uint8_t *p) override - { - if (p == 0) return; - AllocationList::iterator i = allocList_.find((uintptr_t)p); - if (i == allocList_.end()) XBYAK_RISCV_THROW(ERR_BAD_PARAMETER) - if (munmap((void*)i->first, i->second.size) < 0) XBYAK_RISCV_THROW(ERR_MUNMAP) -#if defined(XBYAK_RISCV_USE_MEMFD) - if (i->second.fd != -1) close(i->second.fd); -#endif - allocList_.erase(i); - } -}; -#endif - -namespace local { - -// Register Interface -class IReg { -public: - enum Kind { - GPR = 1, // General purpose register - FReg = 1 << 1, // Floating-point register - VECTOR = 1 << 2, // Vector register - }; -protected: - uint32_t idx_; - Kind kind_; -public: - XBYAK_RISCV_CONSTEXPR IReg(uint32_t idx = 0, Kind kind = GPR) - : idx_(idx), kind_(kind) - { - XBYAK_RISCV_ASSERT(local::inBit(idx, 5)); - } - XBYAK_RISCV_CONSTEXPR int getIdx() const { return idx_; } - const char *toString() const - { - if (kind_ == GPR) { - static const char tbl[][4] = { - "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", - "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", - "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", - "x24", "x25", "x26", "x27", "x28", "x29", "x30", "x31", - }; - return tbl[idx_]; - } else if (kind_ == FReg) { - static const char tbl[][4] = { - "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", - "f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15", - "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23", - "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31", - }; - return tbl[idx_]; - } else if (kind_ == VECTOR) { - static const char tbl[][4] = { - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", - }; - return tbl[idx_]; - } - XBYAK_RISCV_THROW_RET(ERR_INTERNAL, 0); - } - bool operator==(const IReg& rhs) const - { - return idx_ == rhs.idx_ && kind_ == rhs.kind_; - } - bool operator!=(const IReg& rhs) const { return !operator==(rhs); } - -}; - -} // local - -// General Purpose Register -struct Reg : public local::IReg { - explicit XBYAK_RISCV_CONSTEXPR Reg(int idx = 0) : local::IReg(idx, IReg::Kind::GPR) { } -}; - -static XBYAK_RISCV_CONSTEXPR Reg x0(0), x1(1), x2(2), x3(3), x4(4), x5(5), x6(6), x7(7); -static XBYAK_RISCV_CONSTEXPR Reg x8(8), x9(9), x10(10), x11(11), x12(12), x13(13), x14(14), x15(15); -static XBYAK_RISCV_CONSTEXPR Reg x16(16), x17(17), x18(18), x19(19), x20(20), x21(21), x22(22), x23(23); -static XBYAK_RISCV_CONSTEXPR Reg x24(24), x25(25), x26(26), x27(27), x28(28), x29(29), x30(30), x31(31); - -static XBYAK_RISCV_CONSTEXPR Reg zero(x0); -static XBYAK_RISCV_CONSTEXPR Reg ra(x1); -static XBYAK_RISCV_CONSTEXPR Reg sp(x2); -static XBYAK_RISCV_CONSTEXPR Reg gp(x3); -static XBYAK_RISCV_CONSTEXPR Reg tp(x4); -static XBYAK_RISCV_CONSTEXPR Reg t0(x5); -static XBYAK_RISCV_CONSTEXPR Reg t1(x6); -static XBYAK_RISCV_CONSTEXPR Reg t2(x7); -static XBYAK_RISCV_CONSTEXPR Reg fp(x8); -static XBYAK_RISCV_CONSTEXPR Reg s0(x8); -static XBYAK_RISCV_CONSTEXPR Reg s1(x9); -static XBYAK_RISCV_CONSTEXPR Reg a0(x10), a1(x11), a2(x12), a3(x13), a4(x14), a5(x15), a6(x16), a7(x17); -static XBYAK_RISCV_CONSTEXPR Reg s2(x18), s3(x19), s4(x20), s5(x21), s6(x22), s7(x23), s8(x24), s9(x25); -static XBYAK_RISCV_CONSTEXPR Reg s10(x26), s11(x27); -static XBYAK_RISCV_CONSTEXPR Reg t3(x28), t4(x29), t5(x30), t6(x31); - -// Floating Point Register -struct FReg : public local::IReg { - explicit XBYAK_RISCV_CONSTEXPR FReg(int idx = 0) : local::IReg(idx, IReg::Kind::FReg) { } -}; - -static XBYAK_RISCV_CONSTEXPR FReg f0(0), f1(1), f2(2), f3(3), f4(4), f5(5), f6(6), f7(7); -static XBYAK_RISCV_CONSTEXPR FReg f8(8), f9(9), f10(10), f11(11), f12(12), f13(13), f14(14), f15(15); -static XBYAK_RISCV_CONSTEXPR FReg f16(16), f17(17), f18(18), f19(19), f20(20), f21(21), f22(22), f23(23); -static XBYAK_RISCV_CONSTEXPR FReg f24(24), f25(25), f26(26), f27(27), f28(28), f29(29), f30(30), f31(31); -// ABI name -static XBYAK_RISCV_CONSTEXPR FReg ft0(0), ft1(1), ft2(2), ft3(3), ft4(4), ft5(5), ft6(6), ft7(7); -static XBYAK_RISCV_CONSTEXPR FReg fs0(8), fs1(9), fa0(10), fa1(11), fa2(12), fa3(13), fa4(14), fa5(15), fa6(16), fa7(f17); -static XBYAK_RISCV_CONSTEXPR FReg fs2(18), fs3(19), fs4(20), fs5(21), fs6(22), fs7(23), fs8(24), fs9(25), fs10(26), fs11(27); -static XBYAK_RISCV_CONSTEXPR FReg ft8(28), ft9(29), ft10(30), ft11(31); - -#if defined(XBYAK_RISCV_V) && XBYAK_RISCV_V == 1 -// Vector Register -struct VReg : public local::IReg { - explicit XBYAK_RISCV_CONSTEXPR VReg(int idx = 0) : local::IReg(idx, IReg::Kind::VECTOR) { } -}; - -static XBYAK_RISCV_CONSTEXPR VReg v0(0), v1(1), v2(2), v3(3), v4(4), v5(5), v6(6), v7(7); -static XBYAK_RISCV_CONSTEXPR VReg v8(8), v9(9), v10(10), v11(11), v12(12), v13(13), v14(14), v15(15); -static XBYAK_RISCV_CONSTEXPR VReg v16(16), v17(17), v18(18), v19(19), v20(20), v21(21), v22(22), v23(23); -static XBYAK_RISCV_CONSTEXPR VReg v24(24), v25(25), v26(26), v27(27), v28(28), v29(29), v30(30), v31(31); -#endif - -// 2nd parameter for constructor of CodeArray(maxSize, userPtr, alloc) -void *const DontSetProtectRWE = (void*)2; //-V566 - -class CodeArray { - enum Type { - USER_BUF = 1, // use userPtr(non alignment, non protect) - ALLOC_BUF // use new(alignment, protect) - }; - CodeArray(const CodeArray& rhs); - void operator=(const CodeArray&); - bool isAllocType() const { return type_ == ALLOC_BUF; } - const Type type_; -#ifdef XBYAK_RISCV_USE_MMAP_ALLOCATOR - MmapAllocator defaultAllocator_; -#else - Allocator defaultAllocator_; -#endif - Allocator *alloc_; -protected: - size_t maxSize_; - uint8_t *top_; - size_t size_; - - bool useProtect() const { return alloc_->useProtect(); } -public: - enum ProtectMode { - PROTECT_RW = 0, // read/write - PROTECT_RWE = 1, // read/write/exec - PROTECT_RE = 2 // read/exec - }; - explicit CodeArray(size_t maxSize, void *userPtr = 0, Allocator *allocator = 0) - : type_((userPtr == 0 || userPtr == DontSetProtectRWE) ? ALLOC_BUF : USER_BUF) - , alloc_(allocator ? allocator : (Allocator*)&defaultAllocator_) - , maxSize_(maxSize) - , top_(type_ == USER_BUF ? reinterpret_cast(userPtr) : alloc_->alloc((std::max)(maxSize, 1))) - , size_(0) - { - if (maxSize_ > 0 && top_ == 0) XBYAK_RISCV_THROW(ERR_CANT_ALLOC) - if ((type_ == ALLOC_BUF && userPtr != DontSetProtectRWE && useProtect()) && !setProtectMode(PROTECT_RWE, false)) { - alloc_->free(top_); - XBYAK_RISCV_THROW(ERR_CANT_PROTECT) - } - } - virtual ~CodeArray() - { - if (isAllocType()) { - if (useProtect()) setProtectModeRW(false); - alloc_->free(top_); - } - } - bool setProtectMode(ProtectMode mode, bool throwException = true) - { - bool isOK = protect(top_, maxSize_, mode); - if (isOK) return true; - if (throwException) XBYAK_RISCV_THROW_RET(ERR_CANT_PROTECT, false) - return false; - } - bool setProtectModeRE(bool throwException = true) { return setProtectMode(PROTECT_RE, throwException); } - bool setProtectModeRW(bool throwException = true) { return setProtectMode(PROTECT_RW, throwException); } - void resetSize() - { - size_ = 0; - } - void writeBytes(size_t offset, uint64_t v, size_t n) - { - if (n > 8) XBYAK_RISCV_THROW(ERR_BAD_PARAMETER) - if (offset + n > maxSize_) XBYAK_RISCV_THROW(ERR_CODE_IS_TOO_BIG) - uint8_t *const p = top_ + offset; - for (size_t i = 0; i < n; i++) { - p[i] = static_cast(v >> (i * 8)); - } - } - void writeBytes(const uint8_t *addr, uint64_t v, size_t n) - { - writeBytes(addr - top_, v, n); - } - void appendBytes(uint64_t v, size_t n) - { - writeBytes(size_, v, n); - size_ += n; - } - void append4B(uint32_t code) { appendBytes(code, 4); } - void append2B(uint32_t code) { appendBytes(code, 2); } - void append1B(uint32_t code) { appendBytes(code, 1); } - void write4B(size_t offset, uint32_t v) { writeBytes(offset, v, 4); } - void dump(bool separate = false) const - { - const uint8_t *p = getCode(); - const size_t bufSize = getSize(); - if (separate) { - size_t pos = 0; - while (pos < bufSize) { - uint32_t v = p[pos]; - size_t n = (v & 3) == 3 ? 4 : 2; - if (pos + n <= bufSize) { - for (size_t i = 0; i < n; i++) { - printf("%02x", p[pos + n - 1 - i]); - } - printf("\n"); - pos += n; - } else { - printf("%02x error\n", v); - return; - } - } - return; - } - size_t remain = bufSize; - for (int i = 0; i < 4; i++) { - size_t disp = 16; - if (remain < 16) { - disp = remain; - } - for (size_t j = 0; j < 16; j++) { - if (j < disp) { - printf("%02x", p[i * 16 + j]); - } - } - putchar('\n'); - remain -= disp; - if (remain == 0) { - break; - } - } - } - const uint8_t *getCode() const { return top_; } - template - const F getCode() const { return reinterpret_cast(top_); } - const uint8_t *getCurr() const { return &top_[size_]; } - template - const F getCurr() const { return reinterpret_cast(&top_[size_]); } - size_t getSize() const { return size_; } - void setSize(size_t size) - { - if (size > maxSize_) XBYAK_RISCV_THROW(ERR_OFFSET_IS_TOO_BIG) - size_ = size; - } - /** - change exec permission of memory - @param addr [in] buffer address - @param size [in] buffer size - @param protectMode [in] mode(RW/RWE/RE) - @return true(success), false(failure) - */ - static inline bool protect(const void *addr, size_t size, int protectMode) - { -#if defined(_WIN32) - const DWORD c_rw = PAGE_READWRITE; - const DWORD c_rwe = PAGE_EXECUTE_READWRITE; - const DWORD c_re = PAGE_EXECUTE_READ; - DWORD mode; -#else - const int c_rw = PROT_READ | PROT_WRITE; - const int c_rwe = PROT_READ | PROT_WRITE | PROT_EXEC; - const int c_re = PROT_READ | PROT_EXEC; - int mode; -#endif - switch (protectMode) { - case PROTECT_RW: mode = c_rw; break; - case PROTECT_RWE: mode = c_rwe; break; - case PROTECT_RE: mode = c_re; break; - default: - return false; - } -#if defined(_WIN32) - DWORD oldProtect; - return VirtualProtect(const_cast(addr), size, mode, &oldProtect) != 0; -#elif defined(__GNUC__) - size_t pageSize = sysconf(_SC_PAGESIZE); - size_t iaddr = reinterpret_cast(addr); - size_t roundAddr = iaddr & ~(pageSize - static_cast(1)); - return mprotect(reinterpret_cast(roundAddr), size + (iaddr - roundAddr), mode) == 0; -#else - return true; -#endif - } - /** - get aligned memory pointer - @param addr [in] address - @param alignedSize [in] power of two - @return aligned addr by alingedSize - */ - static inline uint8_t *getAlignedAddress(uint8_t *addr, size_t alignedSize = 16) - { - return reinterpret_cast((reinterpret_cast(addr) + alignedSize - 1) & ~(alignedSize - static_cast(1))); - } -}; - -struct Jmp { - enum Type { - tJal, - tBtype, - tRawAddress, - } type; - const uint8_t* from; /* address of the jmp mnemonic */ - uint32_t encoded; - size_t encSize() const - { - return (type == tRawAddress) ? sizeof(size_t) : 4; - } - // jal - Jmp(const uint8_t *from, uint32_t opcode, const Reg& rd) - : type(tJal) - , from(from) - , encoded((rd.getIdx() << 7) | opcode) - { - } - // B-type - Jmp(const uint8_t* from, uint32_t opcode, uint32_t funct3, const Reg& src1, const Reg& src2) - : type(tBtype) - , from(from) - , encoded((src2.getIdx() << 20) | (src1.getIdx() << 15) | (funct3 << 12) | opcode) - { - } - // raw address - explicit Jmp(const uint8_t* from) - : type(tRawAddress) - , from(from) - , encoded(0) - { - } - static inline bool isValidImm(size_t imm, size_t maskBit) - { - const size_t M = local::mask(maskBit); - return (imm < M || ~M <= imm) && (imm & 1) == 0; - } - size_t encode(const uint8_t* addr) const - { - if (addr == 0) return 0; - if (type == tRawAddress) return size_t(addr); - const size_t imm = addr - from; - if (type == tJal) { - if (!isValidImm(imm, 20)) XBYAK_RISCV_THROW(ERR_INVALID_IMM_OF_JAL) - return local::get20_10to1_11_19to12_z12(imm) | encoded; - } else { - if (!isValidImm(imm, 12)) XBYAK_RISCV_THROW(ERR_INVALID_IMM_OF_JAL) - return local::get12_10to5_z13_4to1_11_z7(imm) | encoded; - } - } - // update jmp address by base->getCurr() - void update(CodeArray *base) const - { - base->writeBytes(from, encode(base->getCurr()), encSize()); - } - // append jmp opcode with addr - void appendCode(CodeArray *base, const uint8_t *addr) const - { - base->appendBytes(encode(addr), encSize()); - } -}; - -class LabelManager; - -class Label { - mutable LabelManager *mgr; - mutable int id; - friend class LabelManager; -public: - Label() : mgr(0), id(0) {} - Label(const Label& rhs); - Label& operator=(const Label& rhs); - ~Label(); - void clear() { mgr = 0; id = 0; } - int getId() const { return id; } - const uint8_t *getAddress() const; -}; - -class LabelManager { - // for Label class - struct ClabelVal { - ClabelVal(const uint8_t* addr = 0) : addr(addr), refCount(1) {} - const uint8_t* addr; - int refCount; - }; - typedef std::unordered_map ClabelDefList; - typedef std::unordered_multimap ClabelUndefList; - typedef std::unordered_set LabelPtrList; - - CodeArray *base_; - mutable int labelId_; - ClabelDefList clabelDefList_; - ClabelUndefList clabelUndefList_; - LabelPtrList labelPtrList_; - - int getId(const Label& label) const - { - if (label.id == 0) label.id = labelId_++; - return label.id; - } - void define_inner(ClabelDefList& defList, ClabelUndefList& undefList, int labelId, const uint8_t* addr) - { - // add label - ClabelDefList::value_type item(labelId, addr); - std::pair ret = defList.insert(item); - if (!ret.second) XBYAK_RISCV_THROW(ERR_LABEL_IS_REDEFINED) - // search undefined label - for (;;) { - ClabelUndefList::iterator itr = undefList.find(labelId); - if (itr == undefList.end()) break; - const Jmp& jmp = itr->second; - jmp.update(base_); - undefList.erase(itr); - } - } - friend class Label; - void incRefCount(int id, Label *label) - { - clabelDefList_[id].refCount++; - labelPtrList_.insert(label); - } - void decRefCount(int id, Label *label) - { - labelPtrList_.erase(label); - ClabelDefList::iterator i = clabelDefList_.find(id); - if (i == clabelDefList_.end()) return; - if (i->second.refCount == 1) { - clabelDefList_.erase(id); - } else { - --i->second.refCount; - } - } - template - bool hasUndefinedLabel_inner(const T& list) const - { - return !list.empty(); - } - // detach all labels linked to LabelManager - void resetLabelPtrList() - { - for (LabelPtrList::iterator i = labelPtrList_.begin(), ie = labelPtrList_.end(); i != ie; ++i) { - (*i)->clear(); - } - labelPtrList_.clear(); - } -public: - LabelManager() - { - reset(); - } - ~LabelManager() - { - resetLabelPtrList(); - } - void reset() - { - base_ = 0; - labelId_ = 1; - clabelDefList_.clear(); - clabelUndefList_.clear(); - resetLabelPtrList(); - } - void set(CodeArray *base) { base_ = base; } - void defineClabel(Label& label) - { - define_inner(clabelDefList_, clabelUndefList_, getId(label), base_->getCurr()); - label.mgr = this; - labelPtrList_.insert(&label); - } - void assign(Label& dst, const Label& src) - { - ClabelDefList::const_iterator i = clabelDefList_.find(src.id); - if (i == clabelDefList_.end()) XBYAK_RISCV_THROW(ERR_LABEL_IS_NOT_SET_BY_L) - define_inner(clabelDefList_, clabelUndefList_, dst.id, i->second.addr); - dst.mgr = this; - labelPtrList_.insert(&dst); - } - // return 0 unless label exists - const uint8_t* getAddr(const Label& label) const - { - ClabelDefList::const_iterator i = clabelDefList_.find(getId(label)); - if (i == clabelDefList_.end()) return 0; - return i->second.addr; - } - void addUndefinedLabel(const Label& label, const Jmp& jmp) - { - clabelUndefList_.insert(ClabelUndefList::value_type(label.id, jmp)); - } - bool hasUndefClabel() const { return hasUndefinedLabel_inner(clabelUndefList_); } - const uint8_t *getCode() const { return base_->getCode(); } -}; - -inline Label::Label(const Label& rhs) -{ - id = rhs.id; - mgr = rhs.mgr; - if (mgr) mgr->incRefCount(id, this); -} -inline Label& Label::operator=(const Label& rhs) -{ - if (id) XBYAK_RISCV_THROW_RET(ERR_LABEL_IS_ALREADY_SET_BY_L, *this) - id = rhs.id; - mgr = rhs.mgr; - if (mgr) mgr->incRefCount(id, this); - return *this; -} -inline Label::~Label() -{ - if (id && mgr) mgr->decRefCount(id, this); -} -inline const uint8_t* Label::getAddress() const -{ - if (mgr == 0) return 0; - return mgr->getAddr(*this); -} - -namespace local { - -template -struct Bit { - uint32_t v; - Bit(uint32_t v) - : v(v) - { - XBYAK_RISCV_ASSERT(inBit(v, n)); - } - Bit(const IReg& r) - : v(r.getIdx()) - { - } - Bit(VM vm) - : v(static_cast(vm)) - { - } - Bit(CSR csr) - : v(static_cast(csr)) - { - } - Bit(RM rm) - : v(static_cast(rm)) - { - } -}; - -} // local - -class CodeGenerator : public CodeArray { -public: - enum AqRlType { - T_aq = 2, - T_rl = 1, - T_aqrl = 3, - }; - typedef local::Bit<1> Bit1; - typedef local::Bit<2> Bit2; - typedef local::Bit<3> Bit3; - typedef local::Bit<5> Bit5; - typedef local::Bit<6> Bit6; - typedef local::Bit<7> Bit7; - typedef local::Bit<12> Bit12; - typedef local::Bit<32> Bit32; -private: - CodeGenerator operator=(const CodeGenerator&) = delete; - LabelManager labelMgr_; - int XLEN_; - bool isRV32_; - bool supportRVC_; - void opJmp(const Label& label, const Jmp& jmp) - { - const uint8_t* addr = labelMgr_.getAddr(label); - jmp.appendCode(this, addr); - if (addr) return; - labelMgr_.addUndefinedLabel(label, jmp); - } - uint32_t enc2(uint32_t a, uint32_t b) const { return (a<<7) | (b<<15); } - uint32_t enc3(uint32_t a, uint32_t b, uint32_t c) const { return enc2(a, b) | (c<<20); } - void Rtype(Bit7 opcode, Bit3 funct3, Bit7 funct7, Bit5 rd, Bit5 rs1, Bit5 rs2) - { - uint32_t v = (funct7.v<<25) | (funct3.v<<12) | opcode.v | enc3(rd.v, rs1.v, rs2.v); - append4B(v); - } - void Itype(Bit7 opcode, Bit3 funct3, Bit5 rd, Bit5 rs1, int imm) - { - if (!local::inSBit(imm, 12)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG) - uint32_t v = (imm<<20) | (funct3.v<<12) | opcode.v | enc2(rd.v, rs1.v); - append4B(v); - } - void Stype(Bit7 opcode, Bit3 funct3, Bit5 rs1, Bit5 rs2, int imm) - { - if (!local::inSBit(imm, 12)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG) - uint32_t v = ((imm>>5)<<25) | (funct3.v<<12) | opcode.v | enc3(imm & local::mask(5), rs1.v, rs2.v); - append4B(v); - } - void Utype(Bit7 opcode, Bit5 rd, uint32_t imm) - { - if (imm >= (1u << 20)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG) - uint32_t v = (imm<<12) | opcode.v | (rd.v<<7); - append4B(v); - } - void opShift(Bit7 pre, Bit3 funct3, Bit7 opcode, Bit5 rd, Bit5 rs1, uint32_t shamt, int range = 0) - { - if (range == 0) range = isRV32_ ? 5 : 6; - if (shamt >= (1u << range)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG) - uint32_t v = (pre.v<<25) | (funct3.v<<12) | opcode.v | enc3(rd.v, rs1.v, shamt); - append4B(v); - } - void opAtomic(Bit5 rd, Bit5 rs2, Bit5 addr, Bit5 funct5, Bit3 funct3, uint32_t flag) - { - assert(flag <= 3); - Rtype(0x2f, funct3.v, (funct5.v << 2) | flag, rd, addr, rs2); - } - void opIVV(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 vs1, Bit5 vd) - { - /* - 31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 - func6 vm vs2 vs1 func3 vd opcode - - func6, func3, and opcode must be encoded in the baseValue - */ - uint32_t v = (vm.v<<25) | (vs2.v<<20) | (vs1.v<<15) | (vd.v<<7); - v |= baseValue.v; // force-encode base value - append4B(v); - } - void opFVV(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 vs1, Bit5 d) - { - /* - 31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 - func6 vm vs2 vs1 func3 vd/rd opcode - - func6, func3, and opcode must be encoded in the baseValue - */ - uint32_t v = (vm.v<<25) | (vs2.v<<20) | (vs1.v<<15) | (d.v<<7); - v |= baseValue.v; // force-encode base value - append4B(v); - } - void opMVV(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 vs1, Bit5 d) - { - /* - 31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 - func6 vm vs2 vs1 func3 vd/rd opcode - - func6, func3, and opcode must be encoded in the baseValue - */ - uint32_t v = (vm.v<<25) | (vs2.v<<20) | (vs1.v<<15) | (d.v<<7); - v |= baseValue.v; // force-encode base value - append4B(v); - } - void opIVI(Bit32 baseValue, Bit1 vm, Bit5 vs2, uint32_t imm, Bit5 vd) - { - /* - 31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 - func6 vm vs2 imm func3 vd opcode - - func6, func3, and opcode must be encoded in the baseValue - */ - uint32_t v = (vm.v<<25) | (vs2.v<<20) | ((imm & local::mask(5))<<15) | (vd.v<<7); - v |= baseValue.v; // force-encode base value - append4B(v); - } - void opIVX(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 rs1, Bit5 vd) - { - /* - 31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 - func6 vm vs2 rs1 func3 vd opcode - - func6, func3, and opcode must be encoded in the baseValue - */ - uint32_t v = (vm.v<<25) | (vs2.v<<20) | (rs1.v<<15) | (vd.v<<7); - v |= baseValue.v; // force-encode base value - append4B(v); - } - void opFVF(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 rs1, Bit5 vd) - { - /* - 31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 - func6 vm vs2 rs1 func3 vd opcode - - func6, func3, and opcode must be encoded in the baseValue - */ - uint32_t v = (vm.v<<25) | (vs2.v<<20) | (rs1.v<<15) | (vd.v<<7); - v |= baseValue.v; // force-encode base value - append4B(v); - } - void opMVX(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 rs1, Bit5 d) - { - /* - 31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 - func6 vm vs2 rs1 func3 vd/rd opcode - - func6, func3, and opcode must be encoded in the baseValue - */ - uint32_t v = (vm.v<<25) | (vs2.v<<20) | (rs1.v<<15) | (d.v<<7); - v |= baseValue.v; // force-encode base value - append4B(v); - } - void opVectorLoad(Bit32 baseValue, Bit1 vm, Bit5 rs2_vs2, Bit5 rs1, Bit5 vd) - { - /* - 31 .. 29 | 28 | 27 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 - nf mew mop vm lumop/rs2/vs2 rs1 width vd opcode - - mew, mop, width, lumop, and opcode must be encoded in the baseValue - */ - uint32_t v = (vm.v<<25) | (rs2_vs2.v<<20) | (rs1.v<<15) | (vd.v<<7); - v |= baseValue.v; // force-encode base value - append4B(v); - } - void opVectorStore(Bit32 baseValue, Bit1 vm, Bit5 rs2_vs2, Bit5 rs1, Bit5 vs3) - { - /* - 31 .. 29 | 28 | 27 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 - nf mew mop vm sumop/rs2/vs2 rs1 width vd opcode - - mew, mop, width, sumop, and opcode must be encoded in the baseValue - */ - uint32_t v = (vm.v<<25) | (rs2_vs2.v<<20) | (rs1.v<<15) | (vs3.v<<7); - v |= baseValue.v; // force-encode base value - append4B(v); - } - void opCSR(Bit32 baseValue, Bit12 csr, Bit5 rs1_uimm, Bit5 rd) - { - /* - 31 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 - csr rs1_uimm func3 rd opcode - - func3 and opcode must be encoded in the baseValue - */ - uint32_t v = (csr.v<<20) | (rs1_uimm.v<<15) | (rd.v<<7); - v |= baseValue.v; // force-encode base value - append4B(v); - } - void opLoadFP(Bit32 baseValue, int imm, Bit5 rs1, Bit5 rd) - { - if (!local::inSBit(imm, 12)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG) - /* - 31 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 - imm[11:0] rs1 width rd opcode - - width and opcode must be encoded in the baseValue - */ - uint32_t v = (imm<<20) | (rs1.v<<15) | (rd.v<<7); - v |= baseValue.v; // force-encode base value - append4B(v); - } - void opStoreFP(Bit32 baseValue, int imm, Bit5 rs2, Bit5 rs1) - { - if (!local::inSBit(imm, 12)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG) - /* - 31 .. 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 - imm[11:5] rs2 rs1 width imm[4:0] opcode - - width and opcode must be encoded in the baseValue - */ - uint32_t imm_11_5 = imm & (local::mask(7)<<5); - uint32_t imm_4_0 = imm & local::mask(5); - uint32_t v = (imm_11_5<<20) | (rs2.v<<20) | (rs1.v<<15) | (imm_4_0<<7); - v |= baseValue.v; // force-encode base value - append4B(v); - } - void opFP(Bit32 baseValue, Bit5 rs2, Bit5 rs1, Bit3 rm, Bit5 rd) - { - /* - 31 .. 27 | 26 .. 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 - func5 fmt rs2 rs1 rm rd opcode - - func5, fmt, and opcode must be encoded in the baseValue - */ - uint32_t v = (rs2.v<<20) | (rs1.v<<15) | (rm.v<<12) | (rd.v<<7); - v |= baseValue.v; // force-encode base value - append4B(v); - } - void opR4(Bit32 baseValue, Bit5 rs3, Bit5 rs2, Bit5 rs1, Bit3 rm, Bit5 rd) - { - /* - 31 .. 27 | 26 .. 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0 - rs3 fmt rs2 rs1 rm rd opcode - - fmt and opcode must be encoded in the baseValue - */ - uint32_t v = (rs3.v<<27) | (rs2.v<<20) | (rs1.v<<15) | (rm.v<<12) | (rd.v<<7); - v |= baseValue.v; // force-encode base value - append4B(v); - } - bool isValiCidx(uint32_t idx) const { return 8 <= idx && idx < 16; } - // c_addi, c_addiw - bool c_addi_inner(const Reg& rd, const Reg& rs, uint32_t imm, uint32_t funct3) - { - uint32_t dIdx = rd.getIdx(); - uint32_t sIdx = rs.getIdx(); - if (sIdx == 0 && c_li(rd, imm, 2, 1)) return true; - if (dIdx == 0 || dIdx != sIdx || !local::inSBit(imm, 6)) return false; - uint32_t v = (funct3<<13) | ((imm & (1<<5))<<7) | (dIdx<<7) | ((imm & 31)<<2)| 1; - append2B(v); - return true; - } - bool c_addi16sp(const Reg& rd, const Reg& rs, uint32_t imm) - { - if (rd != sp || rs != sp || (imm % 16) != 0 || (496 < imm && imm < ~512u) || imm == 0) return false; - uint32_t v = (3<<13) | (2<<7) | 1 | local::get9_z5_4_6_8to7_5_z2(imm); - append2B(v); - return true; - } - // c_li, c_slli - bool c_li(const Reg& rd, uint32_t imm, uint32_t funct3, uint32_t op) - { - if (rd == x0 || !local::inSBit(imm, 6)) return false; - uint32_t v = (funct3<<13) | (rd.getIdx() << 7) | op | local::get5_z5_4to0_z2(imm); - append2B(v); - return true; - } - bool c_lui(const Reg& rd, uint32_t imm) - { - if (rd == x0 || rd == x2 || imm == 0 || (32 <= imm && imm < (1<<20)-32)) return false; - uint32_t v = (3<<13) | (rd.getIdx()<<7) | 1 | local::get5_z5_4to0_z2(imm); - append2B(v); - return true; - } - bool c_addi(const Reg& rd, const Reg& rs, uint32_t imm) - { - uint32_t dIdx = rd.getIdx(); - if (imm == 0 && c_mv(rd, rs, 0)) return true; - if (c_addi_inner(rd, rs, imm, 0)) return true; - if (c_addi16sp(rd, rs, imm)) return true; - // c.addi4spn(rd, imm) = c.addi(rd, x2, imm) - if (rs != sp || !isValiCidx(dIdx) || imm == 0 || (imm % 4) != 0 || imm >= 1024) return false; - uint32_t v = ((dIdx-8)<<2) | local::get5to4_9to6_2_3_z5(imm); - append2B(v); - return true; - } - uint32_t creg2(uint32_t a, uint32_t b) { return ((a-8)<<7) | ((b-8)<<2); } - // c_lw, c_sw - bool c_lsw(const Reg& rd, const Reg& rs, int imm, uint32_t funct3) - { - uint32_t dIdx = rd.getIdx(); - uint32_t sIdx = rs.getIdx(); - if (!isValiCidx(dIdx) || !isValiCidx(sIdx) || (imm % 4) != 0 || imm < 0 || imm >= (1 << 7)) return false; - uint32_t v = (funct3<<13) | creg2(sIdx, dIdx) | local::get5to3_z3_2_6_z5(imm); - append2B(v); - return true; - } - // c_ld, c_sd - bool c_lsd(const Reg& rd, const Reg& rs, int imm, uint32_t funct3) - { - uint32_t dIdx = rd.getIdx(); - uint32_t sIdx = rs.getIdx(); - if (!isValiCidx(dIdx) || !isValiCidx(sIdx) || (imm % 8) != 0 || imm < 0 || imm >= (1 << 8)) return false; - uint32_t v = (funct3<<13) | creg2(sIdx, dIdx) | local::get5to3_z3_7_6_z5(imm); - append2B(v); - return true; - } - // c_srli, c_srai, c_andi - bool c_srli(const Reg& rd, const Reg& rs, int imm, uint32_t funct2, bool allowImm0 = false) - { - uint32_t dIdx = rd.getIdx(); - uint32_t sIdx = rs.getIdx(); - if (dIdx != sIdx || !isValiCidx(dIdx) || (!allowImm0 && imm == 0) || imm >= (1 << 6)) return false; - uint32_t v = (4<<13) | (funct2<<10) | ((dIdx-8)<<7) | local::get5_z5_4to0_z2(imm) | 1; - append2B(v); - return true; - } - // rd = rs1 - // c_sub, c_xor, c_or, c_and, c_subw - bool c_noimm(const Reg& rd, const Reg& rs1, const Reg& rs2, uint32_t funct3, uint32_t funct2) - { - uint32_t dIdx = rd.getIdx(); - uint32_t sIdx = rs2.getIdx(); - if (rd.getIdx() != rs1.getIdx() || !isValiCidx(dIdx) || !isValiCidx(sIdx)) return false; - uint32_t v = (funct3<<10) | ((dIdx-8)<<7) | (funct2<<5) | ((sIdx-8)<<2) | 1; - append2B(v); - return true; - } - // c_lwsp, c_flwsp - bool c_lwsp(const Reg& rd, const Reg& addr, int imm, uint32_t funct3) - { - uint32_t idx = rd.getIdx(); - if (addr != sp || (imm % 4) != 0 || (imm >> 8)) return false; - uint32_t v = (funct3<<13) | (idx<<7) | local::get5_z5_4to2_7to6_z2(imm) | 2; - append2B(v); - return true; - } - // c_ldsp - bool c_ldsp(const Reg& rd, const Reg& addr, int imm, uint32_t funct3) - { - uint32_t idx = rd.getIdx(); - if (addr != sp || (imm % 8) != 0 || (imm >> 9)) return false; - uint32_t v = (funct3<<13) | (idx<<7) | local::get5_z5_4to3_8to6_z2(imm) | 2; - append2B(v); - return true; - } - // c.mv, c.add - bool c_mv(const Reg& rd, const Reg& rs, uint32_t funct1) - { - if (rd == x0 || rs == x0) return false; - uint32_t v = (4<<13) | (funct1<<12) | (rd.getIdx()<<7) | (rs.getIdx()<<2) | 2; - append2B(v); - return true; - } - bool c_swsp(const Reg& rs, const Reg& addr, int imm, uint32_t funct3) - { - if (addr != sp || (imm % 4) != 0 || (imm >> 8)) return false; - uint32_t v = (funct3<<13) | (rs.getIdx()<<2) | local::get5to2_7to6_z7(imm) | 2; - append2B(v); - return true; - } - bool c_sdsp(const Reg& rs, const Reg& addr, int imm, uint32_t funct3) - { - if (addr != sp || (imm % 8) != 0 || (imm >> 9)) return false; - uint32_t v = (funct3<<13) | (rs.getIdx()<<2) | local::get5to3_8to6_z7(imm) | 2; - append2B(v); - return true; - } -public: - void L(Label& label) { labelMgr_.defineClabel(label); } - Label L() { Label label; L(label); return label; } - /* - assign src to dst - require - dst : does not used by L() - src : used by L() - */ - void assignL(Label& dst, const Label& src) { labelMgr_.assign(dst, src); } - /* - put the absolute address of label to buffer - @note the put size is 4(32-bit), 8(64-bit) - */ - void putL(const Label &label) - { - Jmp jmp(getCurr()); - opJmp(label, jmp); - } - - // constructor - CodeGenerator(size_t maxSize = DEFAULT_MAX_CODE_SIZE, void *userPtr = DontSetProtectRWE, Allocator *allocator = 0) - : CodeArray(maxSize, userPtr, allocator) - , XLEN_(64) - , isRV32_(false) - , supportRVC_(false) - { - labelMgr_.set(this); - } - void reset() - { - ClearError(); - resetSize(); - labelMgr_.reset(); - labelMgr_.set(this); - XLEN_ = 64; - isRV32_ = false; - supportRVC_ = false; - } - void setRV32(bool on = true) - { - isRV32_ = on; - XLEN_ = on ? 32 : 64; - } - void supportRVC(bool on = true) - { - supportRVC_ = on; - } - bool hasUndefinedLabel() const { return labelMgr_.hasUndefClabel(); } - static inline void clearCache(void *p, size_t n) - { -#ifdef _WIN32 - FlushInstructionCache(GetCurrentProcess(), begin, n); -#elif defined(__APPLE__) - sys_icache_invalidate(begin, n); -#else - __builtin___clear_cache((char *)p, (char *)p + n); -#endif - } - /* - MUST call ready() to complete generating code if you use AutoGrow mode. - It is not necessary for the other mode if hasUndefinedLabel() is true. - */ - void ready(ProtectMode mode = PROTECT_RWE) - { - if (hasUndefinedLabel()) XBYAK_RISCV_THROW(ERR_LABEL_IS_NOT_FOUND) - if (useProtect()) setProtectMode(mode); - clearCache(top_, size_); - } - // set read/exec - void readyRE() { return ready(PROTECT_RE); } - - void align(size_t x) - { - if (x == 1) return; - if (x < 4 || (x & (x - 1))) XBYAK_RISCV_THROW(ERR_BAD_ALIGN) - size_t remain = size_t(getCurr()) % x; - if (remain % 4) XBYAK_RISCV_THROW(ERR_INTERNAL) - if (remain) { - for (size_t i = 0; i < (x - remain) / 4; i++) { - nop(); - } - } - } - -#include "xbyak_riscv_mnemonic.hpp" -#if defined(XBYAK_RISCV_V) && XBYAK_RISCV_V == 1 -#include "xbyak_riscv_v.hpp" -#endif -}; - -#ifdef _MSC_VER - #pragma warning(pop) -#endif -} // Xbyak_riscv - diff --git a/third_party/xbyak_riscv/xbyak_riscv_csr.hpp b/third_party/xbyak_riscv/xbyak_riscv_csr.hpp deleted file mode 100644 index 5f04ed441a1..00000000000 --- a/third_party/xbyak_riscv/xbyak_riscv_csr.hpp +++ /dev/null @@ -1,112 +0,0 @@ -/****************************************************************************** -* Copyright (C), 2023, KNS Group LLC (YADRO) -* -* Licensed under the 3-Clause BSD License -* You may obtain a copy of the License at https://opensource.org/license/bsd-3-clause/ -*******************************************************************************/ - -#pragma once -namespace Xbyak_riscv { - -// Control and Status Register -enum class CSR : uint32_t { - // FP CSRs - fflags = 0x001, // Floating-Point Accrued Exceptions - frm = 0x002, // Floating-Point Dynamic Rounding Mode - fcsr = 0x003, // Floating-Point Control and Status register - // vector CSRs - vstart = 0x008, // Vector start position - vxsat = 0x009, // Fixed-Point Saturate Flag - vxrm = 0x00A, // Fixed-Point Rounding Mode - vcsr = 0x00F, // Vector control and status register - vl = 0xC20, // Vector length - vtype = 0xC21, // Vector data type register - vlenb = 0xC22, // VLEN/8 (vector register length in bytes) -}; - - -// Selected Element Width -enum class SEW : uint32_t { - e8 = 0x0, - e16 = 0x1, - e32 = 0x2, - e64 = 0x3 -}; - -// Vector Length Multiplier -enum class LMUL : uint32_t { - mf8 = 0x5, - mf4 = 0x6, - mf2 = 0x7, - m1 = 0x0, - m2 = 0x1, - m4 = 0x2, - m8 = 0x3 -}; - -// Vector Mask Agnostic -enum class VMA : uint32_t { - mu = 0, // undisturbed - ma = 1, // agnostic -}; - -// Vector Tail Agnostic -enum class VTA : uint32_t { - tu = 0, // undisturbed - ta = 1, // agnostic -}; - -enum class VectorAddressingMode : uint32_t { - unitStride = 0x0, - indexedUnordered = 0x1, - strided = 0x2, - indexedOrdered = 0x3 - // other encodings are reserved -}; - -enum class UnitStrideVectorAddressingModeLoad : uint32_t { - load = 0x0, // unit-stride load - wholeRegisterLoad = 0x8, // unit-stride, whole register load - maskLoad = 0xb, // unit-stride, mask load, EEW=8 - faultOnlyFirst = 0x10 // unit-stride fault-only-first - // other encodings are reserved -}; - -enum class UnitStrideVectorAddressingModeStore : uint32_t { - store = 0x0, // unit-stride store - wholeRegisterStore = 0x8, // unit-stride, whole register store - maskStore = 0xb // unit-stride, mask store, EEW=8 - // other encodings are reserved -}; - -enum class WidthEncoding : uint32_t { - e8 = 0x0, // Vector 8-bit element - e16 = 0x5, // Vector 16-bit element - e32 = 0x6, // Vector 32-bit element - e64 = 0x7, // Vector 64-bit element -}; - -enum class VM : uint32_t { - unmasked = 1, - masked = 0 -}; - -enum class RM : uint32_t { - rne = 0x0, // Round to Nearest, ties to Even - rtz = 0x1, // Round towards Zero - rdn = 0x2, // Round Down (towards -infinity) - rup = 0x3, // Round Up (towards + infinity) - rmm = 0x4, // Round to Nearest, ties to Max Magnitude - dyn = 0x7 // In instruction’s rm field, selects dynamic rounding mode; - // In Rounding Mode register, reserved. -}; - -enum class FFlags : uint32_t { - NV = 0x01, // Invalid Operation - DZ = 0x02, // Divide by Zero - OF = 0x04, // Overflow - UF = 0x08, // Underflow - NX = 0x10 // Inexact -}; - -} // Xbyak_riscv diff --git a/third_party/xbyak_riscv/xbyak_riscv_mnemonic.hpp b/third_party/xbyak_riscv/xbyak_riscv_mnemonic.hpp deleted file mode 100644 index b050d46cc75..00000000000 --- a/third_party/xbyak_riscv/xbyak_riscv_mnemonic.hpp +++ /dev/null @@ -1,231 +0,0 @@ -const char *getVersionString() const { return "1.01"; } -void add(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && rd == rs1 && c_mv(rd, rs2, 1)) return; Rtype(0x33, 0, 0x0, rd, rs1, rs2); } -void sub(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x23, 0)) return; Rtype(0x33, 0, 0x20, rd, rs1, rs2); } -void sll(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 1, 0x0, rd, rs1, rs2); } -void slt(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 2, 0x0, rd, rs1, rs2); } -void sltu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 3, 0x0, rd, rs1, rs2); } -void xor_(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x23, 1)) return; Rtype(0x33, 4, 0x0, rd, rs1, rs2); } -void srl(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 5, 0x0, rd, rs1, rs2); } -void sra(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 5, 0x20, rd, rs1, rs2); } -void or_(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x23, 2)) return; Rtype(0x33, 6, 0x0, rd, rs1, rs2); } -void and_(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x23, 3)) return; Rtype(0x33, 7, 0x0, rd, rs1, rs2); } -void addw(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x27, 1)) return; Rtype(0x3b, 0, 0x0, rd, rs1, rs2); } -void subw(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x27, 0)) return; Rtype(0x3b, 0, 0x20, rd, rs1, rs2); } -void sllw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 1, 0x0, rd, rs1, rs2); } -void srlw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 5, 0x0, rd, rs1, rs2); } -void sraw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 5, 0x20, rd, rs1, rs2); } -void mul(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 0, 0x1, rd, rs1, rs2); } -void mulh(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 1, 0x1, rd, rs1, rs2); } -void mulhsu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 2, 0x1, rd, rs1, rs2); } -void mulhu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 3, 0x1, rd, rs1, rs2); } -void div(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 4, 0x1, rd, rs1, rs2); } -void divu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 5, 0x1, rd, rs1, rs2); } -void rem(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 6, 0x1, rd, rs1, rs2); } -void remu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 7, 0x1, rd, rs1, rs2); } -void mulw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 0, 0x1, rd, rs1, rs2); } -void divw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 4, 0x1, rd, rs1, rs2); } -void remw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 6, 0x1, rd, rs1, rs2); } -void remuw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 7, 0x1, rd, rs1, rs2); } -void addi(const Reg& rd, const Reg& rs1, int imm) { if (supportRVC_ && c_addi(rd, rs1, imm)) return; Itype(0x13, 0, rd, rs1, imm); } -void slti(const Reg& rd, const Reg& rs1, int imm) { Itype(0x13, 2, rd, rs1, imm); } -void sltiu(const Reg& rd, const Reg& rs1, int imm) { Itype(0x13, 3, rd, rs1, imm); } -void xori(const Reg& rd, const Reg& rs1, int imm) { Itype(0x13, 4, rd, rs1, imm); } -void ori(const Reg& rd, const Reg& rs1, int imm) { Itype(0x13, 6, rd, rs1, imm); } -void andi(const Reg& rd, const Reg& rs1, int imm) { if (supportRVC_ && c_srli(rd, rs1, imm, 2, true)) return; Itype(0x13, 7, rd, rs1, imm); } -void addiw(const Reg& rd, const Reg& rs1, int imm) { if (supportRVC_ && c_addi_inner(rd, rs1, imm, 1)) return; Itype(0x1b, 0, rd, rs1, imm); } -// load-op rd, imm(addr); rd = addr[imm]; -void jalr(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x67, 0, rd, addr, imm); } -void lb(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 0, rd, addr, imm); } -void lh(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 1, rd, addr, imm); } -void lw(const Reg& rd, const Reg& addr, int imm = 0) { if (supportRVC_ && (c_lwsp(rd, addr, imm, 2) || c_lsw(rd, addr, imm, 2))) return; Itype(0x3, 2, rd, addr, imm); } -void lbu(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 4, rd, addr, imm); } -void lhu(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 5, rd, addr, imm); } -void lwu(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 6, rd, addr, imm); } -void ld(const Reg& rd, const Reg& addr, int imm = 0) { if (supportRVC_ && (c_ldsp(rd, addr, imm, 3) || c_lsd(rd, addr, imm, 3))) return; Itype(0x3, 3, rd, addr, imm); } -void auipc(const Reg& rd, uint32_t imm) { Utype(0x17, rd, imm); } -void lui(const Reg& rd, uint32_t imm) { if (supportRVC_ && c_lui(rd, imm)) return; Utype(0x37, rd, imm); } -void slli(const Reg& rd, const Reg& rs1, uint32_t shamt) { if (supportRVC_ && rd == rs1 && shamt != 0 && c_li(rd, shamt, 0, 2)) return; opShift(0x0, 1, 0x13, rd, rs1, shamt); } -void srli(const Reg& rd, const Reg& rs1, uint32_t shamt) { if (supportRVC_ && c_srli(rd, rs1, shamt, 0)) return; opShift(0x0, 5, 0x13, rd, rs1, shamt); } -void srai(const Reg& rd, const Reg& rs1, uint32_t shamt) { if (supportRVC_ && c_srli(rd, rs1, shamt, 1)) return; opShift(0x20, 5, 0x13, rd, rs1, shamt); } -void slliw(const Reg& rd, const Reg& rs1, uint32_t shamt) { opShift(0x0, 1, 0x1b, rd, rs1, shamt, 5); } -void srliw(const Reg& rd, const Reg& rs1, uint32_t shamt) { opShift(0x0, 5, 0x1b, rd, rs1, shamt, 5); } -void sraiw(const Reg& rd, const Reg& rs1, uint32_t shamt) { opShift(0x20, 5, 0x1b, rd, rs1, shamt, 5); } -void fence_rw_rw() { append4B(0x330000f); } -void fence_tso() { append4B(0x8330000f); } -void fence_rw_w() { append4B(0x310000f); } -void fence_r_rw() { append4B(0x230000f); } -void fence_r_r() { append4B(0x220000f); } -void fence_w_w() { append4B(0x110000f); } -void fence_i() { append4B(0x100f); } -void ecall() { append4B(0x73); } -void ebreak() { if (supportRVC_) append2B(0x9002); else append4B(0x00100073); } -// store-op rs, imm(addr) ; addr[imm] = rs; -void sb(const Reg& rs, const Reg& addr, int imm = 0) { Stype(0x23, 0, addr, rs, imm); } -void sh(const Reg& rs, const Reg& addr, int imm = 0) { Stype(0x23, 1, addr, rs, imm); } -void sw(const Reg& rs, const Reg& addr, int imm = 0) { if (supportRVC_ && (c_swsp(rs, addr, imm, 6) || c_lsw(rs, addr, imm, 6))) return; Stype(0x23, 2, addr, rs, imm); } -void sd(const Reg& rs, const Reg& addr, int imm = 0) { if (supportRVC_ && (c_sdsp(rs, addr, imm, 7) || c_lsd(rs, addr, imm, 7))) return; Stype(0x23, 3, addr, rs, imm); } -void beq(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 0, rs1, rs2); opJmp(label, jmp); } -void bne(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 1, rs1, rs2); opJmp(label, jmp); } -void blt(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 4, rs1, rs2); opJmp(label, jmp); } -void bge(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 5, rs1, rs2); opJmp(label, jmp); } -void bltu(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 6, rs1, rs2); opJmp(label, jmp); } -void bgeu(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 7, rs1, rs2); opJmp(label, jmp); } -void beqz(const Reg& rs, const Label& label) { beq(rs, x0, label); } -void bnez(const Reg& rs, const Label& label) { bne(rs, x0, label); } -void blez(const Reg& rs, const Label& label) { bge(x0, rs, label); } -void bgez(const Reg& rs, const Label& label) { bge(rs, x0, label); } -void bltz(const Reg& rs, const Label& label) { blt(rs, x0, label); } -void bgtz(const Reg& rs, const Label& label) { blt(x0, rs, label); } -void bgt(const Reg& rs, const Reg& rt, const Label& label) { blt(rt, rs, label); } -void ble(const Reg& rs, const Reg& rt, const Label& label) { bge(rt, rs, label); } -void bgtu(const Reg& rs, const Reg& rt, const Label& label) { bltu(rt, rs, label); } -void bleu(const Reg& rs, const Reg& rt, const Label& label) { bgeu(rt, rs, label); } -// amos**, rd, rs2, (addr) -void sc_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x3, 2, flag); } -void sc_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x3, 3, flag); } -void amoswap_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x1, 2, flag); } -void amoswap_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x1, 3, flag); } -void amoadd_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x0, 2, flag); } -void amoadd_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x0, 3, flag); } -void amoxor_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x4, 2, flag); } -void amoxor_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x4, 3, flag); } -void amoand_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0xc, 2, flag); } -void amoand_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0xc, 3, flag); } -void amoor_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x8, 2, flag); } -void amoor_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x8, 3, flag); } -void amomin_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x10, 2, flag); } -void amomin_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x10, 3, flag); } -void amomax_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x14, 2, flag); } -void amomax_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x14, 3, flag); } -void amominu_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x18, 2, flag); } -void amominu_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x18, 3, flag); } -void amomaxu_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x1c, 2, flag); } -void amomaxu_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x1c, 3, flag); } -void csrrw(const Reg& rd, CSR csr, const Reg& rs1) { opCSR(0x1073, csr, rs1, rd); } -void csrrs(const Reg& rd, CSR csr, const Reg& rs1) { opCSR(0x2073, csr, rs1, rd); } -void csrrc(const Reg& rd, CSR csr, const Reg& rs1) { opCSR(0x3073, csr, rs1, rd); } -void csrrwi(const Reg& rd, CSR csr, uint32_t imm) { opCSR(0x5073, csr, imm, rd); } -void csrrsi(const Reg& rd, CSR csr, uint32_t imm) { opCSR(0x6073, csr, imm, rd); } -void csrrci(const Reg& rd, CSR csr, uint32_t imm) { opCSR(0x7073, csr, imm, rd); } -void fadd_s(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x53, rs2, rs1, rm, rd); } -void fclass_s(const Reg& rd, const FReg& rs1) { opFP(0xe0001053, 0, rs1, 0, rd); } -void fcvt_s_w(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd0000053, 0, rs1, rm, rd); } -void fcvt_s_wu(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd0100053, 0, rs1, rm, rd); } -void fcvt_w_s(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc0000053, 0, rs1, rm, rd); } -void fcvt_wu_s(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc0100053, 0, rs1, rm, rd); } -void fdiv_s(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x18000053, rs2, rs1, rm, rd); } -void feq_s(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa0002053, rs2, rs1, 0, rd); } -void fle_s(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa0000053, rs2, rs1, 0, rd); } -void flt_s(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa0001053, rs2, rs1, 0, rd); } -void fmax_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x28001053, rs2, rs1, 0, rd); } -void fmin_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x28000053, rs2, rs1, 0, rd); } -void fmul_s(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x10000053, rs2, rs1, rm, rd); } -void fmv_w_x(const FReg& rd, const Reg& rs1) { opFP(0xf0000053, 0, rs1, 0, rd); } -void fmv_x_w(const Reg& rd, const FReg& rs1) { opFP(0xe0000053, 0, rs1, 0, rd); } -void fsgnj_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x20000053, rs2, rs1, 0, rd); } -void fsgnjn_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x20001053, rs2, rs1, 0, rd); } -void fsgnjx_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x20002053, rs2, rs1, 0, rd); } -void fsqrt_s(const FReg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0x58000053, 0, rs1, rm, rd); } -void fsub_s(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x8000053, rs2, rs1, rm, rd); } -void fcvt_l_s(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc0200053, 0, rs1, rm, rd); } -void fcvt_lu_s(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc0300053, 0, rs1, rm, rd); } -void fcvt_s_l(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd0200053, 0, rs1, rm, rd); } -void fcvt_s_lu(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd0300053, 0, rs1, rm, rd); } -void fadd_h(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x4000053, rs2, rs1, rm, rd); } -void fclass_h(const Reg& rd, const FReg& rs1) { opFP(0xe4001053, 0, rs1, 0, rd); } -void fcvt_h_s(const Reg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0x44000053, 0, rs1, rm, rd); } -void fcvt_h_w(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd4000053, 0, rs1, rm, rd); } -void fcvt_h_wu(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd4100053, 0, rs1, rm, rd); } -void fcvt_s_h(const Reg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0x40200053, 0, rs1, rm, rd); } -void fcvt_w_h(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc4000053, 0, rs1, rm, rd); } -void fcvt_wu_h(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc4100053, 0, rs1, rm, rd); } -void fdiv_h(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x1c000053, rs2, rs1, rm, rd); } -void feq_h(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa4002053, rs2, rs1, 0, rd); } -void fle_h(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa4000053, rs2, rs1, 0, rd); } -void flt_h(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa4001053, rs2, rs1, 0, rd); } -void fmax_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x2c001053, rs2, rs1, 0, rd); } -void fmin_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x2c000053, rs2, rs1, 0, rd); } -void fmul_h(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x14000053, rs2, rs1, rm, rd); } -void fmv_h_x(const FReg& rd, const Reg& rs1) { opFP(0xf4000053, 0, rs1, 0, rd); } -void fmv_x_h(const Reg& rd, const FReg& rs1) { opFP(0xe4000053, 0, rs1, 0, rd); } -void fsgnj_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x24000053, rs2, rs1, 0, rd); } -void fsgnjn_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x24001053, rs2, rs1, 0, rd); } -void fsgnjx_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x24002053, rs2, rs1, 0, rd); } -void fsqrt_h(const FReg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0x5c000053, 0, rs1, rm, rd); } -void fsub_h(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0xc000053, rs2, rs1, rm, rd); } -void fcvt_h_l(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd4200053, 0, rs1, rm, rd); } -void fcvt_h_lu(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd4300053, 0, rs1, rm, rd); } -void fcvt_l_h(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc4200053, 0, rs1, rm, rd); } -void fcvt_lu_h(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc4300053, 0, rs1, rm, rd); } - -void fmadd_s(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x43, rs3, rs2, rs1, rm, rd); } -void fmsub_s(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x47, rs3, rs2, rs1, rm, rd); } -void fnmsub_s(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x4b, rs3, rs2, rs1, rm, rd); } -void fnmadd_s(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x4f, rs3, rs2, rs1, rm, rd); } - -void fmadd_h(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x4000043, rs3, rs2, rs1, rm, rd); } -void fmsub_h(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x4000047, rs3, rs2, rs1, rm, rd); } -void fnmsub_h(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x400004b, rs3, rs2, rs1, rm, rd); } -void fnmadd_h(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x400004f, rs3, rs2, rs1, rm, rd); } - - -void flq(const FReg& rd, const Reg& rs1, int32_t imm12 = 0) { opLoadFP(0x4007, imm12, rs1, rd); } -void fsq(const FReg& rs2, const Reg& rs1, int32_t imm12 = 0) { opStoreFP(0x4027, imm12, rs2, rs1); } -void fld(const FReg& rd, const Reg& rs1, int32_t imm12 = 0) { opLoadFP(0x3007, imm12, rs1, rd); } -void fsd(const FReg& rs2, const Reg& rs1, int32_t imm12 = 0) { opStoreFP(0x3027, imm12, rs2, rs1); } -void flw(const FReg& rd, const Reg& rs1, int32_t imm12 = 0) { opLoadFP(0x2007, imm12, rs1, rd); } -void fsw(const FReg& rs2, const Reg& rs1, int32_t imm12 = 0) { opStoreFP(0x2027, imm12, rs2, rs1); } -void flh(const FReg& rd, const Reg& rs1, int32_t imm12 = 0) { opLoadFP(0x1007, imm12, rs1, rd); } -void fsh(const FReg& rs2, const Reg& rs1, int32_t imm12 = 0) { opStoreFP(0x1027, imm12, rs2, rs1); } - - -void nop() { if (supportRVC_) { append2B(0x0001); return; } addi(x0, x0, 0); } -void li(const Reg& rd, uint32_t imm) -{ - if (imm && (imm & local::mask(12)) == 0) { // lower 12 bits of imm are zero - lui(rd, uint32_t(imm >> 12)); - return; - } - int H, L; - if (!local::split32bit(&H, &L, imm)) { - addi(rd, zero, imm); - return; - } - lui(rd, H); - if (isRV32_) { - addi(rd, rd, L); - } else { - addiw(rd, rd, L); - } -} -void mv(const Reg& rd, const Reg& rs) { addi(rd, rs, 0); } -void not_(const Reg& rd, const Reg& rs) { xori(rd, rs, -1); } -void neg(const Reg& rd, const Reg& rs) { sub(rd, x0, rs); } -void negw(const Reg& rd, const Reg& rs) { subw(rd, x0, rs); } -void sext_b(const Reg& rd, const Reg& rs) { slli(rd, rs, XLEN_ - 8); srai(rd, rd, XLEN_ - 8); } -void sext_h(const Reg& rd, const Reg& rs) { slli(rd, rs, XLEN_ - 16); srai(rd, rd, XLEN_ - 16); } -void sext_w(const Reg& rd, const Reg& rs) { addiw(rd, rs, 0); } -void zext_b(const Reg& rd, const Reg& rs) { andi(rd, rs, 255); } -void zext_h(const Reg& rd, const Reg& rs) { slli(rd, rs, XLEN_ - 16); srli(rd, rd, XLEN_ - 16); } -void zext_w(const Reg& rd, const Reg& rs) { slli(rd, rs, XLEN_ - 32); srli(rd, rd, XLEN_ - 32); } -void seqz(const Reg& rd, const Reg& rs) { sltiu(rd, rs, 1); } -void snez(const Reg& rd, const Reg& rs) { sltu(rd, x0, rs); } -void sltz(const Reg& rd, const Reg& rs) { slt(rd, rs, x0); } -void sgtz(const Reg& rd, const Reg& rs) { slt(rd, x0, rs); } -void fence() { append4B(0x0ff0000f); } -void j_(const Label& label) { jal(x0, label); } -void jal(const Reg& rd, const Label& label) { Jmp jmp(getCurr(), 0x6f, rd); opJmp(label, jmp); } -void jr(const Reg& rs) { jalr(x0, rs, 0); } -void jalr(const Reg& rs) { jalr(x1, rs, 0); } -void ret() { jalr(x0, x1); } -// lr rd, (addr) -void lr_w(const Reg& rd, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, 0, addr, 2, 2, flag); } -void lr_d(const Reg& rd, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, 0, addr, 2, 3, flag); } -void csrr(const Reg& rd, CSR csr) { csrrs(rd, csr, x0); } -void csrw(CSR csr, const Reg& rs) { csrrw(x0, csr, rs); } -void csrs(CSR csr, const Reg& rs) { csrrs(x0, csr, rs); } -void csrc(CSR csr, const Reg& rs) { csrrc(x0, csr, rs); } -void csrwi(CSR csr, uint32_t imm) { csrrwi(x0, csr, imm); } -void csrsi(CSR csr, uint32_t imm) { csrrsi(x0, csr, imm); } -void csrci(CSR csr, uint32_t imm) { csrrci(x0, csr, imm); } - diff --git a/third_party/xbyak_riscv/xbyak_riscv_util.hpp b/third_party/xbyak_riscv/xbyak_riscv_util.hpp deleted file mode 100644 index 6fdeab13b0e..00000000000 --- a/third_party/xbyak_riscv/xbyak_riscv_util.hpp +++ /dev/null @@ -1,271 +0,0 @@ -/****************************************************************************** -* Copyright (C), 2023, KNS Group LLC (YADRO) -* -* Licensed under the 3-Clause BSD License -* You may obtain a copy of the License at https://opensource.org/license/bsd-3-clause/ -*******************************************************************************/ - -#pragma once - -#include -#include -#include -#include "xbyak_riscv_csr.hpp" -#include "xbyak_riscv.hpp" - -#if defined(__linux__) && defined(__riscv) -#include -#include -#include -#include -#include -#include -#endif - -namespace Xbyak_riscv { - -// Legacy HWCAP constants -#ifndef COMPAT_HWCAP_ISA_I -#define COMPAT_HWCAP_ISA_I (1U << ('I' - 'A')) -#endif - -#ifndef COMPAT_HWCAP_ISA_M -#define COMPAT_HWCAP_ISA_M (1U << ('M' - 'A')) -#endif - -#ifndef COMPAT_HWCAP_ISA_A -#define COMPAT_HWCAP_ISA_A (1U << ('A' - 'A')) -#endif - -#ifndef COMPAT_HWCAP_ISA_F -#define COMPAT_HWCAP_ISA_F (1U << ('F' - 'A')) -#endif - -#ifndef COMPAT_HWCAP_ISA_D -#define COMPAT_HWCAP_ISA_D (1U << ('D' - 'A')) -#endif - -#ifndef COMPAT_HWCAP_ISA_C -#define COMPAT_HWCAP_ISA_C (1U << ('C' - 'A')) -#endif - -#ifndef COMPAT_HWCAP_ISA_V -#define COMPAT_HWCAP_ISA_V (1U << ('V' - 'A')) -#endif - -#if defined(__linux__) && defined(__riscv) -// Definitions for riscv_hwprobe (Linux 6.4+) -#ifndef __NR_riscv_hwprobe -#define __NR_riscv_hwprobe 258 -#endif - -#ifndef RISCV_HWPROBE_KEY_IMA_EXT_0 -#define RISCV_HWPROBE_KEY_IMA_EXT_0 4 -#endif - -#ifndef RISCV_HWPROBE_IMA_V -#define RISCV_HWPROBE_IMA_V (1ULL << 2) -#endif - -#ifndef RISCV_HWPROBE_EXT_ZVBB -#define RISCV_HWPROBE_EXT_ZVBB (1ULL << 17) -#endif - -#ifndef RISCV_HWPROBE_EXT_ZVBC -#define RISCV_HWPROBE_EXT_ZVBC (1ULL << 18) -#endif - -#ifndef RISCV_HWPROBE_EXT_ZVKG -#define RISCV_HWPROBE_EXT_ZVKG (1ULL << 20) -#endif - -#ifndef RISCV_HWPROBE_EXT_ZVFH -#define RISCV_HWPROBE_EXT_ZVFH (1ULL << 30) -#endif - -struct riscv_hwprobe { - int64_t key; - uint64_t value; -}; -#endif - -enum class RISCVExtension : uint64_t { - // 0-25: Legacy single-letter map (matches HWCAP for convenience) - I = COMPAT_HWCAP_ISA_I, - M = COMPAT_HWCAP_ISA_M, - A = COMPAT_HWCAP_ISA_A, - F = COMPAT_HWCAP_ISA_F, - D = COMPAT_HWCAP_ISA_D, - C = COMPAT_HWCAP_ISA_C, - V = COMPAT_HWCAP_ISA_V, - - // 26+: Extended Z-extensions - // Adding new extensions here is safe and conflict-free - Zvfh = 1ULL << 26, - Zvbb = 1ULL << 27, - Zvbc = 1ULL << 28, - Zvkg = 1ULL << 29 -}; - -template -struct CSRReader : public CodeGenerator { - // Buffer capacity exactly for 2 instructions. - static constexpr size_t capacity = 8; - - CSRReader() : CodeGenerator(capacity) { - csrrs(a0, csr, x0); - ret(); - } -}; - -/** - * Class that detects information about a RISC-V CPU. - */ -class CPU final { -public: - static const CPU& getInstance() { - static const CPU cpu; - return cpu; - } - - CPU() { - hwcapFeatures = 0; - xlen = sizeof(void*) * 8; // Fallback if sysconf fails - -#if defined(__linux__) && defined(__riscv) - // Set hwcapFeatures with AT_HWCAP value from - // the Linux auxiliary vector to check for base extensions support. - hwcapFeatures = getauxval(AT_HWCAP) & ( - COMPAT_HWCAP_ISA_I | - COMPAT_HWCAP_ISA_M | - COMPAT_HWCAP_ISA_A | - COMPAT_HWCAP_ISA_F | - COMPAT_HWCAP_ISA_D | - COMPAT_HWCAP_ISA_C | - COMPAT_HWCAP_ISA_V - ); - - // Try to use riscv_hwprobe to detect Z-extensions - struct riscv_hwprobe requests[] = { - {RISCV_HWPROBE_KEY_IMA_EXT_0, 0} - }; - - int ret = syscall(__NR_riscv_hwprobe, &requests, sizeof(requests) / sizeof(requests[0]), 0, NULL, 0); - - if (ret == 0) { - uint64_t v = requests[0].value; - // Update V support from hwprobe if present - if (v & RISCV_HWPROBE_IMA_V) hwcapFeatures |= static_cast(RISCVExtension::V); - - // Detect Z-extensions using the table - const struct { - RISCVExtension id; - uint64_t hwprobe_bit; // Bit in RISCV_HWPROBE_KEY_IMA_EXT_0 - } table[] = { - { RISCVExtension::Zvfh, RISCV_HWPROBE_EXT_ZVFH }, - { RISCVExtension::Zvbb, RISCV_HWPROBE_EXT_ZVBB }, - { RISCVExtension::Zvbc, RISCV_HWPROBE_EXT_ZVBC }, - { RISCVExtension::Zvkg, RISCV_HWPROBE_EXT_ZVKG } - }; - for (const auto& entry : table) { - if (v & entry.hwprobe_bit) { - hwcapFeatures |= static_cast(entry.id); - } - } - } - - // Set xlen, number of cores, cache info - xlen = sysconf(_SC_LONG_BIT); - numCores = sysconf(_SC_NPROCESSORS_ONLN); - - dataCacheSize_[0] = sysconf(_SC_LEVEL1_DCACHE_SIZE); - dataCacheSize_[1] = sysconf(_SC_LEVEL2_CACHE_SIZE); - dataCacheSize_[2] = sysconf(_SC_LEVEL3_CACHE_SIZE); - dataCacheSize_[3] = sysconf(_SC_LEVEL4_CACHE_SIZE); - - dataCacheLineSize_[0] = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); - dataCacheLineSize_[1] = sysconf(_SC_LEVEL2_CACHE_LINESIZE); - dataCacheLineSize_[2] = sysconf(_SC_LEVEL3_CACHE_LINESIZE); - dataCacheLineSize_[3] = sysconf(_SC_LEVEL4_CACHE_LINESIZE); -#endif - - // Set vlen - if(hasExtension(RISCVExtension::V)) { - CSRReader csrReaderGenerator; - csrReaderGenerator.ready(); - const auto csrReader = csrReaderGenerator.getCode(); - vlen = csrReader() * 8 /* bit */; - } - - // Set flen (bit) - if (hasExtension(RISCVExtension::D)) { - flen = 64; - } else if (hasExtension(RISCVExtension::F)) { - flen = 32; - } - } - - /** - * Checks if a particular RISC-V extension is available. - * - * @param extension The extension to check. - */ - bool hasExtension(RISCVExtension extension) const { - return (hwcapFeatures & static_cast(extension)) != 0; - } - - /** - * Get vector register width in bits - */ - uint32_t getVlen() const { - return vlen; - } - - /** - * Get general purpose register width in bits - */ - uint32_t getXlen() const { - return xlen; - }; - - /** - * Get floating-point register width in bits - */ - uint32_t getFlen() const { - return flen; - } - - uint32_t getNumCores() const { - return numCores; - } - - /** - * Get data cache size in bytes - * @param lvl Cache level 1..4 - */ - uint32_t getDataCacheSize(uint32_t lvl) const { - if (lvl == 0 || lvl > maxNumberCacheLevels) XBYAK_RISCV_THROW(ERR_BAD_PARAMETER); - return dataCacheSize_[lvl - 1]; - } - - /** - * Get data cache line size in bytes - * @param lvl Cache level 1..4 - */ - uint32_t getDataCacheLineSize(uint32_t lvl) const { - if (lvl == 0 || lvl > maxNumberCacheLevels) XBYAK_RISCV_THROW(ERR_BAD_PARAMETER); - return dataCacheLineSize_[lvl - 1]; - } - -private: - uint64_t hwcapFeatures = 0; - static constexpr size_t maxNumberCacheLevels = 4; - uint32_t dataCacheSize_[maxNumberCacheLevels] = {0, 0, 0, 0}; - uint32_t dataCacheLineSize_[maxNumberCacheLevels] = {0, 0, 0, 0}; - uint32_t numCores = 0; - uint32_t xlen = 0; - uint32_t vlen = 0; - uint32_t flen = 0; -}; - -} // Xbyak_riscv diff --git a/third_party/xbyak_riscv/xbyak_riscv_v.hpp b/third_party/xbyak_riscv/xbyak_riscv_v.hpp deleted file mode 100644 index 7bff4daf391..00000000000 --- a/third_party/xbyak_riscv/xbyak_riscv_v.hpp +++ /dev/null @@ -1,776 +0,0 @@ -/* - Copyright (C), 2023, MITSUNARI Shigeo - Copyright (C), 2023, KNS Group LLC (YADRO) - Licensed under the 3-Clause BSD License - You may obtain a copy of the License at https://opensource.org/license/bsd-3-clause/ -*/ -void vaadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x24002057, vm, vs2, vs1, vd); } -void vaadd_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x24006057, vm, vs2, rs1, vd); } -void vaaddu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x20002057, vm, vs2, vs1, vd); } -void vaaddu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x20006057, vm, vs2, rs1, vd); } -void vadc_vim(const VReg& vd, const VReg& vs2, int32_t simm5) { opIVI(0x40003057, 0, vs2, simm5, vd); } -void vadc_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x40000057, 0, vs2, vs1, vd); } -void vadc_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x40004057, 0, vs2, rs1, vd); } -void vadd_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x3057, vm, vs2, simm5, vd); } -void vadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x57, vm, vs2, vs1, vd); } -void vadd_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x4057, vm, vs2, rs1, vd); } -void vand_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x24003057, vm, vs2, simm5, vd); } -void vand_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x24000057, vm, vs2, vs1, vd); } -void vand_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x24004057, vm, vs2, rs1, vd); } -void vasub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x2c002057, vm, vs2, vs1, vd); } -void vasub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x2c006057, vm, vs2, rs1, vd); } -void vasubu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x28002057, vm, vs2, vs1, vd); } -void vasubu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x28006057, vm, vs2, rs1, vd); } -void vcompress_vm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opMVV(0x5e002057, 0, vs2, vs1, vd); } -void vcpop_m(const Reg& rd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x40082057, vm, vs2, 0, rd); } -void vdiv_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x84002057, vm, vs2, vs1, vd); } -void vdiv_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x84006057, vm, vs2, rs1, vd); } -void vdivu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x80002057, vm, vs2, vs1, vd); } -void vdivu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x80006057, vm, vs2, rs1, vd); } -void vfadd_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x5057, vm, vs2, rs1, vd); } -void vfadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x1057, vm, vs2, vs1, vd); } -void vfclass_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x4c081057, vm, vs2, 0, vd); } -void vfcvt_f_x_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48019057, vm, vs2, 0, vd); } -void vfcvt_f_xu_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48011057, vm, vs2, 0, vd); } -void vfcvt_rtz_x_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48039057, vm, vs2, 0, vd); } -void vfcvt_rtz_xu_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48031057, vm, vs2, 0, vd); } -void vfcvt_x_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48009057, vm, vs2, 0, vd); } -void vfcvt_xu_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48001057, vm, vs2, 0, vd); } -void vfdiv_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x80005057, vm, vs2, rs1, vd); } -void vfdiv_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x80001057, vm, vs2, vs1, vd); } -void vfirst_m(const Reg& rd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x4008a057, vm, vs2, 0, rd); } -void vfmacc_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xb0005057, vm, vs2, rs1, vd); } -void vfmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xb0001057, vm, vs2, vs1, vd); } -void vfmadd_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xa0005057, vm, vs2, rs1, vd); } -void vfmadd_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xa0001057, vm, vs2, vs1, vd); } -void vfmax_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x18005057, vm, vs2, rs1, vd); } -void vfmax_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x18001057, vm, vs2, vs1, vd); } -void vfmerge_vfm(const VReg& vd, const VReg& vs2, const FReg& rs1) { opFVF(0x5c005057, 0, vs2, rs1, vd); } -void vfmin_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x10005057, vm, vs2, rs1, vd); } -void vfmin_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x10001057, vm, vs2, vs1, vd); } -void vfmsac_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xb8005057, vm, vs2, rs1, vd); } -void vfmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xb8001057, vm, vs2, vs1, vd); } -void vfmsub_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xa8005057, vm, vs2, rs1, vd); } -void vfmsub_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xa8001057, vm, vs2, vs1, vd); } -void vfmul_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x90005057, vm, vs2, rs1, vd); } -void vfmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x90001057, vm, vs2, vs1, vd); } -void vfmv_f_s(const FReg& rd, const VReg& vs2) { opFVV(0x42001057, 0, vs2, 0, rd); } -void vfmv_s_f(const VReg& vd, const FReg& rs1) { opFVF(0x42005057, 0, 0, rs1, vd); } -void vfmv_v_f(const VReg& vd, const FReg& rs1) { opFVF(0x5e005057, 0, 0, rs1, vd); } -void vfncvt_f_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x480a1057, vm, vs2, 0, vd); } -void vfncvt_f_x_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48099057, vm, vs2, 0, vd); } -void vfncvt_f_xu_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48091057, vm, vs2, 0, vd); } -void vfncvt_rod_f_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x480a9057, vm, vs2, 0, vd); } -void vfncvt_rtz_x_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x480b9057, vm, vs2, 0, vd); } -void vfncvt_rtz_xu_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x480b1057, vm, vs2, 0, vd); } -void vfncvt_x_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48089057, vm, vs2, 0, vd); } -void vfncvt_xu_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48081057, vm, vs2, 0, vd); } -void vfnmacc_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xb4005057, vm, vs2, rs1, vd); } -void vfnmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xb4001057, vm, vs2, vs1, vd); } -void vfnmadd_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xa4005057, vm, vs2, rs1, vd); } -void vfnmadd_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xa4001057, vm, vs2, vs1, vd); } -void vfnmsac_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xbc005057, vm, vs2, rs1, vd); } -void vfnmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xbc001057, vm, vs2, vs1, vd); } -void vfnmsub_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xac005057, vm, vs2, rs1, vd); } -void vfnmsub_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xac001057, vm, vs2, vs1, vd); } -void vfrdiv_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x84005057, vm, vs2, rs1, vd); } -void vfrec7_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x4c029057, vm, vs2, 0, vd); } -void vfredmax_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x1c001057, vm, vs2, vs1, vd); } -void vfredmin_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x14001057, vm, vs2, vs1, vd); } -void vfredosum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xc001057, vm, vs2, vs1, vd); } -void vfredusum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x4001057, vm, vs2, vs1, vd); } -void vfrsqrt7_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x4c021057, vm, vs2, 0, vd); } -void vfrsub_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x9c005057, vm, vs2, rs1, vd); } -void vfsgnj_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x20005057, vm, vs2, rs1, vd); } -void vfsgnj_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x20001057, vm, vs2, vs1, vd); } -void vfsgnjn_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x24005057, vm, vs2, rs1, vd); } -void vfsgnjn_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x24001057, vm, vs2, vs1, vd); } -void vfsgnjx_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x28005057, vm, vs2, rs1, vd); } -void vfsgnjx_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x28001057, vm, vs2, vs1, vd); } -void vfslide1down_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x3c005057, vm, vs2, rs1, vd); } -void vfslide1up_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x38005057, vm, vs2, rs1, vd); } -void vfsqrt_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x4c001057, vm, vs2, 0, vd); } -void vfsub_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x8005057, vm, vs2, rs1, vd); } -void vfsub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x8001057, vm, vs2, vs1, vd); } -void vfwadd_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xc0005057, vm, vs2, rs1, vd); } -void vfwadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xc0001057, vm, vs2, vs1, vd); } -void vfwadd_wf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xd0005057, vm, vs2, rs1, vd); } -void vfwadd_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xd0001057, vm, vs2, vs1, vd); } -void vfwcvt_f_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48061057, vm, vs2, 0, vd); } -void vfwcvt_f_x_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48059057, vm, vs2, 0, vd); } -void vfwcvt_f_xu_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48051057, vm, vs2, 0, vd); } -void vfwcvt_rtz_x_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48079057, vm, vs2, 0, vd); } -void vfwcvt_rtz_xu_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48071057, vm, vs2, 0, vd); } -void vfwcvt_x_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48049057, vm, vs2, 0, vd); } -void vfwcvt_xu_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48041057, vm, vs2, 0, vd); } -void vfwmacc_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xf0005057, vm, vs2, rs1, vd); } -void vfwmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xf0001057, vm, vs2, vs1, vd); } -void vfwmsac_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xf8005057, vm, vs2, rs1, vd); } -void vfwmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xf8001057, vm, vs2, vs1, vd); } -void vfwmul_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xe0005057, vm, vs2, rs1, vd); } -void vfwmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xe0001057, vm, vs2, vs1, vd); } -void vfwnmacc_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xf4005057, vm, vs2, rs1, vd); } -void vfwnmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xf4001057, vm, vs2, vs1, vd); } -void vfwnmsac_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xfc005057, vm, vs2, rs1, vd); } -void vfwnmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xfc001057, vm, vs2, vs1, vd); } -void vfwredosum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xcc001057, vm, vs2, vs1, vd); } -void vfwredusum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xc4001057, vm, vs2, vs1, vd); } -void vfwsub_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xc8005057, vm, vs2, rs1, vd); } -void vfwsub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xc8001057, vm, vs2, vs1, vd); } -void vfwsub_wf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xd8005057, vm, vs2, rs1, vd); } -void vfwsub_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xd8001057, vm, vs2, vs1, vd); } -void vid_v(const VReg& vd, VM vm=VM::unmasked) { opMVV(0x5008a057, vm, 0, 0, vd); } -void viota_m(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x50082057, vm, vs2, 0, vd); } -void vl1re16_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2805007, 0, 0, rs1, vd); } -void vl1re32_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2806007, 0, 0, rs1, vd); } -void vl1re64_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2807007, 0, 0, rs1, vd); } -void vl1re8_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2800007, 0, 0, rs1, vd); } -void vl2re16_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2805007, 0, 0, rs1, vd); } -void vl2re32_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2806007, 0, 0, rs1, vd); } -void vl2re64_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2807007, 0, 0, rs1, vd); } -void vl2re8_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2800007, 0, 0, rs1, vd); } -void vl4re16_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2805007, 0, 0, rs1, vd); } -void vl4re32_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2806007, 0, 0, rs1, vd); } -void vl4re64_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2807007, 0, 0, rs1, vd); } -void vl4re8_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2800007, 0, 0, rs1, vd); } -void vl8re16_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2805007, 0, 0, rs1, vd); } -void vl8re32_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2806007, 0, 0, rs1, vd); } -void vl8re64_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2807007, 0, 0, rs1, vd); } -void vl8re8_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2800007, 0, 0, rs1, vd); } -void vlseg1e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10007007, vm, 0, rs1, vd); } -void vlseg2e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x30007007, vm, 0, rs1, vd); } -void vlseg3e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x50007007, vm, 0, rs1, vd); } -void vlseg4e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x70007007, vm, 0, rs1, vd); } -void vlseg5e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x90007007, vm, 0, rs1, vd); } -void vlseg6e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb0007007, vm, 0, rs1, vd); } -void vlseg7e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd0007007, vm, 0, rs1, vd); } -void vlseg8e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf0007007, vm, 0, rs1, vd); } -void vle1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10007007, vm, 0, rs1, vd); } -void vlseg1e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11007007, vm, 0, rs1, vd); } -void vlseg2e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x31007007, vm, 0, rs1, vd); } -void vlseg3e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x51007007, vm, 0, rs1, vd); } -void vlseg4e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x71007007, vm, 0, rs1, vd); } -void vlseg5e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x91007007, vm, 0, rs1, vd); } -void vlseg6e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb1007007, vm, 0, rs1, vd); } -void vlseg7e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd1007007, vm, 0, rs1, vd); } -void vlseg8e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf1007007, vm, 0, rs1, vd); } -void vle1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11007007, vm, 0, rs1, vd); } -void vlseg1e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10000007, vm, 0, rs1, vd); } -void vlseg2e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x30000007, vm, 0, rs1, vd); } -void vlseg3e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x50000007, vm, 0, rs1, vd); } -void vlseg4e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x70000007, vm, 0, rs1, vd); } -void vlseg5e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x90000007, vm, 0, rs1, vd); } -void vlseg6e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb0000007, vm, 0, rs1, vd); } -void vlseg7e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd0000007, vm, 0, rs1, vd); } -void vlseg8e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf0000007, vm, 0, rs1, vd); } -void vle128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10000007, vm, 0, rs1, vd); } -void vlseg1e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11000007, vm, 0, rs1, vd); } -void vlseg2e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x31000007, vm, 0, rs1, vd); } -void vlseg3e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x51000007, vm, 0, rs1, vd); } -void vlseg4e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x71000007, vm, 0, rs1, vd); } -void vlseg5e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x91000007, vm, 0, rs1, vd); } -void vlseg6e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb1000007, vm, 0, rs1, vd); } -void vlseg7e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd1000007, vm, 0, rs1, vd); } -void vlseg8e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf1000007, vm, 0, rs1, vd); } -void vle128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11000007, vm, 0, rs1, vd); } -void vlseg1e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x5007, vm, 0, rs1, vd); } -void vlseg2e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x20005007, vm, 0, rs1, vd); } -void vlseg3e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x40005007, vm, 0, rs1, vd); } -void vlseg4e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x60005007, vm, 0, rs1, vd); } -void vlseg5e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x80005007, vm, 0, rs1, vd); } -void vlseg6e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa0005007, vm, 0, rs1, vd); } -void vlseg7e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc0005007, vm, 0, rs1, vd); } -void vlseg8e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe0005007, vm, 0, rs1, vd); } -void vle16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x5007, vm, 0, rs1, vd); } -void vlseg1e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1005007, vm, 0, rs1, vd); } -void vlseg2e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x21005007, vm, 0, rs1, vd); } -void vlseg3e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x41005007, vm, 0, rs1, vd); } -void vlseg4e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x61005007, vm, 0, rs1, vd); } -void vlseg5e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x81005007, vm, 0, rs1, vd); } -void vlseg6e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa1005007, vm, 0, rs1, vd); } -void vlseg7e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc1005007, vm, 0, rs1, vd); } -void vlseg8e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe1005007, vm, 0, rs1, vd); } -void vle16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1005007, vm, 0, rs1, vd); } -void vlseg1e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10005007, vm, 0, rs1, vd); } -void vlseg2e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x30005007, vm, 0, rs1, vd); } -void vlseg3e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x50005007, vm, 0, rs1, vd); } -void vlseg4e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x70005007, vm, 0, rs1, vd); } -void vlseg5e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x90005007, vm, 0, rs1, vd); } -void vlseg6e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb0005007, vm, 0, rs1, vd); } -void vlseg7e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd0005007, vm, 0, rs1, vd); } -void vlseg8e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf0005007, vm, 0, rs1, vd); } -void vle256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10005007, vm, 0, rs1, vd); } -void vlseg1e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11005007, vm, 0, rs1, vd); } -void vlseg2e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x31005007, vm, 0, rs1, vd); } -void vlseg3e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x51005007, vm, 0, rs1, vd); } -void vlseg4e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x71005007, vm, 0, rs1, vd); } -void vlseg5e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x91005007, vm, 0, rs1, vd); } -void vlseg6e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb1005007, vm, 0, rs1, vd); } -void vlseg7e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd1005007, vm, 0, rs1, vd); } -void vlseg8e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf1005007, vm, 0, rs1, vd); } -void vle256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11005007, vm, 0, rs1, vd); } -void vlseg1e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x6007, vm, 0, rs1, vd); } -void vlseg2e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x20006007, vm, 0, rs1, vd); } -void vlseg3e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x40006007, vm, 0, rs1, vd); } -void vlseg4e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x60006007, vm, 0, rs1, vd); } -void vlseg5e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x80006007, vm, 0, rs1, vd); } -void vlseg6e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa0006007, vm, 0, rs1, vd); } -void vlseg7e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc0006007, vm, 0, rs1, vd); } -void vlseg8e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe0006007, vm, 0, rs1, vd); } -void vle32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x6007, vm, 0, rs1, vd); } -void vlseg1e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1006007, vm, 0, rs1, vd); } -void vlseg2e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x21006007, vm, 0, rs1, vd); } -void vlseg3e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x41006007, vm, 0, rs1, vd); } -void vlseg4e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x61006007, vm, 0, rs1, vd); } -void vlseg5e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x81006007, vm, 0, rs1, vd); } -void vlseg6e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa1006007, vm, 0, rs1, vd); } -void vlseg7e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc1006007, vm, 0, rs1, vd); } -void vlseg8e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe1006007, vm, 0, rs1, vd); } -void vle32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1006007, vm, 0, rs1, vd); } -void vlseg1e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10006007, vm, 0, rs1, vd); } -void vlseg2e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x30006007, vm, 0, rs1, vd); } -void vlseg3e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x50006007, vm, 0, rs1, vd); } -void vlseg4e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x70006007, vm, 0, rs1, vd); } -void vlseg5e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x90006007, vm, 0, rs1, vd); } -void vlseg6e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb0006007, vm, 0, rs1, vd); } -void vlseg7e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd0006007, vm, 0, rs1, vd); } -void vlseg8e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf0006007, vm, 0, rs1, vd); } -void vle512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10006007, vm, 0, rs1, vd); } -void vlseg1e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11006007, vm, 0, rs1, vd); } -void vlseg2e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x31006007, vm, 0, rs1, vd); } -void vlseg3e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x51006007, vm, 0, rs1, vd); } -void vlseg4e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x71006007, vm, 0, rs1, vd); } -void vlseg5e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x91006007, vm, 0, rs1, vd); } -void vlseg6e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb1006007, vm, 0, rs1, vd); } -void vlseg7e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd1006007, vm, 0, rs1, vd); } -void vlseg8e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf1006007, vm, 0, rs1, vd); } -void vle512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11006007, vm, 0, rs1, vd); } -void vlseg1e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x7007, vm, 0, rs1, vd); } -void vlseg2e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x20007007, vm, 0, rs1, vd); } -void vlseg3e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x40007007, vm, 0, rs1, vd); } -void vlseg4e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x60007007, vm, 0, rs1, vd); } -void vlseg5e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x80007007, vm, 0, rs1, vd); } -void vlseg6e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa0007007, vm, 0, rs1, vd); } -void vlseg7e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc0007007, vm, 0, rs1, vd); } -void vlseg8e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe0007007, vm, 0, rs1, vd); } -void vle64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x7007, vm, 0, rs1, vd); } -void vlseg1e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1007007, vm, 0, rs1, vd); } -void vlseg2e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x21007007, vm, 0, rs1, vd); } -void vlseg3e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x41007007, vm, 0, rs1, vd); } -void vlseg4e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x61007007, vm, 0, rs1, vd); } -void vlseg5e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x81007007, vm, 0, rs1, vd); } -void vlseg6e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa1007007, vm, 0, rs1, vd); } -void vlseg7e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc1007007, vm, 0, rs1, vd); } -void vlseg8e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe1007007, vm, 0, rs1, vd); } -void vle64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1007007, vm, 0, rs1, vd); } -void vlseg1e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x7, vm, 0, rs1, vd); } -void vlseg2e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x20000007, vm, 0, rs1, vd); } -void vlseg3e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x40000007, vm, 0, rs1, vd); } -void vlseg4e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x60000007, vm, 0, rs1, vd); } -void vlseg5e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x80000007, vm, 0, rs1, vd); } -void vlseg6e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa0000007, vm, 0, rs1, vd); } -void vlseg7e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc0000007, vm, 0, rs1, vd); } -void vlseg8e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe0000007, vm, 0, rs1, vd); } -void vle8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x7, vm, 0, rs1, vd); } -void vlseg1e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1000007, vm, 0, rs1, vd); } -void vlseg2e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x21000007, vm, 0, rs1, vd); } -void vlseg3e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x41000007, vm, 0, rs1, vd); } -void vlseg4e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x61000007, vm, 0, rs1, vd); } -void vlseg5e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x81000007, vm, 0, rs1, vd); } -void vlseg6e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa1000007, vm, 0, rs1, vd); } -void vlseg7e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc1000007, vm, 0, rs1, vd); } -void vlseg8e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe1000007, vm, 0, rs1, vd); } -void vle8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1000007, vm, 0, rs1, vd); } -void vlm_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2b00007, 0, 0, rs1, vd); } -void vloxei1024_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x1c007007, vm, vs2, rs1, vd); } -void vloxei128_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x1c000007, vm, vs2, rs1, vd); } -void vloxei16_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0xc005007, vm, vs2, rs1, vd); } -void vloxei256_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x1c005007, vm, vs2, rs1, vd); } -void vloxei32_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0xc006007, vm, vs2, rs1, vd); } -void vloxei512_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x1c006007, vm, vs2, rs1, vd); } -void vloxei64_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0xc007007, vm, vs2, rs1, vd); } -void vloxei8_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0xc000007, vm, vs2, rs1, vd); } -void vlsseg1e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18007007, vm, rs2, rs1, vd); } -void vlsseg2e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x38007007, vm, rs2, rs1, vd); } -void vlsseg3e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x58007007, vm, rs2, rs1, vd); } -void vlsseg4e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x78007007, vm, rs2, rs1, vd); } -void vlsseg5e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x98007007, vm, rs2, rs1, vd); } -void vlsseg6e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xb8007007, vm, rs2, rs1, vd); } -void vlsseg7e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xd8007007, vm, rs2, rs1, vd); } -void vlsseg8e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xf8007007, vm, rs2, rs1, vd); } -void vlse1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18007007, vm, rs2, rs1, vd); } -void vlsseg1e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18000007, vm, rs2, rs1, vd); } -void vlsseg2e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x38000007, vm, rs2, rs1, vd); } -void vlsseg3e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x58000007, vm, rs2, rs1, vd); } -void vlsseg4e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x78000007, vm, rs2, rs1, vd); } -void vlsseg5e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x98000007, vm, rs2, rs1, vd); } -void vlsseg6e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xb8000007, vm, rs2, rs1, vd); } -void vlsseg7e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xd8000007, vm, rs2, rs1, vd); } -void vlsseg8e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xf8000007, vm, rs2, rs1, vd); } -void vlse128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18000007, vm, rs2, rs1, vd); } -void vlsseg1e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8005007, vm, rs2, rs1, vd); } -void vlsseg2e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x28005007, vm, rs2, rs1, vd); } -void vlsseg3e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x48005007, vm, rs2, rs1, vd); } -void vlsseg4e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x68005007, vm, rs2, rs1, vd); } -void vlsseg5e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x88005007, vm, rs2, rs1, vd); } -void vlsseg6e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xa8005007, vm, rs2, rs1, vd); } -void vlsseg7e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xc8005007, vm, rs2, rs1, vd); } -void vlsseg8e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xe8005007, vm, rs2, rs1, vd); } -void vlse16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8005007, vm, rs2, rs1, vd); } -void vlsseg1e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18005007, vm, rs2, rs1, vd); } -void vlsseg2e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x38005007, vm, rs2, rs1, vd); } -void vlsseg3e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x58005007, vm, rs2, rs1, vd); } -void vlsseg4e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x78005007, vm, rs2, rs1, vd); } -void vlsseg5e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x98005007, vm, rs2, rs1, vd); } -void vlsseg6e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xb8005007, vm, rs2, rs1, vd); } -void vlsseg7e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xd8005007, vm, rs2, rs1, vd); } -void vlsseg8e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xf8005007, vm, rs2, rs1, vd); } -void vlse256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18005007, vm, rs2, rs1, vd); } -void vlsseg1e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8006007, vm, rs2, rs1, vd); } -void vlsseg2e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x28006007, vm, rs2, rs1, vd); } -void vlsseg3e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x48006007, vm, rs2, rs1, vd); } -void vlsseg4e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x68006007, vm, rs2, rs1, vd); } -void vlsseg5e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x88006007, vm, rs2, rs1, vd); } -void vlsseg6e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xa8006007, vm, rs2, rs1, vd); } -void vlsseg7e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xc8006007, vm, rs2, rs1, vd); } -void vlsseg8e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xe8006007, vm, rs2, rs1, vd); } -void vlse32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8006007, vm, rs2, rs1, vd); } -void vlsseg1e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18006007, vm, rs2, rs1, vd); } -void vlsseg2e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x38006007, vm, rs2, rs1, vd); } -void vlsseg3e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x58006007, vm, rs2, rs1, vd); } -void vlsseg4e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x78006007, vm, rs2, rs1, vd); } -void vlsseg5e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x98006007, vm, rs2, rs1, vd); } -void vlsseg6e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xb8006007, vm, rs2, rs1, vd); } -void vlsseg7e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xd8006007, vm, rs2, rs1, vd); } -void vlsseg8e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xf8006007, vm, rs2, rs1, vd); } -void vlse512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18006007, vm, rs2, rs1, vd); } -void vlsseg1e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8007007, vm, rs2, rs1, vd); } -void vlsseg2e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x28007007, vm, rs2, rs1, vd); } -void vlsseg3e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x48007007, vm, rs2, rs1, vd); } -void vlsseg4e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x68007007, vm, rs2, rs1, vd); } -void vlsseg5e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x88007007, vm, rs2, rs1, vd); } -void vlsseg6e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xa8007007, vm, rs2, rs1, vd); } -void vlsseg7e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xc8007007, vm, rs2, rs1, vd); } -void vlsseg8e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xe8007007, vm, rs2, rs1, vd); } -void vlse64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8007007, vm, rs2, rs1, vd); } -void vlsseg1e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8000007, vm, rs2, rs1, vd); } -void vlsseg2e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x28000007, vm, rs2, rs1, vd); } -void vlsseg3e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x48000007, vm, rs2, rs1, vd); } -void vlsseg4e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x68000007, vm, rs2, rs1, vd); } -void vlsseg5e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x88000007, vm, rs2, rs1, vd); } -void vlsseg6e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xa8000007, vm, rs2, rs1, vd); } -void vlsseg7e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xc8000007, vm, rs2, rs1, vd); } -void vlsseg8e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xe8000007, vm, rs2, rs1, vd); } -void vlse8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8000007, vm, rs2, rs1, vd); } -void vluxei1024_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x14007007, vm, vs2, rs1, vd); } -void vluxei128_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x14000007, vm, vs2, rs1, vd); } -void vluxei16_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x4005007, vm, vs2, rs1, vd); } -void vluxei256_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x14005007, vm, vs2, rs1, vd); } -void vluxei32_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x4006007, vm, vs2, rs1, vd); } -void vluxei512_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x14006007, vm, vs2, rs1, vd); } -void vluxei64_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x4007007, vm, vs2, rs1, vd); } -void vluxei8_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x4000007, vm, vs2, rs1, vd); } -void vmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xb4002057, vm, vs2, vs1, vd); } -void vmacc_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xb4006057, vm, vs2, rs1, vd); } -void vmadc_vi(const VReg& vd, const VReg& vs2, int32_t simm5) { opIVI(0x46003057, 0, vs2, simm5, vd); } -void vmadc_vim(const VReg& vd, const VReg& vs2, int32_t simm5) { opIVI(0x44003057, 0, vs2, simm5, vd); } -void vmadc_vv(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x46000057, 0, vs2, vs1, vd); } -void vmadc_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x44000057, 0, vs2, vs1, vd); } -void vmadc_vx(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x46004057, 0, vs2, rs1, vd); } -void vmadc_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x44004057, 0, vs2, rs1, vd); } -void vmadd_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xa4002057, vm, vs2, vs1, vd); } -void vmadd_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xa4006057, vm, vs2, rs1, vd); } -void vmand_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x64002057, vm, vs2, vs1, vd); } -void vmandn_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x60002057, vm, vs2, vs1, vd); } -void vmax_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x1c000057, vm, vs2, vs1, vd); } -void vmax_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x1c004057, vm, vs2, rs1, vd); } -void vmaxu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x18000057, vm, vs2, vs1, vd); } -void vmaxu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x18004057, vm, vs2, rs1, vd); } -void vmerge_vim(const VReg& vd, const VReg& vs2, int32_t simm5) { opIVI(0x5c003057, 0, vs2, simm5, vd); } -void vmerge_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x5c000057, 0, vs2, vs1, vd); } -void vmerge_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x5c004057, 0, vs2, rs1, vd); } -void vmfeq_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x60005057, vm, vs2, rs1, vd); } -void vmfeq_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x60001057, vm, vs2, vs1, vd); } -void vmfge_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x7c005057, vm, vs2, rs1, vd); } -void vmfgt_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x74005057, vm, vs2, rs1, vd); } -void vmfle_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x64005057, vm, vs2, rs1, vd); } -void vmfle_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x64001057, vm, vs2, vs1, vd); } -void vmflt_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x6c005057, vm, vs2, rs1, vd); } -void vmflt_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x6c001057, vm, vs2, vs1, vd); } -void vmfne_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x70005057, vm, vs2, rs1, vd); } -void vmfne_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x70001057, vm, vs2, vs1, vd); } -void vmin_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x14000057, vm, vs2, vs1, vd); } -void vmin_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x14004057, vm, vs2, rs1, vd); } -void vminu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x10000057, vm, vs2, vs1, vd); } -void vminu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x10004057, vm, vs2, rs1, vd); } -void vmnand_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x74002057, vm, vs2, vs1, vd); } -void vmnor_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x78002057, vm, vs2, vs1, vd); } -void vmor_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x68002057, vm, vs2, vs1, vd); } -void vmorn_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x70002057, vm, vs2, vs1, vd); } -void vmsbc_vv(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x4e000057, 0, vs2, vs1, vd); } -void vmsbc_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x4c000057, 0, vs2, vs1, vd); } -void vmsbc_vx(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x4e004057, 0, vs2, rs1, vd); } -void vmsbc_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x4c004057, 0, vs2, rs1, vd); } -void vmsbf_m(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x5000a057, vm, vs2, 0, vd); } -void vmseq_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x60003057, vm, vs2, simm5, vd); } -void vmseq_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x60000057, vm, vs2, vs1, vd); } -void vmseq_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x60004057, vm, vs2, rs1, vd); } -void vmsgt_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x7c003057, vm, vs2, simm5, vd); } -void vmsgt_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x7c004057, vm, vs2, rs1, vd); } -void vmsgtu_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x78003057, vm, vs2, simm5, vd); } -void vmsgtu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x78004057, vm, vs2, rs1, vd); } -void vmsif_m(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x5001a057, vm, vs2, 0, vd); } -void vmsle_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x74003057, vm, vs2, simm5, vd); } -void vmsle_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x74000057, vm, vs2, vs1, vd); } -void vmsle_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x74004057, vm, vs2, rs1, vd); } -void vmsleu_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x70003057, vm, vs2, simm5, vd); } -void vmsleu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x70000057, vm, vs2, vs1, vd); } -void vmsleu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x70004057, vm, vs2, rs1, vd); } -void vmslt_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x6c000057, vm, vs2, vs1, vd); } -void vmslt_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x6c004057, vm, vs2, rs1, vd); } -void vmsltu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x68000057, vm, vs2, vs1, vd); } -void vmsltu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x68004057, vm, vs2, rs1, vd); } -void vmsne_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x64003057, vm, vs2, simm5, vd); } -void vmsne_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x64000057, vm, vs2, vs1, vd); } -void vmsne_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x64004057, vm, vs2, rs1, vd); } -void vmsof_m(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x50012057, vm, vs2, 0, vd); } -void vmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x94002057, vm, vs2, vs1, vd); } -void vmul_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x94006057, vm, vs2, rs1, vd); } -void vmulh_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x9c002057, vm, vs2, vs1, vd); } -void vmulh_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x9c006057, vm, vs2, rs1, vd); } -void vmulhsu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x98002057, vm, vs2, vs1, vd); } -void vmulhsu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x98006057, vm, vs2, rs1, vd); } -void vmulhu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x90002057, vm, vs2, vs1, vd); } -void vmulhu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x90006057, vm, vs2, rs1, vd); } -void vmv1r_v(const VReg& vd, const VReg& vs2) { opIVI(0x9e003057, 0, vs2, 0, vd); } -void vmv2r_v(const VReg& vd, const VReg& vs2) { opIVI(0x9e00b057, 0, vs2, 0, vd); } -void vmv4r_v(const VReg& vd, const VReg& vs2) { opIVI(0x9e01b057, 0, vs2, 0, vd); } -void vmv8r_v(const VReg& vd, const VReg& vs2) { opIVI(0x9e03b057, 0, vs2, 0, vd); } -void vmv_s_x(const VReg& vd, const Reg& rs1) { opMVX(0x42006057, 0, 0, rs1, vd); } -void vmv_v_i(const VReg& vd, int32_t simm5) { opIVI(0x5e003057, 0, 0, simm5, vd); } -void vmv_v_v(const VReg& vd, const VReg& vs1) { opIVV(0x5e000057, 0, 0, vs1, vd); } -void vmv_v_x(const VReg& vd, const Reg& rs1) { opIVX(0x5e004057, 0, 0, rs1, vd); } -void vmv_x_s(const Reg& rd, const VReg& vs2) { opMVV(0x42002057, 0, vs2, 0, rd); } -void vmxnor_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x7c002057, vm, vs2, vs1, vd); } -void vmxor_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x6c002057, vm, vs2, vs1, vd); } -void vnclip_wi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xbc003057, vm, vs2, simm5, vd); } -void vnclip_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xbc000057, vm, vs2, vs1, vd); } -void vnclip_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xbc004057, vm, vs2, rs1, vd); } -void vnclipu_wi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xb8003057, vm, vs2, simm5, vd); } -void vnclipu_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xb8000057, vm, vs2, vs1, vd); } -void vnclipu_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xb8004057, vm, vs2, rs1, vd); } -void vnmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xbc002057, vm, vs2, vs1, vd); } -void vnmsac_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xbc006057, vm, vs2, rs1, vd); } -void vnmsub_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xac002057, vm, vs2, vs1, vd); } -void vnmsub_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xac006057, vm, vs2, rs1, vd); } -void vnsra_wi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xb4003057, vm, vs2, simm5, vd); } -void vnsra_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xb4000057, vm, vs2, vs1, vd); } -void vnsra_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xb4004057, vm, vs2, rs1, vd); } -void vnsrl_wi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xb0003057, vm, vs2, simm5, vd); } -void vnsrl_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xb0000057, vm, vs2, vs1, vd); } -void vnsrl_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xb0004057, vm, vs2, rs1, vd); } -void vor_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x28003057, vm, vs2, simm5, vd); } -void vor_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x28000057, vm, vs2, vs1, vd); } -void vor_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x28004057, vm, vs2, rs1, vd); } -void vredand_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x4002057, vm, vs2, vs1, vd); } -void vredmax_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x1c002057, vm, vs2, vs1, vd); } -void vredmaxu_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x18002057, vm, vs2, vs1, vd); } -void vredmin_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x14002057, vm, vs2, vs1, vd); } -void vredminu_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x10002057, vm, vs2, vs1, vd); } -void vredor_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x8002057, vm, vs2, vs1, vd); } -void vredsum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x2057, vm, vs2, vs1, vd); } -void vredxor_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xc002057, vm, vs2, vs1, vd); } -void vrem_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x8c002057, vm, vs2, vs1, vd); } -void vrem_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x8c006057, vm, vs2, rs1, vd); } -void vremu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x88002057, vm, vs2, vs1, vd); } -void vremu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x88006057, vm, vs2, rs1, vd); } -void vrgather_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x30003057, vm, vs2, simm5, vd); } -void vrgather_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x30000057, vm, vs2, vs1, vd); } -void vrgather_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x30004057, vm, vs2, rs1, vd); } -void vrgatherei16_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x38000057, vm, vs2, vs1, vd); } -void vrsub_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xc003057, vm, vs2, simm5, vd); } -void vrsub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xc004057, vm, vs2, rs1, vd); } -void vs1r_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2800027, 0, 0, rs1, vs3); } -void vs2r_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2800027, 0, 0, rs1, vs3); } -void vs4r_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2800027, 0, 0, rs1, vs3); } -void vs8r_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2800027, 0, 0, rs1, vs3); } -void vsadd_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x84003057, vm, vs2, simm5, vd); } -void vsadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x84000057, vm, vs2, vs1, vd); } -void vsadd_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x84004057, vm, vs2, rs1, vd); } -void vsaddu_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x80003057, vm, vs2, simm5, vd); } -void vsaddu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x80000057, vm, vs2, vs1, vd); } -void vsaddu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x80004057, vm, vs2, rs1, vd); } -void vsbc_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x48000057, 0, vs2, vs1, vd); } -void vsbc_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x48004057, 0, vs2, rs1, vd); } -void vsseg1e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10007027, vm, 0, rs1, vs3); } -void vsseg2e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x30007027, vm, 0, rs1, vs3); } -void vsseg3e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x50007027, vm, 0, rs1, vs3); } -void vsseg4e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x70007027, vm, 0, rs1, vs3); } -void vsseg5e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x90007027, vm, 0, rs1, vs3); } -void vsseg6e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xb0007027, vm, 0, rs1, vs3); } -void vsseg7e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xd0007027, vm, 0, rs1, vs3); } -void vsseg8e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xf0007027, vm, 0, rs1, vs3); } -void vse1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10007027, vm, 0, rs1, vs3); } -void vsseg1e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10000027, vm, 0, rs1, vs3); } -void vsseg2e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x30000027, vm, 0, rs1, vs3); } -void vsseg3e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x50000027, vm, 0, rs1, vs3); } -void vsseg4e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x70000027, vm, 0, rs1, vs3); } -void vsseg5e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x90000027, vm, 0, rs1, vs3); } -void vsseg6e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xb0000027, vm, 0, rs1, vs3); } -void vsseg7e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xd0000027, vm, 0, rs1, vs3); } -void vsseg8e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xf0000027, vm, 0, rs1, vs3); } -void vse128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10000027, vm, 0, rs1, vs3); } -void vsseg1e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x5027, vm, 0, rs1, vs3); } -void vsseg2e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x20005027, vm, 0, rs1, vs3); } -void vsseg3e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x40005027, vm, 0, rs1, vs3); } -void vsseg4e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x60005027, vm, 0, rs1, vs3); } -void vsseg5e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x80005027, vm, 0, rs1, vs3); } -void vsseg6e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xa0005027, vm, 0, rs1, vs3); } -void vsseg7e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xc0005027, vm, 0, rs1, vs3); } -void vsseg8e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xe0005027, vm, 0, rs1, vs3); } -void vse16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x5027, vm, 0, rs1, vs3); } -void vsseg1e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10005027, vm, 0, rs1, vs3); } -void vsseg2e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x30005027, vm, 0, rs1, vs3); } -void vsseg3e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x50005027, vm, 0, rs1, vs3); } -void vsseg4e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x70005027, vm, 0, rs1, vs3); } -void vsseg5e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x90005027, vm, 0, rs1, vs3); } -void vsseg6e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xb0005027, vm, 0, rs1, vs3); } -void vsseg7e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xd0005027, vm, 0, rs1, vs3); } -void vsseg8e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xf0005027, vm, 0, rs1, vs3); } -void vse256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10005027, vm, 0, rs1, vs3); } -void vsseg1e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x6027, vm, 0, rs1, vs3); } -void vsseg2e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x20006027, vm, 0, rs1, vs3); } -void vsseg3e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x40006027, vm, 0, rs1, vs3); } -void vsseg4e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x60006027, vm, 0, rs1, vs3); } -void vsseg5e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x80006027, vm, 0, rs1, vs3); } -void vsseg6e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xa0006027, vm, 0, rs1, vs3); } -void vsseg7e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xc0006027, vm, 0, rs1, vs3); } -void vsseg8e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xe0006027, vm, 0, rs1, vs3); } -void vse32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x6027, vm, 0, rs1, vs3); } -void vsseg1e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10006027, vm, 0, rs1, vs3); } -void vsseg2e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x30006027, vm, 0, rs1, vs3); } -void vsseg3e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x50006027, vm, 0, rs1, vs3); } -void vsseg4e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x70006027, vm, 0, rs1, vs3); } -void vsseg5e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x90006027, vm, 0, rs1, vs3); } -void vsseg6e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xb0006027, vm, 0, rs1, vs3); } -void vsseg7e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xd0006027, vm, 0, rs1, vs3); } -void vsseg8e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xf0006027, vm, 0, rs1, vs3); } -void vse512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10006027, vm, 0, rs1, vs3); } -void vsseg1e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x7027, vm, 0, rs1, vs3); } -void vsseg2e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x20007027, vm, 0, rs1, vs3); } -void vsseg3e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x40007027, vm, 0, rs1, vs3); } -void vsseg4e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x60007027, vm, 0, rs1, vs3); } -void vsseg5e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x80007027, vm, 0, rs1, vs3); } -void vsseg6e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xa0007027, vm, 0, rs1, vs3); } -void vsseg7e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xc0007027, vm, 0, rs1, vs3); } -void vsseg8e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xe0007027, vm, 0, rs1, vs3); } -void vse64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x7027, vm, 0, rs1, vs3); } -void vsseg1e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x27, vm, 0, rs1, vs3); } -void vsseg2e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x20000027, vm, 0, rs1, vs3); } -void vsseg3e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x40000027, vm, 0, rs1, vs3); } -void vsseg4e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x60000027, vm, 0, rs1, vs3); } -void vsseg5e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x80000027, vm, 0, rs1, vs3); } -void vsseg6e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xa0000027, vm, 0, rs1, vs3); } -void vsseg7e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xc0000027, vm, 0, rs1, vs3); } -void vsseg8e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xe0000027, vm, 0, rs1, vs3); } -void vse8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x27, vm, 0, rs1, vs3); } -void vsext_vf2(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x4803a057, vm, vs2, 0, vd); } -void vsext_vf4(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x4802a057, vm, vs2, 0, vd); } -void vsext_vf8(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x4801a057, vm, vs2, 0, vd); } -void vslide1down_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x3c006057, vm, vs2, rs1, vd); } -void vslide1up_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x38006057, vm, vs2, rs1, vd); } -void vslidedown_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x3c003057, vm, vs2, simm5, vd); } -void vslidedown_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x3c004057, vm, vs2, rs1, vd); } -void vslideup_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x38003057, vm, vs2, simm5, vd); } -void vslideup_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x38004057, vm, vs2, rs1, vd); } -void vsll_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x94003057, vm, vs2, simm5, vd); } -void vsll_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x94000057, vm, vs2, vs1, vd); } -void vsll_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x94004057, vm, vs2, rs1, vd); } -void vsm_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2b00027, 0, 0, rs1, vs3); } -void vsmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x9c000057, vm, vs2, vs1, vd); } -void vsmul_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x9c004057, vm, vs2, rs1, vd); } -void vsoxei1024_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x1c007027, vm, vs2, rs1, vs3); } -void vsoxei128_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x1c000027, vm, vs2, rs1, vs3); } -void vsoxei16_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0xc005027, vm, vs2, rs1, vs3); } -void vsoxei256_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x1c005027, vm, vs2, rs1, vs3); } -void vsoxei32_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0xc006027, vm, vs2, rs1, vs3); } -void vsoxei512_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x1c006027, vm, vs2, rs1, vs3); } -void vsoxei64_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0xc007027, vm, vs2, rs1, vs3); } -void vsoxei8_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0xc000027, vm, vs2, rs1, vs3); } -void vsra_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xa4003057, vm, vs2, simm5, vd); } -void vsra_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xa4000057, vm, vs2, vs1, vd); } -void vsra_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xa4004057, vm, vs2, rs1, vd); } -void vsrl_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xa0003057, vm, vs2, simm5, vd); } -void vsrl_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xa0000057, vm, vs2, vs1, vd); } -void vsrl_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xa0004057, vm, vs2, rs1, vd); } -void vssseg1e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18007027, vm, rs2, rs1, vs3); } -void vssseg2e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x38007027, vm, rs2, rs1, vs3); } -void vssseg3e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x58007027, vm, rs2, rs1, vs3); } -void vssseg4e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x78007027, vm, rs2, rs1, vs3); } -void vssseg5e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x98007027, vm, rs2, rs1, vs3); } -void vssseg6e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xb8007027, vm, rs2, rs1, vs3); } -void vssseg7e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xd8007027, vm, rs2, rs1, vs3); } -void vssseg8e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xf8007027, vm, rs2, rs1, vs3); } -void vsse1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18007027, vm, rs2, rs1, vs3); } -void vssseg1e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18000027, vm, rs2, rs1, vs3); } -void vssseg2e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x38000027, vm, rs2, rs1, vs3); } -void vssseg3e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x58000027, vm, rs2, rs1, vs3); } -void vssseg4e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x78000027, vm, rs2, rs1, vs3); } -void vssseg5e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x98000027, vm, rs2, rs1, vs3); } -void vssseg6e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xb8000027, vm, rs2, rs1, vs3); } -void vssseg7e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xd8000027, vm, rs2, rs1, vs3); } -void vssseg8e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xf8000027, vm, rs2, rs1, vs3); } -void vsse128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18000027, vm, rs2, rs1, vs3); } -void vssseg1e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8005027, vm, rs2, rs1, vs3); } -void vssseg2e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x28005027, vm, rs2, rs1, vs3); } -void vssseg3e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x48005027, vm, rs2, rs1, vs3); } -void vssseg4e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x68005027, vm, rs2, rs1, vs3); } -void vssseg5e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x88005027, vm, rs2, rs1, vs3); } -void vssseg6e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xa8005027, vm, rs2, rs1, vs3); } -void vssseg7e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xc8005027, vm, rs2, rs1, vs3); } -void vssseg8e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xe8005027, vm, rs2, rs1, vs3); } -void vsse16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8005027, vm, rs2, rs1, vs3); } -void vssseg1e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18005027, vm, rs2, rs1, vs3); } -void vssseg2e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x38005027, vm, rs2, rs1, vs3); } -void vssseg3e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x58005027, vm, rs2, rs1, vs3); } -void vssseg4e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x78005027, vm, rs2, rs1, vs3); } -void vssseg5e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x98005027, vm, rs2, rs1, vs3); } -void vssseg6e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xb8005027, vm, rs2, rs1, vs3); } -void vssseg7e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xd8005027, vm, rs2, rs1, vs3); } -void vssseg8e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xf8005027, vm, rs2, rs1, vs3); } -void vsse256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18005027, vm, rs2, rs1, vs3); } -void vssseg1e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8006027, vm, rs2, rs1, vs3); } -void vssseg2e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x28006027, vm, rs2, rs1, vs3); } -void vssseg3e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x48006027, vm, rs2, rs1, vs3); } -void vssseg4e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x68006027, vm, rs2, rs1, vs3); } -void vssseg5e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x88006027, vm, rs2, rs1, vs3); } -void vssseg6e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xa8006027, vm, rs2, rs1, vs3); } -void vssseg7e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xc8006027, vm, rs2, rs1, vs3); } -void vssseg8e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xe8006027, vm, rs2, rs1, vs3); } -void vsse32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8006027, vm, rs2, rs1, vs3); } -void vssseg1e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18006027, vm, rs2, rs1, vs3); } -void vssseg2e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x38006027, vm, rs2, rs1, vs3); } -void vssseg3e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x58006027, vm, rs2, rs1, vs3); } -void vssseg4e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x78006027, vm, rs2, rs1, vs3); } -void vssseg5e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x98006027, vm, rs2, rs1, vs3); } -void vssseg6e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xb8006027, vm, rs2, rs1, vs3); } -void vssseg7e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xd8006027, vm, rs2, rs1, vs3); } -void vssseg8e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xf8006027, vm, rs2, rs1, vs3); } -void vsse512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18006027, vm, rs2, rs1, vs3); } -void vssseg1e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8007027, vm, rs2, rs1, vs3); } -void vssseg2e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x28007027, vm, rs2, rs1, vs3); } -void vssseg3e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x48007027, vm, rs2, rs1, vs3); } -void vssseg4e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x68007027, vm, rs2, rs1, vs3); } -void vssseg5e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x88007027, vm, rs2, rs1, vs3); } -void vssseg6e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xa8007027, vm, rs2, rs1, vs3); } -void vssseg7e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xc8007027, vm, rs2, rs1, vs3); } -void vssseg8e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xe8007027, vm, rs2, rs1, vs3); } -void vsse64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8007027, vm, rs2, rs1, vs3); } -void vssseg1e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8000027, vm, rs2, rs1, vs3); } -void vssseg2e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x28000027, vm, rs2, rs1, vs3); } -void vssseg3e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x48000027, vm, rs2, rs1, vs3); } -void vssseg4e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x68000027, vm, rs2, rs1, vs3); } -void vssseg5e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x88000027, vm, rs2, rs1, vs3); } -void vssseg6e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xa8000027, vm, rs2, rs1, vs3); } -void vssseg7e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xc8000027, vm, rs2, rs1, vs3); } -void vssseg8e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xe8000027, vm, rs2, rs1, vs3); } -void vsse8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8000027, vm, rs2, rs1, vs3); } -void vssra_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xac003057, vm, vs2, simm5, vd); } -void vssra_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xac000057, vm, vs2, vs1, vd); } -void vssra_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xac004057, vm, vs2, rs1, vd); } -void vssrl_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xa8003057, vm, vs2, simm5, vd); } -void vssrl_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xa8000057, vm, vs2, vs1, vd); } -void vssrl_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xa8004057, vm, vs2, rs1, vd); } -void vssub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x8c000057, vm, vs2, vs1, vd); } -void vssub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x8c004057, vm, vs2, rs1, vd); } -void vssubu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x88000057, vm, vs2, vs1, vd); } -void vssubu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x88004057, vm, vs2, rs1, vd); } -void vsub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x8000057, vm, vs2, vs1, vd); } -void vsub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x8004057, vm, vs2, rs1, vd); } -void vsuxei1024_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x14007027, vm, vs2, rs1, vs3); } -void vsuxei128_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x14000027, vm, vs2, rs1, vs3); } -void vsuxei16_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x4005027, vm, vs2, rs1, vs3); } -void vsuxei256_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x14005027, vm, vs2, rs1, vs3); } -void vsuxei32_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x4006027, vm, vs2, rs1, vs3); } -void vsuxei512_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x14006027, vm, vs2, rs1, vs3); } -void vsuxei64_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x4007027, vm, vs2, rs1, vs3); } -void vsuxei8_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x4000027, vm, vs2, rs1, vs3); } -void vwadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xc4002057, vm, vs2, vs1, vd); } -void vwadd_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xc4006057, vm, vs2, rs1, vd); } -void vwadd_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xd4002057, vm, vs2, vs1, vd); } -void vwadd_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xd4006057, vm, vs2, rs1, vd); } -void vwaddu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xc0002057, vm, vs2, vs1, vd); } -void vwaddu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xc0006057, vm, vs2, rs1, vd); } -void vwaddu_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xd0002057, vm, vs2, vs1, vd); } -void vwaddu_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xd0006057, vm, vs2, rs1, vd); } -void vwmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xf4002057, vm, vs2, vs1, vd); } -void vwmacc_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xf4006057, vm, vs2, rs1, vd); } -void vwmaccsu_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xfc002057, vm, vs2, vs1, vd); } -void vwmaccsu_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xfc006057, vm, vs2, rs1, vd); } -void vwmaccu_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xf0002057, vm, vs2, vs1, vd); } -void vwmaccu_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xf0006057, vm, vs2, rs1, vd); } -void vwmaccus_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xf8006057, vm, vs2, rs1, vd); } -void vwmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xec002057, vm, vs2, vs1, vd); } -void vwmul_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xec006057, vm, vs2, rs1, vd); } -void vwmulsu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xe8002057, vm, vs2, vs1, vd); } -void vwmulsu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xe8006057, vm, vs2, rs1, vd); } -void vwmulu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xe0002057, vm, vs2, vs1, vd); } -void vwmulu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xe0006057, vm, vs2, rs1, vd); } -void vwredsum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xc4000057, vm, vs2, vs1, vd); } -void vwredsumu_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xc0000057, vm, vs2, vs1, vd); } -void vwsub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xcc002057, vm, vs2, vs1, vd); } -void vwsub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xcc006057, vm, vs2, rs1, vd); } -void vwsub_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xdc002057, vm, vs2, vs1, vd); } -void vwsub_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xdc006057, vm, vs2, rs1, vd); } -void vwsubu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xc8002057, vm, vs2, vs1, vd); } -void vwsubu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xc8006057, vm, vs2, rs1, vd); } -void vwsubu_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xd8002057, vm, vs2, vs1, vd); } -void vwsubu_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xd8006057, vm, vs2, rs1, vd); } -void vxor_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x2c003057, vm, vs2, simm5, vd); } -void vxor_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x2c000057, vm, vs2, vs1, vd); } -void vxor_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x2c004057, vm, vs2, rs1, vd); } -void vzext_vf2(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x48032057, vm, vs2, 0, vd); } -void vzext_vf4(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x48022057, vm, vs2, 0, vd); } -void vzext_vf8(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x48012057, vm, vs2, 0, vd); } - -void vsetivli(const Reg& rd, uint32_t uimm, SEW sew, LMUL lmul=LMUL::m1, VTA vta=VTA::tu, VMA vma=VMA::mu) { - uint32_t zimm = (static_cast(vma)<<7) | - (static_cast(vta)<<6) | - (static_cast(sew)<<3) | - (static_cast(lmul)); - uint32_t v = (0x3<<30) | (zimm<<20) | (uimm<<15) | (0x7<<12) | (rd.getIdx()<<7) | (0x57); - append4B(v); -} - -void vsetvli(const Reg& rd, const Reg& rs1, SEW sew, LMUL lmul=LMUL::m1, VTA vta=VTA::tu, VMA vma=VMA::mu) { - uint32_t zimm = (static_cast(vma)<<7) | - (static_cast(vta)<<6) | - (static_cast(sew)<<3) | - (static_cast(lmul)); - uint32_t v = (0x0<<31) | (zimm<<20) | (rs1.getIdx()<<15) | (0x7<<12) | (rd.getIdx()<<7) | (0x57); - append4B(v); -} - -void vsetvl(const Reg& rd, const Reg& rs1, const Reg& rs2) { - uint32_t v = (0x40<<25) | (rs2.getIdx()<<20) | (rs1.getIdx()<<15) | (0x7<<12) | (rd.getIdx()<<7) | (0x57); - append4B(v); -} - - -// Copy mask register -void vmmv_m(const VReg& vd, const VReg& vs) { vmand_mm(vd, vs, vs); } -// Clear mask register -void vmclr_m(const VReg& vd) { vmxor_mm(vd, vd, vd); } -// Set mask register -void vmset_m(const VReg& vd) { vmxnor_mm(vd, vd, vd); } -// Invert bits -void vmnot_m(const VReg& vd, const VReg& vs) { vmnand_mm(vd, vs, vs); } - - -// vector compare pseudoinstructions -void vmfgt_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { vmflt_vv(vd, vs2, vs1, vm); } -void vmfge_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { vmfle_vv(vd, vs2, vs1, vm); } - -// sign-related pseudoinstructions -void vfabs_v(const VReg& vd, const VReg& vs, VM vm=VM::unmasked) { vfsgnjx_vv(vd, vs, vs, vm); } -void vfneg_v(const VReg& vd, const VReg& vs, VM vm=VM::unmasked) { vfsgnjn_vv(vd, vs, vs, vm); } From 9b4e111e3b0f26e84fa2d4b0a3adb5cad60fcbbe Mon Sep 17 00:00:00 2001 From: StrelkovKM Date: Tue, 28 Apr 2026 21:29:22 +0000 Subject: [PATCH 12/13] [CPU][RV64] Return ref, clean comments and edit choose impl --- src/cpu/cpu_convolution_list.cpp | 6 +- src/cpu/rv64/rvv_gemm_convolution.cpp | 16 +- src/cpu/rv64/rvv_gemm_convolution.hpp | 10 +- src/cpu/rv64/rvv_gemm_convolution_utils.cpp | 462 ++++++++++---------- 4 files changed, 251 insertions(+), 243 deletions(-) diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp index c8f41b8e947..6a0756b4e95 100644 --- a/src/cpu/cpu_convolution_list.cpp +++ b/src/cpu/cpu_convolution_list.cpp @@ -182,9 +182,9 @@ const std::map> &impl_list_map() CPU_INSTANCE_RV64GCV(riscv_gemm_convolution_fwd_t) - // CPU_INSTANCE(gemm_convolution_fwd_t) - // CPU_INSTANCE(ref_convolution_fwd_t) - // CPU_INSTANCE(ref_fused_convolution_fwd_t) + CPU_INSTANCE(gemm_convolution_fwd_t) + CPU_INSTANCE(ref_convolution_fwd_t) + CPU_INSTANCE(ref_fused_convolution_fwd_t) nullptr, }}, {{forward, f32, f16, f32}, { diff --git a/src/cpu/rv64/rvv_gemm_convolution.cpp b/src/cpu/rv64/rvv_gemm_convolution.cpp index dfb271575d0..f4ea4196da9 100644 --- a/src/cpu/rv64/rvv_gemm_convolution.cpp +++ b/src/cpu/rv64/rvv_gemm_convolution.cpp @@ -48,7 +48,7 @@ static void apply_bias_eltwise_rvv_nspc( bool with_eltwise, const ref_post_ops_t *post_ops, const exec_ctx_t &ctx, - const memory_desc_t *dst_md, // Changed to pointer to memory_desc_t + const memory_desc_t *dst_md, const conv_gemm_conf_t &jcp, size_t g, size_t os_offset_factor) { @@ -350,7 +350,7 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp( jcp.os_block == jcp.os && jcp.ic_block == jcp.ic && jcp.os_nb_block == 1)); - status_t st = status::success; + std::atomic st(status::success); parallel(jcp.nthr, [&](const int ithr, const int nthr) { data_t *_col = col + (ptrdiff_t)ithr * jcp.im2col_sz; @@ -547,7 +547,8 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp( status_t st_thr = inner_ker(spatial, curr, prev, step, end); if (st_thr != status::success) { - st = st_thr; + status_t expected = status::success; + st.compare_exchange_strong(expected, st_thr); return; } } @@ -562,13 +563,16 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp( status_t st_thr = inner_ker(spatial, curr, prev, step, end); if (st_thr != status::success) { - st = st_thr; + status_t expected = status::success; + st.compare_exchange_strong(expected, st_thr); return; } } } - else - st = status::unimplemented; + else { + status_t expected = status::success; + st.compare_exchange_strong(expected, status::unimplemented); + } }); return st; diff --git a/src/cpu/rv64/rvv_gemm_convolution.hpp b/src/cpu/rv64/rvv_gemm_convolution.hpp index 19f4289920c..1545afb6912 100644 --- a/src/cpu/rv64/rvv_gemm_convolution.hpp +++ b/src/cpu/rv64/rvv_gemm_convolution.hpp @@ -38,7 +38,7 @@ struct riscv_gemm_convolution_fwd_t : public primitive_t { struct pd_t : public cpu_convolution_fwd_pd_t { using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t; - DECLARE_COMMON_PD_T(GEMM_IMPL_STR, riscv_gemm_convolution_fwd_t, + DECLARE_COMMON_PD_T("gemm:any", riscv_gemm_convolution_fwd_t, USE_GLOBAL_SCRATCHPAD); status_t init(engine_t *engine) { @@ -68,8 +68,7 @@ struct riscv_gemm_convolution_fwd_t : public primitive_t { // TODO: make `init_conf` assign initialized object to `jcp_` jcp_ = conv_gemm_conf_t(); - - return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad, + return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad, *desc(), src_md_, weights_md_, dst_md_, bias_md_, attr_, dnnl_get_max_threads()); } @@ -114,7 +113,6 @@ struct riscv_gemm_convolution_fwd_t : public primitive_t { : primitive_t(apd), post_ops_(nullptr) {} status_t init(engine_t *engine) override { - std::cout << "GEMM INIT" << std::endl; const auto &jcp = pd()->jcp_; if (jcp.with_eltwise || jcp.with_binary) { @@ -122,16 +120,12 @@ struct riscv_gemm_convolution_fwd_t : public primitive_t { CHECK(post_ops_->init(pd()->dst_md())); } - std::cout << "GEMM SUCCESS" << std::endl; return status::success; } using data_t = typename prec_traits_t::type; status_t execute(const exec_ctx_t &ctx) const override { - fprintf(stderr, "[RVV EXECUTE] Layer executed!\n"); - fflush(stderr); - bool is_nspc = pd()->jcp_.is_nspc; return is_nspc ? execute_forward_nspc(ctx) : execute_forward_ncsp(ctx); } diff --git a/src/cpu/rv64/rvv_gemm_convolution_utils.cpp b/src/cpu/rv64/rvv_gemm_convolution_utils.cpp index 2ce81d0a738..bcfb62b2990 100644 --- a/src/cpu/rv64/rvv_gemm_convolution_utils.cpp +++ b/src/cpu/rv64/rvv_gemm_convolution_utils.cpp @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ - #include "cpu/rv64/rvv_gemm_convolution_utils.hpp" #include "common/bfloat16.hpp" #include "common/c_types_map.hpp" @@ -21,14 +20,16 @@ #include "common/type_helpers.hpp" #include "common/utils.hpp" #include "cpu/scale_utils.hpp" - #include "cpu/platform.hpp" +#ifdef DNNL_RISCV_USE_RVV_INTRINSICS +#include +#endif + namespace dnnl { namespace impl { namespace cpu { namespace rv64 { - using namespace dnnl::impl::status; using namespace dnnl::impl::utils; using namespace prop_kind; @@ -48,13 +49,12 @@ namespace jit_gemm_convolution_utils { template void im2col_3d(const conv_gemm_conf_t &jcp, const data_type_t *im, data_type_t *col, dim_t od, int spatial_step, int spatial_block) { - using data_t = - typename conditional::data_type == bf16, - uint16_t, data_type_t>::type; + using data_t = typename conditional::data_type + == bf16, + uint16_t, data_type_t>::type; const data_t *__restrict _im = reinterpret_cast(im); data_t *__restrict _col = reinterpret_cast(col); - const size_t OHW = spatial_block; const size_t im_step = jcp.ih * jcp.iw * jcp.id; const size_t col_step = jcp.ks * OHW; @@ -97,7 +97,8 @@ void im2col_3d(const conv_gemm_conf_t &jcp, const data_type_t *im, col_ += jcp.kw * OHW; } } else { - const data_t *__restrict im_ = im_loc + id * jcp.ih * jcp.iw; + const data_t *__restrict im_ + = im_loc + id * jcp.ih * jcp.iw; dim_t ih_ = -jcp.t_pad; for (dim_t kh = 0; kh < jcp.kh; ++kh) { dim_t ih = ih_; @@ -211,17 +212,18 @@ void im2col_3d(const conv_gemm_conf_t &jcp, const data_type_t *im, // zero padding is handled outside im2col const bool outer_padding = jcp.os_nb_block == 1; - if (outer_padding) + if (outer_padding) { parallel_nd(jcp.ic, compute_im2col_outer_padding); - else + } else { parallel_nd(jcp.ic, compute_im2col_padding); + } } -template void im2col_3d(const conv_gemm_conf_t &jcp, const float *im, +template void im2col_3d(const conv_gemm_conf_t &jcp, const float *im, float *col, dim_t od, int spatial_step, int spatial_block); - -template void im2col_3d(const conv_gemm_conf_t &jcp, const bfloat16_t *im, - bfloat16_t *col, dim_t od, int spatial_step, int spatial_block); +template void im2col_3d(const conv_gemm_conf_t &jcp, + const bfloat16_t *im, bfloat16_t *col, dim_t od, int spatial_step, + int spatial_block); /* imtr[ic][od][oh][ow] <-- im[id][ih][iw][ic]*/ template @@ -231,7 +233,8 @@ void transpose_dt(const conv_gemm_conf_t &jcp, const T *__restrict im, const dim_t ic_stride = jcp.id * jcp.ih * jcp.iw; const dim_t IC = jcp.ngroups * jcp.ic; const dim_t IHW = jcp.ih * jcp.iw; - constexpr dim_t ic_block = platform::get_cache_line_size(); + const dim_t ic_block = nstl::max( + 1, platform::get_cache_line_size() / sizeof(T)); const dim_t nb_ic = jcp.ic / ic_block; const dim_t ic_blocked = nb_ic * ic_block; parallel_nd(jcp.id, jcp.ih, [&](dim_t id, dim_t ih) { @@ -255,15 +258,15 @@ void transpose_dt(const conv_gemm_conf_t &jcp, const T *__restrict im, }); } -template void transpose_dt(const conv_gemm_conf_t &jcp, +template void transpose_dt(const conv_gemm_conf_t &jcp, const int8_t *__restrict im, int8_t *__restrict imtr); -template void transpose_dt(const conv_gemm_conf_t &jcp, +template void transpose_dt(const conv_gemm_conf_t &jcp, const uint8_t *__restrict im, uint8_t *__restrict imtr); -template void transpose_dt(const conv_gemm_conf_t &jcp, +template void transpose_dt(const conv_gemm_conf_t &jcp, const char *__restrict im, char *__restrict imtr); -template void transpose_dt(const conv_gemm_conf_t &jcp, +template void transpose_dt(const conv_gemm_conf_t &jcp, const float *__restrict im, float *__restrict imtr); -template void transpose_dt(const conv_gemm_conf_t &jcp, +template void transpose_dt(const conv_gemm_conf_t &jcp, const bfloat16_t *__restrict im, bfloat16_t *__restrict imtr); /* col[kd][kh][kw][g][ic][od][oh][ow] <-- im2col_dt_3d(im[id][ih][iw][g][ic]) */ @@ -282,7 +285,6 @@ void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr, const im_dt *__restrict imtr = reinterpret_cast(_imtr); col_dt *__restrict col = reinterpret_cast(_col); - col_dt shift = static_cast(jcp.signed_input ? 128 : 0); const dim_t dd = 1 + jcp.dilate_d; const dim_t dh = 1 + jcp.dilate_h; @@ -303,89 +305,100 @@ void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr, if (sd == 1 && sh == 1 && sw == 1 && dd == 1 && dh == 1 && dw == 1) parallel_nd(jcp.kd, jcp.kh, jcp.kw, jcp.ic, [&](dim_t kd, dim_t kh, dim_t kw, dim_t ic) { - col_dt *__restrict col_loc = col + kd * col_kd_s + kh * col_kh_s - + kw * col_kw_s + ic * col_ic_s; - const dim_t id = od - fp + kd; - if (id < 0 || id >= jcp.id) { - for (ptrdiff_t i = 0; i < OHW; i++) - col_loc[i] = shift; - return; - } - const im_dt *__restrict imtr_loc = imtr + (ic * jcp.id + id) * IHW; - const dim_t oh_start = saturate(dim_t(0), jcp.oh, tp - kh); - const dim_t oh_end = saturate(dim_t(0), jcp.oh, jcp.ih + tp - kh); - const dim_t ow_start = saturate(dim_t(0), jcp.ow, lp - kw); - const dim_t ow_end = saturate(dim_t(0), jcp.ow, jcp.iw + lp - kw); - for (dim_t oh = oh_start, ih = oh_start - tp + kh; oh < oh_end; - oh++, ih++) { - col_dt *__restrict col_h = col_loc + oh * jcp.ow; - const im_dt *__restrict imtr_h = imtr_loc + ih * jcp.iw; - for (dim_t ow = ow_start, iw = ow_start - lp + kw; ow < ow_end; - ow++, iw++) { - col_h[ow] = imtr_h[iw]; - } - } - }); + col_dt *__restrict col_loc = col + kd * col_kd_s + + kh * col_kh_s + kw * col_kw_s + ic * col_ic_s; + const dim_t id = od - fp + kd; + if (id < 0 || id >= jcp.id) { + for (ptrdiff_t i = 0; i < OHW; i++) + col_loc[i] = shift; + return; + } + const im_dt *__restrict imtr_loc + = imtr + (ic * jcp.id + id) * IHW; + const dim_t oh_start + = saturate(dim_t(0), jcp.oh, tp - kh); + const dim_t oh_end + = saturate(dim_t(0), jcp.oh, jcp.ih + tp - kh); + const dim_t ow_start + = saturate(dim_t(0), jcp.ow, lp - kw); + const dim_t ow_end + = saturate(dim_t(0), jcp.ow, jcp.iw + lp - kw); + for (dim_t oh = oh_start, ih = oh_start - tp + kh; + oh < oh_end; oh++, ih++) { + col_dt *__restrict col_h = col_loc + oh * jcp.ow; + const im_dt *__restrict imtr_h + = imtr_loc + ih * jcp.iw; + for (dim_t ow = ow_start, iw = ow_start - lp + kw; + ow < ow_end; ow++, iw++) { + col_h[ow] = imtr_h[iw]; + } + } + }); else if (sd == 2 && sh == 2 && sw == 2 && dd == 1 && dh == 1 && dw == 1) parallel_nd(jcp.kd, jcp.kh, jcp.kw, jcp.ic, [&](dim_t kd, dim_t kh, dim_t kw, dim_t ic) { - col_dt *__restrict col_loc = col + kd * col_kd_s + kh * col_kh_s - + kw * col_kw_s + ic * col_ic_s; - const dim_t id = od * 2 - fp + kd; - if (id < 0 || id >= jcp.id) { - for (ptrdiff_t i = 0; i < OHW; i++) - col_loc[i] = shift; - return; - } - const im_dt *__restrict imtr_loc = imtr + (ic * jcp.id + id) * IHW; - const dim_t oh_start - = saturate(dim_t(0), jcp.oh, div_up(tp - kh, 2)); - const dim_t oh_end - = saturate(dim_t(0), jcp.oh, div_up(jcp.ih + tp - kh, 2)); - const dim_t ow_start - = saturate(dim_t(0), jcp.ow, div_up(lp - kw, 2)); - const dim_t ow_end - = saturate(dim_t(0), jcp.ow, div_up(jcp.iw + lp - kw, 2)); - for (dim_t oh = oh_start, ih = oh_start * 2 - tp + kh; oh < oh_end; - ++oh, ih += 2) { - col_dt *__restrict col_h = col_loc + oh * jcp.ow; - const im_dt *__restrict imtr_h = imtr_loc + ih * jcp.iw; - for (dim_t ow = ow_start, iw = ow_start * 2 - lp + kw; - ow < ow_end; ++ow, iw += 2) { - col_h[ow] = imtr_h[iw]; - } - } - }); + col_dt *__restrict col_loc = col + kd * col_kd_s + + kh * col_kh_s + kw * col_kw_s + ic * col_ic_s; + const dim_t id = od * 2 - fp + kd; + if (id < 0 || id >= jcp.id) { + for (ptrdiff_t i = 0; i < OHW; i++) + col_loc[i] = shift; + return; + } + const im_dt *__restrict imtr_loc + = imtr + (ic * jcp.id + id) * IHW; + const dim_t oh_start + = saturate(dim_t(0), jcp.oh, div_up(tp - kh, 2)); + const dim_t oh_end = saturate( + dim_t(0), jcp.oh, div_up(jcp.ih + tp - kh, 2)); + const dim_t ow_start + = saturate(dim_t(0), jcp.ow, div_up(lp - kw, 2)); + const dim_t ow_end = saturate( + dim_t(0), jcp.ow, div_up(jcp.iw + lp - kw, 2)); + for (dim_t oh = oh_start, ih = oh_start * 2 - tp + kh; + oh < oh_end; ++oh, ih += 2) { + col_dt *__restrict col_h = col_loc + oh * jcp.ow; + const im_dt *__restrict imtr_h + = imtr_loc + ih * jcp.iw; + for (dim_t ow = ow_start, iw = ow_start * 2 - lp + kw; + ow < ow_end; ++ow, iw += 2) { + col_h[ow] = imtr_h[iw]; + } + } + }); else parallel_nd(jcp.kd, jcp.kh, jcp.kw, jcp.ic, [&](dim_t kd, dim_t kh, dim_t kw, dim_t ic) { - col_dt *__restrict col_loc = col + kd * col_kd_s + kh * col_kh_s - + kw * col_kw_s + ic * col_ic_s; - const dim_t id = od * sd - fp + kd * dd; - if (id < 0 || id >= jcp.id) { - for (ptrdiff_t i = 0; i < OHW; i++) - col_loc[i] = shift; - return; - } - const im_dt *__restrict imtr_loc = imtr + (ic * jcp.id + id) * IHW; - const dim_t oh_start - = saturate(dim_t(0), jcp.oh, div_up(tp - kh * dh, sh)); - const dim_t oh_end = saturate( - dim_t(0), jcp.oh, div_up(jcp.ih + tp - kh * dh, sh)); - const dim_t ow_start - = saturate(dim_t(0), jcp.ow, div_up(lp - kw * dw, sw)); - const dim_t ow_end = saturate( - dim_t(0), jcp.ow, div_up(jcp.iw + lp - kw * dw, sw)); - for (dim_t oh = oh_start, ih = oh_start * sh - tp + kh * dh; - oh < oh_end; ++oh, ih += sh) { - col_dt *__restrict col_h = col_loc + oh * jcp.ow; - const im_dt *__restrict imtr_h = imtr_loc + ih * jcp.iw; - for (dim_t ow = ow_start, iw = ow_start * sw - lp + kw * dw; - ow < ow_end; ++ow, iw += sw) { - col_h[ow] = imtr_h[iw]; - } - } - }); + col_dt *__restrict col_loc = col + kd * col_kd_s + + kh * col_kh_s + kw * col_kw_s + ic * col_ic_s; + const dim_t id = od * sd - fp + kd * dd; + if (id < 0 || id >= jcp.id) { + for (ptrdiff_t i = 0; i < OHW; i++) + col_loc[i] = shift; + return; + } + const im_dt *__restrict imtr_loc + = imtr + (ic * jcp.id + id) * IHW; + const dim_t oh_start = saturate( + dim_t(0), jcp.oh, div_up(tp - kh * dh, sh)); + const dim_t oh_end = saturate(dim_t(0), jcp.oh, + div_up(jcp.ih + tp - kh * dh, sh)); + const dim_t ow_start = saturate( + dim_t(0), jcp.ow, div_up(lp - kw * dw, sw)); + const dim_t ow_end = saturate(dim_t(0), jcp.ow, + div_up(jcp.iw + lp - kw * dw, sw)); + for (dim_t oh = oh_start, ih = oh_start * sh - tp + kh * dh; + oh < oh_end; ++oh, ih += sh) { + col_dt *__restrict col_h = col_loc + oh * jcp.ow; + const im_dt *__restrict imtr_h + = imtr_loc + ih * jcp.iw; + for (dim_t ow = ow_start, + iw = ow_start * sw - lp + kw * dw; + ow < ow_end; ++ow, iw += sw) { + col_h[ow] = imtr_h[iw]; + } + } + }); } template void im2col_dt_3d(const conv_gemm_conf_t &jcp, @@ -500,58 +513,62 @@ void im2col(const conv_gemm_conf_t &jcp, const data_type_t *__restrict im, if (sw == 1) parallel_nd(cb, jcp.kh, jcp.kw, oh_range, [&](dim_t ic, dim_t kh, dim_t kw, dim_t ohr) { - const dim_t oh = ohr + oh_begin; - const dim_t ih = oh * sh - tp + kh * dh; - const dim_t ow_start = (oh == first_oh) ? first_ow : 0; - const dim_t ow_end = (oh == last_oh) ? (last_ow + 1) : jcp.ow; - data_t *__restrict col_oh = _col + ic * col_step - + (kh * jcp.kw + kw) * sb + oh * jcp.ow - ss; - const data_t *__restrict im_ - = _im + (ic + cs) * im_step + ih * jcp.iw; - const dim_t iw_shift = kw * dw - lp; - if (ih < 0 || ih >= jcp.ih) - for (dim_t ow = ow_start; ow < ow_end; ow++) - col_oh[ow] = zero_val; - else - for (dim_t ow = ow_start; ow < ow_end; ow++) { - const dim_t iw = ow + iw_shift; - if (iw < 0 || iw >= jcp.iw) - col_oh[ow] = zero_val; + const dim_t oh = ohr + oh_begin; + const dim_t ih = oh * sh - tp + kh * dh; + const dim_t ow_start + = (oh == first_oh) ? first_ow : 0; + const dim_t ow_end + = (oh == last_oh) ? (last_ow + 1) : jcp.ow; + data_t *__restrict col_oh = _col + ic * col_step + + (kh * jcp.kw + kw) * sb + oh * jcp.ow - ss; + const data_t *__restrict im_ = _im + (ic + cs) * im_step + + ih * jcp.iw; + const dim_t iw_shift = kw * dw - lp; + if (ih < 0 || ih >= jcp.ih) + for (dim_t ow = ow_start; ow < ow_end; ow++) + col_oh[ow] = zero_val; else - col_oh[ow] = im_[iw]; - } - }); + for (dim_t ow = ow_start; ow < ow_end; ow++) { + const dim_t iw = ow + iw_shift; + if (iw < 0 || iw >= jcp.iw) + col_oh[ow] = zero_val; + else + col_oh[ow] = im_[iw]; + } + }); else parallel_nd(cb, jcp.kh, jcp.kw, oh_range, [&](dim_t ic, dim_t kh, dim_t kw, dim_t ohr) { - const dim_t oh = ohr + oh_begin; - const dim_t ih = oh * sh - tp + kh * dh; - const dim_t ow_start = (oh == first_oh) ? first_ow : 0; - const dim_t ow_end = (oh == last_oh) ? (last_ow + 1) : jcp.ow; - data_t *__restrict col_oh = _col + ic * col_step - + (kh * jcp.kw + kw) * sb + oh * jcp.ow - ss; - const data_t *__restrict im_ = _im + (ic + cs) * im_step; - if (ih < 0 || ih >= jcp.ih) - for (dim_t ow = ow_start; ow < ow_end; ow++) - col_oh[ow] = zero_val; - else - for (dim_t ow = ow_start; ow < ow_end; ow++) { - const dim_t iw = ow * sw - lp + kw * dw; - if (iw < 0 || iw >= jcp.iw) - col_oh[ow] = zero_val; - else { - const ptrdiff_t im_idx = ih * jcp.iw + iw; - col_oh[ow] = im_[im_idx]; - } - } - }); + const dim_t oh = ohr + oh_begin; + const dim_t ih = oh * sh - tp + kh * dh; + const dim_t ow_start + = (oh == first_oh) ? first_ow : 0; + const dim_t ow_end + = (oh == last_oh) ? (last_ow + 1) : jcp.ow; + data_t *__restrict col_oh = _col + ic * col_step + + (kh * jcp.kw + kw) * sb + oh * jcp.ow - ss; + const data_t *__restrict im_ = _im + (ic + cs) * im_step; + if (ih < 0 || ih >= jcp.ih) + for (dim_t ow = ow_start; ow < ow_end; ow++) + col_oh[ow] = zero_val; + else + for (dim_t ow = ow_start; ow < ow_end; ow++) { + const dim_t iw = ow * sw - lp + kw * dw; + if (iw < 0 || iw >= jcp.iw) + col_oh[ow] = zero_val; + else { + const ptrdiff_t im_idx = ih * jcp.iw + iw; + col_oh[ow] = im_[im_idx]; + } + } + }); } } -template void im2col(const conv_gemm_conf_t &jcp, const float *__restrict im, - float *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb); - -template void im2col(const conv_gemm_conf_t &jcp, +template void im2col(const conv_gemm_conf_t &jcp, + const float *__restrict im, float *__restrict col, dim_t hs, + dim_t hb, dim_t ws, dim_t wb); +template void im2col(const conv_gemm_conf_t &jcp, const bfloat16_t *__restrict im, bfloat16_t *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb); @@ -560,19 +577,16 @@ template void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict _im, void *__restrict _imtr, orig_col_dt *__restrict _col, dim_t hs, dim_t hb, dim_t ws, dim_t wb) { - // For performance reasons, use uint16_t as a proxy for bfloat16_t - using im_dt = - typename utils::conditional::data_type - == bf16, - uint16_t, orig_im_dt>::type; - using col_dt = - typename utils::conditional::data_type - == bf16, - uint16_t, orig_col_dt>::type; - const im_dt *__restrict im = reinterpret_cast(_im); + using im_dt = typename utils::conditional< + data_traits_t::data_type == bf16, uint16_t, + orig_im_dt>::type; + using col_dt = typename utils::conditional< + data_traits_t::data_type == bf16, uint16_t, + orig_col_dt>::type; + const im_dt *__restrict im + = reinterpret_cast(_im); im_dt *__restrict imtr = reinterpret_cast(_imtr); col_dt *__restrict col = reinterpret_cast(_col); - col_dt shift = static_cast(jcp.signed_input ? 128 : 0); const dim_t dh = 1 + jcp.dilate_h; const dim_t dw = 1 + jcp.dilate_w; @@ -655,32 +669,34 @@ void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict _im, } else { parallel_nd(jcp.kh, jcp.kw, jcp.ic, hb, [&](dim_t kh, dim_t kw, dim_t ic, dim_t oh) { - const dim_t hp = tp - kh * dh; - const dim_t ih = (oh + hs) * sh - hp; - const ptrdiff_t col_idx_base - = (((kh * jcp.kw + kw) * jcp.ic + ic) * hb + oh) * wb; - if (ih < 0 || ih >= jcp.ih) - for (dim_t ow = 0; ow < wb; ow++) - col[col_idx_base + ow] = shift; - else { - const dim_t wp = lp - kw * dw; - const dim_t ow_start - = saturate(dim_t(0), wb, div_up(wp, sw) - ws); - const dim_t ow_end - = saturate(dim_t(0), wb, div_up(jcp.iw + wp, sw) - ws); - for (dim_t ow = 0; ow < ow_start; ow++) - col[col_idx_base + ow] = shift; - const dim_t iw_base = ws * sw - wp; - const ptrdiff_t im_idx_base = ih * im_ih_stride + ic; - for (dim_t ow = ow_start; ow < ow_end; ow++) { - const dim_t iw = iw_base + ow * sw; - const ptrdiff_t im_idx = im_idx_base + iw * im_iw_stride; - col[col_idx_base + ow] = im[im_idx] + shift; - } - for (dim_t ow = ow_end; ow < wb; ow++) - col[col_idx_base + ow] = shift; - } - }); + const dim_t hp = tp - kh * dh; + const dim_t ih = (oh + hs) * sh - hp; + const ptrdiff_t col_idx_base + = (((kh * jcp.kw + kw) * jcp.ic + ic) * hb + oh) + * wb; + if (ih < 0 || ih >= jcp.ih) + for (dim_t ow = 0; ow < wb; ow++) + col[col_idx_base + ow] = shift; + else { + const dim_t wp = lp - kw * dw; + const dim_t ow_start = saturate( + dim_t(0), wb, div_up(wp, sw) - ws); + const dim_t ow_end = saturate(dim_t(0), wb, + div_up(jcp.iw + wp, sw) - ws); + for (dim_t ow = 0; ow < ow_start; ow++) + col[col_idx_base + ow] = shift; + const dim_t iw_base = ws * sw - wp; + const ptrdiff_t im_idx_base = ih * im_ih_stride + ic; + for (dim_t ow = ow_start; ow < ow_end; ow++) { + const dim_t iw = iw_base + ow * sw; + const ptrdiff_t im_idx + = im_idx_base + iw * im_iw_stride; + col[col_idx_base + ow] = im[im_idx] + shift; + } + for (dim_t ow = ow_end; ow < wb; ow++) + col[col_idx_base + ow] = shift; + } + }); } } @@ -693,7 +709,6 @@ template void im2col_dt(const conv_gemm_conf_t &jcp, template void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict im, void *__restrict imtr, float *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb); - template void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict im, void *__restrict imtr, bfloat16_t *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb); @@ -1116,10 +1131,12 @@ status_t init_conf(conv_gemm_conf_t &jcp, const bool is_bwd_w = jcp.prop_kind == backward_weights; const bool is_fwd = !is_bwd_d && !is_bwd_w; - const auto dst_max_size - = static_cast(jcp.iw) * jcp.ih * jcp.id * jcp.ic * 4; - const auto src_max_size - = static_cast(jcp.ow) * jcp.oh * jcp.od * jcp.oc * 4; + const auto dst_max_size = static_cast(jcp.iw) + * static_cast(jcp.ih) * static_cast(jcp.id) + * static_cast(jcp.ic) * 4; + const auto src_max_size = static_cast(jcp.ow) + * static_cast(jcp.oh) * static_cast(jcp.od) + * static_cast(jcp.oc) * 4; VDISPATCH_CONV_IC(dst_max_size <= INT_MAX && src_max_size <= INT_MAX, VERBOSE_UNSUPPORTED_FEATURE, "dst/scr size > INT_MAX is not supported"); @@ -1195,7 +1212,7 @@ status_t init_conf(conv_gemm_conf_t &jcp, // to the number of threads and multiplied by a heuristic coefficient (15) const size_t zp_src_pad_comp_size = zp_src_with_padding ? (jcp.oc * jcp.ngroups * jcp.zp.src_pad_comp.d - * jcp.zp.src_pad_comp.h * jcp.zp.src_pad_comp.w) + * jcp.zp.src_pad_comp.h * jcp.zp.src_pad_comp.w) : 0u; const size_t zp_src_comp_size = jcp.zp.src_is_common ? utils::rnd_up(jcp.oc * jcp.ngroups, @@ -1247,6 +1264,7 @@ status_t init_conf(conv_gemm_conf_t &jcp, // memory for transposition row_size += ic * iw; + if (row_size == 0) row_size = 1; h_block = nstl::max( dim_t(1), nstl::min(oh, div_up(dim_t(L2), row_size))); if (h_block == 1) { @@ -1489,8 +1507,7 @@ status_t init_conf(conv_gemm_conf_t &jcp, // 64K - this is heuristic gemm size per thread threshold. constexpr size_t gemm_thrld = 64 * 1024; if (!jcp.outer_threading && !is_3d) { - bool is_depthwise - = jcp.ic == 1 && jcp.oc == 1 && jcp.ngroups != 1; + bool is_depthwise = jcp.ic == 1 && jcp.oc == 1 && jcp.ngroups != 1; const size_t outer_work = jcp.ngroups * jcp.mb; const float outer_thr_eff = (float)outer_work / rnd_up(outer_work, max_threads); @@ -1498,9 +1515,8 @@ status_t init_conf(conv_gemm_conf_t &jcp, = div_up(jcp.is, simd_w) * div_up(jcp.ic, simd_w); const float inner_thr_eff = (float)inner_work / rnd_up(inner_work, max_threads); - jcp.outer_threading - = (is_depthwise - || (jcp.is / max_threads < 64 && jcp.mb != 1)) + jcp.outer_threading = (is_depthwise + || (jcp.is / max_threads < 64 && jcp.mb != 1)) && (outer_thr_eff / inner_thr_eff >= 1.f || (static_cast(jcp.os) * jcp.ic * jcp.oc) @@ -1524,7 +1540,7 @@ status_t init_conf(conv_gemm_conf_t &jcp, gemm_col_datatype_size); if (is_bf16_conv && jcp.with_bias && one_of(data_type::bf16, cd.diff_bias_desc.data_type, - cd.bias_desc.data_type)) { + cd.bias_desc.data_type)) { scratchpad.book( key_conv_bias_bf16_convert_wsp, jcp.ngroups * jcp.oc); } @@ -1540,17 +1556,17 @@ status_t init_conf(conv_gemm_conf_t &jcp, // gemm implementation which we cannot control bool is_blocking_applicable = true && !is_3d && (!jcp.im2col_sz - // spatial is small - || spatial >= max_threads * simd_w - // inner threading work is greater then outer - // threading work - || jcp.os < jcp.mb * jcp.ngroups * jcp.od - // im2col is big - || (sw == 1 && K <= 0.05 * jcp.oc)) + // spatial is small + || spatial >= max_threads * simd_w + // inner threading work is greater then outer + // threading work + || jcp.os < jcp.mb * jcp.ngroups * jcp.od + // im2col is big + || (sw == 1 && K <= 0.05 * jcp.oc)) // heuristic condition && (jcp.im2col_sz - || (jcp.ic / jcp.oc < 42 - && jcp.ic * jcp.oc * jcp.is < 1024)); + || (jcp.ic / jcp.oc < 42 + && jcp.ic * jcp.oc * jcp.is < 1024)); if (is_blocking_applicable) { const dim_t min_oc_block = 8; @@ -1565,9 +1581,8 @@ status_t init_conf(conv_gemm_conf_t &jcp, + ic_disb_k + reg_osb_disb_k + thr_mem_eff_k + gemm_eff_k + gemm_calc_eff_k; - auto calc_max_icb - = [=](dim_t nthr_oc, dim_t ocb, dim_t osb, - dim_t oc_per_thr, dim_t os_per_thr) { + auto calc_max_icb = [=](dim_t nthr_oc, dim_t ocb, dim_t osb, + dim_t oc_per_thr, dim_t os_per_thr) { const dim_t block_out_size = ocb * osb; // TODO: need more precise calculation if stride more than // kernel size @@ -1906,11 +1921,10 @@ status_t init_conf(conv_gemm_conf_t &jcp, if (jcp.im2col_sz) jcp.im2col_sz = (ptrdiff_t)jcp.ic_block * jcp.ks * jcp.os_block; } else if (jcp.is_nspc && is_bwd_d) { - jcp.im2col_sz - = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih, - jcp.od == jcp.id, jcp.stride_w == 1, - jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1, - !jcp.signed_input) + jcp.im2col_sz = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih, + jcp.od == jcp.id, jcp.stride_w == 1, + jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1, + !jcp.signed_input) ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os * jcp.od : 0; @@ -1924,7 +1938,7 @@ status_t init_conf(conv_gemm_conf_t &jcp, = (float)inner_work / rnd_up(inner_work, max_threads); jcp.outer_threading = !is_3d && (is_depthwise - || (jcp.is / max_threads < 64 && jcp.mb != 1)) + || (jcp.is / max_threads < 64 && jcp.mb != 1)) && (outer_thr_eff / inner_thr_eff >= 1.f || (static_cast(jcp.is) * jcp.ic * jcp.oc) / max_threads @@ -1950,11 +1964,10 @@ status_t init_conf(conv_gemm_conf_t &jcp, || (jcp.is * jcp.ic * jcp.oc) / max_threads < gemm_thrld); } else if (jcp.is_nspc && is_bwd_w) { - jcp.im2col_sz - = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih, - jcp.od == jcp.id, jcp.stride_w == 1, - jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1, - !jcp.signed_input) + jcp.im2col_sz = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih, + jcp.od == jcp.id, jcp.stride_w == 1, + jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1, + !jcp.signed_input) ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os : 0; const size_t gemm_col_datatype_size @@ -1970,7 +1983,7 @@ status_t init_conf(conv_gemm_conf_t &jcp, thr_mem_estimate += sizeof(float) * weights_d.size(); if (jcp.with_bias && one_of(data_type::bf16, cd.diff_bias_desc.data_type, - cd.bias_desc.data_type)) + cd.bias_desc.data_type)) thr_mem_estimate += sizeof(float) * jcp.ngroups * jcp.oc; } const bool outer_threading_mem_ok @@ -1997,7 +2010,7 @@ status_t init_conf(conv_gemm_conf_t &jcp, } if ((is_bf16_conv) && jcp.with_bias && one_of(data_type::bf16, cd.diff_bias_desc.data_type, - cd.bias_desc.data_type)) + cd.bias_desc.data_type)) scratchpad.book( key_conv_bias_bf16_convert_wsp, jcp.ngroups * jcp.oc); } else if (!jcp.is_nspc && is_bwd_w) { @@ -2009,7 +2022,7 @@ status_t init_conf(conv_gemm_conf_t &jcp, thr_mem_estimate += sizeof(float) * weights_d.size(); if (jcp.with_bias && one_of(data_type::bf16, cd.diff_bias_desc.data_type, - cd.bias_desc.data_type)) + cd.bias_desc.data_type)) thr_mem_estimate += sizeof(float) * jcp.ngroups * jcp.oc; } const size_t gemm_col_datatype_size @@ -2018,8 +2031,7 @@ status_t init_conf(conv_gemm_conf_t &jcp, thr_mem_estimate += gemm_col_datatype_size * max_threads * jcp.ic * jcp.ks * simd_w; - const bool outer_threading_mem_ok - = thr_mem_estimate < scratchpad_limit; + const bool outer_threading_mem_ok = thr_mem_estimate < scratchpad_limit; jcp.outer_threading = outer_threading_mem_ok && jcp.os / max_threads < 256 && (jcp.mb != 1 || jcp.ngroups > 2); @@ -2050,7 +2062,7 @@ status_t init_conf(conv_gemm_conf_t &jcp, key_conv_int_dat_in_acc_dt, conv_acc_buffer_size); if ((is_fwd || is_bwd_w) && jcp.with_bias && one_of(data_type::bf16, cd.diff_bias_desc.data_type, - cd.bias_desc.data_type)) + cd.bias_desc.data_type)) scratchpad.book(key_conv_bias_bf16_convert_wsp, jcp.ngroups * jcp.oc); } @@ -2065,8 +2077,7 @@ status_t init_conf(conv_gemm_conf_t &jcp, VDISPATCH_CONV_IC(scratchpad_limit >= scratchpad.size(), VERBOSE_SCRATCHPAD_LIMIT); - const size_t available_mem - = scratchpad_limit - scratchpad.size(); + const size_t available_mem = scratchpad_limit - scratchpad.size(); if (available_mem < gemm_col_memory_sz * gemm_col_datatype_size) { // Required memory in this scenario overflows the @@ -2107,8 +2118,7 @@ status_t init_conf(conv_gemm_conf_t &jcp, if (size) scratchpad.book(key_conv_gemm_zp_src_comp, size); } - VDISPATCH_CONV_IC( - scratchpad.size() <= scratchpad_limit, VERBOSE_SCRATCHPAD_LIMIT); + VDISPATCH_CONV_IC(scratchpad.size() <= scratchpad_limit, VERBOSE_SCRATCHPAD_LIMIT); return status::success; } @@ -2182,4 +2192,4 @@ bool padding_exists(const conv_gemm_conf_t &jcp) noexcept { } // namespace rv64 } // namespace cpu } // namespace impl -} // namespace dnnl +} // namespace dnnl \ No newline at end of file From 6791f0ea3e785912e79a31546133afaddfa018f7 Mon Sep 17 00:00:00 2001 From: StrelkovKM Date: Tue, 28 Apr 2026 21:43:45 +0000 Subject: [PATCH 13/13] [CPU][RV64] Clang-formated --- src/cpu/rv64/rvv_gemm_convolution.cpp | 115 +++++++-------- src/cpu/rv64/rvv_gemm_convolution.hpp | 2 +- src/cpu/rv64/rvv_gemm_convolution_utils.cpp | 146 ++++++++++---------- 3 files changed, 133 insertions(+), 130 deletions(-) diff --git a/src/cpu/rv64/rvv_gemm_convolution.cpp b/src/cpu/rv64/rvv_gemm_convolution.cpp index f4ea4196da9..eda2217cd77 100644 --- a/src/cpu/rv64/rvv_gemm_convolution.cpp +++ b/src/cpu/rv64/rvv_gemm_convolution.cpp @@ -11,12 +11,12 @@ See the License for the specific language governing permissions and limitations under the License. *******************************************************************************/ #include -#include #include "common/c_types_map.hpp" #include "common/dnnl_thread.hpp" #include "common/type_helpers.hpp" #include "common/utils.hpp" #include "cpu/rv64/rvv_gemm_convolution.hpp" +#include namespace dnnl { namespace impl { @@ -33,25 +33,19 @@ struct im_pos_t { dim_t n, g, od, sp, ic, oc; bool do_im2col(const im_pos_t &prev) const { return true - && (n != prev.n || g != prev.g || od != prev.od || sp != prev.sp - || ic != prev.ic); + && (n != prev.n || g != prev.g || od != prev.od || sp != prev.sp + || ic != prev.ic); } }; // Helper function to apply bias and eltwise using RVV in NSPC layout // Using float explicitly as data_t is float in this specialization -static void apply_bias_eltwise_rvv_nspc( - const float *__restrict bia_arr, - float *__restrict dst_arr, - size_t start_oc, size_t end_oc, - bool with_bias, - bool with_eltwise, - const ref_post_ops_t *post_ops, - const exec_ctx_t &ctx, - const memory_desc_t *dst_md, - const conv_gemm_conf_t &jcp, - size_t g, size_t os_offset_factor) { - +static void apply_bias_eltwise_rvv_nspc(const float *__restrict bia_arr, + float *__restrict dst_arr, size_t start_oc, size_t end_oc, + bool with_bias, bool with_eltwise, const ref_post_ops_t *post_ops, + const exec_ctx_t &ctx, const memory_desc_t *dst_md, + const conv_gemm_conf_t &jcp, size_t g, size_t os_offset_factor) { + size_t n_elems = end_oc - start_oc + 1; if (n_elems == 0) return; @@ -63,7 +57,7 @@ static void apply_bias_eltwise_rvv_nspc( float eltwise_alpha = 0.0f; float eltwise_scale = 1.0f; bool is_fast_relu = false; - + if (with_eltwise && jcp.post_ops.len() == 1) { const auto &eltwise = jcp.post_ops.entry_.back().eltwise; if (eltwise.alg == alg_kind::eltwise_relu) { @@ -75,7 +69,7 @@ static void apply_bias_eltwise_rvv_nspc( while (oc < n_elems) { size_t vl = __riscv_vsetvl_e32m1(n_elems - oc); - + vfloat32m1_t v_dst = __riscv_vle32_v_f32m1(d_ptr + oc, vl); // 1. Add Bias @@ -92,9 +86,10 @@ static void apply_bias_eltwise_rvv_nspc( } else { // Leaky ReLU-like vbool32_t mask = __riscv_vmflt_vf_f32m1_b32(v_dst, 0.0f, vl); - v_dst = __riscv_vfmul_vf_f32m1_m(mask, v_dst, eltwise_alpha, vl); + v_dst = __riscv_vfmul_vf_f32m1_m( + mask, v_dst, eltwise_alpha, vl); } - + if (eltwise_scale != 1.0f) { v_dst = __riscv_vfmul_vf_f32m1(v_dst, eltwise_scale, vl); } @@ -102,7 +97,7 @@ static void apply_bias_eltwise_rvv_nspc( oc += vl; } else { // If not fast relu, break to handle scalarly or generic post-ops - break; + break; } } @@ -111,19 +106,17 @@ static void apply_bias_eltwise_rvv_nspc( for (size_t i = oc; i < n_elems; ++i) { size_t cur_oc = start_oc + i; float *dst_val = dst_arr + cur_oc; - - if (with_bias) { - *dst_val += bia_arr[cur_oc]; - } - + + if (with_bias) { *dst_val += bia_arr[cur_oc]; } + if (with_eltwise || jcp.with_binary) { - ref_post_ops_t::args_t args; - args.ctx = &ctx; - args.dst_md = dst_md; // Use the passed pointer - // Calculate offset correctly - // Note: l_offset calculation might need adjustment based on exact memory layout expectations of post_ops - args.l_offset = (g * jcp.oc + cur_oc) * (jcp.os * jcp.od); - post_ops->execute(*dst_val, args); + ref_post_ops_t::args_t args; + args.ctx = &ctx; + args.dst_md = dst_md; // Use the passed pointer + // Calculate offset correctly + // Note: l_offset calculation might need adjustment based on exact memory layout expectations of post_ops + args.l_offset = (g * jcp.oc + cur_oc) * (jcp.os * jcp.od); + post_ops->execute(*dst_val, args); } } } @@ -181,7 +174,7 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc( assert(IMPLICATION(is_problem_3d, jcp.oh_block == jcp.oh && jcp.ow_block == jcp.ow - && jcp.ic_block == jcp.ic)); + && jcp.ic_block == jcp.ic)); assert(IMPLICATION(jcp.ow_block != jcp.ow, jcp.oh_block == 1)); const dim_t nb_oh = div_up(jcp.oh, jcp.oh_block); @@ -198,7 +191,8 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc( const vfloat32m1_t v_zero = __riscv_vfmv_v_f_f32m1(0.0f, vlmax); ptrdiff_t i = 0; - for (; i <= (ptrdiff_t)total_sz - (ptrdiff_t)vlmax; i += (ptrdiff_t)vlmax) { + for (; i <= (ptrdiff_t)total_sz - (ptrdiff_t)vlmax; + i += (ptrdiff_t)vlmax) { __riscv_vse32_v_f32m1(col + i, v_zero, vlmax); } if (i < (ptrdiff_t)total_sz) { @@ -220,16 +214,16 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc( const int h_step = nstl::min(jcp.oh_block, jcp.oh - oh); const int w_step = nstl::min(jcp.ow_block, jcp.ow - ow); - + if (jcp.im2col_sz && is_problem_3d) { - jit_gemm_convolution_utils::transpose_dt(jcp, src, imtr); + jit_gemm_convolution_utils::transpose_dt(jcp, src, imtr); } for (int od = 0; od < jcp.od; od++) { data_t *__restrict dst = dst_base + n * dst_mb_stride + g * dst_g_stride + ((od * jcp.oh + oh) * jcp.ow + ow) * dst_os_stride; - + if (jcp.im2col_sz) { if (is_problem_3d) jit_gemm_convolution_utils::im2col_dt_3d( @@ -250,7 +244,7 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc( const float beta = jcp.with_sum ? 1.0f : 0.0f; const data_t *__restrict src_od = src + od * jcp.oh * jcp.ow * jcp.ngroups * jcp.ic; - + status_t st = extended_sgemm("N ", BT, &M, &N, &K, &onef, wei, &LDA, jcp.im2col_sz ? col : (data_t *)src_od, &LDB, &beta, dst, &LDC); @@ -260,7 +254,8 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc( // NOTE: Keeping parallel(0, ...) as requested parallel(0, [&](int ithr_inner, int nthr_inner) { dim_t start_inner, end_inner; - balance211(N * jcp.oc, nthr_inner, ithr_inner, start_inner, end_inner); + balance211(N * jcp.oc, nthr_inner, ithr_inner, start_inner, + end_inner); const size_t first_oc = start_inner % jcp.oc; const size_t last_oc = (end_inner - 1) % jcp.oc; @@ -278,30 +273,35 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc( // Check if we can use optimized RVV path bool has_binary = jcp.with_binary; - bool has_complex_eltwise = jcp.with_eltwise && !(jcp.post_ops.len() == 1 && jcp.post_ops.entry_.back().eltwise.alg == alg_kind::eltwise_relu); - + bool has_complex_eltwise = jcp.with_eltwise + && !(jcp.post_ops.len() == 1 + && jcp.post_ops.entry_.back() + .eltwise.alg + == alg_kind::eltwise_relu); + if (!has_binary && !has_complex_eltwise) { - apply_bias_eltwise_rvv_nspc( - (const float*)bia_arr, (float*)dst_arr, start_oc, end_oc, - jcp.with_bias, jcp.with_eltwise, - post_ops_ptr, ctx, dst_md_ptr, jcp, g, 0); + apply_bias_eltwise_rvv_nspc((const float *)bia_arr, + (float *)dst_arr, start_oc, end_oc, + jcp.with_bias, jcp.with_eltwise, + post_ops_ptr, ctx, dst_md_ptr, jcp, g, 0); } else { // Fallback to original scalar logic for complex cases if (jcp.with_bias) { size_t n_elems = end_oc - start_oc + 1; if (n_elems > 0) { // Scalar bias add - for(size_t k=0; k st(status::success); parallel(jcp.nthr, [&](const int ithr, const int nthr) { @@ -363,7 +363,8 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp( const size_t vlmax = __riscv_vsetvlmax_e32m1(); const vfloat32m1_t v_zero = __riscv_vfmv_v_f_f32m1(0.0f, vlmax); ptrdiff_t i = 0; - for (; i <= (ptrdiff_t)total_sz - (ptrdiff_t)vlmax; i += (ptrdiff_t)vlmax) { + for (; i <= (ptrdiff_t)total_sz - (ptrdiff_t)vlmax; + i += (ptrdiff_t)vlmax) { __riscv_vse32_v_f32m1(_col + i, v_zero, vlmax); } if (i < (ptrdiff_t)total_sz) { @@ -371,7 +372,7 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp( __riscv_vse32_v_f32m1(_col + i, v_zero, vl); } } - + auto inner_ker = [&](int spatial, const im_pos_t &curr, im_pos_t &prev, im_pos_t &step, const im_pos_t &end) { const data_t *_src @@ -396,7 +397,7 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp( const data_t one = 1.0; const dim_t M = jcp.os * jcp.od; - const dim_t m = step.sp ; + const dim_t m = step.sp; const dim_t LDA = jcp.im2col_sz ? m : M; data_t *_dst = dst + curr.n * dst_mb_stride + curr.g * dst_g_stride + curr.oc * M + curr.od * jcp.os + curr.sp; @@ -442,11 +443,11 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp( v_d, b, vl); // Add bias v_d = __riscv_vfmax_vf_f32m1( - v_d, 0.0f, vl); + v_d, 0.0f, vl); if (eltwise.scale != 1.0f) { v_d = __riscv_vfmul_vf_f32m1( - v_d, eltwise.scale, vl); + v_d, eltwise.scale, vl); } __riscv_vse32_v_f32m1(d_ + oS, v_d, vl); @@ -463,10 +464,10 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp( v_d = __riscv_vfadd_vf_f32m1( v_d, b, vl); // Add bias vbool32_t mask - = __riscv_vmflt_vf_f32m1_b32( + = __riscv_vmflt_vf_f32m1_b32( v_d, 0.0f, vl); v_d = __riscv_vfmul_vf_f32m1_m( - mask, v_d, eltwise.alpha, vl); + mask, v_d, eltwise.alpha, vl); v_d = __riscv_vfmul_vf_f32m1( v_d, eltwise.scale, vl); __riscv_vse32_v_f32m1(d_ + oS, v_d, vl); diff --git a/src/cpu/rv64/rvv_gemm_convolution.hpp b/src/cpu/rv64/rvv_gemm_convolution.hpp index 1545afb6912..310bdcd72b1 100644 --- a/src/cpu/rv64/rvv_gemm_convolution.hpp +++ b/src/cpu/rv64/rvv_gemm_convolution.hpp @@ -68,7 +68,7 @@ struct riscv_gemm_convolution_fwd_t : public primitive_t { // TODO: make `init_conf` assign initialized object to `jcp_` jcp_ = conv_gemm_conf_t(); - return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad, + return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad, *desc(), src_md_, weights_md_, dst_md_, bias_md_, attr_, dnnl_get_max_threads()); } diff --git a/src/cpu/rv64/rvv_gemm_convolution_utils.cpp b/src/cpu/rv64/rvv_gemm_convolution_utils.cpp index bcfb62b2990..615c7051ce7 100644 --- a/src/cpu/rv64/rvv_gemm_convolution_utils.cpp +++ b/src/cpu/rv64/rvv_gemm_convolution_utils.cpp @@ -19,8 +19,8 @@ #include "common/dnnl_thread.hpp" #include "common/type_helpers.hpp" #include "common/utils.hpp" -#include "cpu/scale_utils.hpp" #include "cpu/platform.hpp" +#include "cpu/scale_utils.hpp" #ifdef DNNL_RISCV_USE_RVV_INTRINSICS #include @@ -49,9 +49,9 @@ namespace jit_gemm_convolution_utils { template void im2col_3d(const conv_gemm_conf_t &jcp, const data_type_t *im, data_type_t *col, dim_t od, int spatial_step, int spatial_block) { - using data_t = typename conditional::data_type - == bf16, - uint16_t, data_type_t>::type; + using data_t = + typename conditional::data_type == bf16, + uint16_t, data_type_t>::type; const data_t *__restrict _im = reinterpret_cast(im); data_t *__restrict _col = reinterpret_cast(col); @@ -97,8 +97,7 @@ void im2col_3d(const conv_gemm_conf_t &jcp, const data_type_t *im, col_ += jcp.kw * OHW; } } else { - const data_t *__restrict im_ - = im_loc + id * jcp.ih * jcp.iw; + const data_t *__restrict im_ = im_loc + id * jcp.ih * jcp.iw; dim_t ih_ = -jcp.t_pad; for (dim_t kh = 0; kh < jcp.kh; ++kh) { dim_t ih = ih_; @@ -233,8 +232,8 @@ void transpose_dt(const conv_gemm_conf_t &jcp, const T *__restrict im, const dim_t ic_stride = jcp.id * jcp.ih * jcp.iw; const dim_t IC = jcp.ngroups * jcp.ic; const dim_t IHW = jcp.ih * jcp.iw; - const dim_t ic_block = nstl::max( - 1, platform::get_cache_line_size() / sizeof(T)); + const dim_t ic_block + = nstl::max(1, platform::get_cache_line_size() / sizeof(T)); const dim_t nb_ic = jcp.ic / ic_block; const dim_t ic_blocked = nb_ic * ic_block; parallel_nd(jcp.id, jcp.ih, [&](dim_t id, dim_t ih) { @@ -315,19 +314,16 @@ void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr, } const im_dt *__restrict imtr_loc = imtr + (ic * jcp.id + id) * IHW; - const dim_t oh_start - = saturate(dim_t(0), jcp.oh, tp - kh); + const dim_t oh_start = saturate(dim_t(0), jcp.oh, tp - kh); const dim_t oh_end = saturate(dim_t(0), jcp.oh, jcp.ih + tp - kh); - const dim_t ow_start - = saturate(dim_t(0), jcp.ow, lp - kw); + const dim_t ow_start = saturate(dim_t(0), jcp.ow, lp - kw); const dim_t ow_end = saturate(dim_t(0), jcp.ow, jcp.iw + lp - kw); for (dim_t oh = oh_start, ih = oh_start - tp + kh; oh < oh_end; oh++, ih++) { col_dt *__restrict col_h = col_loc + oh * jcp.ow; - const im_dt *__restrict imtr_h - = imtr_loc + ih * jcp.iw; + const im_dt *__restrict imtr_h = imtr_loc + ih * jcp.iw; for (dim_t ow = ow_start, iw = ow_start - lp + kw; ow < ow_end; ow++, iw++) { col_h[ow] = imtr_h[iw]; @@ -358,8 +354,7 @@ void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr, for (dim_t oh = oh_start, ih = oh_start * 2 - tp + kh; oh < oh_end; ++oh, ih += 2) { col_dt *__restrict col_h = col_loc + oh * jcp.ow; - const im_dt *__restrict imtr_h - = imtr_loc + ih * jcp.iw; + const im_dt *__restrict imtr_h = imtr_loc + ih * jcp.iw; for (dim_t ow = ow_start, iw = ow_start * 2 - lp + kw; ow < ow_end; ++ow, iw += 2) { col_h[ow] = imtr_h[iw]; @@ -390,10 +385,9 @@ void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr, for (dim_t oh = oh_start, ih = oh_start * sh - tp + kh * dh; oh < oh_end; ++oh, ih += sh) { col_dt *__restrict col_h = col_loc + oh * jcp.ow; - const im_dt *__restrict imtr_h - = imtr_loc + ih * jcp.iw; + const im_dt *__restrict imtr_h = imtr_loc + ih * jcp.iw; for (dim_t ow = ow_start, - iw = ow_start * sw - lp + kw * dw; + iw = ow_start * sw - lp + kw * dw; ow < ow_end; ++ow, iw += sw) { col_h[ow] = imtr_h[iw]; } @@ -515,14 +509,13 @@ void im2col(const conv_gemm_conf_t &jcp, const data_type_t *__restrict im, [&](dim_t ic, dim_t kh, dim_t kw, dim_t ohr) { const dim_t oh = ohr + oh_begin; const dim_t ih = oh * sh - tp + kh * dh; - const dim_t ow_start - = (oh == first_oh) ? first_ow : 0; + const dim_t ow_start = (oh == first_oh) ? first_ow : 0; const dim_t ow_end = (oh == last_oh) ? (last_ow + 1) : jcp.ow; data_t *__restrict col_oh = _col + ic * col_step + (kh * jcp.kw + kw) * sb + oh * jcp.ow - ss; - const data_t *__restrict im_ = _im + (ic + cs) * im_step - + ih * jcp.iw; + const data_t *__restrict im_ + = _im + (ic + cs) * im_step + ih * jcp.iw; const dim_t iw_shift = kw * dw - lp; if (ih < 0 || ih >= jcp.ih) for (dim_t ow = ow_start; ow < ow_end; ow++) @@ -541,13 +534,13 @@ void im2col(const conv_gemm_conf_t &jcp, const data_type_t *__restrict im, [&](dim_t ic, dim_t kh, dim_t kw, dim_t ohr) { const dim_t oh = ohr + oh_begin; const dim_t ih = oh * sh - tp + kh * dh; - const dim_t ow_start - = (oh == first_oh) ? first_ow : 0; + const dim_t ow_start = (oh == first_oh) ? first_ow : 0; const dim_t ow_end = (oh == last_oh) ? (last_ow + 1) : jcp.ow; data_t *__restrict col_oh = _col + ic * col_step + (kh * jcp.kw + kw) * sb + oh * jcp.ow - ss; - const data_t *__restrict im_ = _im + (ic + cs) * im_step; + const data_t *__restrict im_ + = _im + (ic + cs) * im_step; if (ih < 0 || ih >= jcp.ih) for (dim_t ow = ow_start; ow < ow_end; ow++) col_oh[ow] = zero_val; @@ -566,8 +559,8 @@ void im2col(const conv_gemm_conf_t &jcp, const data_type_t *__restrict im, } template void im2col(const conv_gemm_conf_t &jcp, - const float *__restrict im, float *__restrict col, dim_t hs, - dim_t hb, dim_t ws, dim_t wb); + const float *__restrict im, float *__restrict col, dim_t hs, dim_t hb, + dim_t ws, dim_t wb); template void im2col(const conv_gemm_conf_t &jcp, const bfloat16_t *__restrict im, bfloat16_t *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb); @@ -577,14 +570,15 @@ template void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict _im, void *__restrict _imtr, orig_col_dt *__restrict _col, dim_t hs, dim_t hb, dim_t ws, dim_t wb) { - using im_dt = typename utils::conditional< - data_traits_t::data_type == bf16, uint16_t, - orig_im_dt>::type; - using col_dt = typename utils::conditional< - data_traits_t::data_type == bf16, uint16_t, - orig_col_dt>::type; - const im_dt *__restrict im - = reinterpret_cast(_im); + using im_dt = + typename utils::conditional::data_type + == bf16, + uint16_t, orig_im_dt>::type; + using col_dt = + typename utils::conditional::data_type + == bf16, + uint16_t, orig_col_dt>::type; + const im_dt *__restrict im = reinterpret_cast(_im); im_dt *__restrict imtr = reinterpret_cast(_imtr); col_dt *__restrict col = reinterpret_cast(_col); col_dt shift = static_cast(jcp.signed_input ? 128 : 0); @@ -673,16 +667,16 @@ void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict _im, const dim_t ih = (oh + hs) * sh - hp; const ptrdiff_t col_idx_base = (((kh * jcp.kw + kw) * jcp.ic + ic) * hb + oh) - * wb; + * wb; if (ih < 0 || ih >= jcp.ih) for (dim_t ow = 0; ow < wb; ow++) col[col_idx_base + ow] = shift; else { const dim_t wp = lp - kw * dw; - const dim_t ow_start = saturate( - dim_t(0), wb, div_up(wp, sw) - ws); - const dim_t ow_end = saturate(dim_t(0), wb, - div_up(jcp.iw + wp, sw) - ws); + const dim_t ow_start + = saturate(dim_t(0), wb, div_up(wp, sw) - ws); + const dim_t ow_end = saturate( + dim_t(0), wb, div_up(jcp.iw + wp, sw) - ws); for (dim_t ow = 0; ow < ow_start; ow++) col[col_idx_base + ow] = shift; const dim_t iw_base = ws * sw - wp; @@ -1212,7 +1206,7 @@ status_t init_conf(conv_gemm_conf_t &jcp, // to the number of threads and multiplied by a heuristic coefficient (15) const size_t zp_src_pad_comp_size = zp_src_with_padding ? (jcp.oc * jcp.ngroups * jcp.zp.src_pad_comp.d - * jcp.zp.src_pad_comp.h * jcp.zp.src_pad_comp.w) + * jcp.zp.src_pad_comp.h * jcp.zp.src_pad_comp.w) : 0u; const size_t zp_src_comp_size = jcp.zp.src_is_common ? utils::rnd_up(jcp.oc * jcp.ngroups, @@ -1507,7 +1501,8 @@ status_t init_conf(conv_gemm_conf_t &jcp, // 64K - this is heuristic gemm size per thread threshold. constexpr size_t gemm_thrld = 64 * 1024; if (!jcp.outer_threading && !is_3d) { - bool is_depthwise = jcp.ic == 1 && jcp.oc == 1 && jcp.ngroups != 1; + bool is_depthwise + = jcp.ic == 1 && jcp.oc == 1 && jcp.ngroups != 1; const size_t outer_work = jcp.ngroups * jcp.mb; const float outer_thr_eff = (float)outer_work / rnd_up(outer_work, max_threads); @@ -1515,8 +1510,9 @@ status_t init_conf(conv_gemm_conf_t &jcp, = div_up(jcp.is, simd_w) * div_up(jcp.ic, simd_w); const float inner_thr_eff = (float)inner_work / rnd_up(inner_work, max_threads); - jcp.outer_threading = (is_depthwise - || (jcp.is / max_threads < 64 && jcp.mb != 1)) + jcp.outer_threading + = (is_depthwise + || (jcp.is / max_threads < 64 && jcp.mb != 1)) && (outer_thr_eff / inner_thr_eff >= 1.f || (static_cast(jcp.os) * jcp.ic * jcp.oc) @@ -1540,7 +1536,7 @@ status_t init_conf(conv_gemm_conf_t &jcp, gemm_col_datatype_size); if (is_bf16_conv && jcp.with_bias && one_of(data_type::bf16, cd.diff_bias_desc.data_type, - cd.bias_desc.data_type)) { + cd.bias_desc.data_type)) { scratchpad.book( key_conv_bias_bf16_convert_wsp, jcp.ngroups * jcp.oc); } @@ -1556,17 +1552,17 @@ status_t init_conf(conv_gemm_conf_t &jcp, // gemm implementation which we cannot control bool is_blocking_applicable = true && !is_3d && (!jcp.im2col_sz - // spatial is small - || spatial >= max_threads * simd_w - // inner threading work is greater then outer - // threading work - || jcp.os < jcp.mb * jcp.ngroups * jcp.od - // im2col is big - || (sw == 1 && K <= 0.05 * jcp.oc)) + // spatial is small + || spatial >= max_threads * simd_w + // inner threading work is greater then outer + // threading work + || jcp.os < jcp.mb * jcp.ngroups * jcp.od + // im2col is big + || (sw == 1 && K <= 0.05 * jcp.oc)) // heuristic condition && (jcp.im2col_sz - || (jcp.ic / jcp.oc < 42 - && jcp.ic * jcp.oc * jcp.is < 1024)); + || (jcp.ic / jcp.oc < 42 + && jcp.ic * jcp.oc * jcp.is < 1024)); if (is_blocking_applicable) { const dim_t min_oc_block = 8; @@ -1582,7 +1578,8 @@ status_t init_conf(conv_gemm_conf_t &jcp, + gemm_eff_k + gemm_calc_eff_k; auto calc_max_icb = [=](dim_t nthr_oc, dim_t ocb, dim_t osb, - dim_t oc_per_thr, dim_t os_per_thr) { + dim_t oc_per_thr, + dim_t os_per_thr) { const dim_t block_out_size = ocb * osb; // TODO: need more precise calculation if stride more than // kernel size @@ -1921,10 +1918,11 @@ status_t init_conf(conv_gemm_conf_t &jcp, if (jcp.im2col_sz) jcp.im2col_sz = (ptrdiff_t)jcp.ic_block * jcp.ks * jcp.os_block; } else if (jcp.is_nspc && is_bwd_d) { - jcp.im2col_sz = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih, - jcp.od == jcp.id, jcp.stride_w == 1, - jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1, - !jcp.signed_input) + jcp.im2col_sz + = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih, + jcp.od == jcp.id, jcp.stride_w == 1, + jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1, + !jcp.signed_input) ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os * jcp.od : 0; @@ -1938,7 +1936,7 @@ status_t init_conf(conv_gemm_conf_t &jcp, = (float)inner_work / rnd_up(inner_work, max_threads); jcp.outer_threading = !is_3d && (is_depthwise - || (jcp.is / max_threads < 64 && jcp.mb != 1)) + || (jcp.is / max_threads < 64 && jcp.mb != 1)) && (outer_thr_eff / inner_thr_eff >= 1.f || (static_cast(jcp.is) * jcp.ic * jcp.oc) / max_threads @@ -1964,10 +1962,11 @@ status_t init_conf(conv_gemm_conf_t &jcp, || (jcp.is * jcp.ic * jcp.oc) / max_threads < gemm_thrld); } else if (jcp.is_nspc && is_bwd_w) { - jcp.im2col_sz = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih, - jcp.od == jcp.id, jcp.stride_w == 1, - jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1, - !jcp.signed_input) + jcp.im2col_sz + = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih, + jcp.od == jcp.id, jcp.stride_w == 1, + jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1, + !jcp.signed_input) ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os : 0; const size_t gemm_col_datatype_size @@ -1983,7 +1982,7 @@ status_t init_conf(conv_gemm_conf_t &jcp, thr_mem_estimate += sizeof(float) * weights_d.size(); if (jcp.with_bias && one_of(data_type::bf16, cd.diff_bias_desc.data_type, - cd.bias_desc.data_type)) + cd.bias_desc.data_type)) thr_mem_estimate += sizeof(float) * jcp.ngroups * jcp.oc; } const bool outer_threading_mem_ok @@ -2010,7 +2009,7 @@ status_t init_conf(conv_gemm_conf_t &jcp, } if ((is_bf16_conv) && jcp.with_bias && one_of(data_type::bf16, cd.diff_bias_desc.data_type, - cd.bias_desc.data_type)) + cd.bias_desc.data_type)) scratchpad.book( key_conv_bias_bf16_convert_wsp, jcp.ngroups * jcp.oc); } else if (!jcp.is_nspc && is_bwd_w) { @@ -2022,7 +2021,7 @@ status_t init_conf(conv_gemm_conf_t &jcp, thr_mem_estimate += sizeof(float) * weights_d.size(); if (jcp.with_bias && one_of(data_type::bf16, cd.diff_bias_desc.data_type, - cd.bias_desc.data_type)) + cd.bias_desc.data_type)) thr_mem_estimate += sizeof(float) * jcp.ngroups * jcp.oc; } const size_t gemm_col_datatype_size @@ -2031,7 +2030,8 @@ status_t init_conf(conv_gemm_conf_t &jcp, thr_mem_estimate += gemm_col_datatype_size * max_threads * jcp.ic * jcp.ks * simd_w; - const bool outer_threading_mem_ok = thr_mem_estimate < scratchpad_limit; + const bool outer_threading_mem_ok + = thr_mem_estimate < scratchpad_limit; jcp.outer_threading = outer_threading_mem_ok && jcp.os / max_threads < 256 && (jcp.mb != 1 || jcp.ngroups > 2); @@ -2062,7 +2062,7 @@ status_t init_conf(conv_gemm_conf_t &jcp, key_conv_int_dat_in_acc_dt, conv_acc_buffer_size); if ((is_fwd || is_bwd_w) && jcp.with_bias && one_of(data_type::bf16, cd.diff_bias_desc.data_type, - cd.bias_desc.data_type)) + cd.bias_desc.data_type)) scratchpad.book(key_conv_bias_bf16_convert_wsp, jcp.ngroups * jcp.oc); } @@ -2077,7 +2077,8 @@ status_t init_conf(conv_gemm_conf_t &jcp, VDISPATCH_CONV_IC(scratchpad_limit >= scratchpad.size(), VERBOSE_SCRATCHPAD_LIMIT); - const size_t available_mem = scratchpad_limit - scratchpad.size(); + const size_t available_mem + = scratchpad_limit - scratchpad.size(); if (available_mem < gemm_col_memory_sz * gemm_col_datatype_size) { // Required memory in this scenario overflows the @@ -2118,7 +2119,8 @@ status_t init_conf(conv_gemm_conf_t &jcp, if (size) scratchpad.book(key_conv_gemm_zp_src_comp, size); } - VDISPATCH_CONV_IC(scratchpad.size() <= scratchpad_limit, VERBOSE_SCRATCHPAD_LIMIT); + VDISPATCH_CONV_IC( + scratchpad.size() <= scratchpad_limit, VERBOSE_SCRATCHPAD_LIMIT); return status::success; }