From 461218e8140747fe84426088ce764ff835a610a8 Mon Sep 17 00:00:00 2001
From: StrelkovKM <strelkovkm96@outlook.com>
Date: Fri, 13 Mar 2026 15:04:54 +0300
Subject: [PATCH 01/13] [CPU] rv64: Add rvv_gemm_convolution.cpp

---
 src/cpu/rv64/rvv_gemm_convolution.cpp       |  497 +++++
 src/cpu/rv64/rvv_gemm_convolution.hpp       |  149 ++
 src/cpu/rv64/rvv_gemm_convolution_utils.cpp | 2185 +++++++++++++++++++
 src/cpu/rv64/rvv_gemm_convolution_utils.hpp |  142 ++
 4 files changed, 2973 insertions(+)
 create mode 100644 src/cpu/rv64/rvv_gemm_convolution.cpp
 create mode 100644 src/cpu/rv64/rvv_gemm_convolution.hpp
 create mode 100644 src/cpu/rv64/rvv_gemm_convolution_utils.cpp
 create mode 100644 src/cpu/rv64/rvv_gemm_convolution_utils.hpp
diff --git a/src/cpu/rv64/rvv_gemm_convolution.cpp b/src/cpu/rv64/rvv_gemm_convolution.cpp
new file mode 100644
index 00000000000..fc20fb2fecf
--- /dev/null
+++ b/src/cpu/rv64/rvv_gemm_convolution.cpp
@@ -0,0 +1,497 @@
+/*******************************************************************************
+* Copyright 2016 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <atomic>
+#include <riscv_vector.h>
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+#include "cpu/rv64/rvv_gemm_convolution.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace rv64 {
+
+using namespace dnnl::impl::status;
+using namespace dnnl::impl::memory_tracking::names;
+using namespace dnnl::impl::utils;
+
+namespace {
+struct im_pos_t {
+    im_pos_t() : n {0}, g {0}, od {0}, sp {0}, ic {0}, oc {0} {}
+    dim_t n, g, od, sp, ic, oc;
+    bool do_im2col(const im_pos_t &prev) const {
+        return true
+                && (n != prev.n || g != prev.g || od != prev.od || sp != prev.sp
+                        || ic != prev.ic);
+    }
+};
+} // namespace
+
+status_t riscv_gemm_convolution_fwd_t::execute_forward_nspc(
+        const exec_ctx_t &ctx) const {
+    auto src_base = CTX_IN_MEM(const data_t *, DNNL_ARG_SRC);
+    auto wei_base = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS);
+    auto bia_base = CTX_IN_MEM(const data_t *, DNNL_ARG_BIAS);
+    auto dst_base = CTX_OUT_MEM(data_t *, DNNL_ARG_DST);
+
+    auto scratchpad = ctx.get_scratchpad_grantor();
+    const conv_gemm_conf_t &jcp = pd()->jcp_;
+    std::atomic<status_t> st(status::success);
+
+    parallel(jcp.nthr, [&](const int ithr, const int nthr) {
+        status_t st_thr = execute_forward_thr_nspc(ctx, ithr, nthr, src_base,
+                wei_base, bia_base, dst_base, scratchpad);
+        if (st_thr != status::success) st = st_thr;
+    });
+
+    return st;
+}
+
+status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc(
+        const exec_ctx_t &ctx, const int ithr, const int nthr,
+        const data_t *src_base, const data_t *wei_base, const data_t *bia_base,
+        data_t *dst_base, const memory_tracking::grantor_t &scratchpad) const {
+    const conv_gemm_conf_t &jcp = pd()->jcp_;
+
+    // Src Format: mb-spatial-groups-input_channels
+    const dim_t src_mb_stride = jcp.id * jcp.ih * jcp.iw * jcp.ngroups * jcp.ic;
+    const dim_t src_g_stride = jcp.ic;
+    // Wei Format: spatial-input_channels-groups-output_channels
+    const dim_t wei_g_stride = pd()->with_groups() ? jcp.oc : 0;
+
+    // Dst Format: mb-spatial-groups-output_channels
+    const dim_t dst_mb_stride = jcp.od * jcp.oh * jcp.ow * jcp.ngroups * jcp.oc;
+    const dim_t dst_g_stride = jcp.oc;
+    const dim_t dst_os_stride = jcp.ngroups * jcp.oc;
+
+    data_t *__restrict col = scratchpad.get<data_t>(key_conv_gemm_col)
+            + (ptrdiff_t)ithr * jcp.im2col_sz;
+    data_t *__restrict imtr = scratchpad.get<data_t>(key_conv_gemm_imtr)
+            + (ptrdiff_t)ithr * jcp.is * jcp.ic;
+
+    dim_t g {0}, n {0}, ohb {0}, owb {0};
+    dim_t start = 0, end = 0;
+    const bool is_problem_3d = pd()->ndims() == 5;
+
+    assert(IMPLICATION(is_problem_3d,
+            jcp.oh_block == jcp.oh && jcp.ow_block == jcp.ow
+                    && jcp.ic_block == jcp.ic));
+    assert(IMPLICATION(jcp.ow_block != jcp.ow, jcp.oh_block == 1));
+
+    const dim_t nb_oh = div_up(jcp.oh, jcp.oh_block);
+    const dim_t nb_ow = div_up(jcp.ow, jcp.ow_block);
+    // threads share work across mini-batch, groups, and blocked width/height
+    const dim_t work_amount = jcp.mb * jcp.ngroups * nb_oh * nb_ow;
+    balance211(work_amount, nthr, ithr, start, end);
+    nd_iterator_init(start, n, jcp.mb, g, jcp.ngroups, ohb, nb_oh, owb, nb_ow);
+
+    if (jcp.im2col_sz && is_problem_3d) {
+        // jit_gemm_convolution_utils::im2col_dt_3d() requires external
+        // data initialization by zeroes
+
+        ptrdiff_t i = 0;
+        while (i < jcp.im2col_sz) {
+            size_t vl = __riscv_vsetvl_e32m1(jcp.im2col_sz - i);
+            vfloat32m1_t v_zero = __riscv_vfmv_v_f_f32m1(0.0f, vl);
+            __riscv_vse32_v_f32m1(col + i, v_zero, vl);
+            i += vl;
+        }
+    }
+
+    for (dim_t iwork = start; iwork < end; ++iwork) {
+        dim_t oh = ohb * jcp.oh_block;
+        dim_t ow = owb * jcp.ow_block;
+        const data_t *__restrict src
+                = src_base + n * src_mb_stride + g * src_g_stride;
+        const data_t *__restrict wei = wei_base + g * wei_g_stride;
+
+        const int h_step = nstl::min(jcp.oh_block, jcp.oh - oh);
+        const int w_step = nstl::min(jcp.ow_block, jcp.ow - ow);
+        if (jcp.im2col_sz && is_problem_3d) {
+            jit_gemm_convolution_utils::transpose_dt(jcp, src, imtr);
+        }
+
+        for (int od = 0; od < jcp.od; od++) {
+            data_t *__restrict dst = dst_base + n * dst_mb_stride
+                    + g * dst_g_stride
+                    + ((od * jcp.oh + oh) * jcp.ow + ow) * dst_os_stride;
+            if (jcp.im2col_sz) {
+                if (is_problem_3d)
+                    jit_gemm_convolution_utils::im2col_dt_3d<data_t, data_t>(
+                            jcp, imtr, col, od);
+                else
+                    jit_gemm_convolution_utils::im2col_dt<data_t, data_t>(
+                            jcp, src, imtr, col, oh, h_step, ow, w_step);
+            }
+
+            const dim_t M = jcp.oc;
+            const dim_t K = jcp.ks * jcp.ic;
+            const dim_t N = h_step * w_step;
+            const dim_t LDA = M * jcp.ngroups;
+            const dim_t LDB = jcp.im2col_sz ? N : K * jcp.ngroups;
+            const dim_t LDC = M * jcp.ngroups;
+            const char *BT = jcp.im2col_sz ? "T" : "N";
+            const data_t onef = 1.f;
+            const float beta = jcp.with_sum ? 1.0f : 0.0f;
+            const data_t *__restrict src_od
+                    = src + od * jcp.oh * jcp.ow * jcp.ngroups * jcp.ic;
+            status_t st = extended_sgemm("N", BT, &M, &N, &K, &onef, wei, &LDA,
+                    jcp.im2col_sz ? col : (data_t *)src_od, &LDB, &beta, dst,
+                    &LDC);
+            if (st != status::success) return st;
+
+            if (jcp.with_bias || jcp.with_eltwise || jcp.with_binary) {
+                parallel(0, [&](int ithr, int nthr) {
+                    dim_t start, end;
+                    balance211(N * jcp.oc, nthr, ithr, start, end);
+
+                    const size_t first_oc = start % jcp.oc;
+                    const size_t last_oc = (end - 1) % jcp.oc;
+                    const size_t first_os = start / jcp.oc;
+                    const size_t last_os = (end - 1) / jcp.oc;
+
+                    for (size_t os = first_os; os <= last_os; ++os) {
+                        const size_t start_oc = (os == first_os) ? first_oc : 0;
+                        const size_t end_oc
+                                = (os == last_os) ? last_oc : jcp.oc - 1;
+
+                        const data_t *__restrict bia_arr
+                                = bia_base ? bia_base + g * jcp.oc : nullptr;
+                        data_t *__restrict dst_arr = dst + os * dst_os_stride;
+
+                        if (jcp.with_bias) {
+                            size_t n_elems = end_oc - start_oc + 1;
+                            if (n_elems > 0) {
+                                size_t oc = 0;
+                                const data_t *b_ptr = bia_arr + start_oc;
+                                data_t *d_ptr = dst_arr + start_oc;
+
+                                while (oc < n_elems) {
+                                    size_t vl = __riscv_vsetvl_e32m1(
+                                            n_elems - oc);
+                                    vfloat32m1_t v_dst = __riscv_vle32_v_f32m1(
+                                            d_ptr + oc, vl);
+                                    vfloat32m1_t v_bias = __riscv_vle32_v_f32m1(
+                                            b_ptr + oc, vl);
+                                    v_dst = __riscv_vfadd_vv_f32m1(
+                                            v_dst, v_bias, vl);
+                                    __riscv_vse32_v_f32m1(
+                                            d_ptr + oc, v_dst, vl);
+                                    oc += vl;
+                                }
+                            }
+                        }
+
+                        if (jcp.with_eltwise || jcp.with_binary) {
+                            bool fast_relu_done = false;
+                            if (jcp.with_eltwise && jcp.post_ops.len() == 1) {
+                                // fast branch for ReLU case
+                                const auto &eltwise
+                                        = jcp.post_ops.entry_.back().eltwise;
+
+                                if (eltwise.alg == alg_kind::eltwise_relu) {
+                                    const auto alpha = eltwise.alpha;
+                                    const auto scale = eltwise.scale;
+                                    PRAGMA_OMP_SIMD()
+                                    for (size_t oc = start_oc; oc <= end_oc;
+                                            oc++) {
+                                        if (dst_arr[oc] < 0)
+                                            dst_arr[oc] *= alpha;
+                                        dst_arr[oc] *= scale;
+                                    }
+                                    fast_relu_done = true;
+                                }
+                            }
+                            if (!fast_relu_done) {
+                                ref_post_ops_t::args_t args;
+                                args.ctx = &ctx;
+                                args.dst_md = pd()->dst_md();
+
+                                for (size_t oc = start_oc; oc <= end_oc; oc++) {
+                                    // jcp.od is not part of jcp.os, so multiply
+                                    // jcp.od to get spatial offset.
+                                    args.l_offset = (g * jcp.oc + oc)
+                                            * (jcp.os * jcp.od);
+                                    post_ops_->execute(dst_arr[oc], args);
+                                }
+                            }
+                        }
+                    }
+                });
+            }
+        }
+        nd_iterator_step(n, jcp.mb, g, jcp.ngroups, ohb, nb_oh, owb, nb_ow);
+    }
+    return status::success;
+}
+
+status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp(
+        const exec_ctx_t &ctx) const {
+    auto src = CTX_IN_MEM(const data_t *, DNNL_ARG_SRC);
+    auto weights = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS);
+    auto bias = CTX_IN_MEM(const data_t *, DNNL_ARG_BIAS);
+    auto dst = CTX_OUT_MEM(data_t *, DNNL_ARG_DST);
+
+    auto col = ctx.get_scratchpad_grantor().get<data_t>(key_conv_gemm_col);
+
+    const conv_gemm_conf_t &jcp = this->pd()->jcp_;
+
+    const memory_desc_wrapper src_d(pd()->src_md());
+    const memory_desc_wrapper dst_d(pd()->dst_md());
+
+    // The second arg in template means sub_offset0 = true
+    // See `blk_off` method definition.
+    const size_t src_mb_stride = src_d.blk_off<false, true>(1);
+    const size_t src_g_stride = src_d.blk_off<false, true>(0, 1) * jcp.ic;
+
+    const size_t dst_mb_stride = dst_d.blk_off<false, true>(1);
+    const size_t dst_g_stride = dst_d.blk_off<false, true>(0, 1) * jcp.oc;
+
+    const size_t weights_oc_size = jcp.ic * jcp.ks;
+    const size_t weights_g_size = weights_oc_size * jcp.oc;
+    const bool is_problem_3d = pd()->ndims() == 5;
+
+    src += src_d.off_l(0);
+    dst += dst_d.off_l(0);
+
+    assert(IMPLICATION(is_problem_3d,
+            jcp.os_block == jcp.os && jcp.ic_block == jcp.ic
+                    && jcp.os_nb_block == 1));
+
+    status_t st = status::success;
+    parallel(jcp.nthr, [&](const int ithr, const int nthr) {
+        data_t *_col = col + (ptrdiff_t)ithr * jcp.im2col_sz;
+
+        // non-blocked jit_gemm_convolution_utils::im2col_3d() requires
+        // external data initialization by zeroes
+        const bool outer_padding = jcp.os_nb_block == 1;
+        if (outer_padding && is_problem_3d) {
+            for (ptrdiff_t i = 0; i < jcp.im2col_sz; i++)
+                _col[i] = (data_t)0;
+        }
+        auto inner_ker = [&](int spatial, const im_pos_t &curr, im_pos_t &prev,
+                                 im_pos_t &step, const im_pos_t &end) {
+            const data_t *_src
+                    = src + curr.n * src_mb_stride + curr.g * src_g_stride;
+            step.oc = nstl::min(
+                    jcp.oc_block, nstl::min(jcp.oc, end.oc) - curr.oc);
+            step.sp = nstl::min(jcp.os_block,
+                    nstl::min(jcp.os - curr.sp, end.sp - spatial));
+            step.ic = nstl::min(
+                    jcp.ic_block, nstl::min(jcp.ic, end.ic) - curr.ic);
+            bool do_im2col = curr.do_im2col(prev);
+            prev = curr;
+
+            if (jcp.im2col_sz && do_im2col) {
+                if (!is_problem_3d)
+                    jit_gemm_convolution_utils::im2col<float>(jcp, _src, _col,
+                            curr.sp, step.sp, curr.ic, step.ic);
+                else
+                    jit_gemm_convolution_utils::im2col_3d<float>(
+                            jcp, _src, _col, curr.od, 0, jcp.os);
+            }
+            const data_t one = 1.0;
+
+            const dim_t M = jcp.os * jcp.od;
+            const dim_t m = step.sp;
+            const dim_t LDA = jcp.im2col_sz ? m : M;
+            data_t *_dst = dst + curr.n * dst_mb_stride + curr.g * dst_g_stride
+                    + curr.oc * M + curr.od * jcp.os + curr.sp;
+            const dim_t K = step.ic * jcp.ks;
+            const dim_t LDB = jcp.ic * jcp.ks;
+            const dim_t N = step.oc;
+
+            const float beta
+                    = (curr.ic == 0) ? (jcp.with_sum ? 1.0f : 0.0f) : one;
+            const float *_source = jcp.im2col_sz
+                    ? _col
+                    : _src + curr.ic * M + curr.od * jcp.os + curr.sp;
+            const data_t *_weights = weights + curr.g * weights_g_size
+                    + curr.oc * weights_oc_size + curr.ic * jcp.ks;
+
+            status_t st = extended_sgemm("N", "N", &m, &N, &K, &one, _source,
+                    &LDA, _weights, &LDB, &beta, _dst, &M);
+            if (st != status::success) return st;
+
+            if (curr.ic == jcp.ic - step.ic) {
+                // TODO: for "outer threading" we have parallel section within
+                // outermost "parallel". It is not good. Consider to use
+                // "parallel" here with number of threads passed as parameter
+                const int oc_start = curr.g * jcp.oc + curr.oc;
+                if (jcp.with_eltwise || jcp.with_binary) {
+                    bool fast_relu_done = false;
+                    if (jcp.with_eltwise && jcp.post_ops.len() == 1) {
+                        // fast branch for ReLU case
+                        const auto &eltwise
+                                = jcp.post_ops.entry_.back().eltwise;
+                        if (eltwise.alg == alg_kind::eltwise_relu) {
+                            parallel_nd(step.oc, [&](dim_t oc) {
+                                data_t b = jcp.with_bias ? bias[oc_start + oc]
+                                                         : 0;
+                                data_t *d_ = _dst + oc * M;
+
+                                if (eltwise.alpha == 0.0f) {
+                                    int oS = 0;
+                                    while (oS < m) {
+                                        size_t vl
+                                                = __riscv_vsetvl_e32m1(m - oS);
+                                        vfloat32m1_t v_d
+                                                = __riscv_vle32_v_f32m1(
+                                                        d_ + oS, vl);
+                                        v_d = __riscv_vfadd_vf_f32m1(
+                                                v_d, b, vl); // Add bias
+
+                                        v_d = __riscv_vfmax_vf_f32m1(
+                                                v_d, 0.0f, vl);
+
+                                        if (eltwise.scale != 1.0f) {
+                                            v_d = __riscv_vfmul_vf_f32m1(
+                                                    v_d, eltwise.scale, vl);
+                                        }
+
+                                        __riscv_vse32_v_f32m1(d_ + oS, v_d, vl);
+                                        oS += vl;
+                                    }
+                                } else {
+                                    int oS = 0;
+                                    while (oS < m) {
+                                        size_t vl
+                                                = __riscv_vsetvl_e32m1(m - oS);
+                                        vfloat32m1_t v_d
+                                                = __riscv_vle32_v_f32m1(
+                                                        d_ + oS, vl);
+                                        v_d = __riscv_vfadd_vf_f32m1(
+                                                v_d, b, vl); // Add bias
+                                        vbool32_t mask
+                                                = __riscv_vmflt_vf_f32m1_b32(
+                                                        v_d, 0.0f, vl);
+                                        v_d = __riscv_vfmul_vf_f32m1_m(
+                                                mask, v_d, eltwise.alpha, vl);
+                                        v_d = __riscv_vfmul_vf_f32m1(
+                                                v_d, eltwise.scale, vl);
+                                        __riscv_vse32_v_f32m1(d_ + oS, v_d, vl);
+                                        oS += vl;
+                                    }
+                                }
+                            });
+                            fast_relu_done = true;
+                        }
+                    }
+                    if (!fast_relu_done) {
+                        parallel_nd(step.oc, [&](dim_t oc) {
+                            data_t b = jcp.with_bias ? bias[oc_start + oc] : 0;
+                            data_t *d_ = _dst + oc * M;
+
+                            ref_post_ops_t::args_t args;
+                            args.ctx = &ctx;
+                            args.dst_md = pd()->dst_md();
+                            args.l_offset = d_ - dst;
+
+                            for (int oS = 0; oS < m; ++oS) {
+                                d_[oS] += b;
+                                post_ops_->execute(d_[oS], args);
+                                args.l_offset++;
+                            }
+                        });
+                    }
+
+                } else if (jcp.with_bias) {
+                    parallel_nd(step.oc, [&](dim_t oc) {
+                        data_t b = bias[oc_start + oc];
+                        data_t *d_ = _dst + oc * M;
+
+                        int oS = 0;
+                        while (oS < m) {
+                            size_t vl = __riscv_vsetvl_e32m1(m - oS);
+                            vfloat32m1_t v_d
+                                    = __riscv_vle32_v_f32m1(d_ + oS, vl);
+                            v_d = __riscv_vfadd_vf_f32m1(v_d, b, vl);
+                            __riscv_vse32_v_f32m1(d_ + oS, v_d, vl);
+                            oS += vl;
+                        }
+                    });
+                }
+            }
+
+            return status::success;
+        };
+        im_pos_t start, end;
+        end.ic = jcp.ic;
+
+        if (!is_problem_3d) {
+            dim_t sp_work = jcp.mb * jcp.ngroups * jcp.od * jcp.os;
+            balance2D(nthr, ithr, sp_work, start.sp, end.sp, jcp.oc, start.oc,
+                    end.oc, dim_t(jcp.nthr_oc));
+        } else {
+            dim_t sp_work = jcp.mb * jcp.ngroups * jcp.od;
+            balance2D(nthr, ithr, sp_work, start.sp, end.sp, jcp.oc, start.oc,
+                    end.oc, dim_t(jcp.nthr_oc));
+            start.sp *= jcp.os;
+            end.sp *= jcp.os;
+        }
+
+        im_pos_t curr, prev, step;
+        prev.n = prev.g = prev.od = prev.sp = prev.ic = -1;
+        step.oc = jcp.oc_block;
+        step.sp = jcp.os_block;
+        step.ic = jcp.ic_block;
+
+        if (jcp.loop_order == gemm_loop_rlb)
+            for (curr.ic = 0; curr.ic < jcp.ic; curr.ic += step.ic)
+                for (int spatial = start.sp; spatial < end.sp;
+                        spatial += step.sp) {
+                    nd_iterator_init(spatial, curr.n, jcp.mb, curr.g,
+                            jcp.ngroups, curr.od, jcp.od, curr.sp, jcp.os);
+                    for (curr.oc = start.oc; curr.oc < end.oc;
+                            curr.oc += step.oc) {
+                        status_t st_thr
+                                = inner_ker(spatial, curr, prev, step, end);
+                        if (st_thr != status::success) {
+                            st = st_thr;
+                            return;
+                        }
+                    }
+                }
+        else if (jcp.loop_order == gemm_loop_lrb)
+            for (int spatial = start.sp; spatial < end.sp; spatial += step.sp) {
+                nd_iterator_init(spatial, curr.n, jcp.mb, curr.g, jcp.ngroups,
+                        curr.od, jcp.od, curr.sp, jcp.os);
+                for (curr.ic = 0; curr.ic < jcp.ic; curr.ic += step.ic)
+                    for (curr.oc = start.oc; curr.oc < end.oc;
+                            curr.oc += step.oc) {
+                        status_t st_thr
+                                = inner_ker(spatial, curr, prev, step, end);
+                        if (st_thr != status::success) {
+                            st = st_thr;
+                            return;
+                        }
+                    }
+            }
+        else
+            st = status::unimplemented;
+    });
+
+    return st;
+}
+
+} // namespace rv64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/rv64/rvv_gemm_convolution.hpp b/src/cpu/rv64/rvv_gemm_convolution.hpp
new file mode 100644
index 00000000000..7bcda8e9462
--- /dev/null
+++ b/src/cpu/rv64/rvv_gemm_convolution.hpp
@@ -0,0 +1,149 @@
+/*******************************************************************************
+* Copyright 2016 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_RV64_RVV_GEMM_CONVOLUTION_HPP
+#define CPU_RV64_RVV_GEMM_CONVOLUTION_HPP
+
+#include "common/broadcast_strategy.hpp"
+#include "common/c_types_map.hpp"
+#include "common/memory_tracking.hpp"
+#include "common/primitive.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/binary_injector_utils.hpp"
+#include "cpu/cpu_convolution_pd.hpp"
+#include "cpu/gemm/gemm.hpp"
+#include "cpu/primitive_attr_postops.hpp"
+#include "cpu/rv64/rvv_gemm_convolution_utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace rv64 {
+
+struct riscv_gemm_convolution_fwd_t : public primitive_t {
+    struct pd_t : public cpu_convolution_fwd_pd_t {
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
+
+        DECLARE_COMMON_PD_T(GEMM_IMPL_STR, riscv_gemm_convolution_fwd_t,
+                USE_GLOBAL_SCRATCHPAD);
+
+        status_t init(engine_t *engine) {
+            using namespace data_type;
+
+            VDISPATCH_CONV(is_fwd(), VERBOSE_BAD_PROPKIND);
+
+            if (with_bias()) {
+                VDISPATCH_CONV(expect_data_types(f32, f32, f32, f32, f32),
+                        VERBOSE_UNSUPPORTED_DT_CFG);
+            } else {
+                VDISPATCH_CONV(
+                        expect_data_types(f32, f32, data_type::undef, f32, f32),
+                        VERBOSE_UNSUPPORTED_DT_CFG);
+            }
+
+            VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
+                    VERBOSE_BAD_ALGORITHM);
+            VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
+            VDISPATCH_CONV(
+                    attr()->has_default_values(
+                            primitive_attr_t::skip_mask_t::post_ops, f32),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_CONV(post_ops_ok(), VERBOSE_UNSUPPORTED_POSTOP);
+
+            auto scratchpad = scratchpad_registry().registrar();
+
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            jcp_ = conv_gemm_conf_t();
+            return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad,
+                    *desc(), src_md_, weights_md_, dst_md_, bias_md_, attr_,
+                    dnnl_get_max_threads());
+        }
+
+        conv_gemm_conf_t jcp_ = utils::zero<decltype(jcp_)>();
+
+    protected:
+        bool post_ops_ok() const {
+            auto const &po = attr()->post_ops_;
+            auto is_sum_ok = [&](int idx) {
+                return IMPLICATION(po.entry_[idx].kind == primitive_kind::sum,
+                        idx == 0 && po.entry_[idx].is_sum());
+            };
+            auto is_binary
+                    = [&](int idx) { return po.entry_[idx].is_binary(); };
+            auto is_prelu = [&](int idx) { return po.entry_[idx].is_prelu(); };
+            auto is_binary_or_prelu_supported = [&](int idx) {
+                bool ok = dnnl::impl::get_rhs_arg_broadcasting_strategy(
+                                  binary_injector_utils::get_src1_desc(
+                                          po.entry_[idx], dst_md_),
+                                  dst_md_,
+                                  {broadcasting_strategy_t::scalar,
+                                          broadcasting_strategy_t::per_oc})
+                        != broadcasting_strategy_t::unsupported;
+                return ok;
+            };
+
+            if (!ref_post_ops_t::post_ops_ok(attr()->post_ops_)) return false;
+
+            for (int idx = 0; idx < po.len(); idx++) {
+                bool ok = is_sum_ok(idx)
+                        && IMPLICATION(is_binary(idx) || is_prelu(idx),
+                                is_binary_or_prelu_supported(idx));
+                if (!ok) return false;
+            }
+
+            return true;
+        }
+    };
+
+    riscv_gemm_convolution_fwd_t(const pd_t *apd)
+        : primitive_t(apd), post_ops_(nullptr) {}
+
+    status_t init(engine_t *engine) override {
+        const auto &jcp = pd()->jcp_;
+
+        if (jcp.with_eltwise || jcp.with_binary) {
+            CHECK(safe_ptr_assign(post_ops_, new ref_post_ops_t(jcp.post_ops)));
+            CHECK(post_ops_->init(pd()->dst_md()));
+        }
+        return status::success;
+    }
+
+    using data_t = typename prec_traits_t<data_type::f32>::type;
+
+    status_t execute(const exec_ctx_t &ctx) const override {
+        bool is_nspc = pd()->jcp_.is_nspc;
+        return is_nspc ? execute_forward_nspc(ctx) : execute_forward_ncsp(ctx);
+    }
+
+private:
+    status_t execute_forward_ncsp(const exec_ctx_t &ctx) const;
+    status_t execute_forward_nspc(const exec_ctx_t &ctx) const;
+    status_t execute_forward_thr_nspc(const exec_ctx_t &ctx, const int ithr,
+            const int nthr, const data_t *src_base, const data_t *wei_base,
+            const data_t *bia_base, data_t *dst_base,
+            const memory_tracking::grantor_t &scratchpad) const;
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+
+    std::unique_ptr<ref_post_ops_t> post_ops_;
+};
+
+} // namespace rv64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/cpu/rv64/rvv_gemm_convolution_utils.cpp b/src/cpu/rv64/rvv_gemm_convolution_utils.cpp
new file mode 100644
index 00000000000..2ce81d0a738
--- /dev/null
+++ b/src/cpu/rv64/rvv_gemm_convolution_utils.cpp
@@ -0,0 +1,2185 @@
+/*******************************************************************************
+* Copyright 2016 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "cpu/rv64/rvv_gemm_convolution_utils.hpp"
+#include "common/bfloat16.hpp"
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+#include "cpu/scale_utils.hpp"
+
+#include "cpu/platform.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace rv64 {
+
+using namespace dnnl::impl::status;
+using namespace dnnl::impl::utils;
+using namespace prop_kind;
+using namespace data_type;
+
+single_gemm_conv_chunk_desc_t::single_gemm_conv_chunk_desc_t(dim_t d_off,
+        dim_t d_size, dim_t h_off, dim_t h_size, dim_t w_off, dim_t w_size)
+    : d_off_(d_off)
+    , d_size_(d_size)
+    , h_off_(h_off)
+    , h_size_(h_size)
+    , w_off_(w_off)
+    , w_size_(w_size) {}
+
+namespace jit_gemm_convolution_utils {
+
+template <typename data_type_t>
+void im2col_3d(const conv_gemm_conf_t &jcp, const data_type_t *im,
+        data_type_t *col, dim_t od, int spatial_step, int spatial_block) {
+    using data_t =
+            typename conditional<data_traits_t<data_type_t>::data_type == bf16,
+                    uint16_t, data_type_t>::type;
+    const data_t *__restrict _im
+            = reinterpret_cast<const data_t *__restrict>(im);
+    data_t *__restrict _col = reinterpret_cast<data_t *__restrict>(col);
+
+    const size_t OHW = spatial_block;
+    const size_t im_step = jcp.ih * jcp.iw * jcp.id;
+    const size_t col_step = jcp.ks * OHW;
+
+    auto compute_im2col_outer_padding = [&](dim_t ic) {
+        const data_t *__restrict im_loc = _im + ic * im_step;
+        data_t *__restrict col_loc = _col + ic * col_step;
+        dim_t id = od * jcp.stride_d - jcp.f_pad;
+        for (dim_t kd = 0; kd < jcp.kd; ++kd) {
+            data_t *__restrict col_ = col_loc + kd * jcp.kh * jcp.kw * OHW;
+            if (id < 0 || id >= jcp.id) {
+                dim_t ih_ = -jcp.t_pad;
+                for (dim_t kh = 0; kh < jcp.kh; ++kh) {
+                    dim_t ih = ih_;
+                    for (dim_t oh = 0; oh < jcp.oh; ++oh) {
+                        if (ih < 0 || ih >= jcp.ih) {
+                            ih += jcp.stride_h;
+                            continue;
+                        }
+                        dim_t iw_ = -jcp.l_pad;
+                        for (dim_t kw = 0; kw < jcp.kw; ++kw) {
+                            dim_t iw = iw_;
+                            for (dim_t ow = 0; ow < jcp.ow; ++ow) {
+                                if (iw < 0 || iw >= jcp.iw) {
+                                    iw += jcp.stride_w;
+                                    continue;
+                                }
+
+                                const size_t col_idx
+                                        = kw * OHW + oh * jcp.ow + ow;
+
+                                col_[col_idx] = 0;
+                                iw += jcp.stride_w;
+                            }
+                            iw_ += (1 + jcp.dilate_w);
+                        }
+                        ih += jcp.stride_h;
+                    }
+                    ih_ += (1 + jcp.dilate_h);
+                    col_ += jcp.kw * OHW;
+                }
+            } else {
+                const data_t *__restrict im_ = im_loc + id * jcp.ih * jcp.iw;
+                dim_t ih_ = -jcp.t_pad;
+                for (dim_t kh = 0; kh < jcp.kh; ++kh) {
+                    dim_t ih = ih_;
+                    for (dim_t oh = 0; oh < jcp.oh; ++oh) {
+                        if (ih < 0 || ih >= jcp.ih) {
+                            ih += jcp.stride_h;
+                            continue;
+                        }
+                        dim_t iw_ = -jcp.l_pad;
+                        for (dim_t kw = 0; kw < jcp.kw; ++kw) {
+                            dim_t iw = iw_;
+                            for (dim_t ow = 0; ow < jcp.ow; ++ow) {
+                                if (iw < 0 || iw >= jcp.iw) {
+                                    iw += jcp.stride_w;
+                                    continue;
+                                }
+
+                                const size_t col_idx
+                                        = kw * OHW + oh * jcp.ow + ow;
+                                const size_t im_idx = ih * jcp.iw + iw;
+
+                                col_[col_idx] = im_[im_idx];
+                                iw += jcp.stride_w;
+                            }
+                            iw_ += (1 + jcp.dilate_w);
+                        }
+                        ih += jcp.stride_h;
+                    }
+                    ih_ += (1 + jcp.dilate_h);
+                    col_ += jcp.kw * OHW;
+                }
+            }
+            id += (1 + jcp.dilate_d);
+        }
+    };
+    auto compute_im2col_padding = [&](dim_t ic) {
+        const dim_t first_oh = spatial_step / jcp.ow;
+        const dim_t last_oh = (spatial_step + spatial_block - 1) / jcp.ow;
+        const dim_t oh_begin = first_oh;
+        const dim_t oh_end = last_oh + 1;
+        const dim_t first_ow = spatial_step % jcp.ow;
+        const dim_t last_ow = (spatial_step + spatial_block - 1) % jcp.ow;
+
+        const data_t *__restrict im_loc = _im + ic * im_step;
+        data_t *__restrict col_loc = _col + ic * col_step;
+        dim_t id = od * jcp.stride_d - jcp.f_pad;
+        for (dim_t kd = 0; kd < jcp.kd; ++kd) {
+            data_t *__restrict col_ = col_loc + kd * jcp.kh * jcp.kw * OHW;
+            if (id < 0 || id >= jcp.id) {
+                for (dim_t kh = 0; kh < jcp.kh; ++kh) {
+                    for (dim_t oh = oh_begin; oh < oh_end; ++oh) {
+                        const dim_t ow_begin = (oh == first_oh) ? first_ow : 0;
+                        const dim_t ow_end
+                                = (oh == last_oh) ? (last_ow + 1) : jcp.ow;
+                        for (dim_t kw = 0; kw < jcp.kw; ++kw) {
+                            for (dim_t ow = ow_begin; ow < ow_end; ++ow) {
+                                const size_t col_idx = kw * OHW + oh * jcp.ow
+                                        + ow - spatial_step;
+                                col_[col_idx] = 0;
+                            }
+                        }
+                    }
+                    col_ += jcp.kw * OHW;
+                }
+            } else {
+                const data_t *__restrict im_ = im_loc + id * jcp.ih * jcp.iw;
+                dim_t ih_ = oh_begin * jcp.stride_h - jcp.t_pad;
+                for (dim_t kh = 0; kh < jcp.kh; ++kh) {
+                    dim_t ih = ih_;
+                    for (dim_t oh = oh_begin; oh < oh_end; ++oh) {
+                        const dim_t ow_begin = (oh == first_oh) ? first_ow : 0;
+                        const dim_t ow_end
+                                = (oh == last_oh) ? (last_ow + 1) : jcp.ow;
+                        if (ih < 0 || ih >= jcp.ih) {
+                            for (dim_t kw = 0; kw < jcp.kw; ++kw) {
+                                for (dim_t ow = ow_begin; ow < ow_end; ++ow) {
+                                    const size_t col_idx = kw * OHW
+                                            + oh * jcp.ow + ow - spatial_step;
+                                    col_[col_idx] = 0;
+                                }
+                            }
+                            ih += jcp.stride_h;
+                            continue;
+                        }
+                        dim_t iw_ = ow_begin * jcp.stride_w - jcp.l_pad;
+                        for (dim_t kw = 0; kw < jcp.kw; ++kw) {
+                            dim_t iw = iw_;
+                            for (dim_t ow = ow_begin; ow < ow_end; ++ow) {
+                                const size_t col_idx = kw * OHW + oh * jcp.ow
+                                        + ow - spatial_step;
+                                if (iw < 0 || iw >= jcp.iw) {
+                                    col_[col_idx] = 0;
+                                    iw += jcp.stride_w;
+                                    continue;
+                                }
+                                const size_t im_idx = ih * jcp.iw + iw;
+                                col_[col_idx] = im_[im_idx];
+                                iw += jcp.stride_w;
+                            }
+                            iw_ += (1 + jcp.dilate_w);
+                        }
+                        ih += jcp.stride_h;
+                    }
+                    ih_ += (1 + jcp.dilate_h);
+                    col_ += jcp.kw * OHW;
+                }
+            }
+            id += (1 + jcp.dilate_d);
+        }
+    };
+
+    // zero padding is handled outside im2col
+    const bool outer_padding = jcp.os_nb_block == 1;
+    if (outer_padding)
+        parallel_nd(jcp.ic, compute_im2col_outer_padding);
+    else
+        parallel_nd(jcp.ic, compute_im2col_padding);
+}
+
+template void im2col_3d(const conv_gemm_conf_t &jcp, const float *im,
+        float *col, dim_t od, int spatial_step, int spatial_block);
+
+template void im2col_3d(const conv_gemm_conf_t &jcp, const bfloat16_t *im,
+        bfloat16_t *col, dim_t od, int spatial_step, int spatial_block);
+
+/* imtr[ic][od][oh][ow] <-- im[id][ih][iw][ic]*/
+template <typename T>
+void transpose_dt(const conv_gemm_conf_t &jcp, const T *__restrict im,
+        T *__restrict imtr) {
+    uint8_t shift = jcp.signed_input ? 128 : 0;
+    const dim_t ic_stride = jcp.id * jcp.ih * jcp.iw;
+    const dim_t IC = jcp.ngroups * jcp.ic;
+    const dim_t IHW = jcp.ih * jcp.iw;
+    constexpr dim_t ic_block = platform::get_cache_line_size();
+    const dim_t nb_ic = jcp.ic / ic_block;
+    const dim_t ic_blocked = nb_ic * ic_block;
+    parallel_nd(jcp.id, jcp.ih, [&](dim_t id, dim_t ih) {
+        const T *__restrict im_h = im + id * IHW * IC + ih * jcp.iw * IC;
+        T *__restrict imtr_h = imtr + id * IHW + ih * jcp.iw;
+        for (dim_t iw = 0; iw < jcp.iw; iw++) {
+            const T *__restrict im_w = im_h + iw * IC;
+            T *__restrict imtr_w = imtr_h + iw;
+            for (dim_t icb = 0; icb < nb_ic; icb++) {
+                const T *__restrict im_icb = im_w + icb * ic_block;
+                T *__restrict imtr_icb = imtr_w + icb * ic_block * ic_stride;
+                PRAGMA_OMP_SIMD()
+                for (dim_t ic = 0; ic < ic_block; ic++) {
+                    imtr_icb[ic * ic_stride] = im_icb[ic] + shift;
+                }
+            }
+            for (dim_t ic = ic_blocked; ic < jcp.ic; ic++) {
+                imtr_w[ic * ic_stride] = im_w[ic] + shift;
+            }
+        }
+    });
+}
+
+template void transpose_dt(const conv_gemm_conf_t &jcp,
+        const int8_t *__restrict im, int8_t *__restrict imtr);
+template void transpose_dt(const conv_gemm_conf_t &jcp,
+        const uint8_t *__restrict im, uint8_t *__restrict imtr);
+template void transpose_dt(const conv_gemm_conf_t &jcp,
+        const char *__restrict im, char *__restrict imtr);
+template void transpose_dt(const conv_gemm_conf_t &jcp,
+        const float *__restrict im, float *__restrict imtr);
+template void transpose_dt(const conv_gemm_conf_t &jcp,
+        const bfloat16_t *__restrict im, bfloat16_t *__restrict imtr);
+
+/* col[kd][kh][kw][g][ic][od][oh][ow] <-- im2col_dt_3d(im[id][ih][iw][g][ic]) */
+template <typename orig_im_dt, typename orig_col_dt>
+void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr,
+        orig_col_dt *__restrict _col, dim_t od) {
+    // For performance reasons, use uint16_t as a proxy for bfloat16_t
+    using im_dt =
+            typename utils::conditional<data_traits_t<orig_im_dt>::data_type
+                            == bf16,
+                    uint16_t, orig_im_dt>::type;
+    using col_dt =
+            typename utils::conditional<data_traits_t<orig_col_dt>::data_type
+                            == bf16,
+                    uint16_t, orig_col_dt>::type;
+    const im_dt *__restrict imtr
+            = reinterpret_cast<const im_dt *__restrict>(_imtr);
+    col_dt *__restrict col = reinterpret_cast<col_dt *__restrict>(_col);
+
+    col_dt shift = static_cast<col_dt>(jcp.signed_input ? 128 : 0);
+    const dim_t dd = 1 + jcp.dilate_d;
+    const dim_t dh = 1 + jcp.dilate_h;
+    const dim_t dw = 1 + jcp.dilate_w;
+    const dim_t sd = jcp.stride_d;
+    const dim_t sh = jcp.stride_h;
+    const dim_t sw = jcp.stride_w;
+    const dim_t fp = jcp.f_pad;
+    const dim_t tp = jcp.t_pad;
+    const dim_t lp = jcp.l_pad;
+    const dim_t col_ic_s = jcp.oh * jcp.ow;
+    const dim_t col_kw_s = jcp.ic * col_ic_s;
+    const dim_t col_kh_s = jcp.kw * col_kw_s;
+    const dim_t col_kd_s = jcp.kh * col_kh_s;
+    const dim_t IHW = jcp.ih * jcp.iw;
+    const dim_t OHW = jcp.oh * jcp.ow;
+
+    if (sd == 1 && sh == 1 && sw == 1 && dd == 1 && dh == 1 && dw == 1)
+        parallel_nd(jcp.kd, jcp.kh, jcp.kw, jcp.ic,
+                [&](dim_t kd, dim_t kh, dim_t kw, dim_t ic) {
+            col_dt *__restrict col_loc = col + kd * col_kd_s + kh * col_kh_s
+                    + kw * col_kw_s + ic * col_ic_s;
+            const dim_t id = od - fp + kd;
+            if (id < 0 || id >= jcp.id) {
+                for (ptrdiff_t i = 0; i < OHW; i++)
+                    col_loc[i] = shift;
+                return;
+            }
+            const im_dt *__restrict imtr_loc = imtr + (ic * jcp.id + id) * IHW;
+            const dim_t oh_start = saturate(dim_t(0), jcp.oh, tp - kh);
+            const dim_t oh_end = saturate(dim_t(0), jcp.oh, jcp.ih + tp - kh);
+            const dim_t ow_start = saturate(dim_t(0), jcp.ow, lp - kw);
+            const dim_t ow_end = saturate(dim_t(0), jcp.ow, jcp.iw + lp - kw);
+            for (dim_t oh = oh_start, ih = oh_start - tp + kh; oh < oh_end;
+                    oh++, ih++) {
+                col_dt *__restrict col_h = col_loc + oh * jcp.ow;
+                const im_dt *__restrict imtr_h = imtr_loc + ih * jcp.iw;
+                for (dim_t ow = ow_start, iw = ow_start - lp + kw; ow < ow_end;
+                        ow++, iw++) {
+                    col_h[ow] = imtr_h[iw];
+                }
+            }
+        });
+    else if (sd == 2 && sh == 2 && sw == 2 && dd == 1 && dh == 1 && dw == 1)
+        parallel_nd(jcp.kd, jcp.kh, jcp.kw, jcp.ic,
+                [&](dim_t kd, dim_t kh, dim_t kw, dim_t ic) {
+            col_dt *__restrict col_loc = col + kd * col_kd_s + kh * col_kh_s
+                    + kw * col_kw_s + ic * col_ic_s;
+            const dim_t id = od * 2 - fp + kd;
+            if (id < 0 || id >= jcp.id) {
+                for (ptrdiff_t i = 0; i < OHW; i++)
+                    col_loc[i] = shift;
+                return;
+            }
+            const im_dt *__restrict imtr_loc = imtr + (ic * jcp.id + id) * IHW;
+            const dim_t oh_start
+                    = saturate(dim_t(0), jcp.oh, div_up(tp - kh, 2));
+            const dim_t oh_end
+                    = saturate(dim_t(0), jcp.oh, div_up(jcp.ih + tp - kh, 2));
+            const dim_t ow_start
+                    = saturate(dim_t(0), jcp.ow, div_up(lp - kw, 2));
+            const dim_t ow_end
+                    = saturate(dim_t(0), jcp.ow, div_up(jcp.iw + lp - kw, 2));
+            for (dim_t oh = oh_start, ih = oh_start * 2 - tp + kh; oh < oh_end;
+                    ++oh, ih += 2) {
+                col_dt *__restrict col_h = col_loc + oh * jcp.ow;
+                const im_dt *__restrict imtr_h = imtr_loc + ih * jcp.iw;
+                for (dim_t ow = ow_start, iw = ow_start * 2 - lp + kw;
+                        ow < ow_end; ++ow, iw += 2) {
+                    col_h[ow] = imtr_h[iw];
+                }
+            }
+        });
+    else
+        parallel_nd(jcp.kd, jcp.kh, jcp.kw, jcp.ic,
+                [&](dim_t kd, dim_t kh, dim_t kw, dim_t ic) {
+            col_dt *__restrict col_loc = col + kd * col_kd_s + kh * col_kh_s
+                    + kw * col_kw_s + ic * col_ic_s;
+            const dim_t id = od * sd - fp + kd * dd;
+            if (id < 0 || id >= jcp.id) {
+                for (ptrdiff_t i = 0; i < OHW; i++)
+                    col_loc[i] = shift;
+                return;
+            }
+            const im_dt *__restrict imtr_loc = imtr + (ic * jcp.id + id) * IHW;
+            const dim_t oh_start
+                    = saturate(dim_t(0), jcp.oh, div_up(tp - kh * dh, sh));
+            const dim_t oh_end = saturate(
+                    dim_t(0), jcp.oh, div_up(jcp.ih + tp - kh * dh, sh));
+            const dim_t ow_start
+                    = saturate(dim_t(0), jcp.ow, div_up(lp - kw * dw, sw));
+            const dim_t ow_end = saturate(
+                    dim_t(0), jcp.ow, div_up(jcp.iw + lp - kw * dw, sw));
+            for (dim_t oh = oh_start, ih = oh_start * sh - tp + kh * dh;
+                    oh < oh_end; ++oh, ih += sh) {
+                col_dt *__restrict col_h = col_loc + oh * jcp.ow;
+                const im_dt *__restrict imtr_h = imtr_loc + ih * jcp.iw;
+                for (dim_t ow = ow_start, iw = ow_start * sw - lp + kw * dw;
+                        ow < ow_end; ++ow, iw += sw) {
+                    col_h[ow] = imtr_h[iw];
+                }
+            }
+        });
+}
+
+template void im2col_dt_3d<int8_t, uint8_t>(const conv_gemm_conf_t &jcp,
+        const void *__restrict im, uint8_t *__restrict col, dim_t od);
+template void im2col_dt_3d<uint8_t, uint8_t>(const conv_gemm_conf_t &jcp,
+        const void *__restrict im, uint8_t *__restrict col, dim_t od);
+template void im2col_dt_3d<float, float>(const conv_gemm_conf_t &jcp,
+        const void *__restrict im, float *__restrict col, dim_t od);
+template void im2col_dt_3d<bfloat16_t, bfloat16_t>(const conv_gemm_conf_t &jcp,
+        const void *__restrict im, bfloat16_t *__restrict col, dim_t od);
+
+/* col[ic][kh][kw][oh][ow] <-- im2col(im[ic][ih][iw]) */
+template <typename data_type_t>
+void im2col(const conv_gemm_conf_t &jcp, const data_type_t *__restrict im,
+        data_type_t *__restrict col, dim_t ss, dim_t sb, dim_t cs, dim_t cb) {
+
+    using data_t =
+            typename utils::conditional<data_traits_t<data_type_t>::data_type
+                            == bf16,
+                    uint16_t, data_type_t>::type;
+    const data_t *__restrict _im
+            = reinterpret_cast<const data_t *__restrict>(im);
+    data_t *__restrict _col = reinterpret_cast<data_t *__restrict>(col);
+
+    const size_t im_step = jcp.is;
+    const size_t col_step = jcp.ks * sb;
+    const dim_t dh = 1 + jcp.dilate_h;
+    const dim_t dw = 1 + jcp.dilate_w;
+    const dim_t sh = jcp.stride_h;
+    const dim_t sw = jcp.stride_w;
+    const dim_t tp = jcp.t_pad;
+    const dim_t lp = jcp.l_pad;
+    const dim_t first_oh = ss / jcp.ow;
+    const dim_t last_oh = (ss + sb - 1) / jcp.ow;
+    const dim_t oh_begin = first_oh;
+    const dim_t oh_end = last_oh + 1;
+    const dim_t first_ow = ss % jcp.ow;
+    const dim_t last_ow = (ss + sb - 1) % jcp.ow;
+
+    const data_t zero_val = 0;
+
+    if (jcp.outer_threading) {
+        if (sw == 1) {
+            // Generated code is more optimized for stride_w == 1
+            // because innermost loop is by width
+            for (dim_t ic = 0; ic < cb; ic++) {
+                const data_t *__restrict im_ic = _im + (ic + cs) * im_step;
+                for_(dim_t kh = 0; kh < jcp.kh; kh++)
+                for (dim_t kw = 0; kw < jcp.kw; kw++) {
+                    data_t *__restrict col_k
+                            = _col + ic * col_step + (kh * jcp.kw + kw) * sb;
+                    for (dim_t oh = oh_begin; oh < oh_end; oh++) {
+                        const dim_t ih = oh * sh - tp + kh * dh;
+                        const data_t *__restrict im_
+                                = im_ic + ih * jcp.iw - lp + kw * dw;
+                        const dim_t ow_begin = (oh == first_oh) ? first_ow : 0;
+                        const dim_t ow_end
+                                = (oh == last_oh) ? (last_ow + 1) : jcp.ow;
+                        data_t *__restrict col_ = col_k + oh * jcp.ow - ss;
+                        if (ih < 0 || ih >= jcp.ih)
+                            for (dim_t ow = ow_begin; ow < ow_end; ow++)
+                                col_[ow] = zero_val;
+                        else {
+                            for (dim_t ow = ow_begin; ow < ow_end; ++ow) {
+                                const dim_t iw = ow;
+                                if (iw < lp - kw * dw
+                                        || iw >= jcp.iw + lp - kw * dw)
+                                    col_[ow] = zero_val;
+                                else
+                                    col_[ow] = im_[iw];
+                            }
+                        }
+                    }
+                }
+            }
+        } else {
+            for (dim_t ic = 0; ic < cb; ic++) {
+                const data_t *__restrict im_ = _im + (ic + cs) * im_step;
+                for_(dim_t kh = 0; kh < jcp.kh; kh++)
+                for (dim_t kw = 0; kw < jcp.kw; kw++) {
+                    data_t *__restrict col_k
+                            = _col + ic * col_step + (kh * jcp.kw + kw) * sb;
+                    for (dim_t oh = oh_begin; oh < oh_end; oh++) {
+                        const dim_t ih = oh * sh - tp + kh * dh;
+                        const dim_t ow_begin = (oh == first_oh) ? first_ow : 0;
+                        const dim_t ow_end
+                                = (oh == last_oh) ? (last_ow + 1) : jcp.ow;
+                        data_t *__restrict col_oh = col_k + oh * jcp.ow - ss;
+                        if (ih < 0 || ih >= jcp.ih)
+                            for (dim_t ow = ow_begin; ow < ow_end; ow++)
+                                col_oh[ow] = zero_val;
+                        else
+                            for (dim_t ow = ow_begin; ow < ow_end; ow++) {
+                                const dim_t iw = ow * sw - lp + kw * dw;
+                                if (iw < 0 || iw >= jcp.iw)
+                                    col_oh[ow] = zero_val;
+                                else {
+                                    const ptrdiff_t im_idx = ih * jcp.iw + iw;
+                                    col_oh[ow] = im_[im_idx];
+                                }
+                            }
+                    }
+                }
+            }
+        }
+    } else {
+        // TODO: optimize threading if jcp.ic*jcp.kh*jcp.kw*oh_range is small
+        // comparing to number of threads
+        const dim_t oh_range = oh_end - oh_begin;
+        // Generated code is more optimized for stride_w == 1
+        // because innermost loop is by width
+        if (sw == 1)
+            parallel_nd(cb, jcp.kh, jcp.kw, oh_range,
+                    [&](dim_t ic, dim_t kh, dim_t kw, dim_t ohr) {
+                const dim_t oh = ohr + oh_begin;
+                const dim_t ih = oh * sh - tp + kh * dh;
+                const dim_t ow_start = (oh == first_oh) ? first_ow : 0;
+                const dim_t ow_end = (oh == last_oh) ? (last_ow + 1) : jcp.ow;
+                data_t *__restrict col_oh = _col + ic * col_step
+                        + (kh * jcp.kw + kw) * sb + oh * jcp.ow - ss;
+                const data_t *__restrict im_
+                        = _im + (ic + cs) * im_step + ih * jcp.iw;
+                const dim_t iw_shift = kw * dw - lp;
+                if (ih < 0 || ih >= jcp.ih)
+                    for (dim_t ow = ow_start; ow < ow_end; ow++)
+                        col_oh[ow] = zero_val;
+                else
+                    for (dim_t ow = ow_start; ow < ow_end; ow++) {
+                        const dim_t iw = ow + iw_shift;
+                        if (iw < 0 || iw >= jcp.iw)
+                            col_oh[ow] = zero_val;
+                        else
+                            col_oh[ow] = im_[iw];
+                    }
+            });
+        else
+            parallel_nd(cb, jcp.kh, jcp.kw, oh_range,
+                    [&](dim_t ic, dim_t kh, dim_t kw, dim_t ohr) {
+                const dim_t oh = ohr + oh_begin;
+                const dim_t ih = oh * sh - tp + kh * dh;
+                const dim_t ow_start = (oh == first_oh) ? first_ow : 0;
+                const dim_t ow_end = (oh == last_oh) ? (last_ow + 1) : jcp.ow;
+                data_t *__restrict col_oh = _col + ic * col_step
+                        + (kh * jcp.kw + kw) * sb + oh * jcp.ow - ss;
+                const data_t *__restrict im_ = _im + (ic + cs) * im_step;
+                if (ih < 0 || ih >= jcp.ih)
+                    for (dim_t ow = ow_start; ow < ow_end; ow++)
+                        col_oh[ow] = zero_val;
+                else
+                    for (dim_t ow = ow_start; ow < ow_end; ow++) {
+                        const dim_t iw = ow * sw - lp + kw * dw;
+                        if (iw < 0 || iw >= jcp.iw)
+                            col_oh[ow] = zero_val;
+                        else {
+                            const ptrdiff_t im_idx = ih * jcp.iw + iw;
+                            col_oh[ow] = im_[im_idx];
+                        }
+                    }
+            });
+    }
+}
+
+template void im2col(const conv_gemm_conf_t &jcp, const float *__restrict im,
+        float *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb);
+
+template void im2col(const conv_gemm_conf_t &jcp,
+        const bfloat16_t *__restrict im, bfloat16_t *__restrict col, dim_t hs,
+        dim_t hb, dim_t ws, dim_t wb);
+
+/* col[kh][kw][ic][oh][ow] <-- im2col_dt(im[ih][iw][ic]) */
+template <typename orig_im_dt, typename orig_col_dt>
+void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict _im,
+        void *__restrict _imtr, orig_col_dt *__restrict _col, dim_t hs,
+        dim_t hb, dim_t ws, dim_t wb) {
+    // For performance reasons, use uint16_t as a proxy for bfloat16_t
+    using im_dt =
+            typename utils::conditional<data_traits_t<orig_im_dt>::data_type
+                            == bf16,
+                    uint16_t, orig_im_dt>::type;
+    using col_dt =
+            typename utils::conditional<data_traits_t<orig_col_dt>::data_type
+                            == bf16,
+                    uint16_t, orig_col_dt>::type;
+    const im_dt *__restrict im = reinterpret_cast<const im_dt *__restrict>(_im);
+    im_dt *__restrict imtr = reinterpret_cast<im_dt *__restrict>(_imtr);
+    col_dt *__restrict col = reinterpret_cast<col_dt *__restrict>(_col);
+
+    col_dt shift = static_cast<col_dt>(jcp.signed_input ? 128 : 0);
+    const dim_t dh = 1 + jcp.dilate_h;
+    const dim_t dw = 1 + jcp.dilate_w;
+    const dim_t sh = jcp.stride_h;
+    const dim_t sw = jcp.stride_w;
+    const dim_t im_iw_stride = jcp.ic * jcp.ngroups;
+    const dim_t im_ih_stride = jcp.iw * im_iw_stride;
+    const dim_t tp = jcp.t_pad;
+    const dim_t lp = jcp.l_pad;
+
+    if (jcp.outer_threading && sh == 1 && sw == 1 && dh == 1 && dw == 1) {
+        /* im[ih][iw][ic] --> imtr[ic][ih][iw] --> col[kh][kw][ic][oh][ow] */
+        const dim_t hp = hs - tp;
+        const dim_t wp = ws - lp;
+        const dim_t ih_start = saturate(dim_t(0), jcp.ih, hp);
+        const dim_t ih_end = saturate(dim_t(0), jcp.ih, hp + hb + jcp.kh);
+        const dim_t iw_start = saturate(dim_t(0), jcp.iw, wp);
+        const dim_t iw_end = saturate(dim_t(0), jcp.iw, wp + wb + jcp.kw);
+
+        const dim_t ihb = ih_end - ih_start;
+        const dim_t iwb = iw_end - iw_start;
+
+        const dim_t imtr_ic_stride = ihb * iwb;
+        const ptrdiff_t imtr_idx_shift = ih_start * iwb + iw_start;
+        for (dim_t ic = 0; ic < jcp.ic; ic++) {
+            const ptrdiff_t imtr_idx_ic = ic * imtr_ic_stride - imtr_idx_shift;
+            for (dim_t ih = ih_start; ih < ih_end; ih++) {
+                const ptrdiff_t im_idx_ih = ic + ih * im_ih_stride;
+                const ptrdiff_t imtr_idx_ih = imtr_idx_ic + ih * iwb;
+                for (dim_t iw = iw_start; iw < iw_end; iw++)
+                    imtr[imtr_idx_ih + iw] = im[im_idx_ih + iw * im_iw_stride];
+            }
+        }
+
+        const dim_t col_ic_str = hb * wb;
+        const dim_t col_kw_stride = jcp.ic * col_ic_str;
+        const dim_t col_kh_stride = jcp.kw * col_kw_stride;
+
+        const dim_t oh_init = ih_start - hp;
+        const dim_t ow_init = iw_start - wp;
+        for (dim_t kh = 0; kh < jcp.kh; kh++) {
+            const ptrdiff_t col_idx_kh = kh * col_kh_stride;
+            const dim_t oh_kh = oh_init - kh;
+            const dim_t oh_start = saturate(dim_t(0), hb, oh_kh);
+            const dim_t oh_end = saturate(dim_t(0), hb, oh_kh + ihb);
+            for (dim_t kw = 0; kw < jcp.kw; kw++) {
+                const ptrdiff_t col_idx_kw
+                        = col_idx_kh + kw * jcp.ic * col_ic_str;
+                const dim_t ow_kw = ow_init - kw;
+                const dim_t imtr_shift = oh_kh * iwb + ow_kw;
+                const dim_t ow_start = saturate(dim_t(0), wb, ow_kw);
+                const dim_t ow_end = saturate(dim_t(0), wb, ow_kw + iwb);
+                for (dim_t ic = 0; ic < jcp.ic; ic++) {
+                    const ptrdiff_t col_idx_ic = col_idx_kw + ic * col_ic_str;
+                    const dim_t imtr_idx_ic = ic * imtr_ic_stride - imtr_shift;
+                    for (dim_t oh = 0; oh < oh_start; oh++) {
+                        const ptrdiff_t col_idx_oh = col_idx_ic + oh * wb;
+                        for (dim_t ow = 0; ow < wb; ++ow)
+                            col[col_idx_oh + ow] = shift;
+                    }
+                    for (dim_t oh = oh_start; oh < oh_end; oh++) {
+                        const ptrdiff_t col_idx_oh = col_idx_ic + oh * wb;
+                        const ptrdiff_t imtr_idx_oh = imtr_idx_ic + oh * iwb;
+                        for (dim_t ow = 0; ow < ow_start; ++ow)
+                            col[col_idx_oh + ow] = shift;
+                        for (dim_t ow = ow_start; ow < ow_end; ++ow)
+                            col[col_idx_oh + ow]
+                                    = imtr[imtr_idx_oh + ow] + shift;
+                        for (dim_t ow = ow_end; ow < wb; ++ow)
+                            col[col_idx_oh + ow] = shift;
+                    }
+                    for (dim_t oh = oh_end; oh < hb; oh++) {
+                        const ptrdiff_t col_idx_oh = col_idx_ic + oh * wb;
+                        for (dim_t ow = 0; ow < wb; ++ow)
+                            col[col_idx_oh + ow] = shift;
+                    }
+                }
+            }
+        }
+    } else {
+        parallel_nd(jcp.kh, jcp.kw, jcp.ic, hb,
+                [&](dim_t kh, dim_t kw, dim_t ic, dim_t oh) {
+            const dim_t hp = tp - kh * dh;
+            const dim_t ih = (oh + hs) * sh - hp;
+            const ptrdiff_t col_idx_base
+                    = (((kh * jcp.kw + kw) * jcp.ic + ic) * hb + oh) * wb;
+            if (ih < 0 || ih >= jcp.ih)
+                for (dim_t ow = 0; ow < wb; ow++)
+                    col[col_idx_base + ow] = shift;
+            else {
+                const dim_t wp = lp - kw * dw;
+                const dim_t ow_start
+                        = saturate(dim_t(0), wb, div_up(wp, sw) - ws);
+                const dim_t ow_end
+                        = saturate(dim_t(0), wb, div_up(jcp.iw + wp, sw) - ws);
+                for (dim_t ow = 0; ow < ow_start; ow++)
+                    col[col_idx_base + ow] = shift;
+                const dim_t iw_base = ws * sw - wp;
+                const ptrdiff_t im_idx_base = ih * im_ih_stride + ic;
+                for (dim_t ow = ow_start; ow < ow_end; ow++) {
+                    const dim_t iw = iw_base + ow * sw;
+                    const ptrdiff_t im_idx = im_idx_base + iw * im_iw_stride;
+                    col[col_idx_base + ow] = im[im_idx] + shift;
+                }
+                for (dim_t ow = ow_end; ow < wb; ow++)
+                    col[col_idx_base + ow] = shift;
+            }
+        });
+    }
+}
+
+template void im2col_dt<int8_t, uint8_t>(const conv_gemm_conf_t &jcp,
+        const void *__restrict im, void *__restrict imtr,
+        uint8_t *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb);
+template void im2col_dt<uint8_t, uint8_t>(const conv_gemm_conf_t &jcp,
+        const void *__restrict im, void *__restrict imtr,
+        uint8_t *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb);
+template void im2col_dt<float, float>(const conv_gemm_conf_t &jcp,
+        const void *__restrict im, void *__restrict imtr, float *__restrict col,
+        dim_t hs, dim_t hb, dim_t ws, dim_t wb);
+
+template void im2col_dt<bfloat16_t, bfloat16_t>(const conv_gemm_conf_t &jcp,
+        const void *__restrict im, void *__restrict imtr,
+        bfloat16_t *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb);
+
+/* im[id][ih][iw][ic] <-- col2im_dt_3d(col[od][oh][ow][kd][kh][kw][ic]) */
+template <typename orig_T>
+void col2im_dt(const conv_gemm_conf_t &jcp, const orig_T *__restrict _col,
+        orig_T *__restrict _im) {
+    // For performance reasons, use uint16_t as a proxy for bfloat16_t
+    using T = typename utils::conditional<
+            data_traits_t<orig_T>::data_type == bf16, uint16_t, orig_T>::type;
+    const T *__restrict col = reinterpret_cast<const T *__restrict>(_col);
+    T *__restrict im = reinterpret_cast<T *__restrict>(_im);
+
+    parallel(0, [&](const int ithr, const int nthr) {
+        dim_t d_nthr = nstl::min(jcp.id, dim_t(nthr));
+        dim_t h_nthr = nstl::min(jcp.ih, dim_t(nthr) / d_nthr);
+        dim_t w_nthr = nstl::min(jcp.iw, dim_t(nthr) / (d_nthr * h_nthr));
+        dim_t d_ithr = 1, d_s = 0, d_e = 0, h_ithr = 1, h_s = 0, h_e = 0,
+              w_ithr = 1, w_s = 0, w_e = 0;
+        if (ithr < d_nthr * h_nthr * w_nthr) {
+            d_ithr = ithr / (h_nthr * w_nthr);
+            h_ithr = (ithr % (h_nthr * w_nthr)) / w_nthr;
+            w_ithr = (ithr % (h_nthr * w_nthr)) % w_nthr;
+            balance211(jcp.id, d_nthr, d_ithr, d_s, d_e);
+            balance211(jcp.ih, h_nthr, h_ithr, h_s, h_e);
+            balance211(jcp.iw, w_nthr, w_ithr, w_s, w_e);
+        } else {
+            d_nthr = h_ithr = w_ithr = -ithr;
+            d_s = d_e = h_s = h_e = w_s = w_e = -1;
+        }
+
+        for_(dim_t id = d_s; id < d_e; ++id)
+        for_(dim_t ih = h_s; ih < h_e; ++ih)
+        for (dim_t iw = w_s; iw < w_e; ++iw) {
+            PRAGMA_OMP_SIMD()
+            for (dim_t ic = 0; ic < jcp.ic; ++ic) {
+                im[((id * jcp.ih + ih) * jcp.iw + iw) * jcp.ic + ic] = 0;
+            }
+        }
+
+        // TODO: reduce region: [0.. oh] --> [h_s * sh .. h_e * sh]
+        for_(dim_t od = 0; od < jcp.od; ++od)
+        for_(dim_t oh = 0; oh < jcp.oh; ++oh)
+        for_(dim_t ow = 0; ow < jcp.ow; ++ow)
+        for (dim_t kd = 0; kd < jcp.kd; ++kd) {
+            const dim_t id
+                    = od * jcp.stride_d - jcp.f_pad + kd * (1 + jcp.dilate_d);
+            if (id < d_s || id >= d_e) continue;
+
+            for (dim_t kh = 0; kh < jcp.kh; ++kh) {
+                const dim_t ih = oh * jcp.stride_h - jcp.t_pad
+                        + kh * (1 + jcp.dilate_h);
+                if (ih < h_s || ih >= h_e) continue;
+
+                for (dim_t kw = 0; kw < jcp.kw; ++kw) {
+                    const dim_t iw = ow * jcp.stride_w - jcp.l_pad
+                            + kw * (1 + jcp.dilate_w);
+                    if (iw < w_s || iw >= w_e) continue;
+
+                    const size_t col_idx
+                            = (((((od * jcp.oh + oh) * jcp.ow + ow) * jcp.kd
+                                        + kd) * jcp.kh
+                                       + kh) * jcp.kw
+                                      + kw)
+                            * jcp.ic;
+                    const size_t im_idx
+                            = ((id * jcp.ih + ih) * jcp.iw + iw) * jcp.ic;
+                    PRAGMA_OMP_SIMD()
+                    for (dim_t ic = 0; ic < jcp.ic; ++ic) {
+                        im[im_idx + ic] += col[col_idx + ic];
+                    }
+                }
+            }
+        }
+    });
+}
+
+template void col2im_dt<int32_t>(const conv_gemm_conf_t &jcp,
+        const int32_t *__restrict col, int32_t *__restrict im);
+
+template void col2im_dt<float>(const conv_gemm_conf_t &jcp,
+        const float *__restrict col, float *__restrict im);
+
+template void col2im_dt<bfloat16_t>(const conv_gemm_conf_t &jcp,
+        const bfloat16_t *__restrict col, bfloat16_t *__restrict im);
+
+void col2im_3d(const conv_gemm_conf_t &jcp, const float *col, float *im,
+        dim_t od, int spatial_step, int spatial_block) {
+
+    auto sp_blocked_ker = [&](dim_t ic) {
+        const size_t col_step = jcp.ks * spatial_block;
+        const float *__restrict col_ = col + ic * col_step;
+        float *__restrict im_ic = im + ic * jcp.ih * jcp.iw * jcp.id;
+
+        const dim_t first_oh = spatial_step / jcp.ow;
+        const dim_t last_oh = (spatial_step + spatial_block - 1) / jcp.ow;
+        const dim_t oh_begin = first_oh;
+        const dim_t oh_end = last_oh + 1;
+        const dim_t first_ow = spatial_step % jcp.ow;
+        const dim_t last_ow = (spatial_step + spatial_block - 1) % jcp.ow;
+        const dim_t wei_stride
+                = nstl::min(jcp.ow * jcp.oh, dim_t(spatial_block));
+
+        dim_t id = od * jcp.stride_d - jcp.f_pad;
+        for (dim_t kd = 0; kd < jcp.kd; ++kd) {
+            if (id < 0 || id >= jcp.id) {
+                col_ += jcp.kh * jcp.kw * wei_stride;
+                id += (1 + jcp.dilate_d);
+                continue;
+            }
+
+            float *__restrict im_ = im_ic + (size_t)id * jcp.ih * jcp.iw;
+            for_(dim_t kh = 0; kh < jcp.kh; ++kh)
+            for_(dim_t kw = 0; kw < jcp.kw; ++kw)
+            for (dim_t oh = oh_begin, col_off = 0; oh < oh_end; ++oh) {
+
+                const dim_t ow_begin = (oh == first_oh) ? first_ow : 0;
+                const dim_t ow_end = (oh == last_oh) ? (last_ow + 1) : jcp.ow;
+                const dim_t ow_work = ow_end - ow_begin;
+
+                const dim_t ih = oh * jcp.stride_h - jcp.t_pad
+                        + kh * (1 + jcp.dilate_h);
+                if (ih < 0 || ih >= jcp.ih) {
+                    col_off += ow_work;
+                    continue;
+                }
+
+                for (dim_t ow = ow_begin; ow < ow_end; ++ow, ++col_off) {
+                    const dim_t iw = ow * jcp.stride_w - jcp.l_pad
+                            + kw * (1 + jcp.dilate_w);
+                    if (iw < 0 || iw >= jcp.iw) { continue; }
+
+                    const size_t col_idx
+                            = (kh * jcp.kw + kw) * wei_stride + col_off;
+                    const size_t im_idx = ih * jcp.iw + iw;
+                    im_[im_idx] += col_[col_idx];
+                }
+            }
+            col_ += jcp.kh * jcp.kw * wei_stride;
+            id += (1 + jcp.dilate_d);
+        }
+    };
+
+    auto ker = [&](dim_t ic) {
+        const float *__restrict col_ = col + (size_t)ic * jcp.ks * jcp.os;
+        float *__restrict im_ic = im + (size_t)ic * jcp.ih * jcp.iw * jcp.id;
+
+        dim_t id = od * jcp.stride_d - jcp.f_pad;
+        for (dim_t kd = 0; kd < jcp.kd; ++kd) {
+            if (id < 0 || id >= jcp.id) {
+                col_ += jcp.kh * jcp.kw * jcp.os;
+                id += (1 + jcp.dilate_d);
+                continue;
+            }
+
+            float *__restrict im_ = im_ic + (size_t)id * jcp.ih * jcp.iw;
+
+            for_(dim_t oh = 0; oh < jcp.oh; ++oh)
+            for (dim_t kh = 0; kh < jcp.kh; ++kh) {
+                const dim_t ih = oh * jcp.stride_h - jcp.t_pad
+                        + kh * (1 + jcp.dilate_h);
+                if (ih < 0 || ih >= jcp.ih) continue;
+
+                for_(dim_t ow = 0; ow < jcp.ow; ++ow)
+                for (dim_t kw = 0; kw < jcp.kw; ++kw) {
+                    const dim_t iw = ow * jcp.stride_w - jcp.l_pad
+                            + kw * (1 + jcp.dilate_w);
+                    if (iw < 0 || iw >= jcp.iw) continue;
+
+                    const size_t col_idx
+                            = ((kh * jcp.kw + kw) * jcp.oh + oh) * jcp.ow + ow;
+                    const size_t im_idx = ih * jcp.iw + iw;
+                    im_[im_idx] += col_[col_idx];
+                }
+            }
+
+            col_ += jcp.kh * jcp.kw * jcp.os;
+            id += (1 + jcp.dilate_d);
+        }
+    };
+
+    const bool blocked_kernel = jcp.os_nb_block > 1;
+    if (blocked_kernel)
+        parallel_nd(jcp.ic, sp_blocked_ker);
+    else
+        parallel_nd(jcp.ic, ker);
+}
+
+void col2im(const conv_gemm_conf_t &jcp, const float *col, float *im,
+        int spatial_step, int spatial_block) {
+    const size_t col_step = jcp.ks * spatial_block;
+    const size_t im_step = jcp.ih * jcp.iw;
+    const dim_t iS = jcp.ih * jcp.iw;
+
+    auto sp_blocked_ker = [&](dim_t ic) {
+        const dim_t wei_stride
+                = nstl::min(jcp.ow * jcp.oh, dim_t(spatial_block));
+        const dim_t first_oh = spatial_step / jcp.ow;
+        const dim_t last_oh = (spatial_step + spatial_block - 1) / jcp.ow;
+        const dim_t oh_begin = first_oh;
+        const dim_t oh_end = last_oh + 1;
+        const dim_t first_ow = spatial_step % jcp.ow;
+        const dim_t last_ow = (spatial_step + spatial_block - 1) % jcp.ow;
+
+        float *__restrict img_ithr = im + ic * im_step;
+        const float *__restrict col_icb = col + ic * col_step;
+
+        if (spatial_step == 0) {
+            PRAGMA_OMP_SIMD()
+            for (dim_t is = 0; is < iS; ++is)
+                img_ithr[is] = 0.;
+        }
+
+        float *__restrict img_kh = img_ithr;
+        for (dim_t kh = 0; kh < jcp.kh; ++kh) {
+            float *__restrict im_ = img_kh;
+            for (dim_t kw = 0; kw < jcp.kw; ++kw) {
+                const float *__restrict col_ = col_icb;
+                for (dim_t oh = oh_begin; oh < oh_end; ++oh) {
+                    const dim_t ow_begin = (oh == first_oh) ? first_ow : 0;
+                    const dim_t ow_end
+                            = (oh == last_oh) ? (last_ow + 1) : jcp.ow;
+                    const dim_t ow_work = ow_end - ow_begin;
+
+                    const dim_t ih = oh * jcp.stride_h - jcp.t_pad;
+                    const dim_t ih_ = ih + kh * (1 + jcp.dilate_h);
+                    if (ih_ < 0 || ih_ >= jcp.ih) {
+                        col_ += ow_work;
+                        continue;
+                    }
+                    for (dim_t ow = ow_begin; ow < ow_end; ++ow, ++col_) {
+                        const dim_t iw = ow * jcp.stride_w - jcp.l_pad;
+                        const dim_t iw_ = iw + kw * (1 + jcp.dilate_w);
+                        if (iw_ < 0 || iw_ >= jcp.iw) continue;
+
+                        const size_t im_idx = ih * jcp.iw + iw;
+                        im_[im_idx] += *col_;
+                    }
+                }
+                col_icb += wei_stride;
+                im_ += (1 + jcp.dilate_w);
+            }
+            img_kh += (jcp.iw * (1 + jcp.dilate_h));
+        }
+    };
+
+    auto ker = [&](dim_t ic) {
+        float *__restrict im_ = im + ic * im_step;
+        const float *__restrict col_ = col + ic * col_step;
+        PRAGMA_OMP_SIMD()
+        for (dim_t is = 0; is < iS; ++is)
+            im_[is] = 0.;
+
+        for_(dim_t kh = 0; kh < jcp.kh; ++kh)
+        for (dim_t oh = 0; oh < jcp.oh; ++oh) {
+            const dim_t ih
+                    = oh * jcp.stride_h - jcp.t_pad + kh * (1 + jcp.dilate_h);
+            if (ih < 0 || ih >= jcp.ih) continue;
+
+            for_(dim_t kw = 0; kw < jcp.kw; ++kw)
+            for (dim_t ow = 0; ow < jcp.ow; ++ow) {
+                const dim_t iw = ow * jcp.stride_w - jcp.l_pad
+                        + kw * (1 + jcp.dilate_w);
+                if (iw < 0 || iw >= jcp.iw) continue;
+
+                const size_t col_idx
+                        = ((kh * jcp.kw + kw) * jcp.oh + oh) * jcp.ow + ow;
+                const size_t im_idx = ih * jcp.iw + iw;
+                im_[im_idx] += col_[col_idx];
+            }
+        }
+    };
+
+    const bool blocked_kernel = jcp.os_nb_block > 1;
+    if (blocked_kernel)
+        parallel_nd(jcp.ic, sp_blocked_ker);
+    else
+        parallel_nd(jcp.ic, ker);
+}
+
+status_t init_conf(conv_gemm_conf_t &jcp,
+        memory_tracking::registrar_t &scratchpad, const convolution_desc_t &cd,
+        memory_desc_t &src_md, memory_desc_t &weights_md, memory_desc_t &dst_md,
+        memory_desc_t &bias_md, primitive_attr_t &attr, int max_threads,
+        bool check_postops) {
+    const memory_desc_wrapper src_d(&src_md);
+    const memory_desc_wrapper weights_d(&weights_md);
+    const memory_desc_wrapper dst_d(&dst_md);
+
+    const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
+    const int ndims = src_d.ndims();
+    const int is_1d = ndims == 3;
+    const int is_3d = ndims == 5;
+
+    jcp.prop_kind = cd.prop_kind;
+
+    jcp.ngroups = with_groups ? weights_d.dims()[0] : 1;
+    jcp.mb = src_d.dims()[0];
+
+    jcp.oc = dst_d.dims()[1] / jcp.ngroups;
+    jcp.ic = src_d.dims()[1] / jcp.ngroups;
+    jcp.id = is_3d ? src_d.dims()[2] : 1;
+    jcp.ih = is_1d ? 1 : src_d.dims()[ndims - 2];
+    jcp.iw = src_d.dims()[ndims - 1];
+    jcp.od = is_3d ? dst_d.dims()[2] : 1;
+    jcp.oh = is_1d ? 1 : dst_d.dims()[ndims - 2];
+    jcp.ow = dst_d.dims()[ndims - 1];
+
+    jcp.kd = is_3d ? weights_d.dims()[with_groups + 2] : 1;
+    jcp.kh = is_1d ? 1 : weights_d.dims()[with_groups + ndims - 2];
+    jcp.kw = weights_d.dims()[with_groups + ndims - 1];
+
+    jcp.f_pad = is_3d ? cd.padding[0][0] : 0;
+    jcp.t_pad = is_1d ? 0 : cd.padding[0][ndims - 4];
+    jcp.l_pad = cd.padding[0][ndims - 3];
+
+    jcp.stride_d = is_3d ? cd.strides[0] : 1;
+    jcp.stride_h = is_1d ? 1 : cd.strides[ndims - 4];
+    jcp.stride_w = cd.strides[ndims - 3];
+
+    jcp.dilate_d = is_3d ? cd.dilates[0] : 0;
+    jcp.dilate_h = is_1d ? 0 : cd.dilates[ndims - 4];
+    jcp.dilate_w = cd.dilates[ndims - 3];
+
+    jcp.with_bias = cd.bias_desc.format_kind != format_kind::undef
+            || cd.diff_bias_desc.format_kind != format_kind::undef;
+
+    jcp.is = jcp.ih * jcp.iw;
+    jcp.os = jcp.oh * jcp.ow;
+    jcp.ks = jcp.kh * jcp.kw * jcp.kd;
+
+    jcp.signed_input = src_d.data_type() == data_type::s8;
+
+    jcp.outer_threading = false;
+
+    jcp.zp = zero_point_config_t(attr);
+    jcp.b_pad = nstl::max((jcp.oh - 1) * jcp.stride_h
+                    + (jcp.kh - 1) * (jcp.dilate_h + 1)
+                    - (jcp.ih + jcp.t_pad - 1),
+            dim_t(0));
+    jcp.r_pad = nstl::max((jcp.ow - 1) * jcp.stride_w
+                    + (jcp.kw - 1) * (jcp.dilate_w + 1)
+                    - (jcp.iw + jcp.l_pad - 1),
+            dim_t(0));
+    jcp.e_pad = nstl::max((jcp.od - 1) * jcp.stride_d
+                    + (jcp.kd - 1) * (jcp.dilate_d + 1)
+                    - (jcp.id + jcp.f_pad - 1),
+            dim_t(0));
+
+    const bool zp_src_with_padding = jcp.zp.src_exists && padding_exists(jcp);
+
+    if (zp_src_with_padding) {
+        jcp.zp.src_pad_comp = zero_point_pad_comp_config_t(jcp.f_pad, jcp.e_pad,
+                jcp.t_pad, jcp.b_pad, jcp.l_pad, jcp.r_pad, jcp.stride_d,
+                jcp.stride_h, jcp.stride_w, jcp.od, jcp.oh, jcp.ow);
+    }
+
+    const auto set_or_check_tags
+            = [&](format_tag_t desired_src_tag, format_tag_t desired_dst_tag,
+                      bool is_src_s8) -> status_t {
+        using namespace format_tag;
+        auto src_tag = any, dst_tag = any;
+
+        if (src_d.format_kind() == format_kind::any) {
+            CHECK(memory_desc_init_by_tag(src_md, desired_src_tag));
+            src_tag = desired_src_tag;
+        } else {
+            src_tag = src_d.mb_stride_relaxed_match(
+                    nwc, nhwc, ndhwc, ncw, nchw, ncdhw);
+        }
+
+        if (dst_d.format_kind() == format_kind::any) {
+            CHECK(memory_desc_init_by_tag(dst_md, desired_dst_tag));
+            dst_tag = desired_dst_tag;
+        } else {
+            dst_tag = dst_d.mb_stride_relaxed_match(
+                    nwc, nhwc, ndhwc, ncw, nchw, ncdhw);
+        }
+
+        if (src_tag == format_tag::undef || dst_tag == format_tag::undef)
+            return status::unimplemented;
+        if (src_tag != dst_tag) return status::unimplemented;
+
+        if (jcp.with_bias && bias_md.format_kind == format_kind::any)
+            CHECK(memory_desc_init_by_tag(bias_md, x));
+
+        const bool is_nspc = utils::one_of(src_tag, nwc, nhwc, ndhwc);
+        jcp.is_nspc = is_nspc;
+
+        memory_desc_t want_wei_md = weights_md;
+        auto wei_tag = is_nspc
+                ? (with_groups ? utils::pick(ndims - 3, wigo, hwigo, dhwigo)
+                               : utils::pick(ndims - 3, wio, hwio, dhwio))
+                : (with_groups ? utils::pick(ndims - 3, goiw, goihw, goidhw)
+                               : utils::pick(ndims - 3, oiw, oihw, oidhw));
+        CHECK(memory_desc_init_by_tag(want_wei_md, wei_tag));
+
+        if (is_src_s8) {
+            want_wei_md.extra.flags = 0
+                    | memory_extra_flags::compensation_conv_s8s8
+                    | memory_extra_flags::scale_adjust;
+            want_wei_md.extra.compensation_mask
+                    = (1 << 0) + (with_groups ? (1 << 1) : 0);
+            want_wei_md.extra.scale_adjust
+                    = platform::s8s8_weights_scale_factor();
+        }
+
+        if (jcp.zp.src_exists) set_zp_src_comp_flags(want_wei_md, with_groups);
+
+        if (weights_md.format_kind == format_kind::any) {
+            weights_md = want_wei_md;
+            return status::success;
+        }
+        return (want_wei_md == weights_md) ? status::success
+                                           : status::unimplemented;
+    };
+
+    const bool is_bwd_d = jcp.prop_kind == backward_data;
+    const bool is_bwd_w = jcp.prop_kind == backward_weights;
+    const bool is_fwd = !is_bwd_d && !is_bwd_w;
+
+    const auto dst_max_size
+            = static_cast<size_t>(jcp.iw) * jcp.ih * jcp.id * jcp.ic * 4;
+    const auto src_max_size
+            = static_cast<size_t>(jcp.ow) * jcp.oh * jcp.od * jcp.oc * 4;
+    VDISPATCH_CONV_IC(dst_max_size <= INT_MAX && src_max_size <= INT_MAX,
+            VERBOSE_UNSUPPORTED_FEATURE,
+            "dst/scr size > INT_MAX is not supported");
+
+    bool is_int8_conv = (is_fwd ? utils::one_of(src_d.data_type(), s8, u8)
+                                : utils::one_of(dst_d.data_type(), s8, u8))
+            && weights_d.data_type() == s8;
+
+    auto default_dat_tag = is_int8_conv
+            ? utils::pick(ndims - 3, format_tag::nwc, format_tag::nhwc,
+                      format_tag::ndhwc)
+            : utils::pick(ndims - 3, format_tag::ncw, format_tag::nchw,
+                      format_tag::ncdhw);
+    const status_t check_tag_status = set_or_check_tags(default_dat_tag,
+            default_dat_tag, src_md.data_type == data_type::s8);
+    VDISPATCH_CONV_IC(check_tag_status == status::success,
+            VERBOSE_UNSUPPORTED_TAG_S, "src");
+
+    // Does int8 conv ever need to support ncsp input format
+    VDISPATCH_CONV_IC(
+            !(is_int8_conv && !src_d.matches_one_of_tag(default_dat_tag)),
+            VERBOSE_UNSUPPORTED_DT);
+
+    CHECK(attr.set_default_formats(&dst_md));
+
+    jcp.post_ops = attr.post_ops_;
+
+    const int eltwise_ind = jcp.post_ops.find(primitive_kind::eltwise);
+    jcp.with_eltwise = eltwise_ind != -1;
+    const int binary_ind = jcp.post_ops.find(primitive_kind::binary);
+    const int prelu_ind = jcp.post_ops.find(primitive_kind::prelu);
+    jcp.with_binary = !everyone_is(-1, binary_ind, prelu_ind);
+    const int sum_ind = jcp.post_ops.find(primitive_kind::sum);
+    jcp.with_sum = sum_ind != -1;
+
+    bool is_bf16_conv = false
+            || (is_fwd
+                    && utils::everyone_is(
+                            bf16, src_d.data_type(), weights_d.data_type()))
+            || (is_bwd_d
+                    && utils::everyone_is(
+                            bf16, dst_d.data_type(), weights_d.data_type()))
+            || (is_bwd_w
+                    && utils::everyone_is(
+                            bf16, src_d.data_type(), dst_d.data_type()));
+    VDISPATCH_CONV_IC(!(is_bf16_conv && !platform::has_data_type_support(bf16)),
+            VERBOSE_UNSUPPORTED_DT);
+
+    const int vlen = std::max(platform::get_vector_register_size(), 4);
+    const int data_size = (is_int8_conv ? 1 : (is_bf16_conv ? 2 : 4));
+    const int simd_w = vlen / data_size;
+
+    jcp.os_block = jcp.os;
+    jcp.os_nb_block = 1;
+    jcp.oc_block = jcp.oc;
+    jcp.ic_block = jcp.ic;
+    jcp.loop_order = gemm_loop_rlb;
+    jcp.nthr_oc = 1;
+
+    jcp.oh_block = is_fwd ? jcp.oh : jcp.ih;
+    jcp.ow_block = is_fwd ? jcp.ow : jcp.iw;
+
+    using namespace memory_tracking::names;
+    bool is_depthwise = jcp.ic == 1 && jcp.oc == 1 && jcp.ngroups != 1;
+
+    // TODO: maybe mitigate blocking restriction
+    const auto L2 = platform::get_per_core_cache_size(2) / data_size;
+    const int gemm_thrld = 64 * 1024;
+
+    // Heuristic threshold for requested scratchpad memory to avoid
+    // possible crash on memory allocation:
+    // 1Gb or size of the buffers already used for this convolution proportional
+    // to the number of threads and multiplied by a heuristic coefficient (15)
+    const size_t zp_src_pad_comp_size = zp_src_with_padding
+            ? (jcp.oc * jcp.ngroups * jcp.zp.src_pad_comp.d
+                      * jcp.zp.src_pad_comp.h * jcp.zp.src_pad_comp.w)
+            : 0u;
+    const size_t zp_src_comp_size = jcp.zp.src_is_common
+            ? utils::rnd_up(jcp.oc * jcp.ngroups,
+                      platform::get_cache_line_size() / sizeof(int))
+            : 0u;
+
+    const size_t weights_size = weights_d.size()
+            + (zp_src_comp_size + zp_src_pad_comp_size) * sizeof(int32_t);
+
+    static constexpr size_t scratchpad_limit_by_absolute_value = (size_t)1
+            << 30; // 1Gb
+    const size_t scratchpad_limit_by_tensor_sizes
+            = 15 * max_threads * (src_d.size() + weights_size + dst_d.size());
+    const size_t scratchpad_limit
+            = nstl::min(scratchpad_limit_by_absolute_value,
+                    scratchpad_limit_by_tensor_sizes);
+
+    if (is_int8_conv) {
+        if (is_fwd) {
+            jcp.im2col_sz
+                    = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih,
+                              jcp.od == jcp.id, jcp.stride_w == 1,
+                              jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1,
+                              !jcp.signed_input)
+                    ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os
+                    : 0;
+
+            dim_t wei_size = jcp.oc * jcp.ic * jcp.kh * jcp.kw;
+            bool is_blocking_applicable = true && is_fwd && jcp.im2col_sz
+                    && !is_3d && jcp.dilate_h == 0 && jcp.dilate_w == 0
+                    && !is_depthwise && wei_size < L2 / 2;
+            if (is_blocking_applicable) {
+                // looking for oh and ow blocking
+                dim_t h_block {jcp.oh_block}, w_block {jcp.ow_block};
+                dim_t ic = jcp.ic;
+                dim_t oc = jcp.oc;
+                dim_t iw = jcp.iw;
+                dim_t ow = jcp.ow;
+                dim_t oh = jcp.oh;
+                dim_t os = oh * ow;
+
+                // 1. cache requirement
+                dim_t row_size = ic * ow * jcp.ks + 2 * (ic * iw + oc * ow);
+                // Heuristic rule: gemm needed a lot of memory for internal
+                // usage
+                row_size *= 5;
+                // memory for accumulators
+                row_size += oc * ow * sizeof(uint32_t);
+                // memory for transposition
+                row_size += ic * iw;
+
+                h_block = nstl::max(
+                        dim_t(1), nstl::min(oh, div_up(dim_t(L2), row_size)));
+                if (h_block == 1) {
+                    dim_t col_size = ic * jcp.ks + 2 * (ic + oc);
+                    if (is_int8_conv) {
+                        col_size *= 5;
+                        col_size += oc * sizeof(uint32_t);
+                        col_size += ic;
+                    }
+                    w_block = nstl::max(dim_t(1),
+                            nstl::min(ow, div_up(dim_t(L2), col_size)));
+                }
+
+                // 2. threading requirement
+                if (h_block != oh)
+                    h_block = nstl::max(dim_t(1), rnd_dn(h_block, dim_t(4)));
+                if (w_block != ow)
+                    w_block = nstl::max(dim_t(1), rnd_dn(w_block, simd_w));
+
+                float thr_eff = 0.f;
+                float thr_eff_treshold = 0.9f;
+                if (w_block == ow) {
+                    do {
+                        dim_t nb_h = div_up(oh, h_block);
+                        dim_t work = jcp.ngroups * jcp.mb * jcp.od * nb_h;
+                        float disb = (float)oh / rnd_up(oh, h_block);
+                        thr_eff = (float)work / rnd_up(work, max_threads);
+                        thr_eff = (thr_eff + disb) / 2.f;
+                        if (thr_eff >= thr_eff_treshold) break;
+                        h_block = rnd_dn(h_block - 4, 4);
+                    } while (h_block > 0);
+                }
+                if (thr_eff
+                        < thr_eff_treshold) // we didn't find suitable h_block
+                {
+                    h_block = 1;
+                    int nb_h = oh;
+                    do {
+                        dim_t nb_w = div_up(ow, w_block);
+                        dim_t work_amount = jcp.ngroups * jcp.mb * nb_h * nb_w;
+                        float disb = (float)ow / rnd_up(ow, w_block);
+                        thr_eff = (float)work_amount
+                                / rnd_up(work_amount, max_threads);
+                        thr_eff = (thr_eff + disb) / 2.f;
+                        if (thr_eff > thr_eff_treshold) break;
+                        w_block = rnd_dn(w_block - simd_w, simd_w);
+                    } while (w_block > 0);
+                }
+                h_block = nstl::max(dim_t(1), h_block);
+                w_block = nstl::max(dim_t(1), w_block);
+                dim_t inner_work = div_up(os, simd_w) * div_up(oc, simd_w);
+                const float inner_thr_eff
+                        = (float)inner_work / rnd_up(inner_work, max_threads);
+                if (thr_eff >= inner_thr_eff / 2 && h_block > 0
+                        && w_block > 0) {
+                    jcp.oh_block = h_block;
+                    jcp.ow_block = w_block;
+                    jcp.outer_threading = true;
+                }
+                // updating jcp.im2col_sz
+                if (jcp.oh_block != 1) jcp.ow_block = ow;
+                jcp.im2col_sz
+                        = (ptrdiff_t)ic * jcp.ks * jcp.oh_block * jcp.ow_block;
+            }
+            //  For threading selection in bwd_d we do:
+            //  1. Rough estimation of efficiency for inner and outer threading.
+            //  2. Gemm size estimation in assumption that it does not work
+            //  so effectively for small sizes.
+            //  64K - this is heuristic gemm size per thread threshold.
+            const int gemm_thrld = 64 * 1024;
+            if (!jcp.outer_threading && !is_3d) {
+                bool is_depthwise
+                        = jcp.ic == 1 && jcp.oc == 1 && jcp.ngroups != 1;
+                const dim_t outer_work = jcp.ngroups * jcp.mb;
+                const float outer_thr_eff
+                        = (float)outer_work / rnd_up(outer_work, max_threads);
+                const size_t inner_work
+                        = div_up(jcp.is, simd_w) * div_up(jcp.ic, simd_w);
+                const float inner_thr_eff
+                        = (float)inner_work / rnd_up(inner_work, max_threads);
+                jcp.outer_threading
+                        = (is_depthwise
+                                  || (jcp.is / max_threads < 64 && jcp.mb != 1))
+                        && (outer_thr_eff / inner_thr_eff >= 1.f
+                                || (jcp.os * jcp.ic * jcp.oc) / max_threads
+                                        < gemm_thrld);
+            }
+            jcp.nthr = jcp.outer_threading ? max_threads : 1;
+            scratchpad.book<int8_t>(
+                    key_conv_gemm_col, jcp.nthr * jcp.im2col_sz);
+            scratchpad.book<int32_t>(key_conv_int_dat_in_acc_dt,
+                    jcp.nthr * jcp.oh_block * jcp.ow_block * jcp.oc);
+            scratchpad.book<int8_t>(
+                    key_conv_gemm_imtr, jcp.nthr * jcp.id * jcp.is * jcp.ic);
+        } else if (is_bwd_d) {
+            jcp.im2col_sz
+                    = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih,
+                              jcp.od == jcp.id, jcp.stride_w == 1,
+                              jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1,
+                              !jcp.signed_input)
+                    ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os * jcp.od
+                    : 0;
+
+            bool is_depthwise = jcp.ic == 1 && jcp.oc == 1 && jcp.ngroups != 1;
+            const size_t outer_work = jcp.ngroups * jcp.mb;
+            const float outer_thr_eff
+                    = (float)outer_work / rnd_up(outer_work, max_threads);
+            const size_t inner_work
+                    = div_up(jcp.is, simd_w) * div_up(jcp.ic, simd_w);
+            const float inner_thr_eff
+                    = (float)inner_work / rnd_up(inner_work, max_threads);
+            jcp.outer_threading = !is_3d
+                    && (is_depthwise
+                            || (jcp.is / max_threads < 64 && jcp.mb != 1))
+                    && (outer_thr_eff / inner_thr_eff >= 1.f
+                            || (jcp.is * jcp.ic * jcp.oc) / max_threads
+                                    < gemm_thrld);
+
+            jcp.nthr = jcp.outer_threading ? max_threads : 1;
+            scratchpad.book<int32_t>(
+                    key_conv_gemm_col, jcp.nthr * jcp.im2col_sz);
+            scratchpad.book<int32_t>(key_conv_int_dat_in_acc_dt,
+                    jcp.nthr * jcp.is * jcp.id * jcp.ic);
+        } else if (is_bwd_w) {
+            assert(!"unimplemented prop_kind");
+            return status::unimplemented;
+        }
+    } else {
+        jcp.im2col_sz = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih,
+                                jcp.od == jcp.id, jcp.stride_w == 1,
+                                jcp.stride_h == 1, jcp.stride_d == 1,
+                                jcp.ks == 1, !jcp.signed_input)
+                ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os
+                : 0;
+        if (jcp.is_nspc && is_fwd) {
+            const size_t wei_size
+                    = static_cast<size_t>(jcp.oc) * jcp.ic * jcp.kh * jcp.kw;
+            bool is_blocking_applicable = true && is_fwd && jcp.im2col_sz
+                    && !is_3d && jcp.dilate_h == 0 && jcp.dilate_w == 0
+                    && !is_depthwise && wei_size < static_cast<size_t>(L2) / 2;
+            // Logic for blocking for f32_nspc gemm convolution follows that of
+            // int8_nspc gemm convolution. Currently, not optimized for f32
+            // data type.
+            if (is_blocking_applicable) {
+                // looking for oh and ow blocking
+                size_t h_block = jcp.oh_block;
+                size_t w_block = jcp.ow_block;
+
+                const size_t ic = jcp.ic;
+                const size_t oc = jcp.oc;
+                const size_t iw = jcp.iw;
+                const size_t ow = jcp.ow;
+                const size_t oh = jcp.oh;
+                const size_t os = oh * ow;
+
+                // 1. cache requirement
+                size_t row_size = ic * ow * jcp.ks * data_size
+                        + 2 * (ic * iw + oc * ow) * data_size;
+                // Heuristic rule: gemm needed a lot of memory for internal
+                // usage
+                row_size *= 5;
+                // memory for accumulators
+                row_size += oc * ow * data_size;
+                // memory for transposition
+                row_size += ic * iw * data_size;
+
+                const size_t L2_rows = div_up(L2, row_size);
+                h_block = saturate(size_t {1}, L2_rows, oh);
+                if (h_block == 1) {
+                    size_t col_size = ic * jcp.ks * data_size
+                            + 2 * (ic + oc) * data_size;
+                    const size_t L2_cols = div_up(L2, col_size);
+                    w_block = saturate(size_t {1}, L2_cols, ow);
+                }
+
+                // 2. threading requirement
+                if (h_block != oh)
+                    h_block = nstl::max(size_t {1}, rnd_dn(h_block, 4));
+                if (w_block != ow)
+                    w_block = nstl::max(size_t {1}, rnd_dn(w_block, simd_w));
+
+                float thr_eff = 0.f;
+                float thr_eff_treshold = 0.9f;
+                if (w_block == ow) {
+                    do {
+                        size_t nb_h = div_up(oh, h_block);
+                        size_t work = jcp.ngroups * jcp.mb * jcp.od * nb_h;
+                        float disb = (float)oh / rnd_up(oh, h_block);
+                        thr_eff = (float)work / rnd_up(work, max_threads);
+                        thr_eff = (thr_eff + disb) / 2.f;
+                        if (thr_eff >= thr_eff_treshold) break;
+
+                        if (h_block < 4)
+                            h_block = 0;
+                        else
+                            h_block = rnd_dn(h_block - 4, 4);
+                    } while (h_block > 0);
+                }
+                if (thr_eff
+                        < thr_eff_treshold) // we didn't find suitable h_block
+                {
+                    h_block = 1;
+                    size_t nb_h = oh;
+                    do {
+                        size_t nb_w = div_up(ow, w_block);
+                        size_t work_amount = jcp.ngroups * jcp.mb * nb_h * nb_w;
+                        float disb = (float)ow / rnd_up(ow, w_block);
+                        thr_eff = (float)work_amount
+                                / rnd_up(work_amount, max_threads);
+                        thr_eff = (thr_eff + disb) / 2.f;
+                        if (thr_eff > thr_eff_treshold) break;
+
+                        if (w_block < static_cast<size_t>(simd_w))
+                            w_block = 0;
+                        else
+                            w_block = rnd_dn(w_block - simd_w, simd_w);
+                    } while (w_block > 0);
+                }
+                h_block = nstl::max(size_t {1}, h_block);
+                w_block = nstl::max(size_t {1}, w_block);
+                const size_t inner_work
+                        = div_up(os, simd_w) * div_up(oc, simd_w);
+                const float inner_thr_eff
+                        = (float)inner_work / rnd_up(inner_work, max_threads);
+                if (thr_eff >= inner_thr_eff / 2 && h_block > 0
+                        && w_block > 0) {
+                    jcp.oh_block = static_cast<int>(h_block);
+                    jcp.ow_block = static_cast<int>(w_block);
+                    jcp.outer_threading = true;
+                }
+                // updating jcp.im2col_sz
+                if (jcp.oh_block != 1) jcp.ow_block = static_cast<int>(ow);
+                jcp.im2col_sz
+                        = (ptrdiff_t)ic * jcp.ks * jcp.oh_block * jcp.ow_block;
+            }
+            //  For threading selection in fwd_d we do:
+            //  1. Rough estimation of efficiency for inner and outer threading.
+            //  2. Gemm size estimation in assumption that it does not work
+            //  so effectively for small sizes.
+            //  64K - this is heuristic gemm size per thread threshold.
+            constexpr size_t gemm_thrld = 64 * 1024;
+            if (!jcp.outer_threading && !is_3d) {
+                bool is_depthwise
+                        = jcp.ic == 1 && jcp.oc == 1 && jcp.ngroups != 1;
+                const size_t outer_work = jcp.ngroups * jcp.mb;
+                const float outer_thr_eff
+                        = (float)outer_work / rnd_up(outer_work, max_threads);
+                const size_t inner_work
+                        = div_up(jcp.is, simd_w) * div_up(jcp.ic, simd_w);
+                const float inner_thr_eff
+                        = (float)inner_work / rnd_up(inner_work, max_threads);
+                jcp.outer_threading
+                        = (is_depthwise
+                                  || (jcp.is / max_threads < 64 && jcp.mb != 1))
+                        && (outer_thr_eff / inner_thr_eff >= 1.f
+                                || (static_cast<size_t>(jcp.os) * jcp.ic
+                                           * jcp.oc)
+                                                / max_threads
+                                        < gemm_thrld);
+            }
+            jcp.nthr = jcp.outer_threading ? max_threads : 1;
+            const size_t gemm_col_datatype_size
+                    = is_bf16_conv ? sizeof(bfloat16_t) : sizeof(float);
+
+            scratchpad.book(key_conv_gemm_col, jcp.nthr * jcp.im2col_sz,
+                    gemm_col_datatype_size);
+            if (is_bf16_conv) {
+                scratchpad.book<float>(key_conv_gemm_acc,
+                        jcp.nthr * static_cast<size_t>(jcp.oh_block)
+                                * jcp.ow_block * jcp.oc);
+            }
+
+            scratchpad.book(key_conv_gemm_imtr,
+                    jcp.nthr * static_cast<size_t>(jcp.id) * jcp.is * jcp.ic,
+                    gemm_col_datatype_size);
+            if (is_bf16_conv && jcp.with_bias
+                    && one_of(data_type::bf16, cd.diff_bias_desc.data_type,
+                            cd.bias_desc.data_type)) {
+                scratchpad.book<float>(
+                        key_conv_bias_bf16_convert_wsp, jcp.ngroups * jcp.oc);
+            }
+
+        } else if (!jcp.is_nspc && is_fwd) {
+            const dim_t sh = jcp.stride_h;
+            const dim_t sw = jcp.stride_w;
+            const dim_t spatial = jcp.mb * jcp.ngroups * jcp.od * jcp.os;
+            dim_t K = jcp.ic * jcp.ks;
+
+            // There is some heuristics in the definition of
+            // inner/outer threading cross point due to the nature of the
+            // gemm implementation which we cannot control
+            bool is_blocking_applicable = true && !is_3d
+                    && (!jcp.im2col_sz
+                            // spatial is small
+                            || spatial >= max_threads * simd_w
+                            // inner threading work is greater then outer
+                            // threading work
+                            || jcp.os < jcp.mb * jcp.ngroups * jcp.od
+                            // im2col is big
+                            || (sw == 1 && K <= 0.05 * jcp.oc))
+                    // heuristic condition
+                    && (jcp.im2col_sz
+                            || (jcp.ic / jcp.oc < 42
+                                    && jcp.ic * jcp.oc * jcp.is < 1024));
+
+            if (is_blocking_applicable) {
+                const dim_t min_oc_block = 8;
+                const dim_t min_os_block = simd_w;
+                const float non_cache_access = 20;
+                const float strided_im2col_k = 8;
+                const float thr_disb_k = 8;
+                const float thr_mem_eff_k {1}, oc_disb_k {1}, os_disb_k {1},
+                        ic_disb_k {1}, reg_osb_disb_k {1}, gemm_eff_k {0.5},
+                        gemm_calc_eff_k {1};
+                const float k_sum = thr_disb_k + oc_disb_k + os_disb_k
+                        + ic_disb_k + reg_osb_disb_k + thr_mem_eff_k
+                        + gemm_eff_k + gemm_calc_eff_k;
+
+                auto calc_max_icb
+                        = [=](dim_t nthr_oc, dim_t ocb, dim_t osb,
+                                  dim_t oc_per_thr, dim_t os_per_thr) {
+                    const dim_t block_out_size = ocb * osb;
+                    // TODO: need more precise calculation if stride more than
+                    // kernel size
+                    const dim_t inp_row_size = sh * sw * osb;
+                    dim_t max_icb = 1;
+                    if (jcp.im2col_sz) {
+                        const dim_t col_row_size = jcp.ks * osb;
+                        if (osb >= os_per_thr) { // one pass by os
+                            const dim_t wei_col_size = jcp.ks * ocb;
+                            max_icb = L2 / (inp_row_size + col_row_size);
+                            if (ocb < oc_per_thr) {
+                                max_icb = nstl::min(max_icb,
+                                        (L2 - block_out_size)
+                                                / (col_row_size
+                                                        + wei_col_size));
+                            }
+                        } else {
+                            const dim_t wei_col_size = jcp.ks * oc_per_thr;
+                            max_icb = (L2 - block_out_size)
+                                    / (inp_row_size + col_row_size
+                                            + wei_col_size);
+                        }
+                    } else {
+                        if (osb >= os_per_thr)
+                            max_icb = L2 / inp_row_size;
+                        else {
+                            const dim_t wei_col_size = jcp.ks * oc_per_thr;
+                            max_icb = L2 / (inp_row_size + wei_col_size);
+                        }
+                    }
+                    if (max_icb < jcp.ic) {
+                        if (jcp.im2col_sz) {
+                            const dim_t col_row_size = jcp.ks * osb;
+                            const dim_t wei_col_size = jcp.ks * oc_per_thr;
+                            max_icb = (L2 - block_out_size)
+                                    / (inp_row_size + col_row_size
+                                            + wei_col_size);
+                        }
+                    }
+                    return max_icb;
+                };
+
+                dim_t best_ocb {1}, best_osb {1};
+                dim_t best_nthr_oc {1};
+                dim_t best_icb {jcp.ic};
+                float best_thr_eff = 0;
+
+                auto try_cfg = [&](dim_t nthr_oc, dim_t ocb, dim_t osb) {
+                    // for given nthr_oc, oc block:
+                    // 1. find ic block to fit into cache
+                    // 2. estimate efficiency basing on rules and heuristic:
+                    // - Minimize im2col cost
+                    // - ratio of FMA number to data size
+                    // - gemm works better if M divided by 48 and N divided by 8
+
+                    const dim_t max_oc = div_up(jcp.oc, nthr_oc);
+                    const dim_t min_oc = nstl::max(dim_t(1), jcp.oc / nthr_oc);
+                    const dim_t max_os
+                            = div_up(spatial, (dim_t)(max_threads / nthr_oc));
+                    ocb = utils::saturate(min_oc_block, max_oc, ocb);
+                    osb = utils::saturate(min_os_block, max_os, osb);
+
+                    // The computation of max_thr_size and min_thr_size is
+                    // based on work balance using:
+                    // balance2D(max_threads, i, spatial, sp_start, sp_end,
+                    //            jcp.oc, oc_start, oc_end, nthr_oc);
+                    size_t max_thr_size = 1;
+                    {
+                        const dim_t min_os = div_up(
+                                spatial, (dim_t)div_up(max_threads, nthr_oc));
+                        /* --- compute max_thr_size ------------
+                         may not necessarily be (max_oc * max_os)
+                         thr_size = thr_oc * (spatial /nthrs_in_slice);
+                         with spatial as const, thr_size has maxima when
+                            (A: thr_oc is max) and (B: nthrs_in_slice is min)
+                        */
+                        if (jcp.oc % nthr_oc > max_threads % nthr_oc) {
+                            // If (A) and (B) are true together, then it is the
+                            // global max
+                            max_thr_size = max_oc * max_os;
+                        } else {
+                            const size_t oc_max_os_min = max_oc * min_os;
+                            const size_t oc_min_os_max = min_oc * max_os;
+                            max_thr_size
+                                    = nstl::max(oc_max_os_min, oc_min_os_max);
+                        }
+                    }
+
+                    size_t min_thr_size {1};
+                    {
+                        const dim_t min_os = nstl::max(dim_t(1),
+                                spatial / div_up(max_threads, nthr_oc));
+                        /* --- compute min_thr_size ------------
+                         may not necessarily be (min_oc * min_y)
+                         thr_size = thr_oc * (spatial /nthrs_in_slice);
+                         with spatial as const, thr_size has minima when
+                            (A: thr_oc is min) and (B: nthrs_in_slice is max)
+                        */
+                        if (max_threads % nthr_oc > jcp.oc % nthr_oc) {
+                            // If (A) and (B) are true together, then it is the
+                            // global min
+                            min_thr_size = min_oc * min_os;
+                        } else {
+                            const size_t oc_max_os_min = max_oc * min_os;
+                            const size_t oc_min_os_max = min_oc
+                                    * (size_t)(spatial
+                                            / (dim_t)(max_threads / nthr_oc));
+                            min_thr_size
+                                    = nstl::min(oc_max_os_min, oc_min_os_max);
+                        }
+                    }
+                    auto thr_disb = (float)min_thr_size / max_thr_size;
+
+                    const dim_t oc_per_thr = max_oc;
+                    const dim_t os_per_thr = max_os;
+                    ocb = nstl::min(oc_per_thr, ocb);
+                    const dim_t os_max = nstl::min(jcp.os, os_per_thr);
+                    osb = nstl::min(os_max, osb);
+
+                    // -- selecting icb ---------------------
+                    dim_t max_ic_block = calc_max_icb(
+                            nthr_oc, ocb, osb, oc_per_thr, os_per_thr);
+                    // if we don't fit into cache then access to memory is
+                    // expensive
+                    dim_t mem_access_cost
+                            = (max_ic_block < 1) ? non_cache_access : 1;
+                    max_ic_block = nstl::max(dim_t(1), max_ic_block);
+                    dim_t icb = nstl::max(
+                            dim_t(1), jcp.ic / div_up(jcp.ic, max_ic_block));
+                    dim_t nb_ic = div_up(jcp.ic, icb);
+                    dim_t kb = icb * jcp.ks;
+                    dim_t kb_caligned = rnd_up(kb, simd_w);
+
+                    // -- mem efficiency ------------
+                    const size_t out_size
+                            = oc_per_thr * rnd_up(os_per_thr, simd_w);
+                    const size_t out_ops = mem_access_cost * out_size
+                            * ((icb == jcp.ic) ? 1 : (2 * nb_ic - 1));
+                    const dim_t osb_caligned = rnd_up(osb, simd_w);
+                    const size_t inp_size
+                            = jcp.ic * rnd_up(os_per_thr * sh * sw, simd_w);
+                    size_t inp_ops = 0;
+                    size_t col_ops = 0;
+                    // TODO: simplify calculations
+                    if (jcp.im2col_sz) {
+                        inp_ops = mem_access_cost * jcp.ks * inp_size;
+                        const float col_tail_koeff = (float)osb_caligned / osb;
+                        col_ops = mem_access_cost
+                                * (jcp.ks * inp_size * col_tail_koeff
+                                        + jcp.ks * inp_size * col_tail_koeff);
+                        if (sw != 1) // im2col with strides is much slower
+                            col_ops *= strided_im2col_k;
+                    } else {
+                        inp_ops = mem_access_cost * jcp.ks * inp_size;
+                    }
+                    // TODO: what about groups?
+                    const size_t wei_size = oc_per_thr * rnd_up(K, simd_w);
+                    const size_t wei_ops = mem_access_cost * wei_size;
+                    // ratio of real FMA to number of memory ops
+                    const float thr_mem_eff
+                            = (((float)os_per_thr / simd_w) * oc_per_thr * K)
+                            / (inp_ops + col_ops + wei_ops + out_ops);
+
+                    auto oc_disb = (float)oc_per_thr / rnd_up(oc_per_thr, ocb);
+                    auto os_disb = (float)os_max / rnd_up(os_max, osb);
+                    auto ic_disb = (float)jcp.ic / rnd_up(jcp.ic, icb);
+
+                    auto reg_osb_disb = (float)osb / rnd_up(osb, 3 * simd_w);
+
+                    // Heuristics
+                    const float gemm_eff = ((float)osb * ocb * kb)
+                            / ((float)oc_per_thr * os_per_thr * K);
+
+                    // number of FMA to memory size
+                    const float gemm_calc_eff
+                            = (((float)osb / simd_w) * ocb * kb)
+                            / (osb_caligned * kb + ocb * kb_caligned
+                                    + ocb * osb_caligned);
+                    // optimization: remove pow, when corresponding weight is 1
+                    const float res_eff = pow(pow(thr_disb, thr_disb_k)
+                                    * oc_disb // pow(oc_disb, oc_disb_k)
+                                    * os_disb // pow(os_disb, os_disb_k)
+                                    * ic_disb // pow(ic_disb, ic_disb_k)
+                                    // pow(reg_osb_disb, reg_osb_disb_k)
+                                    * reg_osb_disb
+                                    //pow(thr_mem_eff, thr_mem_eff_k)
+                                    * thr_mem_eff
+                                    //pow(gemm_calc_eff, gemm_calc_eff_k)
+                                    * pow(gemm_eff, gemm_eff_k) * gemm_calc_eff,
+                            1.f / k_sum);
+
+                    if (res_eff > best_thr_eff) {
+                        best_thr_eff = res_eff;
+                        best_nthr_oc = nthr_oc;
+                        best_ocb = ocb;
+                        best_osb = osb;
+                        best_icb = icb;
+                    }
+                };
+
+                auto explore_cfg = [&](dim_t nthr_oc, dim_t ocb, dim_t osb) {
+                    try_cfg(nthr_oc, ocb, osb);
+                    // few combinations to try, as the eff is better when ocb is
+                    // multiple of 8 and osb is multiple of 48 or min_os_block.
+                    try_cfg(nthr_oc, rnd_dn(ocb, 8), rnd_dn(osb, 48));
+                    try_cfg(nthr_oc, rnd_up(ocb, 8), rnd_dn(osb, 48));
+                    try_cfg(nthr_oc, rnd_up(ocb, 8), rnd_up(osb, min_os_block));
+                    try_cfg(nthr_oc, rnd_up(ocb, 8), rnd_up(osb, 48));
+                };
+
+                for (dim_t nthr_oc = 1; nthr_oc <= max_threads; ++nthr_oc) {
+                    const dim_t max_oc_per_thr = div_up(jcp.oc, nthr_oc);
+                    dim_t max_os_per_thr
+                            = div_up(spatial, max_threads / nthr_oc);
+                    dim_t ocb {1}, osb {1}, icb {1};
+                    if (jcp.im2col_sz) {
+                        try_cfg(nthr_oc, max_oc_per_thr, max_os_per_thr);
+                        if ((best_ocb == max_oc_per_thr)
+                                && (best_osb == max_os_per_thr)
+                                && (best_icb == jcp.ic)) {
+                            // best case scenario
+                            continue;
+                        }
+
+                        /*
+                          memory eq from calc_max_icb():
+                            max_icb = (L2 - block_out_size)
+                                    / (inp_row_size + col_row_size
+                                            + wei_col_size);
+                            icb*sh*sw*osb + icb*jcp.ks*osb +
+                                jcp.ks*max_oc_per_thr*icb + osb *ocb = L2
+
+                            a_k*icb*osb + b_k*icb + osb*ocb = L2
+                            We would like to maximize icb*osb*ocb (FMA).
+
+                            Unfortunately, above eq and constraint doesn't have
+                            a single solution. So, based on experiments we try
+                            few scenarios.
+                            1. icb = jcp.ic
+                            2. Solving the constraint eq we get
+                              osb = (L2 - 2*b_k*icb)/(2*a_k*icb) >= min_oc_block
+                              => icb <= (L2)/(2* min_oc_block * a_k + 2 * b_k)
+                            3. Maximize channel compute:
+                              ocb = max_oc_per_thr;
+                              icb = jcp.ic;
+                        */
+                        dim_t a_k = sh * sw + jcp.ks;
+                        dim_t b_k = jcp.ks * max_oc_per_thr;
+
+                        // Note 1:
+                        icb = jcp.ic;
+                        ocb = utils::saturate(min_oc_block, max_oc_per_thr,
+                                (L2 - a_k * icb * min_os_block - b_k * icb)
+                                        / min_os_block);
+                        osb = utils::saturate(min_os_block, max_os_per_thr,
+                                (L2 - b_k * icb) / (a_k * icb + ocb));
+                        explore_cfg(nthr_oc, ocb, osb);
+
+                        // Note 2:
+                        const dim_t icb_max = nstl::max(dim_t(1),
+                                L2 / (2 * min_oc_block * a_k + 2 * b_k));
+                        if (icb_max < jcp.ic) {
+                            // adjust icb, such that it is evenly distributed.
+                            icb = jcp.ic
+                                    / nstl::max(dim_t(1), jcp.ic / icb_max);
+                            osb = nstl::max(dim_t(1),
+                                    (L2 - 2 * b_k * icb) / (2 * icb * a_k));
+                            ocb = L2 / 2 / osb;
+
+                            if (ocb > max_oc_per_thr) {
+                                ocb = max_oc_per_thr;
+                                // reduce mem eq by making ocb constant. we get
+                                osb = utils::saturate(min_os_block,
+                                        max_os_per_thr,
+                                        (L2 - b_k * icb) / (a_k * icb + ocb));
+                            } else if (osb > max_os_per_thr) {
+                                // reduce mem eq by making osb constant. we get
+                                osb = max_os_per_thr;
+                                ocb = utils::saturate(min_oc_block,
+                                        max_oc_per_thr,
+                                        (L2 - a_k * icb * osb - b_k * icb)
+                                                / (osb));
+                            }
+
+                            explore_cfg(nthr_oc, ocb, osb);
+                        }
+
+                        // Note 3:
+                        ocb = max_oc_per_thr;
+                        icb = jcp.ic;
+                        osb = nstl::max(min_os_block,
+                                rnd_dn((L2 - b_k * icb) / (a_k * icb + ocb),
+                                        min_os_block));
+                        explore_cfg(nthr_oc, ocb, osb);
+
+                    } else {
+                        // from calc_max_icb, memory eq is independent of ocb.
+                        // So, set it to maximum.
+                        ocb = max_oc_per_thr;
+                        osb = (L2 - jcp.ks * jcp.ic) / (sh * sw * jcp.ic);
+                        explore_cfg(nthr_oc, ocb, osb);
+                    }
+                }
+                jcp.outer_threading = true;
+                jcp.nthr_oc = best_nthr_oc;
+                jcp.oc_block = best_ocb;
+                jcp.os_block = best_osb;
+                jcp.ic_block = best_icb;
+
+                // TODO: define loop order
+                // if im2col then gemm_loop_rlb and gemm_loop_lrb looks
+                // preferable otherwise other loop orders are possible
+                jcp.loop_order = gemm_loop_rlb;
+            } else {
+                const size_t outer_work_amount = jcp.ngroups * jcp.mb * jcp.od;
+                const float outer_thr_eff = (float)outer_work_amount
+                        / rnd_up(outer_work_amount, max_threads);
+                const size_t inner_work_amount
+                        = div_up(jcp.os, simd_w) * div_up(jcp.oc, simd_w);
+                const float inner_thr_eff = (float)inner_work_amount
+                        / rnd_up(inner_work_amount, max_threads);
+                jcp.outer_threading = jcp.os / max_threads < 512
+                        && IMPLICATION(
+                                jcp.od == 1, jcp.mb != 1 || jcp.ngroups > 2)
+                        && (outer_thr_eff / inner_thr_eff >= 1.f
+                                || (jcp.os * jcp.ic * jcp.oc) / max_threads
+                                        < gemm_thrld);
+            }
+            jcp.os_nb_block = div_up(jcp.os, jcp.os_block);
+
+            // BF16: other loops should be explored for potential
+            // performance speedup, but BF16-dst post-processing implementation
+            // would require enabling this support.
+            if (is_bf16_conv) jcp.loop_order = gemm_loop_lbr;
+
+            if (jcp.im2col_sz)
+                jcp.im2col_sz = (ptrdiff_t)jcp.ic_block * jcp.ks * jcp.os_block;
+        } else if (jcp.is_nspc && is_bwd_d) {
+            jcp.im2col_sz
+                    = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih,
+                              jcp.od == jcp.id, jcp.stride_w == 1,
+                              jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1,
+                              !jcp.signed_input)
+                    ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os * jcp.od
+                    : 0;
+
+            bool is_depthwise = jcp.ic == 1 && jcp.oc == 1 && jcp.ngroups != 1;
+            const size_t outer_work = jcp.ngroups * jcp.mb;
+            const float outer_thr_eff
+                    = (float)outer_work / rnd_up(outer_work, max_threads);
+            const size_t inner_work
+                    = div_up(jcp.is, simd_w) * div_up(jcp.ic, simd_w);
+            const float inner_thr_eff
+                    = (float)inner_work / rnd_up(inner_work, max_threads);
+            jcp.outer_threading = !is_3d
+                    && (is_depthwise
+                            || (jcp.is / max_threads < 64 && jcp.mb != 1))
+                    && (outer_thr_eff / inner_thr_eff >= 1.f
+                            || (static_cast<size_t>(jcp.is) * jcp.ic * jcp.oc)
+                                            / max_threads
+                                    < gemm_thrld);
+
+            jcp.nthr = jcp.outer_threading ? max_threads : 1;
+            scratchpad.book<float>(key_conv_gemm_col, jcp.nthr * jcp.im2col_sz);
+            if (jcp.ngroups > 1 || is_bf16_conv)
+                scratchpad.book<float>(key_conv_gemm_acc,
+                        jcp.nthr * static_cast<size_t>(jcp.is) * jcp.id
+                                * jcp.ic);
+        } else if (!jcp.is_nspc && is_bwd_d) {
+            const size_t outer_work_amount = jcp.ngroups * jcp.mb;
+            const float outer_thr_eff = (float)outer_work_amount
+                    / rnd_up(outer_work_amount, max_threads);
+            const size_t inner_work
+                    = div_up(jcp.is, simd_w) * div_up(jcp.ic, simd_w);
+            const float inner_thr_eff
+                    = (float)inner_work / rnd_up(inner_work, max_threads);
+            jcp.outer_threading = (jcp.os / max_threads < 512 || jcp.ks < 64)
+                    && (jcp.mb != 1 || jcp.ngroups > 2)
+                    && (outer_thr_eff / inner_thr_eff >= 1.f
+                            || (jcp.is * jcp.ic * jcp.oc) / max_threads
+                                    < gemm_thrld);
+        } else if (jcp.is_nspc && is_bwd_w) {
+            jcp.im2col_sz
+                    = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih,
+                              jcp.od == jcp.id, jcp.stride_w == 1,
+                              jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1,
+                              !jcp.signed_input)
+                    ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os
+                    : 0;
+            const size_t gemm_col_datatype_size
+                    = is_bf16_conv ? sizeof(bfloat16_t) : sizeof(float);
+
+            // Potential scratchpad memory requirement when outer threading is
+            // enabled during f32/bf16 BWD_W nspc convolution
+            size_t thr_mem_estimate = max_threads
+                    * (gemm_col_datatype_size * jcp.im2col_sz
+                            + gemm_col_datatype_size * jcp.id * jcp.is * jcp.ic
+                            + sizeof(float) * weights_d.size());
+            if (is_bf16_conv) {
+                thr_mem_estimate += sizeof(float) * weights_d.size();
+                if (jcp.with_bias
+                        && one_of(data_type::bf16, cd.diff_bias_desc.data_type,
+                                cd.bias_desc.data_type))
+                    thr_mem_estimate += sizeof(float) * jcp.ngroups * jcp.oc;
+            }
+            const bool outer_threading_mem_ok
+                    = thr_mem_estimate < scratchpad_limit;
+
+            jcp.outer_threading = outer_threading_mem_ok
+                    && jcp.os / max_threads < 256
+                    && (jcp.mb != 1 || jcp.ngroups > 2);
+            jcp.nthr = jcp.outer_threading ? max_threads : 1;
+
+            scratchpad.book(key_conv_gemm_col, jcp.nthr * jcp.im2col_sz,
+                    gemm_col_datatype_size);
+
+            jcp.need_wei_reduction = jcp.mb != 1 && jcp.nthr != 1;
+            scratchpad.book<float>(
+                    key_conv_wei_reduction, jcp.nthr * weights_d.size());
+            scratchpad.book(key_conv_gemm_imtr,
+                    static_cast<size_t>(jcp.nthr) * jcp.id * jcp.is * jcp.ic,
+                    gemm_col_datatype_size);
+            if (is_bf16_conv) {
+                size_t conv_acc_buffer_size = weights_d.size();
+                scratchpad.book<float>(
+                        key_conv_int_dat_in_acc_dt, conv_acc_buffer_size);
+            }
+            if ((is_bf16_conv) && jcp.with_bias
+                    && one_of(data_type::bf16, cd.diff_bias_desc.data_type,
+                            cd.bias_desc.data_type))
+                scratchpad.book<float>(
+                        key_conv_bias_bf16_convert_wsp, jcp.ngroups * jcp.oc);
+        } else if (!jcp.is_nspc && is_bwd_w) {
+            // Potential scratchpad memory requirement when outer threading is
+            // enabled during f32/bf16 BWD_W blocked convolution
+            size_t thr_mem_estimate
+                    = sizeof(float) * max_threads * weights_d.size();
+            if (is_bf16_conv) {
+                thr_mem_estimate += sizeof(float) * weights_d.size();
+                if (jcp.with_bias
+                        && one_of(data_type::bf16, cd.diff_bias_desc.data_type,
+                                cd.bias_desc.data_type))
+                    thr_mem_estimate += sizeof(float) * jcp.ngroups * jcp.oc;
+            }
+            const size_t gemm_col_datatype_size
+                    = is_bf16_conv ? sizeof(bfloat16_t) : sizeof(float);
+            // Minimum memory requirement as os_block >= simd_w
+            thr_mem_estimate += gemm_col_datatype_size * max_threads * jcp.ic
+                    * jcp.ks * simd_w;
+
+            const bool outer_threading_mem_ok
+                    = thr_mem_estimate < scratchpad_limit;
+            jcp.outer_threading = outer_threading_mem_ok
+                    && jcp.os / max_threads < 256
+                    && (jcp.mb != 1 || jcp.ngroups > 2);
+        }
+
+        if (!jcp.is_nspc) {
+            jcp.nthr = jcp.outer_threading ? max_threads : 1;
+            const int sizeof_cacheline_float = 16;
+            if (is_bwd_w) {
+                jcp.need_wei_reduction = jcp.mb != 1 && jcp.nthr != 1;
+                scratchpad.book<float>(
+                        key_conv_wei_reduction, jcp.nthr * weights_d.size());
+            }
+
+            if (is_bf16_conv) {
+                size_t conv_acc_buffer_size = 0;
+                if (is_fwd)
+                    conv_acc_buffer_size = jcp.nthr
+                            * rnd_up(jcp.oc_block * jcp.os_block,
+                                    sizeof_cacheline_float);
+                else if (is_bwd_d)
+                    conv_acc_buffer_size = jcp.nthr
+                            * rnd_up(jcp.ic * jcp.ih * jcp.iw * jcp.id,
+                                    sizeof_cacheline_float);
+                else if (is_bwd_w)
+                    conv_acc_buffer_size = weights_d.size();
+                scratchpad.book<float>(
+                        key_conv_int_dat_in_acc_dt, conv_acc_buffer_size);
+                if ((is_fwd || is_bwd_w) && jcp.with_bias
+                        && one_of(data_type::bf16, cd.diff_bias_desc.data_type,
+                                cd.bias_desc.data_type))
+                    scratchpad.book<float>(key_conv_bias_bf16_convert_wsp,
+                            jcp.ngroups * jcp.oc);
+            }
+
+            const size_t gemm_col_datatype_size = is_bf16_conv && !is_bwd_d
+                    ? sizeof(bfloat16_t)
+                    : sizeof(float);
+            size_t gemm_col_memory_sz = jcp.nthr * jcp.im2col_sz;
+
+            if (is_bwd_d || is_bwd_w) {
+                // check available memory
+                VDISPATCH_CONV_IC(scratchpad_limit >= scratchpad.size(),
+                        VERBOSE_SCRATCHPAD_LIMIT);
+
+                const size_t available_mem
+                        = scratchpad_limit - scratchpad.size();
+                if (available_mem
+                        < gemm_col_memory_sz * gemm_col_datatype_size) {
+                    // Required memory in this scenario overflows the
+                    // available memory due to the large dimensions.
+                    const int min_os_block = simd_w;
+                    const int max_os_block = (int)available_mem
+                            / ((int)gemm_col_datatype_size * jcp.nthr
+                                    * (jcp.im2col_sz / jcp.os));
+                    // Choose an arbitrary small coeficient reduce spatial
+                    // dimensions.
+                    // TODO: better heuristic to determine os_block based
+                    // on cache efficiency
+                    float _coef = is_bwd_w ? 0.05 : 0.1;
+                    jcp.os_block = nstl::max(
+                            min_os_block, (int)(max_os_block * _coef));
+                    jcp.os_nb_block = div_up(jcp.os, jcp.os_block);
+                    jcp.im2col_sz = (ptrdiff_t)jcp.ic * jcp.ks * jcp.os_block;
+                    gemm_col_memory_sz = jcp.nthr * jcp.im2col_sz;
+                }
+            }
+            scratchpad.book(key_conv_gemm_col, gemm_col_memory_sz,
+                    gemm_col_datatype_size);
+        }
+    }
+
+    jcp.bias_data_type = cd.bias_desc.data_type;
+    jcp.dst_data_type = dst_md.data_type;
+    jcp.sum_data_type = jcp.post_ops.get_sum_dt(jcp.dst_data_type);
+    jcp.dst_os_stride = dst_d.is_blocking_desc()
+            ? dst_d.blocking_desc().strides[ndims - 1]
+            : 0;
+    jcp.scale_idx_mult = attr.scales_.get_mask(DNNL_ARG_WEIGHTS) > 0;
+    jcp.with_dst_scale = !attr.scales_.has_default_values(DNNL_ARG_DST);
+    book_precomputed_scales(scratchpad, attr.scales_, jcp.ngroups * jcp.oc);
+
+    if (jcp.zp.src_exists) {
+        const auto size = zp_src_comp_size + zp_src_pad_comp_size;
+        if (size) scratchpad.book<int32_t>(key_conv_gemm_zp_src_comp, size);
+    }
+
+    VDISPATCH_CONV_IC(
+            scratchpad.size() <= scratchpad_limit, VERBOSE_SCRATCHPAD_LIMIT);
+
+    return status::success;
+}
+
+void bwd_weights_balance(int ithr, int nthr, int ngroups, int mb, int &ithr_g,
+        int &nthr_g, int &ithr_mb, int &nthr_mb) {
+    nthr_g = nstl::min(ngroups, nthr);
+    nthr_mb = nstl::min(mb, nthr / nthr_g);
+    if (ithr / nthr_mb >= ngroups) {
+        ithr_g = ithr_mb = -1;
+    } else {
+        ithr_g = ithr / nthr_mb;
+        ithr_mb = ithr % nthr_mb;
+    }
+}
+
+void bwd_weights_reduction_par_ncsp(int ithr, int nthr,
+        const conv_gemm_conf_t &jcp, const float *weights_reduce_ws,
+        float *weights) {
+    const size_t weights_g_size = jcp.ic * jcp.oc * jcp.ks;
+
+    size_t weights_start {0}, weights_end {0};
+    balance211(weights_g_size, nthr, ithr, weights_start, weights_end);
+
+    for (int i = 0; i < nthr; ++i) {
+        const float *ws_i = weights_reduce_ws + i * weights_g_size;
+        for (size_t s = weights_start; s < weights_end; ++s)
+            weights[s] = (i == 0 ? 0 : weights[s]) + ws_i[s];
+    }
+}
+
+void bwd_weights_reduction_par_nspc(int ithr, int nthr, size_t g_start,
+        size_t g_end, const conv_gemm_conf_t &jcp,
+        const float *weights_reduce_base, float *diff_weights) {
+    const dim_t weights_g_size = jcp.oc;
+    dim_t weights_start {0}, weights_end {0};
+    balance211(jcp.ks * jcp.ic, nthr, ithr, weights_start, weights_end);
+
+    // Threads divide work w.r.t. min-batch and groups, therefore
+    //   - weights_reduce_base format: spatial-input_channels-output_channels
+    //   - diff_weights format: spatial-input_channels-groups-output_channels
+    for (auto tidx = 0; tidx < nthr; ++tidx) {
+        const float *ws_base
+                = weights_reduce_base + tidx * weights_g_size * jcp.ks * jcp.ic;
+        for_(auto w = weights_start; w < weights_end; ++w)
+        for (auto g = g_start; g < g_end; ++g) {
+            float *__restrict dwei_ptr
+                    = diff_weights + (w * jcp.ngroups + g) * jcp.oc;
+            const float *__restrict ws_ptr = ws_base + w * jcp.oc;
+            if (tidx == 0) {
+                PRAGMA_OMP_SIMD()
+                for (auto oc = 0; oc < jcp.oc; ++oc) {
+                    dwei_ptr[oc] = ws_ptr[oc];
+                }
+            } else {
+                PRAGMA_OMP_SIMD()
+                for (auto oc = 0; oc < jcp.oc; ++oc) {
+                    dwei_ptr[oc] += ws_ptr[oc];
+                }
+            }
+        }
+    }
+}
+
+bool padding_exists(const conv_gemm_conf_t &jcp) noexcept {
+    return jcp.l_pad || jcp.t_pad || jcp.f_pad || jcp.e_pad || jcp.b_pad
+            || jcp.r_pad;
+}
+
+} // namespace jit_gemm_convolution_utils
+} // namespace rv64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/rv64/rvv_gemm_convolution_utils.hpp b/src/cpu/rv64/rvv_gemm_convolution_utils.hpp
new file mode 100644
index 00000000000..0659c7c91fa
--- /dev/null
+++ b/src/cpu/rv64/rvv_gemm_convolution_utils.hpp
@@ -0,0 +1,142 @@
+/*******************************************************************************
+* Copyright 2016 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_RV64_RVV_GEMM_CONVOLUTION_UTILS_HPP
+#define CPU_RV64_RVV_GEMM_CONVOLUTION_UTILS_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/memory_tracking.hpp"
+
+#include "cpu/cpu_convolution_pd.hpp"
+#include "cpu/cpu_engine.hpp"
+#include "cpu/zero_point_utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace rv64 {
+
+enum conv_gemm_loop_order_t { gemm_loop_rlb, gemm_loop_lrb, gemm_loop_lbr };
+struct conv_gemm_conf_t {
+    prop_kind_t prop_kind;
+
+    dim_t mb;
+    dim_t ngroups, ic, oc;
+    dim_t iw, ih, id, ow, oh, od;
+    dim_t l_pad, t_pad, f_pad, e_pad, b_pad, r_pad;
+    dim_t kh, kw, kd;
+    dim_t stride_h, stride_w, stride_d;
+    dim_t dilate_h, dilate_w, dilate_d;
+    bool with_bias;
+    bool with_eltwise;
+    bool with_binary;
+    bool with_sum;
+    post_ops_t post_ops;
+    bool is_nspc;
+
+    dim_t is, os, ks;
+    dim_t ic_block, oc_block;
+
+    int nthr;
+    ptrdiff_t im2col_sz;
+    bool need_wei_reduction;
+    bool signed_input;
+    dim_t oh_block;
+    dim_t ow_block;
+    dim_t os_block, os_nb_block;
+    bool outer_threading;
+    conv_gemm_loop_order_t loop_order;
+    int nthr_oc;
+
+    zero_point_config_t zp;
+
+    data_type_t bias_data_type;
+    data_type_t dst_data_type;
+    data_type_t sum_data_type;
+    size_t dst_os_stride;
+    size_t scale_idx_mult;
+    bool with_dst_scale;
+};
+
+struct single_gemm_conv_chunk_desc_t {
+    single_gemm_conv_chunk_desc_t() = default;
+    single_gemm_conv_chunk_desc_t(dim_t d_off, dim_t d_size, dim_t h_off,
+            dim_t h_size, dim_t w_off, dim_t w_size);
+
+    dim_t d_off_ = 0;
+    dim_t d_size_ = 0;
+    dim_t h_off_ = 0;
+    dim_t h_size_ = 0;
+    dim_t w_off_ = 0;
+    dim_t w_size_ = 0;
+};
+
+namespace jit_gemm_convolution_utils {
+template <typename data_type_t>
+void im2col_3d(const conv_gemm_conf_t &jcp, const data_type_t *im,
+        data_type_t *col, dim_t od, int spatial_step, int spatial_block);
+
+template <typename T>
+void transpose_dt(const conv_gemm_conf_t &jcp, const T *__restrict im,
+        T *__restrict imtr);
+
+template <typename im_dt, typename col_dt>
+void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict im,
+        col_dt *__restrict col, dim_t od);
+
+template <typename data_type_t>
+void im2col(const conv_gemm_conf_t &jcp, const data_type_t *__restrict im,
+        data_type_t *__restrict col, dim_t ss, dim_t sb, dim_t cs, dim_t cb);
+
+template <typename im_dt, typename col_dt>
+void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict im,
+        void *__restrict imtr, col_dt *__restrict col, dim_t hs, dim_t hb,
+        dim_t ws, dim_t wb);
+
+template <typename T>
+void col2im_dt(
+        const conv_gemm_conf_t &jcp, const T *__restrict col, T *__restrict im);
+void col2im_3d(const conv_gemm_conf_t &jcp, const float *col, float *im,
+        dim_t od, int spatial_step, int spatial_block);
+void col2im(const conv_gemm_conf_t &jcp, const float *col, float *im,
+        int spatial_step, int spatial_block);
+
+status_t init_conf(conv_gemm_conf_t &jcp,
+        memory_tracking::registrar_t &scratchpad, const convolution_desc_t &cd,
+        memory_desc_t &src_md, memory_desc_t &weights_md, memory_desc_t &dst_md,
+        memory_desc_t &bias_md, primitive_attr_t &attr, int max_threads,
+        bool check_postops = false);
+
+void bwd_weights_balance(int ithr, int nthr, int ngroups, int mb, int &ithr_g,
+        int &nthr_g, int &ithr_mb, int &nthr_mb);
+void bwd_weights_reduction_par_ncsp(int ithr, int nthr,
+        const conv_gemm_conf_t &jcp, const float *weights_reduce_ws,
+        float *weights);
+void bwd_weights_reduction_par_nspc(int ithr, int nthr, size_t g_start,
+        size_t g_end, const conv_gemm_conf_t &jcp,
+        const float *weights_reduce_base, float *diff_weights);
+
+bool padding_exists(const conv_gemm_conf_t &jcp) noexcept;
+
+} // namespace jit_gemm_convolution_utils
+
+} // namespace rv64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif

From 6e5fca86fca17ff81097930144e5471b60d1f369 Mon Sep 17 00:00:00 2001
From: StrelkovKM <strelkovkm96@outlook.com>
Date: Fri, 20 Mar 2026 08:48:26 +0300
Subject: [PATCH 02/13] [CPU][RV64] Add: xbyak_riscv, jit_rvv_1x1

---
 src/CMakeLists.txt                            |    2 +-
 src/cpu/cpu_convolution_list.cpp              |   15 +-
 src/cpu/rv64/cpu_isa_traits.cpp               |   44 +
 src/cpu/rv64/cpu_isa_traits.hpp               |  107 ++
 src/cpu/rv64/jit_generator.hpp                |  137 ++
 src/cpu/rv64/jit_primitive_conf.hpp           |   97 ++
 src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp      |  581 +++++++
 src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp      |  109 ++
 src/cpu/rv64/jit_rvv_1x1_convolution.cpp      |  144 ++
 src/cpu/rv64/jit_rvv_1x1_convolution.hpp      |  170 ++
 third_party/xbyak_riscv/xbyak_riscv.hpp       | 1383 +++++++++++++++++
 third_party/xbyak_riscv/xbyak_riscv_csr.hpp   |  112 ++
 .../xbyak_riscv/xbyak_riscv_mnemonic.hpp      |  231 +++
 third_party/xbyak_riscv/xbyak_riscv_util.hpp  |  271 ++++
 third_party/xbyak_riscv/xbyak_riscv_v.hpp     |  776 +++++++++
 15 files changed, 4175 insertions(+), 4 deletions(-)
 create mode 100644 src/cpu/rv64/cpu_isa_traits.cpp
 create mode 100644 src/cpu/rv64/cpu_isa_traits.hpp
 create mode 100644 src/cpu/rv64/jit_generator.hpp
 create mode 100644 src/cpu/rv64/jit_primitive_conf.hpp
 create mode 100644 src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp
 create mode 100644 src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp
 create mode 100644 src/cpu/rv64/jit_rvv_1x1_convolution.cpp
 create mode 100644 src/cpu/rv64/jit_rvv_1x1_convolution.hpp
 create mode 100644 third_party/xbyak_riscv/xbyak_riscv.hpp
 create mode 100644 third_party/xbyak_riscv/xbyak_riscv_csr.hpp
 create mode 100644 third_party/xbyak_riscv/xbyak_riscv_mnemonic.hpp
 create mode 100644 third_party/xbyak_riscv/xbyak_riscv_util.hpp
 create mode 100644 third_party/xbyak_riscv/xbyak_riscv_v.hpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 08d882bfee0..e69a804d39a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -77,7 +77,7 @@ if(DNNL_EXPERIMENTAL)
 endif()
 
 if(DNNL_EXPERIMENTAL_UKERNEL)
-    if(DNNL_TARGET_ARCH STREQUAL "X64" OR DNNL_TARGET_ARCH STREQUAL "AARCH64")
+    if(DNNL_TARGET_ARCH STREQUAL "X64" OR DNNL_TARGET_ARCH STREQUAL "AARCH64" OR DNNL_TARGET_ARCH STREQUAL "RISCV64")
         message(STATUS "Experimental functionality for ukernels is enabled")
     else()
         message(FATAL_ERROR "ukernel API isn't supported for ${DNNL_TARGET_ARCH}.")
diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp
index a43ab39d7d0..350ac8e14e4 100644
--- a/src/cpu/cpu_convolution_list.cpp
+++ b/src/cpu/cpu_convolution_list.cpp
@@ -76,6 +76,10 @@ using namespace dnnl::impl::cpu::aarch64;
 #include "cpu/acl/acl_depthwise_convolution.hpp"
 #include "cpu/acl/acl_winograd_convolution.hpp"
 using namespace dnnl::impl::cpu::acl;
+#elif DNNL_RV64
+#include "cpu/rv64/rvv_gemm_convolution.hpp"
+#include "cpu/rv64/jit_rvv_1x1_convolution.hpp"
+using namespace dnnl::impl::cpu::rv64;
 #endif
 
 namespace dnnl {
@@ -175,9 +179,14 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
             CPU_INSTANCE_AARCH64(brgemm_1x1_convolution_fwd_t, sve_128)
             CPU_INSTANCE_AARCH64(brgemm_convolution_fwd_t, sve_128)
             // CPU_INSTANCE_X64(jit_uni_ncsp_convolution_fwd_t)
-            CPU_INSTANCE(gemm_convolution_fwd_t)
-            CPU_INSTANCE(ref_convolution_fwd_t)
-            CPU_INSTANCE(ref_fused_convolution_fwd_t)
+
+            //CPU_INSTANCE_RV64GCV(jit_rvv_1x1_convolution_fwd_t)
+            CPU_INSTANCE_RV64GCV(riscv_gemm_convolution_fwd_t)
+            
+
+            // CPU_INSTANCE(gemm_convolution_fwd_t)
+            // CPU_INSTANCE(ref_convolution_fwd_t)
+            // CPU_INSTANCE(ref_fused_convolution_fwd_t)
             nullptr,
         }},
         {{forward, f32, f16, f32}, {
diff --git a/src/cpu/rv64/cpu_isa_traits.cpp b/src/cpu/rv64/cpu_isa_traits.cpp
new file mode 100644
index 00000000000..b8c3fc658e0
--- /dev/null
+++ b/src/cpu/rv64/cpu_isa_traits.cpp
@@ -0,0 +1,44 @@
+/*******************************************************************************
+* Copyright 2019 Intel Corporation
+* Copyright 2025 Institute of Software, Chinese Academy of Sciences
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "cpu/rv64/cpu_isa_traits.hpp"
+#include "cpu/platform.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace rv64 {
+
+struct isa_info_t {
+    isa_info_t(cpu_isa_t aisa) : isa(aisa) {};
+    cpu_isa_t isa;
+};
+
+static isa_info_t get_isa_info_t(void) {
+    if (mayiuse(zvfh)) return isa_info_t(zvfh);
+    if (mayiuse(v)) return isa_info_t(v);
+    return isa_info_t(isa_undef);
+}
+
+cpu_isa_t get_max_cpu_isa() {
+    return get_isa_info_t().isa;
+}
+
+} // namespace rv64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/rv64/cpu_isa_traits.hpp b/src/cpu/rv64/cpu_isa_traits.hpp
new file mode 100644
index 00000000000..be5a4fc1d49
--- /dev/null
+++ b/src/cpu/rv64/cpu_isa_traits.hpp
@@ -0,0 +1,107 @@
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+* Copyright 2025 Institute of Software, Chinese Academy of Sciences
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_RV64_CPU_ISA_TRAITS_HPP
+#define CPU_RV64_CPU_ISA_TRAITS_HPP
+
+#include <type_traits>
+
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+#include "dnnl_types.h"
+
+#ifndef XBYAK_RISCV_V
+#define XBYAK_RISCV_V 1
+#endif
+
+#include "xbyak_riscv/xbyak_riscv_util.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace rv64 {
+
+enum cpu_isa_bit_t : unsigned {
+    v_bit = 1u << 0,
+    zvfh_bit = 1u << 1,
+};
+
+enum cpu_isa_t : unsigned {
+    isa_undef = 0u,
+    v = v_bit,
+    zvfh = zvfh_bit | v,
+    isa_all = ~0u,
+};
+
+struct Riscv64Cpu {
+public:
+    static Riscv64Cpu &getInstance() {
+        static Riscv64Cpu instance;
+        return instance;
+    }
+
+    bool get_has_v() const { return has_v; }
+    bool get_has_zvfh() const { return has_zvfh; }
+
+private:
+    bool has_v = false;
+    bool has_zvfh = false;
+
+    Riscv64Cpu() {
+        const auto &xbyak_cpu = Xbyak_riscv::CPU::getInstance();
+
+        has_v = xbyak_cpu.hasExtension(Xbyak_riscv::RISCVExtension::V);
+
+        if (has_v) {
+            has_zvfh
+                    = xbyak_cpu.hasExtension(Xbyak_riscv::RISCVExtension::Zvfh);
+        } else {
+            has_zvfh = false;
+        }
+    }
+};
+
+inline bool mayiuse(const cpu_isa_t cpu_isa, bool soft = false) {
+    MAYBE_UNUSED(soft);
+    const Riscv64Cpu &cpu = Riscv64Cpu::getInstance();
+
+    switch (cpu_isa) {
+        case v: return cpu.get_has_v();
+        case zvfh: return cpu.get_has_v() && cpu.get_has_zvfh();
+        case isa_undef: return true;
+        case isa_all: return false;
+    }
+    return false;
+}
+
+cpu_isa_t get_max_cpu_isa();
+
+#include "common/z_magic.hpp"
+/* clang-format off */
+#define JIT_IMPL_NAME_HELPER(prefix, isa, suffix_if_any) \
+    ((isa) == isa_undef ? prefix STRINGIFY(any) : \
+    ((isa) == v ? prefix STRINGIFY(rvv) : \
+    ((isa) == zvfh ? prefix STRINGIFY(rvv_zvfh) : \
+    prefix suffix_if_any)))
+/* clang-format on */
+
+} // namespace rv64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/cpu/rv64/jit_generator.hpp b/src/cpu/rv64/jit_generator.hpp
new file mode 100644
index 00000000000..c795aba8c61
--- /dev/null
+++ b/src/cpu/rv64/jit_generator.hpp
@@ -0,0 +1,137 @@
+/*******************************************************************************
+* Copyright 2025 ZTE Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_RV64_JIT_GENERATOR_HPP
+#define CPU_RV64_JIT_GENERATOR_HPP
+
+#include <cstdint>
+#include <utility>
+
+#include "common/c_types_map.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+#include "cpu/jit_utils/jit_utils.hpp"
+
+#include "cpu/rv64/cpu_isa_traits.hpp"
+#include "xbyak_riscv/xbyak_riscv.hpp"
+
+#define DECLARE_CPU_JIT_AUX_FUNCTIONS(gen_name) \
+    const char *name() const override { \
+        return STRINGIFY(gen_name); \
+    } \
+    const char *source_file() const override { \
+        return __FILE__; \
+    }
+
+#define JIT_ASSERT(condition) \
+    do { \
+        assert(condition); \
+        if (!(condition)) XBYAK_RISCV_THROW(Xbyak_riscv::ERR_INTERNAL); \
+    } while (false)
+
+#define JIT_ASSERT_RET(condition, ret) \
+    do { \
+        assert(condition); \
+        if (!(condition)) \
+            XBYAK_RISCV_THROW_RET(Xbyak_riscv::ERR_INTERNAL, ret); \
+    } while (false)
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace rv64 {
+
+// Simple helper to check subset relation between two ISA masks.
+inline bool is_subset(cpu_isa_t isa, cpu_isa_t max_isa) {
+    using u_t = typename std::underlying_type<cpu_isa_t>::type;
+    return (static_cast<u_t>(isa) & static_cast<u_t>(max_isa))
+            == static_cast<u_t>(isa);
+}
+
+// Minimal RV64 JIT generator base class.
+class jit_generator_t : public Xbyak_riscv::CodeGenerator, public c_compatible {
+public:
+    using c_compatible::operator new;
+    using c_compatible::operator new[];
+    using c_compatible::operator delete;
+    using c_compatible::operator delete[];
+
+    // All JIT kernels must override these to provide a stable name used for
+    // debug/logging and jit code registration.
+    virtual const char *name() const = 0;
+    virtual const char *source_file() const = 0;
+
+    explicit jit_generator_t(const char * /*unused_name*/,
+            cpu_isa_t max_cpu_isa = get_max_cpu_isa())
+        : Xbyak_riscv::CodeGenerator(max_code_size)
+        , max_cpu_isa_(max_cpu_isa) {}
+
+    ~jit_generator_t() override = default;
+
+    const uint8_t *jit_ker() const { return jit_ker_; }
+
+    template <typename... kernel_args_t>
+    void operator()(kernel_args_t... args) const {
+        using jit_kernel_func_t = void (*)(const kernel_args_t...);
+        // This const_cast is required for Clang.
+        // Clang rejects reinterpret_cast from const uint8_t* to function pointer.
+        auto *fptr = reinterpret_cast<jit_kernel_func_t>(
+                const_cast<uint8_t *>(jit_ker_));
+        (*fptr)(std::forward<kernel_args_t>(args)...);
+    }
+
+    virtual status_t create_kernel() {
+        try {
+            generate();
+        } catch (...) { return status::runtime_error; }
+
+        this->ready(Xbyak_riscv::CodeArray::PROTECT_RWE);
+
+        jit_ker_ = Xbyak_riscv::CodeGenerator::getCode();
+
+        if (jit_ker_) {
+            jit_utils::register_jit_code(jit_ker_,
+                    Xbyak_riscv::CodeArray::getSize(), name(), source_file());
+            return status::success;
+        }
+
+        return status::runtime_error;
+    }
+
+    inline cpu_isa_t max_cpu_isa() const noexcept { return max_cpu_isa_; }
+
+    // Helper to check that a requested ISA is both within the per‑kernel limit
+    // and supported by the current CPU.
+    inline bool is_valid_isa(cpu_isa_t isa) const {
+        return is_subset(isa, max_cpu_isa_) && mayiuse(isa);
+    }
+
+protected:
+    virtual void generate() = 0;
+
+private:
+    static constexpr unsigned max_code_size = 256 * 1024;
+
+    const cpu_isa_t max_cpu_isa_;
+    const uint8_t *jit_ker_ = nullptr;
+};
+
+} // namespace rv64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/cpu/rv64/jit_primitive_conf.hpp b/src/cpu/rv64/jit_primitive_conf.hpp
new file mode 100644
index 00000000000..dde5afb8d32
--- /dev/null
+++ b/src/cpu/rv64/jit_primitive_conf.hpp
@@ -0,0 +1,97 @@
+/*******************************************************************************
+* Copyright 2025 ZTE Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_RV64_JIT_PRIMITIVE_CONF_HPP
+#define CPU_RV64_JIT_PRIMITIVE_CONF_HPP
+
+#include "common/c_types_map.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace rv64 {
+
+struct jit_1x1_conv_conf_t {
+    prop_kind_t prop_kind;
+    int mb;
+    int ngroups, ic, oc, oc_without_padding, ic_without_padding;
+    int iw, ih, id;
+    int ow, oh, od;
+    int os, is;
+    int kw, kh, kd;
+    int stride_w, stride_h, stride_d;
+    int t_pad, l_pad, f_pad;
+
+    int ic_block, oc_block;
+    int load_block, reduce_block;
+    int bcast_block;
+
+    dim_t load_dim, bcast_dim, reduce_dim;
+
+    int ur, ur_tail;
+    int load_loop_blk;
+    int reduce_loop_unroll;
+    int nthr;
+    int nb_bcast, nb_load, nb_reduce, load_grp_count;
+    int nb_load_blocking, nb_load_blocking_max;
+    int nb_bcast_blocking, nb_bcast_blocking_max;
+    int nb_reduce_blocking;
+
+    dim_t reduce_loop_bcast_step;
+    int reduce_loop_load_step;
+    int bcast_loop_bcast_step;
+    int bcast_loop_output_step;
+    int load_loop_load_step;
+    int load_loop_iter_step;
+
+    bool with_bias;
+    bool with_sum;
+    bool with_eltwise;
+    bool with_binary;
+    bool with_dw_conv;
+
+    int typesize_in;
+    int typesize_out;
+    int typesize_bia;
+    int typesize_acc;
+
+    format_tag_t src_tag, wei_tag, dst_tag;
+};
+
+struct jit_1x1_conv_args_t {
+    const void *bcast_data;
+    const void *load_data;
+    const void *output_data;
+    const void *bias_data;
+
+    size_t load_dim;
+    size_t bcast_dim;
+    size_t reduce_dim;
+
+    size_t first_last_flag;
+};
+
+enum {
+    FLAG_REDUCE_FIRST = 1 << 0,
+    FLAG_REDUCE_LAST = 1 << 1,
+};
+
+} // namespace rv64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp
new file mode 100644
index 00000000000..c63a375d13b
--- /dev/null
+++ b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp
@@ -0,0 +1,581 @@
+/*******************************************************************************
+* Copyright 2025 ZTE Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <assert.h>
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/memory.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/rv64/jit_rvv_1x1_conv_kernel.hpp"
+
+#define GET_OFF(field) \
+    static_cast<int32_t>(offsetof(jit_1x1_conv_args_t, field))
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace rv64 {
+
+using namespace dnnl::impl::format_tag;
+using namespace dnnl::impl::prop_kind;
+using namespace dnnl::impl::utils;
+using namespace Xbyak_riscv;
+
+jit_rvv_1x1_conv_kernel_t::jit_rvv_1x1_conv_kernel_t(
+        const jit_1x1_conv_conf_t &ajcp, const primitive_attr_t &attr,
+        const memory_desc_t &dst_md)
+    : jit_generator_t("jit_rvv_1x1_conv_kernel"), jcp(ajcp), attr_(attr) {
+    create_kernel();
+}
+
+status_t jit_rvv_1x1_conv_kernel_t::init_conf(jit_1x1_conv_conf_t &jcp,
+        const convolution_desc_t &cd, const memory_desc_wrapper &src_d,
+        const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d,
+        const primitive_attr_t &attr, int nthreads, bool reduce_src) {
+
+    const int ndims = src_d.ndims();
+
+    jcp.prop_kind = cd.prop_kind;
+    jcp.nthr = nthreads;
+
+    jcp.with_bias = cd.bias_desc.format_kind != format_kind::undef;
+
+    // Initialize dimensions
+    jcp.mb = src_d.dims()[0];
+    jcp.ngroups
+            = weights_d.ndims() == src_d.ndims() + 1 ? weights_d.dims()[0] : 1;
+    jcp.oc_without_padding = dst_d.dims()[1] / jcp.ngroups;
+    jcp.ic_without_padding = src_d.dims()[1] / jcp.ngroups;
+    jcp.oc = jcp.oc_without_padding;
+    jcp.ic = jcp.ic_without_padding;
+
+    // Targeting SEW=32 (float), LMUL=1, VLEN=128 -> simd_w = 4
+    const int simd_w = 4;
+
+    // OC is padded to match oc_block in weights format (Oihw4o)
+    // IC is not padded; kernel handles IC tail processing
+    jcp.oc = rnd_up(jcp.oc, simd_w);
+
+    // 3D convolution support
+    jcp.id = (ndims == 5) ? src_d.dims()[2] : 1;
+    jcp.od = (ndims == 5) ? dst_d.dims()[2] : 1;
+
+    jcp.ih = (ndims == 3) ? 1 : src_d.dims()[ndims - 2];
+    jcp.iw = src_d.dims()[ndims - 1];
+    jcp.oh = (ndims == 3) ? 1 : dst_d.dims()[ndims - 2];
+    jcp.ow = dst_d.dims()[ndims - 1];
+
+    // Spatial dimensions: D*H*W
+    jcp.os = jcp.od * jcp.oh * jcp.ow;
+    jcp.is = jcp.id * jcp.ih * jcp.iw;
+
+    jcp.oc_block = simd_w;
+    jcp.ic_block = simd_w;
+
+    // Dynamic parameter calculation
+    // Register constraint: (ur * load_loop_blk) + (unroll * load_loop_blk) + 1 <= 32
+    jcp.reduce_loop_unroll = 4;
+
+    const int SMALL_SPATIAL = 10;
+    const int BIG_SPATIAL = 65;
+    const int BIG_LOAD_DIM = (jcp.ic >= 512) ? 256 : 512;
+
+    // Initial load_loop_blk selection
+    if (jcp.oc % (2 * jcp.oc_block) == 0 && jcp.os >= 11) {
+        jcp.load_loop_blk = 2;
+    } else {
+        jcp.load_loop_blk = 1;
+    }
+
+    // Dynamic ur selection algorithm
+    int max_regs, min_regs, size_threshold;
+
+    const int spatial = jcp.od * jcp.oh;
+
+    // Select register range based on batch size and thread count
+    if ((8 * jcp.mb) / jcp.nthr >= 1 || jcp.mb == 1) {
+        max_regs = 9;
+        min_regs = 6;
+        size_threshold = 14;
+
+        // Special shape optimization
+        if (jcp.oc > 128 && jcp.oc < BIG_LOAD_DIM && spatial > SMALL_SPATIAL
+                && spatial < BIG_SPATIAL && jcp.ic < 256) {
+            max_regs = 6;
+            min_regs = 5;
+        }
+    } else {
+        max_regs = 30;
+        min_regs = 9;
+        size_threshold = 14;
+    }
+
+    // Initial ur
+    jcp.ur = 1;
+
+    // First pass: find largest ur that divides spatial evenly
+    for (int ur_w = max_regs; ur_w >= min_regs; ur_w--) {
+        if ((spatial >= size_threshold && spatial % ur_w == 0)
+                || (spatial < size_threshold && jcp.os % ur_w == 0)) {
+            jcp.ur = ur_w;
+            break;
+        }
+    }
+
+    // If first pass fails, use heuristic
+    if (jcp.ur == 1) {
+        jcp.ur = nstl::min(max_regs, jcp.os);
+        int os_tail = jcp.os % max_regs;
+        for (int i = max_regs; i >= min_regs; i--) {
+            int i_tail = jcp.os % i;
+            if (i_tail > os_tail || i_tail == 0) {
+                jcp.ur = i;
+                os_tail = i_tail;
+                if (i_tail == 0) break;
+            }
+        }
+    }
+
+    // Adjust ur based on load_loop_blk (ensure register limit)
+    // Register constraint: ur * load_loop_blk + unroll * load_loop_blk + 1 <= 32
+    int max_ur_for_blk = (32 - 1 - jcp.reduce_loop_unroll * jcp.load_loop_blk)
+            / jcp.load_loop_blk;
+    if (jcp.ur > max_ur_for_blk) {
+        jcp.ur = max_ur_for_blk;
+        if (jcp.ur < 1) jcp.ur = 1;
+    }
+
+    jcp.load_block = jcp.oc_block;
+    jcp.reduce_block = jcp.ic_block;
+
+    jcp.bcast_block = jcp.ur;
+    jcp.load_dim = jcp.oc_without_padding;
+    jcp.bcast_dim = jcp.os;
+    jcp.reduce_dim = jcp.ic_without_padding;
+
+    jcp.ur_tail = jcp.bcast_dim % jcp.ur;
+
+    jcp.nb_bcast = div_up(jcp.os, jcp.bcast_block);
+    jcp.nb_load = div_up(jcp.oc_without_padding, jcp.load_block);
+    jcp.nb_reduce = div_up(jcp.ic_without_padding, jcp.reduce_block);
+    jcp.load_grp_count = 1;
+
+    // Blocking strategy for NHWC layout
+    jcp.nb_reduce_blocking = jcp.nb_reduce;
+    jcp.nb_load_blocking = jcp.nb_load;
+    jcp.nb_load_blocking_max = jcp.nb_load;
+
+    // Spatial dimension blocking (in ur units)
+    int target_bcast_blocking = 735;
+    jcp.nb_bcast_blocking
+            = nstl::min(jcp.nb_bcast, div_up(target_bcast_blocking, jcp.ur));
+    if (jcp.nb_bcast_blocking == 0) jcp.nb_bcast_blocking = 1;
+    jcp.nb_bcast_blocking_max = jcp.nb_bcast_blocking;
+
+    // Optimize reduce_loop_unroll based on available registers
+    if (jcp.load_loop_blk == 2) {
+        jcp.reduce_loop_unroll = 4;
+    } else {
+        jcp.reduce_loop_unroll = 4;
+    }
+
+    // Layout-dependent stride parameters (for NHWC)
+    jcp.typesize_in = sizeof(float);
+    jcp.typesize_out = sizeof(float);
+
+    jcp.reduce_loop_bcast_step = jcp.typesize_in;
+    jcp.reduce_loop_load_step = jcp.oc_block * jcp.typesize_in;
+
+    // Strides within bcast_loop (spatial dimensions)
+    jcp.bcast_loop_bcast_step
+            = jcp.ngroups * jcp.ic_without_padding * jcp.typesize_in;
+    jcp.bcast_loop_output_step
+            = jcp.ngroups * jcp.oc_without_padding * jcp.typesize_out;
+
+    // Strides within load_loop (OC dimension)
+    jcp.load_loop_load_step
+            = jcp.ic_without_padding * jcp.oc_block * jcp.typesize_in;
+    jcp.load_loop_iter_step = jcp.oc_block;
+
+    return status::success;
+}
+
+void jit_rvv_1x1_conv_kernel_t::init_scratchpad(
+        memory_tracking::registrar_t &scratchpad,
+        const jit_1x1_conv_conf_t &jcp) {
+    // Not implemented
+}
+
+void jit_rvv_1x1_conv_kernel_t::balance(jit_1x1_conv_conf_t &jcp) {
+    // Not implemented
+}
+
+void jit_rvv_1x1_conv_kernel_t::generate() {
+    preamble();
+
+    // Set initial VL to oc_block (4)
+    li(reg_tmp_imm, jcp.oc_block);
+    vsetvli(reg_tmp_imm, reg_tmp_imm, Xbyak_riscv::SEW::e32,
+            Xbyak_riscv::LMUL::m1);
+
+    // Load parameters
+    ld(reg_bcast_data, reg_param, GET_OFF(bcast_data));
+    ld(reg_load_data, reg_param, GET_OFF(load_data));
+    ld(reg_output_data, reg_param, GET_OFF(output_data));
+    if (jcp.with_bias) ld(reg_bias_data, reg_param, GET_OFF(bias_data));
+
+    ld(reg_load_loop_work, reg_param, GET_OFF(load_dim));
+    ld(reg_bcast_loop_work, reg_param, GET_OFF(bcast_dim));
+    ld(reg_reduce_loop_work, reg_param, GET_OFF(reduce_dim));
+    ld(reg_reduce_pos_flag, reg_param, GET_OFF(first_last_flag));
+
+    // Main loop generation
+    auto load_loop_body = [=](int load_loop_blk) {
+        bcast_loop(load_loop_blk);
+
+        // Update pointers and work counters
+        li(reg_tmp_imm, load_loop_blk * jcp.load_loop_load_step);
+        add(reg_load_data, reg_load_data, reg_tmp_imm);
+
+        if (jcp.with_bias) {
+            li(reg_tmp_imm, load_loop_blk * jcp.oc_block * jcp.typesize_out);
+            add(reg_bias_data, reg_bias_data, reg_tmp_imm);
+        }
+
+        li(reg_tmp_imm, load_loop_blk * jcp.oc_block * jcp.typesize_out);
+        add(reg_output_data, reg_output_data, reg_tmp_imm);
+
+        li(reg_tmp_imm, load_loop_blk * jcp.load_loop_iter_step);
+        sub(reg_load_loop_work, reg_load_loop_work, reg_tmp_imm);
+    };
+
+    Label load_loop_label, load_loop_end, load_loop_tail;
+
+    if (jcp.load_loop_blk > 1) {
+        L(load_loop_label);
+        li(reg_tmp_imm, jcp.load_loop_blk * jcp.oc_block);
+        blt(reg_load_loop_work, reg_tmp_imm, load_loop_tail);
+
+        // Ensure VL is full
+        li(reg_tmp_imm, jcp.oc_block);
+        vsetvli(reg_tmp_imm, reg_tmp_imm, Xbyak_riscv::SEW::e32,
+                Xbyak_riscv::LMUL::m1);
+
+        load_loop_body(jcp.load_loop_blk);
+        jal(x0, load_loop_label);
+    }
+
+    L(load_loop_tail);
+    {
+        Label tail_loop;
+        L(tail_loop);
+        blez(reg_load_loop_work, load_loop_end);
+
+        // Last block may be partial, use vsetvli to set VL dynamically
+        vsetvli(reg_tmp_imm, reg_load_loop_work, Xbyak_riscv::SEW::e32,
+                Xbyak_riscv::LMUL::m1);
+
+        bcast_loop(1);
+
+        // Update pointers and work counters (tail loop)
+        li(reg_tmp_imm, jcp.load_loop_load_step);
+        add(reg_load_data, reg_load_data, reg_tmp_imm);
+        if (jcp.with_bias) {
+            li(reg_tmp_imm, jcp.oc_block * jcp.typesize_out);
+            add(reg_bias_data, reg_bias_data, reg_tmp_imm);
+        }
+        li(reg_tmp_imm, jcp.oc_block * jcp.typesize_out);
+        add(reg_output_data, reg_output_data, reg_tmp_imm);
+
+        li(reg_tmp_imm, jcp.oc_block);
+        sub(reg_load_loop_work, reg_load_loop_work, reg_tmp_imm);
+
+        jal(x0, tail_loop);
+    }
+    L(load_loop_end);
+
+    postamble();
+}
+
+void jit_rvv_1x1_conv_kernel_t::preamble() {
+    addi(sp, sp, -64);
+    sd(ra, sp, 56);
+    sd(s0, sp, 48);
+    sd(s1, sp, 40);
+    sd(s2, sp, 32);
+    sd(s3, sp, 24);
+    sd(s4, sp, 16);
+    sd(s5, sp, 8);
+}
+
+void jit_rvv_1x1_conv_kernel_t::postamble() {
+    ld(ra, sp, 56);
+    ld(s0, sp, 48);
+    ld(s1, sp, 40);
+    ld(s2, sp, 32);
+    ld(s3, sp, 24);
+    ld(s4, sp, 16);
+    ld(s5, sp, 8);
+    addi(sp, sp, 64);
+    ret();
+}
+
+void jit_rvv_1x1_conv_kernel_t::bcast_loop(int load_loop_blk) {
+    mv(reg_bcast_loop_iter, reg_bcast_loop_work);
+    mv(aux1_reg_bcast_data, reg_bcast_data);
+    mv(aux_reg_output_data, reg_output_data);
+
+    Label bcast_loop_label, bcast_loop_tail;
+
+    li(reg_tmp_imm, jcp.ur);
+    blt(reg_bcast_loop_iter, reg_tmp_imm, bcast_loop_tail);
+
+    L(bcast_loop_label);
+    {
+        reduce_loop(load_loop_blk, jcp.ur);
+
+        li(reg_tmp_imm, jcp.ur * jcp.bcast_loop_bcast_step);
+        add(aux1_reg_bcast_data, aux1_reg_bcast_data, reg_tmp_imm);
+
+        li(reg_tmp_imm, jcp.ur * jcp.bcast_loop_output_step);
+        add(aux_reg_output_data, aux_reg_output_data, reg_tmp_imm);
+
+        addi(reg_bcast_loop_iter, reg_bcast_loop_iter, -jcp.ur);
+        li(reg_tmp_imm, jcp.ur);
+        bge(reg_bcast_loop_iter, reg_tmp_imm, bcast_loop_label);
+    }
+
+    L(bcast_loop_tail);
+    if (jcp.ur_tail > 0) {
+        Label bcast_loop_tail_end;
+        blez(reg_bcast_loop_iter, bcast_loop_tail_end);
+
+        reduce_loop(load_loop_blk, jcp.ur_tail);
+
+        L(bcast_loop_tail_end);
+    }
+}
+
+void jit_rvv_1x1_conv_kernel_t::reduce_loop(int load_loop_blk, int ur) {
+    mv(aux_reg_load_data, reg_load_data);
+    mv(aux_reg_bcast_data, aux1_reg_bcast_data);
+
+    auto init = [=]() {
+        Label init_zero, init_done;
+        andi(reg_tmp_imm, reg_reduce_pos_flag, FLAG_REDUCE_FIRST);
+        bnez(reg_tmp_imm, init_zero);
+
+        // Load from dst for accumulation
+        mv(reg_tmp_addr, aux_reg_output_data);
+        for (int i_ur = 0; i_ur < ur; ++i_ur) {
+            for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
+                vle32_v(vreg_accum(i_load, i_ur), reg_tmp_addr);
+                if (i_load + 1 < load_loop_blk)
+                    addi(reg_tmp_addr, reg_tmp_addr,
+                            jcp.load_block * jcp.typesize_out);
+            }
+            li(reg_tmp_imm,
+                    jcp.bcast_loop_output_step
+                            - (load_loop_blk - 1) * jcp.load_block
+                                    * jcp.typesize_out);
+            add(reg_tmp_addr, reg_tmp_addr, reg_tmp_imm);
+        }
+        jal(x0, init_done);
+
+        L(init_zero);
+        for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
+            if (jcp.with_bias) {
+                size_t bias_off
+                        = (size_t)i_load * jcp.oc_block * jcp.typesize_out;
+                if (bias_off == 0) {
+                    vle32_v(vreg_load(0), reg_bias_data);
+                } else {
+                    li(reg_tmp_addr, bias_off);
+                    add(reg_tmp_addr, reg_tmp_addr, reg_bias_data);
+                    vle32_v(vreg_load(0), reg_tmp_addr);
+                }
+            }
+            for (int i_ur = 0; i_ur < ur; ++i_ur) {
+                if (jcp.with_bias) {
+                    vmv_v_v(vreg_accum(i_load, i_ur), vreg_load(0));
+                } else {
+                    vxor_vv(vreg_accum(i_load, i_ur), vreg_accum(i_load, i_ur),
+                            vreg_accum(i_load, i_ur));
+                }
+            }
+        }
+        L(init_done);
+    };
+
+    auto store = [=]() {
+        mv(reg_tmp_addr, aux_reg_output_data);
+        for (int i_ur = 0; i_ur < ur; ++i_ur) {
+            for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
+                vse32_v(vreg_accum(i_load, i_ur), reg_tmp_addr);
+                if (i_load + 1 < load_loop_blk)
+                    addi(reg_tmp_addr, reg_tmp_addr,
+                            jcp.load_block * jcp.typesize_out);
+            }
+            li(reg_tmp_imm,
+                    jcp.bcast_loop_output_step
+                            - (load_loop_blk - 1) * jcp.load_block
+                                    * jcp.typesize_out);
+            add(reg_tmp_addr, reg_tmp_addr, reg_tmp_imm);
+        }
+    };
+
+    auto fma_block = [=](int current_unroll, bool last_block) {
+        for (int i_unroll = 0; i_unroll < current_unroll; ++i_unroll) {
+            flw(freg_bcast, aux_reg_bcast_data, 0);
+
+            for (int i_ur = 0; i_ur < ur; ++i_ur) {
+                for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
+                    vfmacc_vf(vreg_accum(i_load, i_ur), freg_bcast,
+                            vreg_load(i_load, i_unroll));
+                }
+
+                if (i_ur + 1 < ur) {
+                    size_t offset
+                            = (size_t)(i_ur + 1) * jcp.bcast_loop_bcast_step;
+                    if (offset <= 2047) {
+                        flw(freg_bcast, aux_reg_bcast_data, offset);
+                    } else {
+                        li(reg_tmp_addr, offset);
+                        add(reg_tmp_addr, reg_tmp_addr, aux_reg_bcast_data);
+                        flw(freg_bcast, reg_tmp_addr, 0);
+                    }
+                }
+            }
+            addi(aux_reg_bcast_data, aux_reg_bcast_data,
+                    jcp.reduce_loop_bcast_step);
+        }
+
+        // Update weight pointer to next unroll block
+        li(reg_tmp_imm, jcp.reduce_loop_unroll * jcp.reduce_loop_load_step);
+        add(aux_reg_load_data, aux_reg_load_data, reg_tmp_imm);
+
+        // Prefetch weights for next iteration
+        if (!last_block) {
+            for (int i_unroll = 0; i_unroll < jcp.reduce_loop_unroll;
+                    ++i_unroll) {
+                for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
+                    size_t weight_off
+                            = (size_t)i_unroll * jcp.reduce_loop_load_step
+                            + (size_t)i_load * jcp.load_loop_load_step;
+                    li(reg_tmp_addr, weight_off);
+                    add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr);
+                    vle32_v(vreg_load(i_load, i_unroll), reg_tmp_addr);
+                }
+            }
+        }
+    };
+
+    init();
+
+    // Load first round of weights (IC=0..unroll-1)
+    for (int i_unroll = 0; i_unroll < jcp.reduce_loop_unroll; ++i_unroll) {
+        for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
+            size_t weight_off = (size_t)i_unroll * jcp.reduce_loop_load_step
+                    + (size_t)i_load * jcp.load_loop_load_step;
+            if (weight_off == 0) {
+                vle32_v(vreg_load(i_load, i_unroll), aux_reg_load_data);
+            } else {
+                li(reg_tmp_addr, weight_off);
+                add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr);
+                vle32_v(vreg_load(i_load, i_unroll), reg_tmp_addr);
+            }
+        }
+    }
+
+    mv(reduce_loop_iter, reg_reduce_loop_work);
+    Label reduce_loop_label, reduce_loop_tail;
+
+    li(reg_tmp_imm, jcp.reduce_loop_unroll);
+    blt(reduce_loop_iter, reg_tmp_imm, reduce_loop_tail);
+
+    L(reduce_loop_label);
+    {
+        li(reg_tmp_imm, jcp.reduce_loop_unroll);
+        sub(reg_tmp_imm, reduce_loop_iter, reg_tmp_imm);
+        li(reg_tmp_addr, jcp.reduce_loop_unroll);
+        Label is_last, do_fma;
+        blt(reg_tmp_imm, reg_tmp_addr, is_last);
+        fma_block(jcp.reduce_loop_unroll, false);
+        jal(x0, do_fma);
+        L(is_last);
+        fma_block(jcp.reduce_loop_unroll, true);
+        L(do_fma);
+
+        addi(reduce_loop_iter, reduce_loop_iter, -jcp.reduce_loop_unroll);
+        li(reg_tmp_imm, jcp.reduce_loop_unroll);
+        bge(reduce_loop_iter, reg_tmp_imm, reduce_loop_label);
+    }
+
+    L(reduce_loop_tail);
+    {
+        Label tail_done;
+        blez(reduce_loop_iter, tail_done);
+        Label tail_loop;
+        L(tail_loop);
+        {
+            for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
+                size_t weight_off = (size_t)i_load * jcp.load_loop_load_step;
+                if (weight_off == 0) {
+                    vle32_v(vreg_load(i_load, 0), aux_reg_load_data);
+                } else {
+                    li(reg_tmp_addr, weight_off);
+                    add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr);
+                    vle32_v(vreg_load(i_load, 0), reg_tmp_addr);
+                }
+            }
+
+            flw(freg_bcast, aux_reg_bcast_data, 0);
+            for (int i_ur = 0; i_ur < ur; ++i_ur) {
+                for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
+                    vfmacc_vf(vreg_accum(i_load, i_ur), freg_bcast,
+                            vreg_load(i_load, 0));
+                }
+                if (i_ur + 1 < ur) {
+                    size_t offset
+                            = (size_t)(i_ur + 1) * jcp.bcast_loop_bcast_step;
+                    if (offset <= 2047) {
+                        flw(freg_bcast, aux_reg_bcast_data, offset);
+                    } else {
+                        li(reg_tmp_addr, offset);
+                        add(reg_tmp_addr, reg_tmp_addr, aux_reg_bcast_data);
+                        flw(freg_bcast, reg_tmp_addr, 0);
+                    }
+                }
+            }
+
+            addi(aux_reg_bcast_data, aux_reg_bcast_data,
+                    jcp.reduce_loop_bcast_step);
+            addi(aux_reg_load_data, aux_reg_load_data,
+                    jcp.reduce_loop_load_step);
+            addi(reduce_loop_iter, reduce_loop_iter, -1);
+            bnez(reduce_loop_iter, tail_loop);
+        }
+        L(tail_done);
+    }
+
+    store();
+}
+
+} // namespace rv64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp
new file mode 100644
index 00000000000..0fcd9774aec
--- /dev/null
+++ b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp
@@ -0,0 +1,109 @@
+/*******************************************************************************
+* Copyright 2025 ZTE Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_RV64_JIT_RVV_1X1_CONV_KERNEL_HPP
+#define CPU_RV64_JIT_RVV_1X1_CONV_KERNEL_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/memory_tracking.hpp"
+
+#include "cpu/rv64/jit_generator.hpp"
+#include "cpu/rv64/jit_primitive_conf.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace rv64 {
+
+using namespace Xbyak_riscv;
+
+struct jit_rvv_1x1_conv_kernel_t : public jit_generator_t {
+    jit_rvv_1x1_conv_kernel_t(const jit_1x1_conv_conf_t &ajcp,
+            const primitive_attr_t &attr, const memory_desc_t &dst_md);
+
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_rvv_1x1_conv_kernel)
+
+    static status_t init_conf(jit_1x1_conv_conf_t &jcp,
+            const convolution_desc_t &cd, const memory_desc_wrapper &src_d,
+            const memory_desc_wrapper &weights_d,
+            const memory_desc_wrapper &dst_d, const primitive_attr_t &attr,
+            int nthreads, bool reduce_src);
+
+    static void init_scratchpad(memory_tracking::registrar_t &scratchpad,
+            const jit_1x1_conv_conf_t &jcp);
+
+    static void balance(jit_1x1_conv_conf_t &jcp);
+
+    jit_1x1_conv_conf_t jcp;
+    const primitive_attr_t &attr_;
+
+private:
+    using Reg = Xbyak_riscv::Reg;
+    using VReg = Xbyak_riscv::VReg;
+    using FReg = Xbyak_riscv::FReg;
+
+    const Reg reg_param = a0;
+    const Reg reg_bcast_data = a1;
+    const Reg reg_load_data = a2;
+    const Reg reg_output_data = a3;
+    const Reg reg_bias_data = a4;
+
+    const Reg reg_load_loop_work = t0;
+    const Reg reg_bcast_loop_work = t1;
+    const Reg reg_reduce_loop_work = t2;
+
+    const Reg aux_reg_bcast_data = t3;
+    const Reg aux_reg_load_data = t4;
+    const Reg aux_reg_output_data = t5;
+    const Reg aux1_reg_bcast_data = t6;
+
+    const Reg reduce_loop_iter = s0;
+    const Reg reg_bcast_loop_iter = s1;
+    const Reg reg_reduce_pos_flag = s2;
+    const Reg reg_output_stride = s3;
+
+    const Reg reg_tmp_imm = s4;
+    const Reg reg_tmp_addr = s5;
+
+    VReg vreg_accum(int i_load, int i_ur) {
+        // Avoid v0, start from v1
+        return VReg(1 + i_ur * jcp.load_loop_blk + i_load);
+    }
+
+    VReg vreg_load(int i_load, int i_unroll = 0) {
+        // Allocate after accum to avoid conflicts
+        // accum uses v1 to v(ur * load_loop_blk)
+        return VReg(1 + jcp.ur * jcp.load_loop_blk
+                + i_unroll * jcp.load_loop_blk + i_load);
+    }
+
+    const FReg freg_bcast = fa0;
+    const FReg freg_load = fa1;
+
+    void generate() override;
+    void preamble();
+    void postamble();
+    void bcast_loop(int load_loop_blk);
+    void reduce_loop(int load_loop_blk, int ur);
+    void fma_block(int load_loop_blk, int ur);
+};
+
+} // namespace rv64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/cpu/rv64/jit_rvv_1x1_convolution.cpp b/src/cpu/rv64/jit_rvv_1x1_convolution.cpp
new file mode 100644
index 00000000000..f744419990a
--- /dev/null
+++ b/src/cpu/rv64/jit_rvv_1x1_convolution.cpp
@@ -0,0 +1,144 @@
+/*******************************************************************************
+* Copyright 2025 ZTE Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/rv64/jit_rvv_1x1_convolution.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace rv64 {
+
+using namespace dnnl::impl::status;
+using namespace dnnl::impl::utils;
+
+void jit_rvv_1x1_convolution_fwd_t::execute_forward(
+        const exec_ctx_t &ctx) const {
+    auto src = CTX_IN_MEM(const float *, DNNL_ARG_SRC);
+    auto weights = CTX_IN_MEM(const float *, DNNL_ARG_WEIGHTS);
+    auto bias = CTX_IN_MEM(const float *, DNNL_ARG_BIAS);
+    auto dst = CTX_OUT_MEM(float *, DNNL_ARG_DST);
+
+    const auto &scratchpad = ctx.get_scratchpad_grantor();
+
+    parallel(pd()->jcp_.nthr, [&](const int ithr, const int nthr) {
+        execute_forward_thr(ithr, nthr, src, weights, bias, dst, scratchpad);
+    });
+}
+
+void jit_rvv_1x1_convolution_fwd_t::execute_forward_thr(const int ithr,
+        const int nthr, const float *src, const float *weights,
+        const float *bias, float *dst,
+        const memory_tracking::grantor_t &scratchpad) const {
+
+    const memory_desc_wrapper src_d(pd()->src_md());
+    const memory_desc_wrapper dst_d(pd()->dst_md());
+    const memory_desc_wrapper weights_d(pd()->weights_md(0));
+
+    const auto &jcp = pd()->jcp_;
+
+    auto step = [](int default_step, int remaining, int tail_step) {
+        assert(default_step <= tail_step);
+        return remaining < tail_step ? remaining : default_step;
+    };
+
+    // RVV 1x1 convolution uses NHWC layout.
+    // Spatial dimensions are collapsed into 'os'.
+    // Threading is balanced over (MB * groups * nb_bcast) and (nb_load).
+
+    const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast;
+    int bcast_start {0}, bcast_end {0}, ocb_start {0}, ocb_end {0};
+
+    balance2D(nthr, ithr, work_amount, bcast_start, bcast_end, jcp.nb_load,
+            ocb_start, ocb_end, jcp.load_grp_count);
+
+    if (bcast_start >= bcast_end || ocb_start >= ocb_end) return;
+
+    auto p = jit_1x1_conv_args_t();
+
+    auto ker_1x1 = [&](int ocb, int load_step, int icb, int n, int g, int osb,
+                           int bcast_step) {
+        const int oc_off = g * jcp.oc_without_padding + ocb * jcp.oc_block;
+        const size_t dst_off
+                = (size_t)n * jcp.os * jcp.ngroups * jcp.oc_without_padding
+                + (size_t)osb * jcp.bcast_block * jcp.ngroups
+                        * jcp.oc_without_padding
+                + oc_off;
+
+        p.output_data = &dst[dst_off];
+        p.bias_data = bias ? &bias[oc_off] : nullptr;
+
+        const size_t wei_off = (size_t)g * jcp.oc * jcp.ic_without_padding
+                + (size_t)ocb * jcp.ic_without_padding * jcp.oc_block
+                + (size_t)icb * jcp.ic_block * jcp.oc_block;
+        p.load_data = &weights[wei_off];
+
+        const int ic_off = g * jcp.ic_without_padding + icb * jcp.ic_block;
+        const size_t src_off
+                = (size_t)n * jcp.is * jcp.ngroups * jcp.ic_without_padding
+                + (size_t)osb * jcp.bcast_block * jcp.ngroups
+                        * jcp.ic_without_padding
+                + ic_off;
+        p.bcast_data = &src[src_off];
+
+        p.bcast_dim = this_block_size(
+                osb * jcp.bcast_block, jcp.os, bcast_step * jcp.bcast_block);
+        p.load_dim = this_block_size(ocb * jcp.oc_block, jcp.oc_without_padding,
+                load_step * jcp.oc_block);
+        p.reduce_dim = this_block_size(icb * jcp.ic_block,
+                jcp.ic_without_padding, jcp.nb_reduce_blocking * jcp.ic_block);
+
+        p.first_last_flag = (icb == 0 ? FLAG_REDUCE_FIRST : 0)
+                | (icb + jcp.nb_reduce_blocking >= jcp.nb_reduce
+                                ? FLAG_REDUCE_LAST
+                                : 0);
+
+        (*kernel_)(&p);
+    };
+
+    // Loop order: Load -> Bcast -> Reduce (LBR)
+    // This order keeps weights in registers/L1 while iterating over spatial.
+    for (int ocb = ocb_start; ocb < ocb_end;) {
+        int load_step = step(
+                jcp.nb_load_blocking, ocb_end - ocb, jcp.nb_load_blocking_max);
+        int iwork = bcast_start;
+        while (iwork < bcast_end) {
+            int n {0}, g {0}, osb {0};
+            nd_iterator_init(
+                    iwork, n, jcp.mb, g, jcp.ngroups, osb, jcp.nb_bcast);
+
+            int bcast_step = step(jcp.nb_bcast_blocking, bcast_end - iwork,
+                    jcp.nb_bcast_blocking_max);
+            bcast_step = nstl::min(bcast_step, jcp.nb_bcast - osb);
+
+            for (int icb = 0; icb < jcp.nb_reduce;
+                    icb += jcp.nb_reduce_blocking) {
+                ker_1x1(ocb, load_step, icb, n, g, osb, bcast_step);
+            }
+            iwork += bcast_step;
+        }
+        ocb += load_step;
+    }
+}
+
+} // namespace rv64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/rv64/jit_rvv_1x1_convolution.hpp b/src/cpu/rv64/jit_rvv_1x1_convolution.hpp
new file mode 100644
index 00000000000..2d379cc6ec9
--- /dev/null
+++ b/src/cpu/rv64/jit_rvv_1x1_convolution.hpp
@@ -0,0 +1,170 @@
+/*******************************************************************************
+* Copyright 2025 ZTE Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_RV64_JIT_RVV_1X1_CONVOLUTION_HPP
+#define CPU_RV64_JIT_RVV_1X1_CONVOLUTION_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/memory_tracking.hpp"
+#include "common/primitive.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/cpu_convolution_pd.hpp"
+#include "cpu/platform.hpp"
+
+#include "cpu/rv64/jit_rvv_1x1_conv_kernel.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace rv64 {
+
+struct jit_rvv_1x1_convolution_fwd_t : public primitive_t {
+    struct pd_t : public cpu_convolution_fwd_pd_t {
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
+
+        DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_1x1:", v, ""),
+                jit_rvv_1x1_convolution_fwd_t);
+
+        status_t init(engine_t *engine) {
+            using namespace utils;
+            using namespace format_tag;
+
+            const memory_desc_wrapper src_d(src_md());
+            const memory_desc_wrapper weights_d(weights_md());
+            const memory_desc_wrapper dst_d(dst_md());
+
+            VDISPATCH_CONV(is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
+                    VERBOSE_BAD_ALGORITHM);
+            VDISPATCH_CONV(
+                    expect_data_types(data_type::f32, data_type::f32,
+                            data_type::f32, data_type::f32, data_type::undef),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(attr()->has_default_values(
+                                   primitive_attr_t::skip_mask_t::post_ops),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_CONV(post_ops_ok(), VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
+
+            // Only support: data = nwc/nhwc/ndhwc, weights = blocked formats (Oiw4o/gOiw4o/etc)
+            const int n = ndims();
+            const bool g = with_groups();
+            const auto dat_tag_nxc = utils::pick(n - 3, nwc, nhwc, ndhwc);
+            const auto wei_tag_blocked = utils::pick(2 * n - 6 + (g ? 1 : 0),
+                    Oiw4o, gOiw4o, Oihw4o, gOihw4o, Oidhw4o, gOidhw4o);
+
+            // Check if src/dst match supported format (nxc)
+            // Only accept format_kind::any as a fallback, reject explicit
+            // unsupported formats
+            VDISPATCH_CONV(IMPLICATION(src_d.matches_one_of_tag(dat_tag_nxc)
+                                           != dat_tag_nxc,
+                                   src_d.format_kind() == format_kind::any),
+                    VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_CONV(IMPLICATION(dst_d.matches_one_of_tag(dat_tag_nxc)
+                                           != dat_tag_nxc,
+                                   dst_d.format_kind() == format_kind::any),
+                    VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_CONV(
+                    IMPLICATION(weights_d.matches_one_of_tag(wei_tag_blocked)
+                                    != wei_tag_blocked,
+                            weights_d.format_kind() == format_kind::any),
+                    VERBOSE_UNSUPPORTED_TAG);
+
+            // Set default formats if format_kind == any
+            VDISPATCH_CONV(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
+
+            // ISA check
+            VDISPATCH_CONV(mayiuse(v), VERBOSE_UNSUPPORTED_ISA);
+
+            // 1x1 convolution check
+            const int ndims = src_d.ndims();
+            const int weights_ndims = weights_d.ndims();
+            for (int i = 0; i < ndims - 2; ++i) {
+                VDISPATCH_CONV(
+                        weights_d.dims()[weights_ndims - (ndims - 2) + i] == 1,
+                        VERBOSE_UNSUPPORTED_FEATURE,
+                        "only 1x1 convolution is supported");
+                VDISPATCH_CONV(desc()->strides[i] == 1,
+                        VERBOSE_UNSUPPORTED_FEATURE,
+                        "only stride 1 is supported");
+                VDISPATCH_CONV(desc()->padding[0][i] == 0,
+                        VERBOSE_UNSUPPORTED_FEATURE,
+                        "padding is not supported");
+            }
+
+            VDISPATCH_CONV_SC(jit_rvv_1x1_conv_kernel_t::init_conf(jcp_,
+                                      *desc(), src_d, weights_d, dst_d, *attr(),
+                                      dnnl_get_max_threads(), false),
+                    VERBOSE_UNSUPPORTED_FEATURE, "init_conf failed");
+
+            auto scratchpad = scratchpad_registry().registrar();
+            jit_rvv_1x1_conv_kernel_t::init_scratchpad(scratchpad, jcp_);
+
+            return status::success;
+        }
+
+        jit_1x1_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
+
+    protected:
+        bool post_ops_ok() const {
+            // TODO: Post-ops support is not implemented yet.
+            return attr()->post_ops_.len() == 0;
+        }
+        bool set_default_formats() {
+            using namespace format_tag;
+            const int n = ndims();
+            const bool g = with_groups();
+            const auto dat_tag = utils::pick(n - 3, nwc, nhwc, ndhwc);
+            const auto wei_tag = utils::pick(2 * n - 6 + (g ? 1 : 0), Oiw4o,
+                    gOiw4o, Oihw4o, gOihw4o, Oidhw4o, gOidhw4o);
+
+            return set_default_formats_common(dat_tag, wei_tag, dat_tag);
+        }
+    };
+
+    jit_rvv_1x1_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
+
+    status_t init(engine_t *engine) override {
+        CHECK(safe_ptr_assign(kernel_,
+                new jit_rvv_1x1_conv_kernel_t(
+                        pd()->jcp_, *pd()->attr(), *pd()->dst_md())));
+        return kernel_->create_kernel();
+    }
+
+    status_t execute(const exec_ctx_t &ctx) const override {
+        execute_forward(ctx);
+        return status::success;
+    }
+
+private:
+    void execute_forward(const exec_ctx_t &ctx) const;
+    void execute_forward_thr(const int ithr, const int nthr, const float *src,
+            const float *weights, const float *bias, float *dst,
+            const memory_tracking::grantor_t &scratchpad) const;
+
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+
+    std::unique_ptr<jit_rvv_1x1_conv_kernel_t> kernel_;
+};
+
+} // namespace rv64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/third_party/xbyak_riscv/xbyak_riscv.hpp b/third_party/xbyak_riscv/xbyak_riscv.hpp
new file mode 100644
index 00000000000..249553a36f9
--- /dev/null
+++ b/third_party/xbyak_riscv/xbyak_riscv.hpp
@@ -0,0 +1,1383 @@
+#pragma once
+/*!
+	@file xbyak_riscv.hpp
+	@brief Xbyak_riscv ; JIT assembler for RISC-V
+	@author herumi
+	@url https://github.com/herumi/xbyak_riscv
+	@note modified new BSD license
+	http://opensource.org/licenses/BSD-3-Clause
+*/
+
+// Copyright (C), 2023, KNS Group LLC (YADRO)
+
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+#include <list>
+#include <string>
+#include <algorithm>
+#include <unordered_set>
+#include <unordered_map>
+
+#ifdef _WIN32
+	#ifndef WIN32_LEAN_AND_MEAN
+		#define WIN32_LEAN_AND_MEAN
+	#endif
+	#include <windows.h>
+	#include <malloc.h>
+#elif defined(__GNUC__)
+	#include <unistd.h>
+	#include <sys/mman.h>
+	#include <stdlib.h>
+#endif
+#if defined(__APPLE__)
+	#define XBYAK_RISCV_USE_MAP_JIT
+	#include <sys/sysctl.h>
+	#ifndef MAP_JIT
+		#define MAP_JIT 0x800
+	#endif
+#endif
+
+#if defined(__GNUC__) && !defined(__MINGW32__)
+	#define XBYAK_RISCV_USE_MMAP_ALLOCATOR
+#endif
+
+#ifdef NDEBUG
+	#define XBYAK_RISCV_ASSERT(x)
+#else
+	#define XBYAK_RISCV_ASSERT(x) assert(x)
+#endif
+
+// MFD_CLOEXEC defined only linux 3.17 or later.
+// Android wraps the memfd_create syscall from API version 30.
+#if !defined(MFD_CLOEXEC) || (defined(__ANDROID__) && __ANDROID_API__ < 30)
+	#undef XBYAK_RISCV_USE_MEMFD
+#endif
+
+#if defined(_WIN64) || defined(__MINGW64__) || (defined(__CYGWIN__) && defined(__x86_64__))
+	#define XBYAK_RISCV64_WIN
+#elif defined(__x86_64__)
+	#define XBYAK_RISCV64_GCC
+#endif
+#if !defined(XBYAK_RISCV64) && !defined(XBYAK_RISCV32)
+	#if defined(XBYAK_RISCV64_GCC) || defined(XBYAK_RISCV64_WIN)
+		#define XBYAK_RISCV64
+	#else
+		#define XBYAK_RISCV32
+	#endif
+#endif
+
+#ifdef _MSC_VER
+	#pragma warning(push)
+	#pragma warning(disable : 4514) /* remove inline function */
+	#pragma warning(disable : 4786) /* identifier is too long */
+	#pragma warning(disable : 4503) /* name is too long */
+	#pragma warning(disable : 4127) /* constant expresison */
+#endif
+
+#include "xbyak_riscv_csr.hpp"
+
+#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || (defined(_MSC_VER) && _MSC_VER >= 1910)
+	#define XBYAK_RISCV_CONSTEXPR constexpr
+#else
+	#define XBYAK_RISCV_CONSTEXPR
+#endif
+
+namespace Xbyak_riscv {
+
+enum {
+	DEFAULT_MAX_CODE_SIZE = 4096,
+	VERSION = 0x1010 /* 0xABCD = A.BC.D */
+};
+
+inline uint32_t getVersion() { return VERSION; }
+
+enum {
+	ERR_NONE = 1,
+	ERR_OFFSET_IS_TOO_BIG,
+	ERR_CODE_IS_TOO_BIG,
+	ERR_IMM_IS_TOO_BIG,
+	ERR_INVALID_IMM_OF_JAL,
+	ERR_INVALID_IMM_OF_BTYPE,
+	ERR_LABEL_IS_NOT_FOUND,
+	ERR_LABEL_IS_REDEFINED,
+	ERR_LABEL_IS_TOO_FAR,
+	ERR_LABEL_IS_NOT_SET_BY_L,
+	ERR_LABEL_IS_ALREADY_SET_BY_L,
+	ERR_CANT_PROTECT,
+	ERR_CANT_ALLOC,
+	ERR_BAD_PARAMETER,
+	ERR_MUNMAP,
+	ERR_BAD_ALIGN,
+	ERR_INTERNAL // Put it at last.
+};
+
+inline const char *ConvertErrorToString(int err)
+{
+	static const char *errTbl[] = {
+		"none",
+		"offset is too big",
+		"code is too big",
+		"imm is too big",
+		"invalid imm of jal",
+		"invalid imm of Btype",
+		"label is not found",
+		"label is redefined",
+		"label is too far",
+		"label is not set by L",
+		"label is already set by L",
+		"can't protect",
+		"can't alloc",
+		"bad parameter",
+		"munmap",
+		"bad align",
+		"internal error"
+	};
+	assert(ERR_INTERNAL == sizeof(errTbl) / sizeof(*errTbl));
+	return err <= ERR_INTERNAL ? errTbl[err] : "unknown err";
+}
+
+#ifdef XBYAK_RISCV_NO_EXCEPTION
+namespace local {
+
+inline int& GetErrorRef() {
+	static thread_local int err = 0;
+	return err;
+}
+
+inline void SetError(int err) {
+	if (local::GetErrorRef()) return; // keep the first err code
+	local::GetErrorRef() = err;
+}
+
+} // local
+
+inline void ClearError() {
+	local::GetErrorRef() = 0;
+}
+inline int GetError() { return Xbyak_riscv::local::GetErrorRef(); }
+
+#define XBYAK_RISCV_THROW(err) { Xbyak_riscv::local::SetError(err); return; }
+#define XBYAK_RISCV_THROW_RET(err, r) { Xbyak_riscv::local::SetError(err); return r; }
+
+#else
+class Error : public std::exception {
+	int err_;
+public:
+	explicit Error(int err) : err_(err)
+	{
+		if (err_ < 0 || err_ > ERR_INTERNAL) {
+			err_ = ERR_INTERNAL;
+		}
+	}
+	operator int() const { return err_; }
+	const char *what() const noexcept override
+	{
+		return ConvertErrorToString(err_);
+	}
+};
+
+// dummy functions
+inline void ClearError() { }
+inline int GetError() { return 0; }
+
+inline const char *ConvertErrorToString(const Error& err)
+{
+	return err.what();
+}
+
+#define XBYAK_RISCV_THROW(err) { throw Error(err); }
+#define XBYAK_RISCV_THROW_RET(err, r) { throw Error(err); }
+
+#endif
+
+inline void *AlignedMalloc(size_t size, size_t alignment)
+{
+#ifdef __MINGW32__
+	return __mingw_aligned_malloc(size, alignment);
+#elif defined(_WIN32)
+	return _aligned_malloc(size, alignment);
+#else
+	void *p;
+	int ret = posix_memalign(&p, alignment, size);
+	return (ret == 0) ? p : 0;
+#endif
+}
+
+inline void AlignedFree(void *p)
+{
+#ifdef __MINGW32__
+	__mingw_aligned_free(p);
+#elif defined(_MSC_VER)
+	_aligned_free(p);
+#else
+	free(p);
+#endif
+}
+
+namespace local {
+
+static const size_t ALIGN_PAGE_SIZE = 4096;
+
+inline XBYAK_RISCV_CONSTEXPR uint32_t mask(size_t n)
+{
+	XBYAK_RISCV_ASSERT(n <= 32);
+	return n == 32 ? 0xffffffff : (1u << n) - 1;
+}
+// is x <= mask(n) ?
+inline XBYAK_RISCV_CONSTEXPR bool inBit(uint32_t x, size_t n)
+{
+	return x <= mask(n);
+}
+
+// is x a signed n-bit integer?
+inline XBYAK_RISCV_CONSTEXPR bool inSBit(int x, int n)
+{
+	return -(1 << (n-1)) <= x && x < (1 << (n-1));
+}
+
+// split x to hi20bits and low12bits
+// return false if x in 12-bit signed integer
+inline bool split32bit(int *pH, int* pL, int x) {
+	if (inSBit(x, 12)) return false;
+	int H = (x >> 12) & mask(20);
+	int L = x & mask(12);
+	if (x & (1 << 11)) {
+		H++;
+		L = L | (mask(20) << 12);
+	}
+	*pH = H;
+	*pL = L;
+	return true;
+}
+
+// @@@ embedded by bit_pattern.py (DON'T DELETE THIS LINE)
+inline size_t get20_10to1_11_19to12_z12(size_t v) { return ((v & (1<<20)) << 11)| ((v & (1023<<1)) << 20)| ((v & (1<<11)) << 9)| (v & (255<<12)); }
+inline size_t get12_10to5_z13_4to1_11_z7(size_t v) { return ((v & (1<<12)) << 19)| ((v & (63<<5)) << 20)| ((v & (15<<1)) << 7)| ((v & (1<<11)) >> 4); }
+inline size_t get5to4_9to6_2_3_z5(size_t v) { return ((v & (3<<4)) << 7)| ((v & (15<<6)) << 1)| ((v & (1<<2)) << 4)| ((v & (1<<3)) << 2); }
+inline size_t get9_z5_4_6_8to7_5_z2(size_t v) { return ((v & (1<<9)) << 3)| ((v & (1<<4)) << 2)| ((v & (1<<6)) >> 1)| ((v & (3<<7)) >> 4)| ((v & (1<<5)) >> 3); }
+inline size_t get5to3_z3_2_6_z5(size_t v) { return ((v & (7<<3)) << 7)| ((v & (1<<2)) << 4)| ((v & (1<<6)) >> 1); }
+inline size_t get5to3_z3_7_6_z5(size_t v) { return ((v & (7<<3)) << 7)| ((v & (1<<7)) >> 1)| ((v & (1<<6)) >> 1); }
+inline size_t get5_z5_4to0_z2(size_t v) { return ((v & (1<<5)) << 7)| ((v & 31) << 2); }
+inline size_t get11_4_9to8_10_6_7_3to1_5_z2(size_t v) { return ((v & (1<<11)) << 1)| ((v & (1<<4)) << 7)| ((v & (3<<8)) << 1)| ((v & (1<<10)) >> 2)| ((v & (1<<6)) << 1)| ((v & (1<<7)) >> 1)| ((v & (7<<1)) << 2)| ((v & (1<<5)) >> 3); }
+inline size_t get17_z5_16to12_z2(size_t v) { return ((v & (1<<17)) >> 5)| ((v & (31<<12)) >> 10); }
+inline size_t get5_z5_4to2_7to6_z2(size_t v) { return ((v & (1<<5)) << 7)| ((v & (7<<2)) << 2)| ((v & (3<<6)) >> 4); }
+inline size_t get5_z5_4to3_8to6_z2(size_t v) { return ((v & (1<<5)) << 7)| ((v & (3<<3)) << 2)| ((v & (7<<6)) >> 4); }
+inline size_t get5to2_7to6_z7(size_t v) { return ((v & (15<<2)) << 7)| ((v & (3<<6)) << 1); }
+inline size_t get5to3_8to6_z7(size_t v) { return ((v & (7<<3)) << 7)| ((v & (7<<6)) << 1); }
+// @@@ embedded by bit_pattern.py (DON'T DELETE THIS LINE)
+
+} // local
+
+/*
+	custom allocator
+*/
+struct Allocator {
+	explicit Allocator(const std::string& = "") {} // same interface with MmapAllocator
+	virtual uint8_t *alloc(size_t size) { return reinterpret_cast<uint8_t*>(AlignedMalloc(size, local::ALIGN_PAGE_SIZE)); }
+	virtual void free(uint8_t *p) { AlignedFree(p); }
+	virtual ~Allocator() {}
+	/* override to return false if you call protect() manually */
+	virtual bool useProtect() const { return true; }
+};
+
+#ifdef XBYAK_RISCV_USE_MMAP_ALLOCATOR
+#ifdef XBYAK_RISCV_USE_MAP_JIT
+namespace local {
+
+inline int getMacOsVersionPure()
+{
+	char buf[64];
+	size_t size = sizeof(buf);
+	int err = sysctlbyname("kern.osrelease", buf, &size, NULL, 0);
+	if (err != 0) return 0;
+	char *endp;
+	int major = strtol(buf, &endp, 10);
+	if (*endp != '.') return 0;
+	return major;
+}
+
+inline int getMacOsVersion()
+{
+	static const int version = getMacOsVersionPure();
+	return version;
+}
+
+} // local
+#endif
+class MmapAllocator : public Allocator {
+	struct Allocation {
+		size_t size;
+#if defined(XBYAK_RISCV_USE_MEMFD)
+		// fd_ is only used with XBYAK_RISCV_USE_MEMFD. We keep the file open
+		// during the lifetime of each allocation in order to support
+		// checkpoint/restore by unprivileged users.
+		int fd;
+#endif
+	};
+	const std::string name_; // only used with XBYAK_RISCV_USE_MEMFD
+	typedef std::unordered_map<uintptr_t, Allocation> AllocationList;
+	AllocationList allocList_;
+public:
+	explicit MmapAllocator(const std::string& name = "xbyak") : name_(name) {}
+	uint8_t *alloc(size_t size) override
+	{
+		const size_t alignedSizeM1 = local::ALIGN_PAGE_SIZE - 1;
+		size = (size + alignedSizeM1) & ~alignedSizeM1;
+#if defined(MAP_ANONYMOUS)
+		int mode = MAP_PRIVATE | MAP_ANONYMOUS;
+#elif defined(MAP_ANON)
+		int mode = MAP_PRIVATE | MAP_ANON;
+#else
+		#error "not supported"
+#endif
+#if defined(XBYAK_RISCV_USE_MAP_JIT)
+		const int mojaveVersion = 18;
+		if (local::getMacOsVersion() >= mojaveVersion) mode |= MAP_JIT;
+#endif
+		int fd = -1;
+#if defined(XBYAK_RISCV_USE_MEMFD)
+		fd = memfd_create(name_.c_str(), MFD_CLOEXEC);
+		if (fd != -1) {
+			mode = MAP_SHARED;
+			if (ftruncate(fd, size) != 0) {
+				close(fd);
+				XBYAK_RISCV_THROW_RET(ERR_CANT_ALLOC, 0)
+			}
+		}
+#endif
+		void *p = mmap(NULL, size, PROT_READ | PROT_WRITE, mode, fd, 0);
+		if (p == MAP_FAILED) {
+			if (fd != -1) close(fd);
+			XBYAK_RISCV_THROW_RET(ERR_CANT_ALLOC, 0)
+		}
+		assert(p);
+		Allocation &alloc = allocList_[(uintptr_t)p];
+		alloc.size = size;
+#if defined(XBYAK_RISCV_USE_MEMFD)
+		alloc.fd = fd;
+#endif
+		return (uint8_t*)p;
+	}
+	void free(uint8_t *p) override
+	{
+		if (p == 0) return;
+		AllocationList::iterator i = allocList_.find((uintptr_t)p);
+		if (i == allocList_.end()) XBYAK_RISCV_THROW(ERR_BAD_PARAMETER)
+		if (munmap((void*)i->first, i->second.size) < 0) XBYAK_RISCV_THROW(ERR_MUNMAP)
+#if defined(XBYAK_RISCV_USE_MEMFD)
+		if (i->second.fd != -1) close(i->second.fd);
+#endif
+		allocList_.erase(i);
+	}
+};
+#endif
+
+namespace local {
+
+// Register Interface
+class IReg {
+public:
+	enum Kind {
+		GPR = 1,         // General purpose register
+		FReg = 1 << 1,   // Floating-point register
+		VECTOR = 1 << 2, // Vector register
+	};
+protected:
+	uint32_t idx_;
+	Kind kind_;
+public:
+	XBYAK_RISCV_CONSTEXPR IReg(uint32_t idx = 0, Kind kind = GPR)
+		: idx_(idx), kind_(kind)
+	{
+		XBYAK_RISCV_ASSERT(local::inBit(idx, 5));
+	}
+	XBYAK_RISCV_CONSTEXPR int getIdx() const { return idx_; }
+	const char *toString() const
+	{
+		if (kind_ == GPR) {
+			static const char tbl[][4] = {
+				"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+				"x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+				"x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
+				"x24", "x25", "x26", "x27", "x28", "x29", "x30", "x31",
+			};
+			return tbl[idx_];
+		} else if (kind_ == FReg) {
+			static const char tbl[][4] = {
+				"f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7",
+				"f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15",
+				"f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
+				"f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31",
+			};
+			return tbl[idx_];
+		} else if (kind_ == VECTOR) {
+			static const char tbl[][4] = {
+				"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+				"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+				"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+				"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+			};
+			return tbl[idx_];
+		}
+		XBYAK_RISCV_THROW_RET(ERR_INTERNAL, 0);
+	}
+	bool operator==(const IReg& rhs) const
+	{
+		return idx_ == rhs.idx_ && kind_ == rhs.kind_;
+	}
+	bool operator!=(const IReg& rhs) const { return !operator==(rhs); }
+
+};
+
+} // local
+
+// General Purpose Register
+struct Reg : public local::IReg {
+	explicit XBYAK_RISCV_CONSTEXPR Reg(int idx = 0) : local::IReg(idx, IReg::Kind::GPR) { }
+};
+
+static XBYAK_RISCV_CONSTEXPR Reg x0(0), x1(1), x2(2), x3(3), x4(4), x5(5), x6(6), x7(7);
+static XBYAK_RISCV_CONSTEXPR Reg x8(8), x9(9), x10(10), x11(11), x12(12), x13(13), x14(14), x15(15);
+static XBYAK_RISCV_CONSTEXPR Reg x16(16), x17(17), x18(18), x19(19), x20(20), x21(21), x22(22), x23(23);
+static XBYAK_RISCV_CONSTEXPR Reg x24(24), x25(25), x26(26), x27(27), x28(28), x29(29), x30(30), x31(31);
+
+static XBYAK_RISCV_CONSTEXPR Reg zero(x0);
+static XBYAK_RISCV_CONSTEXPR Reg ra(x1);
+static XBYAK_RISCV_CONSTEXPR Reg sp(x2);
+static XBYAK_RISCV_CONSTEXPR Reg gp(x3);
+static XBYAK_RISCV_CONSTEXPR Reg tp(x4);
+static XBYAK_RISCV_CONSTEXPR Reg t0(x5);
+static XBYAK_RISCV_CONSTEXPR Reg t1(x6);
+static XBYAK_RISCV_CONSTEXPR Reg t2(x7);
+static XBYAK_RISCV_CONSTEXPR Reg fp(x8);
+static XBYAK_RISCV_CONSTEXPR Reg s0(x8);
+static XBYAK_RISCV_CONSTEXPR Reg s1(x9);
+static XBYAK_RISCV_CONSTEXPR Reg a0(x10), a1(x11), a2(x12), a3(x13), a4(x14), a5(x15), a6(x16), a7(x17);
+static XBYAK_RISCV_CONSTEXPR Reg s2(x18), s3(x19), s4(x20), s5(x21), s6(x22), s7(x23), s8(x24), s9(x25);
+static XBYAK_RISCV_CONSTEXPR Reg s10(x26), s11(x27);
+static XBYAK_RISCV_CONSTEXPR Reg t3(x28), t4(x29), t5(x30), t6(x31);
+
+// Floating Point Register
+struct FReg : public local::IReg {
+	explicit XBYAK_RISCV_CONSTEXPR FReg(int idx = 0) : local::IReg(idx, IReg::Kind::FReg) { }
+};
+
+static XBYAK_RISCV_CONSTEXPR FReg f0(0), f1(1), f2(2), f3(3), f4(4), f5(5), f6(6), f7(7);
+static XBYAK_RISCV_CONSTEXPR FReg f8(8), f9(9), f10(10), f11(11), f12(12), f13(13), f14(14), f15(15);
+static XBYAK_RISCV_CONSTEXPR FReg f16(16), f17(17), f18(18), f19(19), f20(20), f21(21), f22(22), f23(23);
+static XBYAK_RISCV_CONSTEXPR FReg f24(24), f25(25), f26(26), f27(27), f28(28), f29(29), f30(30), f31(31);
+// ABI name
+static XBYAK_RISCV_CONSTEXPR FReg ft0(0), ft1(1), ft2(2), ft3(3), ft4(4), ft5(5), ft6(6), ft7(7);
+static XBYAK_RISCV_CONSTEXPR FReg fs0(8), fs1(9), fa0(10), fa1(11), fa2(12), fa3(13), fa4(14), fa5(15), fa6(16), fa7(f17);
+static XBYAK_RISCV_CONSTEXPR FReg fs2(18), fs3(19), fs4(20), fs5(21), fs6(22), fs7(23), fs8(24), fs9(25), fs10(26), fs11(27);
+static XBYAK_RISCV_CONSTEXPR FReg ft8(28), ft9(29), ft10(30), ft11(31);
+
+#if defined(XBYAK_RISCV_V) && XBYAK_RISCV_V == 1
+// Vector Register
+struct VReg : public local::IReg {
+	explicit XBYAK_RISCV_CONSTEXPR VReg(int idx = 0) : local::IReg(idx, IReg::Kind::VECTOR) { }
+};
+
+static XBYAK_RISCV_CONSTEXPR VReg v0(0), v1(1), v2(2), v3(3), v4(4), v5(5), v6(6), v7(7);
+static XBYAK_RISCV_CONSTEXPR VReg v8(8), v9(9), v10(10), v11(11), v12(12), v13(13), v14(14), v15(15);
+static XBYAK_RISCV_CONSTEXPR VReg v16(16), v17(17), v18(18), v19(19), v20(20), v21(21), v22(22), v23(23);
+static XBYAK_RISCV_CONSTEXPR VReg v24(24), v25(25), v26(26), v27(27), v28(28), v29(29), v30(30), v31(31);
+#endif
+
+// 2nd parameter for constructor of CodeArray(maxSize, userPtr, alloc)
+void *const DontSetProtectRWE = (void*)2; //-V566
+
+class CodeArray {
+	enum Type {
+		USER_BUF = 1, // use userPtr(non alignment, non protect)
+		ALLOC_BUF // use new(alignment, protect)
+	};
+	CodeArray(const CodeArray& rhs);
+	void operator=(const CodeArray&);
+	bool isAllocType() const { return type_ == ALLOC_BUF; }
+	const Type type_;
+#ifdef XBYAK_RISCV_USE_MMAP_ALLOCATOR
+	MmapAllocator defaultAllocator_;
+#else
+	Allocator defaultAllocator_;
+#endif
+	Allocator *alloc_;
+protected:
+	size_t maxSize_;
+	uint8_t *top_;
+	size_t size_;
+
+	bool useProtect() const { return alloc_->useProtect(); }
+public:
+	enum ProtectMode {
+		PROTECT_RW = 0, // read/write
+		PROTECT_RWE = 1, // read/write/exec
+		PROTECT_RE = 2 // read/exec
+	};
+	explicit CodeArray(size_t maxSize, void *userPtr = 0, Allocator *allocator = 0)
+		: type_((userPtr == 0 || userPtr == DontSetProtectRWE) ? ALLOC_BUF : USER_BUF)
+		, alloc_(allocator ? allocator : (Allocator*)&defaultAllocator_)
+		, maxSize_(maxSize)
+		, top_(type_ == USER_BUF ? reinterpret_cast<uint8_t*>(userPtr) : alloc_->alloc((std::max<size_t>)(maxSize, 1)))
+		, size_(0)
+	{
+		if (maxSize_ > 0 && top_ == 0) XBYAK_RISCV_THROW(ERR_CANT_ALLOC)
+		if ((type_ == ALLOC_BUF && userPtr != DontSetProtectRWE && useProtect()) && !setProtectMode(PROTECT_RWE, false)) {
+			alloc_->free(top_);
+			XBYAK_RISCV_THROW(ERR_CANT_PROTECT)
+		}
+	}
+	virtual ~CodeArray()
+	{
+		if (isAllocType()) {
+			if (useProtect()) setProtectModeRW(false);
+			alloc_->free(top_);
+		}
+	}
+	bool setProtectMode(ProtectMode mode, bool throwException = true)
+	{
+		bool isOK = protect(top_, maxSize_, mode);
+		if (isOK) return true;
+		if (throwException) XBYAK_RISCV_THROW_RET(ERR_CANT_PROTECT, false)
+		return false;
+	}
+	bool setProtectModeRE(bool throwException = true) { return setProtectMode(PROTECT_RE, throwException); }
+	bool setProtectModeRW(bool throwException = true) { return setProtectMode(PROTECT_RW, throwException); }
+	void resetSize()
+	{
+		size_ = 0;
+	}
+	void writeBytes(size_t offset, uint64_t v, size_t n)
+	{
+		if (n > 8) XBYAK_RISCV_THROW(ERR_BAD_PARAMETER)
+		if (offset + n > maxSize_) XBYAK_RISCV_THROW(ERR_CODE_IS_TOO_BIG)
+		uint8_t *const p = top_ + offset;
+		for (size_t i = 0; i < n; i++) {
+			p[i] = static_cast<uint8_t>(v >> (i * 8));
+		}
+	}
+	void writeBytes(const uint8_t *addr, uint64_t v, size_t n)
+	{
+		writeBytes(addr - top_, v, n);
+	}
+	void appendBytes(uint64_t v, size_t n)
+	{
+		writeBytes(size_, v, n);
+		size_ += n;
+	}
+	void append4B(uint32_t code) { appendBytes(code, 4); }
+	void append2B(uint32_t code) { appendBytes(code, 2); }
+	void append1B(uint32_t code) { appendBytes(code, 1); }
+	void write4B(size_t offset, uint32_t v) { writeBytes(offset, v, 4); }
+	void dump(bool separate = false) const
+	{
+		const uint8_t *p = getCode();
+		const size_t bufSize = getSize();
+		if (separate) {
+			size_t pos = 0;
+			while (pos < bufSize) {
+				uint32_t v = p[pos];
+				size_t n = (v & 3) == 3 ? 4 : 2;
+				if (pos + n <= bufSize) {
+					for (size_t i = 0; i < n; i++) {
+						printf("%02x", p[pos + n - 1 - i]);
+					}
+					printf("\n");
+					pos += n;
+				} else {
+					printf("%02x error\n", v);
+					return;
+				}
+			}
+			return;
+		}
+		size_t remain = bufSize;
+		for (int i = 0; i < 4; i++) {
+			size_t disp = 16;
+			if (remain < 16) {
+				disp = remain;
+			}
+			for (size_t j = 0; j < 16; j++) {
+				if (j < disp) {
+					printf("%02x", p[i * 16 + j]);
+				}
+			}
+			putchar('\n');
+			remain -= disp;
+			if (remain == 0) {
+				break;
+			}
+		}
+	}
+	const uint8_t *getCode() const { return top_; }
+	template<class F>
+	const F getCode() const { return reinterpret_cast<F>(top_); }
+	const uint8_t *getCurr() const { return &top_[size_]; }
+	template<class F>
+	const F getCurr() const { return reinterpret_cast<F>(&top_[size_]); }
+	size_t getSize() const { return size_; }
+	void setSize(size_t size)
+	{
+		if (size > maxSize_) XBYAK_RISCV_THROW(ERR_OFFSET_IS_TOO_BIG)
+		size_ = size;
+	}
+	/**
+		change exec permission of memory
+		@param addr [in] buffer address
+		@param size [in] buffer size
+		@param protectMode [in] mode(RW/RWE/RE)
+		@return true(success), false(failure)
+	*/
+	static inline bool protect(const void *addr, size_t size, int protectMode)
+	{
+#if defined(_WIN32)
+		const DWORD c_rw = PAGE_READWRITE;
+		const DWORD c_rwe = PAGE_EXECUTE_READWRITE;
+		const DWORD c_re = PAGE_EXECUTE_READ;
+		DWORD mode;
+#else
+		const int c_rw = PROT_READ | PROT_WRITE;
+		const int c_rwe = PROT_READ | PROT_WRITE | PROT_EXEC;
+		const int c_re = PROT_READ | PROT_EXEC;
+		int mode;
+#endif
+		switch (protectMode) {
+		case PROTECT_RW: mode = c_rw; break;
+		case PROTECT_RWE: mode = c_rwe; break;
+		case PROTECT_RE: mode = c_re; break;
+		default:
+			return false;
+		}
+#if defined(_WIN32)
+		DWORD oldProtect;
+		return VirtualProtect(const_cast<void*>(addr), size, mode, &oldProtect) != 0;
+#elif defined(__GNUC__)
+		size_t pageSize = sysconf(_SC_PAGESIZE);
+		size_t iaddr = reinterpret_cast<size_t>(addr);
+		size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1));
+		return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0;
+#else
+		return true;
+#endif
+	}
+	/**
+		get aligned memory pointer
+		@param addr [in] address
+		@param alignedSize [in] power of two
+		@return aligned addr by alingedSize
+	*/
+	static inline uint8_t *getAlignedAddress(uint8_t *addr, size_t alignedSize = 16)
+	{
+		return reinterpret_cast<uint8_t*>((reinterpret_cast<size_t>(addr) + alignedSize - 1) & ~(alignedSize - static_cast<size_t>(1)));
+	}
+};
+
+struct Jmp {
+	enum Type {
+		tJal,
+		tBtype,
+		tRawAddress,
+	} type;
+	const uint8_t* from; /* address of the jmp mnemonic */
+	uint32_t encoded;
+	size_t encSize() const
+	{
+		return (type == tRawAddress) ? sizeof(size_t) : 4;
+	}
+	// jal
+	Jmp(const uint8_t *from, uint32_t opcode, const Reg& rd)
+		: type(tJal)
+		, from(from)
+		, encoded((rd.getIdx() << 7) | opcode)
+	{
+	}
+	// B-type
+	Jmp(const uint8_t* from, uint32_t opcode, uint32_t funct3, const Reg& src1, const Reg& src2)
+		: type(tBtype)
+		, from(from)
+		, encoded((src2.getIdx() << 20) | (src1.getIdx() << 15) | (funct3 << 12) | opcode)
+	{
+	}
+	// raw address
+	explicit Jmp(const uint8_t* from)
+		: type(tRawAddress)
+		, from(from)
+		, encoded(0)
+	{
+	}
+	static inline bool isValidImm(size_t imm, size_t maskBit)
+	{
+		const size_t M = local::mask(maskBit);
+		return (imm < M || ~M <= imm) && (imm & 1) == 0;
+	}
+	size_t encode(const uint8_t* addr) const
+	{
+		if (addr == 0) return 0;
+		if (type == tRawAddress) return size_t(addr);
+		const size_t imm = addr - from;
+		if (type == tJal) {
+			if (!isValidImm(imm, 20)) XBYAK_RISCV_THROW(ERR_INVALID_IMM_OF_JAL)
+			return local::get20_10to1_11_19to12_z12(imm) | encoded;
+		} else {
+			if (!isValidImm(imm, 12)) XBYAK_RISCV_THROW(ERR_INVALID_IMM_OF_JAL)
+			return local::get12_10to5_z13_4to1_11_z7(imm) | encoded;
+		}
+	}
+	// update jmp address by base->getCurr()
+	void update(CodeArray *base) const
+	{
+		base->writeBytes(from, encode(base->getCurr()), encSize());
+	}
+	// append jmp opcode with addr
+	void appendCode(CodeArray *base, const uint8_t *addr) const
+	{
+		base->appendBytes(encode(addr), encSize());
+	}
+};
+
+class LabelManager;
+
+class Label {
+	mutable LabelManager *mgr;
+	mutable int id;
+	friend class LabelManager;
+public:
+	Label() : mgr(0), id(0) {}
+	Label(const Label& rhs);
+	Label& operator=(const Label& rhs);
+	~Label();
+	void clear() { mgr = 0; id = 0; }
+	int getId() const { return id; }
+	const uint8_t *getAddress() const;
+};
+
+class LabelManager {
+	// for Label class
+	struct ClabelVal {
+		ClabelVal(const uint8_t* addr = 0) : addr(addr), refCount(1) {}
+		const uint8_t* addr;
+		int refCount;
+	};
+	typedef std::unordered_map<int, ClabelVal> ClabelDefList;
+	typedef std::unordered_multimap<int, Jmp> ClabelUndefList;
+	typedef std::unordered_set<Label*> LabelPtrList;
+
+	CodeArray *base_;
+	mutable int labelId_;
+	ClabelDefList clabelDefList_;
+	ClabelUndefList clabelUndefList_;
+	LabelPtrList labelPtrList_;
+
+	int getId(const Label& label) const
+	{
+		if (label.id == 0) label.id = labelId_++;
+		return label.id;
+	}
+	void define_inner(ClabelDefList& defList, ClabelUndefList& undefList, int labelId, const uint8_t* addr)
+	{
+		// add label
+		ClabelDefList::value_type item(labelId, addr);
+		std::pair<ClabelDefList::iterator, bool> ret = defList.insert(item);
+		if (!ret.second) XBYAK_RISCV_THROW(ERR_LABEL_IS_REDEFINED)
+		// search undefined label
+		for (;;) {
+			ClabelUndefList::iterator itr = undefList.find(labelId);
+			if (itr == undefList.end()) break;
+			const Jmp& jmp = itr->second;
+			jmp.update(base_);
+			undefList.erase(itr);
+		}
+	}
+	friend class Label;
+	void incRefCount(int id, Label *label)
+	{
+		clabelDefList_[id].refCount++;
+		labelPtrList_.insert(label);
+	}
+	void decRefCount(int id, Label *label)
+	{
+		labelPtrList_.erase(label);
+		ClabelDefList::iterator i = clabelDefList_.find(id);
+		if (i == clabelDefList_.end()) return;
+		if (i->second.refCount == 1) {
+			clabelDefList_.erase(id);
+		} else {
+			--i->second.refCount;
+		}
+	}
+	template<class T>
+	bool hasUndefinedLabel_inner(const T& list) const
+	{
+		return !list.empty();
+	}
+	// detach all labels linked to LabelManager
+	void resetLabelPtrList()
+	{
+		for (LabelPtrList::iterator i = labelPtrList_.begin(), ie = labelPtrList_.end(); i != ie; ++i) {
+			(*i)->clear();
+		}
+		labelPtrList_.clear();
+	}
+public:
+	LabelManager()
+	{
+		reset();
+	}
+	~LabelManager()
+	{
+		resetLabelPtrList();
+	}
+	void reset()
+	{
+		base_ = 0;
+		labelId_ = 1;
+		clabelDefList_.clear();
+		clabelUndefList_.clear();
+		resetLabelPtrList();
+	}
+	void set(CodeArray *base) { base_ = base; }
+	void defineClabel(Label& label)
+	{
+		define_inner(clabelDefList_, clabelUndefList_, getId(label), base_->getCurr());
+		label.mgr = this;
+		labelPtrList_.insert(&label);
+	}
+	void assign(Label& dst, const Label& src)
+	{
+		ClabelDefList::const_iterator i = clabelDefList_.find(src.id);
+		if (i == clabelDefList_.end()) XBYAK_RISCV_THROW(ERR_LABEL_IS_NOT_SET_BY_L)
+		define_inner(clabelDefList_, clabelUndefList_, dst.id, i->second.addr);
+		dst.mgr = this;
+		labelPtrList_.insert(&dst);
+	}
+	// return 0 unless label exists
+	const uint8_t* getAddr(const Label& label) const
+	{
+		ClabelDefList::const_iterator i = clabelDefList_.find(getId(label));
+		if (i == clabelDefList_.end()) return 0;
+		return i->second.addr;
+	}
+	void addUndefinedLabel(const Label& label, const Jmp& jmp)
+	{
+		clabelUndefList_.insert(ClabelUndefList::value_type(label.id, jmp));
+	}
+	bool hasUndefClabel() const { return hasUndefinedLabel_inner(clabelUndefList_); }
+	const uint8_t *getCode() const { return base_->getCode(); }
+};
+
+inline Label::Label(const Label& rhs)
+{
+	id = rhs.id;
+	mgr = rhs.mgr;
+	if (mgr) mgr->incRefCount(id, this);
+}
+inline Label& Label::operator=(const Label& rhs)
+{
+	if (id) XBYAK_RISCV_THROW_RET(ERR_LABEL_IS_ALREADY_SET_BY_L, *this)
+	id = rhs.id;
+	mgr = rhs.mgr;
+	if (mgr) mgr->incRefCount(id, this);
+	return *this;
+}
+inline Label::~Label()
+{
+	if (id && mgr) mgr->decRefCount(id, this);
+}
+inline const uint8_t* Label::getAddress() const
+{
+	if (mgr == 0) return 0;
+	return mgr->getAddr(*this);
+}
+
+namespace local {
+
+template<size_t n>
+struct Bit {
+	uint32_t v;
+	Bit(uint32_t v)
+		: v(v)
+	{
+		XBYAK_RISCV_ASSERT(inBit(v, n));
+	}
+	Bit(const IReg& r)
+		: v(r.getIdx())
+	{
+	}
+	Bit(VM vm)
+		: v(static_cast<uint32_t>(vm))
+	{
+	}
+	Bit(CSR csr)
+		: v(static_cast<uint32_t>(csr))
+	{
+	}
+	Bit(RM rm)
+		: v(static_cast<uint32_t>(rm))
+	{
+	}
+};
+
+} // local
+
+class CodeGenerator : public CodeArray {
+public:
+	enum AqRlType {
+		T_aq = 2,
+		T_rl = 1,
+		T_aqrl = 3,
+	};
+	typedef local::Bit<1> Bit1;
+	typedef local::Bit<2> Bit2;
+	typedef local::Bit<3> Bit3;
+	typedef local::Bit<5> Bit5;
+	typedef local::Bit<6> Bit6;
+	typedef local::Bit<7> Bit7;
+	typedef local::Bit<12> Bit12;
+	typedef local::Bit<32> Bit32;
+private:
+	CodeGenerator operator=(const CodeGenerator&) = delete;
+	LabelManager labelMgr_;
+	int XLEN_;
+	bool isRV32_;
+	bool supportRVC_;
+	void opJmp(const Label& label, const Jmp& jmp)
+	{
+		const uint8_t* addr = labelMgr_.getAddr(label);
+		jmp.appendCode(this, addr);
+		if (addr) return;
+		labelMgr_.addUndefinedLabel(label, jmp);
+	}
+	uint32_t enc2(uint32_t a, uint32_t b) const { return (a<<7) | (b<<15); }
+	uint32_t enc3(uint32_t a, uint32_t b, uint32_t c) const { return enc2(a, b) | (c<<20); }
+	void Rtype(Bit7 opcode, Bit3 funct3, Bit7 funct7, Bit5 rd, Bit5 rs1, Bit5 rs2)
+	{
+		uint32_t v = (funct7.v<<25) | (funct3.v<<12) | opcode.v | enc3(rd.v, rs1.v, rs2.v);
+		append4B(v);
+	}
+	void Itype(Bit7 opcode, Bit3 funct3, Bit5 rd, Bit5 rs1, int imm)
+	{
+		if (!local::inSBit(imm, 12)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG)
+		uint32_t v = (imm<<20) | (funct3.v<<12) | opcode.v | enc2(rd.v, rs1.v);
+		append4B(v);
+	}
+	void Stype(Bit7 opcode, Bit3 funct3, Bit5 rs1, Bit5 rs2, int imm)
+	{
+		if (!local::inSBit(imm, 12)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG)
+		uint32_t v = ((imm>>5)<<25) | (funct3.v<<12) | opcode.v | enc3(imm & local::mask(5), rs1.v, rs2.v);
+		append4B(v);
+	}
+	void Utype(Bit7 opcode, Bit5 rd, uint32_t imm)
+	{
+		if (imm >= (1u << 20)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG)
+		uint32_t v = (imm<<12) | opcode.v | (rd.v<<7);
+		append4B(v);
+	}
+	void opShift(Bit7 pre, Bit3 funct3, Bit7 opcode, Bit5 rd, Bit5 rs1, uint32_t shamt, int range = 0)
+	{
+		if (range == 0) range = isRV32_ ? 5 : 6;
+		if (shamt >= (1u << range)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG)
+		uint32_t v = (pre.v<<25) | (funct3.v<<12) | opcode.v | enc3(rd.v, rs1.v, shamt);
+		append4B(v);
+	}
+	void opAtomic(Bit5 rd, Bit5 rs2, Bit5 addr, Bit5 funct5, Bit3 funct3, uint32_t flag)
+	{
+		assert(flag <= 3);
+		Rtype(0x2f, funct3.v, (funct5.v << 2) | flag, rd, addr, rs2);
+	}
+	void opIVV(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 vs1, Bit5 vd)
+	{
+		/*
+		    31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
+			  func6    vm      vs2       vs1        func3       vd     opcode
+
+			func6, func3, and opcode must be encoded in the baseValue
+		*/
+		uint32_t v = (vm.v<<25) | (vs2.v<<20) | (vs1.v<<15) | (vd.v<<7);
+		v |= baseValue.v; // force-encode base value
+		append4B(v);
+	}
+	void opFVV(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 vs1, Bit5 d)
+	{
+		/*
+		    31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
+			  func6    vm      vs2       vs1        func3     vd/rd    opcode
+
+			func6, func3, and opcode must be encoded in the baseValue
+		*/
+		uint32_t v = (vm.v<<25) | (vs2.v<<20) | (vs1.v<<15) | (d.v<<7);
+		v |= baseValue.v; // force-encode base value
+		append4B(v);
+	}
+	void opMVV(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 vs1, Bit5 d)
+	{
+		/*
+		    31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
+			  func6    vm      vs2       vs1        func3     vd/rd    opcode
+
+			func6, func3, and opcode must be encoded in the baseValue
+		*/
+		uint32_t v = (vm.v<<25) | (vs2.v<<20) | (vs1.v<<15) | (d.v<<7);
+		v |= baseValue.v; // force-encode base value
+		append4B(v);
+	}
+	void opIVI(Bit32 baseValue, Bit1 vm, Bit5 vs2, uint32_t imm, Bit5 vd)
+	{
+		/*
+		    31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
+			  func6    vm      vs2       imm       func3       vd     opcode
+
+			func6, func3, and opcode must be encoded in the baseValue
+		*/
+		uint32_t v = (vm.v<<25) | (vs2.v<<20) | ((imm & local::mask(5))<<15) | (vd.v<<7);
+		v |= baseValue.v; // force-encode base value
+		append4B(v);
+	}
+	void opIVX(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 rs1, Bit5 vd)
+	{
+		/*
+		    31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
+			  func6    vm      vs2       rs1        func3       vd     opcode
+
+			func6, func3, and opcode must be encoded in the baseValue
+		*/
+		uint32_t v = (vm.v<<25) | (vs2.v<<20) | (rs1.v<<15) | (vd.v<<7);
+		v |= baseValue.v; // force-encode base value
+		append4B(v);
+	}
+	void opFVF(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 rs1, Bit5 vd)
+	{
+		/*
+		    31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
+			  func6    vm      vs2       rs1        func3       vd     opcode
+
+			func6, func3, and opcode must be encoded in the baseValue
+		*/
+		uint32_t v = (vm.v<<25) | (vs2.v<<20) | (rs1.v<<15) | (vd.v<<7);
+		v |= baseValue.v; // force-encode base value
+		append4B(v);
+	}
+	void opMVX(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 rs1, Bit5 d)
+	{
+		/*
+		    31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
+			  func6    vm      vs2       rs1        func3     vd/rd    opcode
+
+			func6, func3, and opcode must be encoded in the baseValue
+		*/
+		uint32_t v = (vm.v<<25) | (vs2.v<<20) | (rs1.v<<15) | (d.v<<7);
+		v |= baseValue.v; // force-encode base value
+		append4B(v);
+	}
+	void opVectorLoad(Bit32 baseValue, Bit1 vm, Bit5 rs2_vs2, Bit5 rs1, Bit5 vd)
+	{
+		/*
+		    31 .. 29 | 28 | 27 .. 26 | 25 |     24 .. 20     | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
+			   nf      mew     mop     vm     lumop/rs2/vs2      rs1        width       vd     opcode
+
+			mew, mop, width, lumop, and opcode must be encoded in the baseValue
+		*/
+		uint32_t v = (vm.v<<25) | (rs2_vs2.v<<20) | (rs1.v<<15) | (vd.v<<7);
+		v |= baseValue.v; // force-encode base value
+		append4B(v);
+	}
+	void opVectorStore(Bit32 baseValue, Bit1 vm, Bit5 rs2_vs2, Bit5 rs1, Bit5 vs3)
+	{
+		/*
+		    31 .. 29 | 28 | 27 .. 26 | 25 |     24 .. 20     | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
+			   nf      mew     mop     vm     sumop/rs2/vs2       rs1        width     vd      opcode
+
+			mew, mop, width, sumop, and opcode must be encoded in the baseValue
+		*/
+		uint32_t v = (vm.v<<25) | (rs2_vs2.v<<20) | (rs1.v<<15) | (vs3.v<<7);
+		v |= baseValue.v; // force-encode base value
+		append4B(v);
+	}
+	void opCSR(Bit32 baseValue, Bit12 csr, Bit5 rs1_uimm, Bit5 rd)
+	{
+		/*
+		    31 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
+			   csr     rs1_uimm     func3       rd     opcode
+
+			func3 and opcode must be encoded in the baseValue
+		*/
+		uint32_t v = (csr.v<<20) | (rs1_uimm.v<<15) | (rd.v<<7);
+		v |= baseValue.v; // force-encode base value
+		append4B(v);
+	}
+	void opLoadFP(Bit32 baseValue, int imm, Bit5 rs1, Bit5 rd)
+	{
+		if (!local::inSBit(imm, 12)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG)
+		/*
+			31 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
+			imm[11:0]     rs1       width       rd      opcode
+
+			width and opcode must be encoded in the baseValue
+		*/
+		uint32_t v = (imm<<20) | (rs1.v<<15) | (rd.v<<7);
+		v |= baseValue.v; // force-encode base value
+		append4B(v);
+	}
+	void opStoreFP(Bit32 baseValue, int imm, Bit5 rs2, Bit5 rs1)
+	{
+		if (!local::inSBit(imm, 12)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG)
+		/*
+			31 .. 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
+			imm[11:5]     rs2        rs1       width    imm[4:0]   opcode
+
+			width and opcode must be encoded in the baseValue
+		*/
+		uint32_t imm_11_5 = imm & (local::mask(7)<<5);
+		uint32_t imm_4_0 = imm & local::mask(5);
+		uint32_t v = (imm_11_5<<20) | (rs2.v<<20) | (rs1.v<<15) | (imm_4_0<<7);
+		v |= baseValue.v; // force-encode base value
+		append4B(v);
+	}
+	void opFP(Bit32 baseValue, Bit5 rs2, Bit5 rs1, Bit3 rm, Bit5 rd)
+	{
+		/*
+			31 .. 27 | 26 .. 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
+			  func5       fmt        rs2        rs1        rm         rd      opcode
+
+			func5, fmt, and opcode must be encoded in the baseValue
+		*/
+		uint32_t v = (rs2.v<<20) | (rs1.v<<15) | (rm.v<<12) | (rd.v<<7);
+		v |= baseValue.v; // force-encode base value
+		append4B(v);
+	}
+	void opR4(Bit32 baseValue, Bit5 rs3, Bit5 rs2, Bit5 rs1, Bit3 rm, Bit5 rd)
+	{
+		/*
+			31 .. 27 | 26 .. 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
+			   rs3        fmt        rs2        rs1        rm         rd      opcode
+
+			fmt and opcode must be encoded in the baseValue
+		*/
+		uint32_t v = (rs3.v<<27) | (rs2.v<<20) | (rs1.v<<15) | (rm.v<<12) | (rd.v<<7);
+		v |= baseValue.v; // force-encode base value
+		append4B(v);
+	}
+	bool isValiCidx(uint32_t idx) const { return 8 <= idx && idx < 16; }
+	// c_addi, c_addiw
+	bool c_addi_inner(const Reg& rd, const Reg& rs, uint32_t imm, uint32_t funct3)
+	{
+		uint32_t dIdx = rd.getIdx();
+		uint32_t sIdx = rs.getIdx();
+		if (sIdx == 0 && c_li(rd, imm, 2, 1)) return true;
+		if (dIdx == 0 || dIdx != sIdx || !local::inSBit(imm, 6)) return false;
+		uint32_t v = (funct3<<13) | ((imm & (1<<5))<<7) | (dIdx<<7) | ((imm & 31)<<2)| 1;
+		append2B(v);
+		return true;
+	}
+	bool c_addi16sp(const Reg& rd, const Reg& rs, uint32_t imm)
+	{
+		if (rd != sp || rs != sp || (imm % 16) != 0 || (496 < imm && imm < ~512u) || imm == 0) return false;
+		uint32_t v = (3<<13) | (2<<7) | 1 | local::get9_z5_4_6_8to7_5_z2(imm);
+		append2B(v);
+		return true;
+	}
+	// c_li, c_slli
+	bool c_li(const Reg& rd, uint32_t imm, uint32_t funct3, uint32_t op)
+	{
+		if (rd == x0 || !local::inSBit(imm, 6)) return false;
+		uint32_t v = (funct3<<13) | (rd.getIdx() << 7) | op | local::get5_z5_4to0_z2(imm);
+		append2B(v);
+		return true;
+	}
+	bool c_lui(const Reg& rd, uint32_t imm)
+	{
+		if (rd == x0 || rd == x2 || imm == 0 || (32 <= imm && imm < (1<<20)-32)) return false;
+		uint32_t v = (3<<13) | (rd.getIdx()<<7) | 1 | local::get5_z5_4to0_z2(imm);
+		append2B(v);
+		return true;
+	}
+	bool c_addi(const Reg& rd, const Reg& rs, uint32_t imm)
+	{
+		uint32_t dIdx = rd.getIdx();
+		if (imm == 0 && c_mv(rd, rs, 0)) return true;
+		if (c_addi_inner(rd, rs, imm, 0)) return true;
+		if (c_addi16sp(rd, rs, imm)) return true;
+		// c.addi4spn(rd, imm) = c.addi(rd, x2, imm)
+		if (rs != sp || !isValiCidx(dIdx) || imm == 0 || (imm % 4) != 0 || imm >= 1024) return false;
+		uint32_t v = ((dIdx-8)<<2) | local::get5to4_9to6_2_3_z5(imm);
+		append2B(v);
+		return true;
+	}
+	uint32_t creg2(uint32_t a, uint32_t b) { return ((a-8)<<7) | ((b-8)<<2); }
+	// c_lw, c_sw
+	bool c_lsw(const Reg& rd, const Reg& rs, int imm, uint32_t funct3)
+	{
+		uint32_t dIdx = rd.getIdx();
+		uint32_t sIdx = rs.getIdx();
+		if (!isValiCidx(dIdx) || !isValiCidx(sIdx) || (imm % 4) != 0 || imm < 0 || imm >= (1 << 7)) return false;
+		uint32_t v = (funct3<<13) | creg2(sIdx, dIdx) | local::get5to3_z3_2_6_z5(imm);
+		append2B(v);
+		return true;
+	}
+	// c_ld, c_sd
+	bool c_lsd(const Reg& rd, const Reg& rs, int imm, uint32_t funct3)
+	{
+		uint32_t dIdx = rd.getIdx();
+		uint32_t sIdx = rs.getIdx();
+		if (!isValiCidx(dIdx) || !isValiCidx(sIdx) || (imm % 8) != 0 || imm < 0 || imm >= (1 << 8)) return false;
+		uint32_t v = (funct3<<13) | creg2(sIdx, dIdx) | local::get5to3_z3_7_6_z5(imm);
+		append2B(v);
+		return true;
+	}
+	// c_srli, c_srai, c_andi
+	bool c_srli(const Reg& rd, const Reg& rs, int imm, uint32_t funct2, bool allowImm0 = false)
+	{
+		uint32_t dIdx = rd.getIdx();
+		uint32_t sIdx = rs.getIdx();
+		if (dIdx != sIdx || !isValiCidx(dIdx) || (!allowImm0 && imm == 0) || imm >= (1 << 6)) return false;
+		uint32_t v = (4<<13) | (funct2<<10) | ((dIdx-8)<<7) | local::get5_z5_4to0_z2(imm) | 1;
+		append2B(v);
+		return true;
+	}
+	// rd = rs1
+	// c_sub, c_xor, c_or, c_and, c_subw
+	bool c_noimm(const Reg& rd, const Reg& rs1, const Reg& rs2, uint32_t funct3, uint32_t funct2)
+	{
+		uint32_t dIdx = rd.getIdx();
+		uint32_t sIdx = rs2.getIdx();
+		if (rd.getIdx() != rs1.getIdx() || !isValiCidx(dIdx) || !isValiCidx(sIdx)) return false;
+		uint32_t v = (funct3<<10) | ((dIdx-8)<<7) | (funct2<<5) | ((sIdx-8)<<2) | 1;
+		append2B(v);
+		return true;
+	}
+	// c_lwsp, c_flwsp
+	bool c_lwsp(const Reg& rd, const Reg& addr, int imm, uint32_t funct3)
+	{
+		uint32_t idx = rd.getIdx();
+		if (addr != sp || (imm % 4) != 0 || (imm >> 8)) return false;
+		uint32_t v = (funct3<<13) | (idx<<7) | local::get5_z5_4to2_7to6_z2(imm) | 2;
+		append2B(v);
+		return true;
+	}
+	// c_ldsp
+	bool c_ldsp(const Reg& rd, const Reg& addr, int imm, uint32_t funct3)
+	{
+		uint32_t idx = rd.getIdx();
+		if (addr != sp || (imm % 8) != 0 || (imm >> 9)) return false;
+		uint32_t v = (funct3<<13) | (idx<<7) | local::get5_z5_4to3_8to6_z2(imm) | 2;
+		append2B(v);
+		return true;
+	}
+	// c.mv, c.add
+	bool c_mv(const Reg& rd, const Reg& rs, uint32_t funct1)
+	{
+		if (rd == x0 || rs == x0) return false;
+		uint32_t v = (4<<13) | (funct1<<12) | (rd.getIdx()<<7) | (rs.getIdx()<<2) | 2;
+		append2B(v);
+		return true;
+	}
+	bool c_swsp(const Reg& rs, const Reg& addr, int imm, uint32_t funct3)
+	{
+		if (addr != sp || (imm % 4) != 0 || (imm >> 8)) return false;
+		uint32_t v = (funct3<<13) | (rs.getIdx()<<2) | local::get5to2_7to6_z7(imm) | 2;
+		append2B(v);
+		return true;
+	}
+	bool c_sdsp(const Reg& rs, const Reg& addr, int imm, uint32_t funct3)
+	{
+		if (addr != sp || (imm % 8) != 0 || (imm >> 9)) return false;
+		uint32_t v = (funct3<<13) | (rs.getIdx()<<2) | local::get5to3_8to6_z7(imm) | 2;
+		append2B(v);
+		return true;
+	}
+public:
+	void L(Label& label) { labelMgr_.defineClabel(label); }
+	Label L() { Label label; L(label); return label; }
+	/*
+		assign src to dst
+		require
+		dst : does not used by L()
+		src : used by L()
+	*/
+	void assignL(Label& dst, const Label& src) { labelMgr_.assign(dst, src); }
+	/*
+		put the absolute address of label to buffer
+		@note the put size is 4(32-bit), 8(64-bit)
+	*/
+	void putL(const Label &label)
+	{
+		Jmp jmp(getCurr());
+		opJmp(label, jmp);
+	}
+
+	// constructor
+	CodeGenerator(size_t maxSize = DEFAULT_MAX_CODE_SIZE, void *userPtr = DontSetProtectRWE, Allocator *allocator = 0)
+		: CodeArray(maxSize, userPtr, allocator)
+		, XLEN_(64)
+		, isRV32_(false)
+		, supportRVC_(false)
+	{
+		labelMgr_.set(this);
+	}
+	void reset()
+	{
+		ClearError();
+		resetSize();
+		labelMgr_.reset();
+		labelMgr_.set(this);
+		XLEN_ = 64;
+		isRV32_ = false;
+		supportRVC_ = false;
+	}
+	void setRV32(bool on = true)
+	{
+		isRV32_ = on;
+		XLEN_ = on ? 32 : 64;
+	}
+	void supportRVC(bool on = true)
+	{
+		supportRVC_ = on;
+	}
+	bool hasUndefinedLabel() const { return labelMgr_.hasUndefClabel(); }
+	static inline void clearCache(void *p, size_t n)
+	{
+#ifdef _WIN32
+		FlushInstructionCache(GetCurrentProcess(), begin, n);
+#elif defined(__APPLE__)
+		sys_icache_invalidate(begin, n);
+#else
+		__builtin___clear_cache((char *)p, (char *)p + n);
+#endif
+	}
+	/*
+		MUST call ready() to complete generating code if you use AutoGrow mode.
+		It is not necessary for the other mode if hasUndefinedLabel() is true.
+	*/
+	void ready(ProtectMode mode = PROTECT_RWE)
+	{
+		if (hasUndefinedLabel()) XBYAK_RISCV_THROW(ERR_LABEL_IS_NOT_FOUND)
+		if (useProtect()) setProtectMode(mode);
+		clearCache(top_, size_);
+	}
+	// set read/exec
+	void readyRE() { return ready(PROTECT_RE); }
+
+	void align(size_t x)
+	{
+		if (x == 1) return;
+		if (x < 4 || (x & (x - 1))) XBYAK_RISCV_THROW(ERR_BAD_ALIGN)
+		size_t remain = size_t(getCurr()) % x;
+		if (remain % 4) XBYAK_RISCV_THROW(ERR_INTERNAL)
+		if (remain) {
+			for (size_t i = 0; i < (x - remain) / 4; i++) {
+				nop();
+			}
+		}
+	}
+
+#include "xbyak_riscv_mnemonic.hpp"
+#if defined(XBYAK_RISCV_V) && XBYAK_RISCV_V == 1
+#include "xbyak_riscv_v.hpp"
+#endif
+};
+
+#ifdef _MSC_VER
+	#pragma warning(pop)
+#endif
+} // Xbyak_riscv
+
diff --git a/third_party/xbyak_riscv/xbyak_riscv_csr.hpp b/third_party/xbyak_riscv/xbyak_riscv_csr.hpp
new file mode 100644
index 00000000000..5f04ed441a1
--- /dev/null
+++ b/third_party/xbyak_riscv/xbyak_riscv_csr.hpp
@@ -0,0 +1,112 @@
+/******************************************************************************
+* Copyright (C), 2023, KNS Group LLC (YADRO)
+*
+* Licensed under the 3-Clause BSD License
+* You may obtain a copy of the License at https://opensource.org/license/bsd-3-clause/
+*******************************************************************************/
+
+#pragma once
+namespace Xbyak_riscv {
+
+// Control and Status Register
+enum class CSR : uint32_t {
+    // FP CSRs
+    fflags = 0x001, // Floating-Point Accrued Exceptions
+    frm    = 0x002, // Floating-Point Dynamic Rounding Mode
+    fcsr   = 0x003, // Floating-Point Control and Status register
+    // vector CSRs
+    vstart = 0x008, // Vector start position
+    vxsat  = 0x009, // Fixed-Point Saturate Flag
+    vxrm   = 0x00A, // Fixed-Point Rounding Mode
+    vcsr   = 0x00F, // Vector control and status register
+    vl     = 0xC20, // Vector length
+    vtype  = 0xC21, // Vector data type register
+    vlenb  = 0xC22, // VLEN/8 (vector register length in bytes)
+};
+
+
+// Selected Element Width
+enum class SEW : uint32_t {
+    e8  = 0x0,
+    e16 = 0x1,
+    e32 = 0x2,
+    e64 = 0x3
+};
+
+// Vector Length Multiplier
+enum class LMUL : uint32_t {
+    mf8 = 0x5,
+    mf4 = 0x6,
+    mf2 = 0x7,
+    m1  = 0x0,
+    m2  = 0x1,
+    m4  = 0x2,
+    m8  = 0x3
+};
+
+// Vector Mask Agnostic
+enum class VMA : uint32_t {
+    mu = 0, // undisturbed
+    ma = 1, // agnostic
+};
+
+// Vector Tail Agnostic
+enum class VTA : uint32_t {
+    tu = 0, // undisturbed
+    ta = 1, // agnostic
+};
+
+enum class VectorAddressingMode : uint32_t {
+    unitStride       = 0x0,
+    indexedUnordered = 0x1,
+    strided          = 0x2,
+    indexedOrdered   = 0x3
+    // other encodings are reserved
+};
+
+enum class UnitStrideVectorAddressingModeLoad : uint32_t {
+    load              = 0x0, // unit-stride load
+    wholeRegisterLoad = 0x8, // unit-stride, whole register load
+    maskLoad          = 0xb, // unit-stride, mask load, EEW=8
+    faultOnlyFirst    = 0x10  // unit-stride fault-only-first
+    // other encodings are reserved
+};
+
+enum class UnitStrideVectorAddressingModeStore : uint32_t {
+    store              = 0x0, // unit-stride store
+    wholeRegisterStore = 0x8, // unit-stride, whole register store
+    maskStore          = 0xb  // unit-stride, mask store, EEW=8
+    // other encodings are reserved
+};
+
+enum class WidthEncoding : uint32_t {
+    e8  = 0x0, // Vector 8-bit  element
+    e16 = 0x5, // Vector 16-bit element
+    e32 = 0x6, // Vector 32-bit element
+    e64 = 0x7, // Vector 64-bit element
+};
+
+enum class VM : uint32_t {
+    unmasked = 1,
+    masked = 0
+};
+
+enum class RM : uint32_t {
+    rne = 0x0, // Round to Nearest, ties to Even
+    rtz = 0x1, // Round towards Zero
+    rdn = 0x2, // Round Down (towards -infinity)
+    rup = 0x3, // Round Up (towards + infinity)
+    rmm = 0x4, // Round to Nearest, ties to Max Magnitude
+    dyn = 0x7  // In instruction’s rm field, selects dynamic rounding mode;
+               // In Rounding Mode register, reserved.
+};
+
+enum class FFlags : uint32_t {
+    NV = 0x01, // Invalid Operation
+    DZ = 0x02, // Divide by Zero
+    OF = 0x04, // Overflow
+    UF = 0x08, // Underflow
+    NX = 0x10  // Inexact
+};
+
+} // Xbyak_riscv
diff --git a/third_party/xbyak_riscv/xbyak_riscv_mnemonic.hpp b/third_party/xbyak_riscv/xbyak_riscv_mnemonic.hpp
new file mode 100644
index 00000000000..b050d46cc75
--- /dev/null
+++ b/third_party/xbyak_riscv/xbyak_riscv_mnemonic.hpp
@@ -0,0 +1,231 @@
+const char *getVersionString() const { return "1.01"; }
+void add(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && rd == rs1 && c_mv(rd, rs2, 1)) return; Rtype(0x33, 0, 0x0, rd, rs1, rs2); }
+void sub(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x23, 0)) return; Rtype(0x33, 0, 0x20, rd, rs1, rs2); }
+void sll(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 1, 0x0, rd, rs1, rs2); }
+void slt(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 2, 0x0, rd, rs1, rs2); }
+void sltu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 3, 0x0, rd, rs1, rs2); }
+void xor_(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x23, 1)) return; Rtype(0x33, 4, 0x0, rd, rs1, rs2); }
+void srl(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 5, 0x0, rd, rs1, rs2); }
+void sra(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 5, 0x20, rd, rs1, rs2); }
+void or_(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x23, 2)) return; Rtype(0x33, 6, 0x0, rd, rs1, rs2); }
+void and_(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x23, 3)) return; Rtype(0x33, 7, 0x0, rd, rs1, rs2); }
+void addw(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x27, 1)) return; Rtype(0x3b, 0, 0x0, rd, rs1, rs2); }
+void subw(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x27, 0)) return; Rtype(0x3b, 0, 0x20, rd, rs1, rs2); }
+void sllw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 1, 0x0, rd, rs1, rs2); }
+void srlw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 5, 0x0, rd, rs1, rs2); }
+void sraw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 5, 0x20, rd, rs1, rs2); }
+void mul(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 0, 0x1, rd, rs1, rs2); }
+void mulh(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 1, 0x1, rd, rs1, rs2); }
+void mulhsu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 2, 0x1, rd, rs1, rs2); }
+void mulhu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 3, 0x1, rd, rs1, rs2); }
+void div(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 4, 0x1, rd, rs1, rs2); }
+void divu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 5, 0x1, rd, rs1, rs2); }
+void rem(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 6, 0x1, rd, rs1, rs2); }
+void remu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 7, 0x1, rd, rs1, rs2); }
+void mulw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 0, 0x1, rd, rs1, rs2); }
+void divw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 4, 0x1, rd, rs1, rs2); }
+void remw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 6, 0x1, rd, rs1, rs2); }
+void remuw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 7, 0x1, rd, rs1, rs2); }
+void addi(const Reg& rd, const Reg& rs1, int imm) { if (supportRVC_ && c_addi(rd, rs1, imm)) return; Itype(0x13, 0, rd, rs1, imm); }
+void slti(const Reg& rd, const Reg& rs1, int imm) { Itype(0x13, 2, rd, rs1, imm); }
+void sltiu(const Reg& rd, const Reg& rs1, int imm) { Itype(0x13, 3, rd, rs1, imm); }
+void xori(const Reg& rd, const Reg& rs1, int imm) { Itype(0x13, 4, rd, rs1, imm); }
+void ori(const Reg& rd, const Reg& rs1, int imm) { Itype(0x13, 6, rd, rs1, imm); }
+void andi(const Reg& rd, const Reg& rs1, int imm) { if (supportRVC_ && c_srli(rd, rs1, imm, 2, true)) return; Itype(0x13, 7, rd, rs1, imm); }
+void addiw(const Reg& rd, const Reg& rs1, int imm) { if (supportRVC_ && c_addi_inner(rd, rs1, imm, 1)) return; Itype(0x1b, 0, rd, rs1, imm); }
+// load-op rd, imm(addr); rd = addr[imm];
+void jalr(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x67, 0, rd, addr, imm); }
+void lb(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 0, rd, addr, imm); }
+void lh(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 1, rd, addr, imm); }
+void lw(const Reg& rd, const Reg& addr, int imm = 0) { if (supportRVC_ && (c_lwsp(rd, addr, imm, 2) || c_lsw(rd, addr, imm, 2))) return; Itype(0x3, 2, rd, addr, imm); }
+void lbu(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 4, rd, addr, imm); }
+void lhu(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 5, rd, addr, imm); }
+void lwu(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 6, rd, addr, imm); }
+void ld(const Reg& rd, const Reg& addr, int imm = 0) { if (supportRVC_ && (c_ldsp(rd, addr, imm, 3) || c_lsd(rd, addr, imm, 3))) return; Itype(0x3, 3, rd, addr, imm); }
+void auipc(const Reg& rd, uint32_t imm) { Utype(0x17, rd, imm); }
+void lui(const Reg& rd, uint32_t imm) { if (supportRVC_ && c_lui(rd, imm)) return; Utype(0x37, rd, imm); }
+void slli(const Reg& rd, const Reg& rs1, uint32_t shamt) { if (supportRVC_ && rd == rs1 && shamt != 0 && c_li(rd, shamt, 0, 2)) return; opShift(0x0, 1, 0x13, rd, rs1, shamt); }
+void srli(const Reg& rd, const Reg& rs1, uint32_t shamt) { if (supportRVC_ && c_srli(rd, rs1, shamt, 0)) return; opShift(0x0, 5, 0x13, rd, rs1, shamt); }
+void srai(const Reg& rd, const Reg& rs1, uint32_t shamt) { if (supportRVC_ && c_srli(rd, rs1, shamt, 1)) return; opShift(0x20, 5, 0x13, rd, rs1, shamt); }
+void slliw(const Reg& rd, const Reg& rs1, uint32_t shamt) { opShift(0x0, 1, 0x1b, rd, rs1, shamt, 5); }
+void srliw(const Reg& rd, const Reg& rs1, uint32_t shamt) { opShift(0x0, 5, 0x1b, rd, rs1, shamt, 5); }
+void sraiw(const Reg& rd, const Reg& rs1, uint32_t shamt) { opShift(0x20, 5, 0x1b, rd, rs1, shamt, 5); }
+void fence_rw_rw() { append4B(0x330000f); }
+void fence_tso() { append4B(0x8330000f); }
+void fence_rw_w() { append4B(0x310000f); }
+void fence_r_rw() { append4B(0x230000f); }
+void fence_r_r() { append4B(0x220000f); }
+void fence_w_w() { append4B(0x110000f); }
+void fence_i() { append4B(0x100f); }
+void ecall() { append4B(0x73); }
+void ebreak() { if (supportRVC_) append2B(0x9002); else append4B(0x00100073); }
+// store-op rs, imm(addr) ; addr[imm] = rs;
+void sb(const Reg& rs, const Reg& addr, int imm = 0) { Stype(0x23, 0, addr, rs, imm); }
+void sh(const Reg& rs, const Reg& addr, int imm = 0) { Stype(0x23, 1, addr, rs, imm); }
+void sw(const Reg& rs, const Reg& addr, int imm = 0) { if (supportRVC_ && (c_swsp(rs, addr, imm, 6) || c_lsw(rs, addr, imm, 6))) return; Stype(0x23, 2, addr, rs, imm); }
+void sd(const Reg& rs, const Reg& addr, int imm = 0) { if (supportRVC_ && (c_sdsp(rs, addr, imm, 7) || c_lsd(rs, addr, imm, 7))) return; Stype(0x23, 3, addr, rs, imm); }
+void beq(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 0, rs1, rs2); opJmp(label, jmp); }
+void bne(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 1, rs1, rs2); opJmp(label, jmp); }
+void blt(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 4, rs1, rs2); opJmp(label, jmp); }
+void bge(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 5, rs1, rs2); opJmp(label, jmp); }
+void bltu(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 6, rs1, rs2); opJmp(label, jmp); }
+void bgeu(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 7, rs1, rs2); opJmp(label, jmp); }
+void beqz(const Reg& rs, const Label& label) { beq(rs, x0, label); }
+void bnez(const Reg& rs, const Label& label) { bne(rs, x0, label); }
+void blez(const Reg& rs, const Label& label) { bge(x0, rs, label); }
+void bgez(const Reg& rs, const Label& label) { bge(rs, x0, label); }
+void bltz(const Reg& rs, const Label& label) { blt(rs, x0, label); }
+void bgtz(const Reg& rs, const Label& label) { blt(x0, rs, label); }
+void bgt(const Reg& rs, const Reg& rt, const Label& label) { blt(rt, rs, label); }
+void ble(const Reg& rs, const Reg& rt, const Label& label) { bge(rt, rs, label); }
+void bgtu(const Reg& rs, const Reg& rt, const Label& label) { bltu(rt, rs, label); }
+void bleu(const Reg& rs, const Reg& rt, const Label& label) { bgeu(rt, rs, label); }
+// amos**, rd, rs2, (addr)
+void sc_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x3, 2, flag); }
+void sc_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x3, 3, flag); }
+void amoswap_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x1, 2, flag); }
+void amoswap_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x1, 3, flag); }
+void amoadd_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x0, 2, flag); }
+void amoadd_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x0, 3, flag); }
+void amoxor_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x4, 2, flag); }
+void amoxor_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x4, 3, flag); }
+void amoand_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0xc, 2, flag); }
+void amoand_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0xc, 3, flag); }
+void amoor_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x8, 2, flag); }
+void amoor_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x8, 3, flag); }
+void amomin_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x10, 2, flag); }
+void amomin_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x10, 3, flag); }
+void amomax_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x14, 2, flag); }
+void amomax_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x14, 3, flag); }
+void amominu_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x18, 2, flag); }
+void amominu_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x18, 3, flag); }
+void amomaxu_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x1c, 2, flag); }
+void amomaxu_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x1c, 3, flag); }
+void csrrw(const Reg& rd, CSR csr, const Reg& rs1) { opCSR(0x1073, csr, rs1, rd); }
+void csrrs(const Reg& rd, CSR csr, const Reg& rs1) { opCSR(0x2073, csr, rs1, rd); }
+void csrrc(const Reg& rd, CSR csr, const Reg& rs1) { opCSR(0x3073, csr, rs1, rd); }
+void csrrwi(const Reg& rd, CSR csr, uint32_t imm) { opCSR(0x5073, csr, imm, rd); }
+void csrrsi(const Reg& rd, CSR csr, uint32_t imm) { opCSR(0x6073, csr, imm, rd); }
+void csrrci(const Reg& rd, CSR csr, uint32_t imm) { opCSR(0x7073, csr, imm, rd); }
+void fadd_s(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x53, rs2, rs1, rm, rd); }
+void fclass_s(const Reg& rd, const FReg& rs1) { opFP(0xe0001053, 0, rs1, 0, rd); }
+void fcvt_s_w(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd0000053, 0, rs1, rm, rd); }
+void fcvt_s_wu(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd0100053, 0, rs1, rm, rd); }
+void fcvt_w_s(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc0000053, 0, rs1, rm, rd); }
+void fcvt_wu_s(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc0100053, 0, rs1, rm, rd); }
+void fdiv_s(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x18000053, rs2, rs1, rm, rd); }
+void feq_s(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa0002053, rs2, rs1, 0, rd); }
+void fle_s(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa0000053, rs2, rs1, 0, rd); }
+void flt_s(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa0001053, rs2, rs1, 0, rd); }
+void fmax_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x28001053, rs2, rs1, 0, rd); }
+void fmin_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x28000053, rs2, rs1, 0, rd); }
+void fmul_s(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x10000053, rs2, rs1, rm, rd); }
+void fmv_w_x(const FReg& rd, const Reg& rs1) { opFP(0xf0000053, 0, rs1, 0, rd); }
+void fmv_x_w(const Reg& rd, const FReg& rs1) { opFP(0xe0000053, 0, rs1, 0, rd); }
+void fsgnj_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x20000053, rs2, rs1, 0, rd); }
+void fsgnjn_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x20001053, rs2, rs1, 0, rd); }
+void fsgnjx_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x20002053, rs2, rs1, 0, rd); }
+void fsqrt_s(const FReg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0x58000053, 0, rs1, rm, rd); }
+void fsub_s(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x8000053, rs2, rs1, rm, rd); }
+void fcvt_l_s(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc0200053, 0, rs1, rm, rd); }
+void fcvt_lu_s(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc0300053, 0, rs1, rm, rd); }
+void fcvt_s_l(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd0200053, 0, rs1, rm, rd); }
+void fcvt_s_lu(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd0300053, 0, rs1, rm, rd); }
+void fadd_h(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x4000053, rs2, rs1, rm, rd); }
+void fclass_h(const Reg& rd, const FReg& rs1) { opFP(0xe4001053, 0, rs1, 0, rd); }
+void fcvt_h_s(const Reg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0x44000053, 0, rs1, rm, rd); }
+void fcvt_h_w(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd4000053, 0, rs1, rm, rd); }
+void fcvt_h_wu(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd4100053, 0, rs1, rm, rd); }
+void fcvt_s_h(const Reg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0x40200053, 0, rs1, rm, rd); }
+void fcvt_w_h(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc4000053, 0, rs1, rm, rd); }
+void fcvt_wu_h(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc4100053, 0, rs1, rm, rd); }
+void fdiv_h(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x1c000053, rs2, rs1, rm, rd); }
+void feq_h(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa4002053, rs2, rs1, 0, rd); }
+void fle_h(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa4000053, rs2, rs1, 0, rd); }
+void flt_h(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa4001053, rs2, rs1, 0, rd); }
+void fmax_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x2c001053, rs2, rs1, 0, rd); }
+void fmin_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x2c000053, rs2, rs1, 0, rd); }
+void fmul_h(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x14000053, rs2, rs1, rm, rd); }
+void fmv_h_x(const FReg& rd, const Reg& rs1) { opFP(0xf4000053, 0, rs1, 0, rd); }
+void fmv_x_h(const Reg& rd, const FReg& rs1) { opFP(0xe4000053, 0, rs1, 0, rd); }
+void fsgnj_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x24000053, rs2, rs1, 0, rd); }
+void fsgnjn_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x24001053, rs2, rs1, 0, rd); }
+void fsgnjx_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x24002053, rs2, rs1, 0, rd); }
+void fsqrt_h(const FReg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0x5c000053, 0, rs1, rm, rd); }
+void fsub_h(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0xc000053, rs2, rs1, rm, rd); }
+void fcvt_h_l(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd4200053, 0, rs1, rm, rd); }
+void fcvt_h_lu(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd4300053, 0, rs1, rm, rd); }
+void fcvt_l_h(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc4200053, 0, rs1, rm, rd); }
+void fcvt_lu_h(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc4300053, 0, rs1, rm, rd); }
+
+void fmadd_s(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x43, rs3, rs2, rs1, rm, rd); }
+void fmsub_s(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x47, rs3, rs2, rs1, rm, rd); }
+void fnmsub_s(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x4b, rs3, rs2, rs1, rm, rd); }
+void fnmadd_s(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x4f, rs3, rs2, rs1, rm, rd); }
+
+void fmadd_h(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x4000043, rs3, rs2, rs1, rm, rd); }
+void fmsub_h(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x4000047, rs3, rs2, rs1, rm, rd); }
+void fnmsub_h(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x400004b, rs3, rs2, rs1, rm, rd); }
+void fnmadd_h(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x400004f, rs3, rs2, rs1, rm, rd); }
+
+
+void flq(const FReg& rd, const Reg& rs1, int32_t imm12 = 0) { opLoadFP(0x4007, imm12, rs1, rd); }
+void fsq(const FReg& rs2, const Reg& rs1, int32_t imm12 = 0) { opStoreFP(0x4027, imm12, rs2, rs1); }
+void fld(const FReg& rd, const Reg& rs1, int32_t imm12 = 0) { opLoadFP(0x3007, imm12, rs1, rd); }
+void fsd(const FReg& rs2, const Reg& rs1, int32_t imm12 = 0) { opStoreFP(0x3027, imm12, rs2, rs1); }
+void flw(const FReg& rd, const Reg& rs1, int32_t imm12 = 0) { opLoadFP(0x2007, imm12, rs1, rd); }
+void fsw(const FReg& rs2, const Reg& rs1, int32_t imm12 = 0) { opStoreFP(0x2027, imm12, rs2, rs1); }
+void flh(const FReg& rd, const Reg& rs1, int32_t imm12 = 0) { opLoadFP(0x1007, imm12, rs1, rd); }
+void fsh(const FReg& rs2, const Reg& rs1, int32_t imm12 = 0) { opStoreFP(0x1027, imm12, rs2, rs1); }
+
+
+void nop() { if (supportRVC_) { append2B(0x0001); return; } addi(x0, x0, 0); }
+void li(const Reg& rd, uint32_t imm)
+{
+	if (imm && (imm & local::mask(12)) == 0) { // lower 12 bits of imm are zero
+		lui(rd, uint32_t(imm >> 12));
+		return;
+	}
+	int H, L;
+	if (!local::split32bit(&H, &L, imm)) {
+		addi(rd, zero, imm);
+		return;
+	}
+	lui(rd, H);
+	if (isRV32_) {
+		addi(rd, rd, L);
+	} else {
+		addiw(rd, rd, L);
+	}
+}
+void mv(const Reg& rd, const Reg& rs) { addi(rd, rs, 0); }
+void not_(const Reg& rd, const Reg& rs) { xori(rd, rs, -1); }
+void neg(const Reg& rd, const Reg& rs) { sub(rd, x0, rs); }
+void negw(const Reg& rd, const Reg& rs) { subw(rd, x0, rs); }
+void sext_b(const Reg& rd, const Reg& rs) { slli(rd, rs, XLEN_ - 8); srai(rd, rd, XLEN_ - 8); }
+void sext_h(const Reg& rd, const Reg& rs) { slli(rd, rs, XLEN_ - 16); srai(rd, rd, XLEN_ - 16); }
+void sext_w(const Reg& rd, const Reg& rs) { addiw(rd, rs, 0); }
+void zext_b(const Reg& rd, const Reg& rs) { andi(rd, rs, 255); }
+void zext_h(const Reg& rd, const Reg& rs) { slli(rd, rs, XLEN_ - 16); srli(rd, rd, XLEN_ - 16); }
+void zext_w(const Reg& rd, const Reg& rs) { slli(rd, rs, XLEN_ - 32); srli(rd, rd, XLEN_ - 32); }
+void seqz(const Reg& rd, const Reg& rs) { sltiu(rd, rs, 1); }
+void snez(const Reg& rd, const Reg& rs) { sltu(rd, x0, rs); }
+void sltz(const Reg& rd, const Reg& rs) { slt(rd, rs, x0); }
+void sgtz(const Reg& rd, const Reg& rs) { slt(rd, x0, rs); }
+void fence() { append4B(0x0ff0000f); }
+void j_(const Label& label) { jal(x0, label); }
+void jal(const Reg& rd, const Label& label) { Jmp jmp(getCurr(), 0x6f, rd); opJmp(label, jmp); }
+void jr(const Reg& rs) { jalr(x0, rs, 0); }
+void jalr(const Reg& rs) { jalr(x1, rs, 0); }
+void ret() { jalr(x0, x1); }
+// lr rd, (addr)
+void lr_w(const Reg& rd, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, 0, addr, 2, 2, flag); }
+void lr_d(const Reg& rd, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, 0, addr, 2, 3, flag); }
+void csrr(const Reg& rd, CSR csr) { csrrs(rd, csr, x0); }
+void csrw(CSR csr, const Reg& rs) { csrrw(x0, csr, rs); }
+void csrs(CSR csr, const Reg& rs) { csrrs(x0, csr, rs); }
+void csrc(CSR csr, const Reg& rs) { csrrc(x0, csr, rs); }
+void csrwi(CSR csr, uint32_t imm) { csrrwi(x0, csr, imm); }
+void csrsi(CSR csr, uint32_t imm) { csrrsi(x0, csr, imm); }
+void csrci(CSR csr, uint32_t imm) { csrrci(x0, csr, imm); }
+
diff --git a/third_party/xbyak_riscv/xbyak_riscv_util.hpp b/third_party/xbyak_riscv/xbyak_riscv_util.hpp
new file mode 100644
index 00000000000..6fdeab13b0e
--- /dev/null
+++ b/third_party/xbyak_riscv/xbyak_riscv_util.hpp
@@ -0,0 +1,271 @@
+/******************************************************************************
+* Copyright (C), 2023, KNS Group LLC (YADRO)
+*
+* Licensed under the 3-Clause BSD License
+* You may obtain a copy of the License at https://opensource.org/license/bsd-3-clause/
+*******************************************************************************/
+
+#pragma once
+
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include "xbyak_riscv_csr.hpp"
+#include "xbyak_riscv.hpp"
+
+#if defined(__linux__) && defined(__riscv)
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <sys/utsname.h>
+#include <asm/hwcap.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+
+namespace Xbyak_riscv {
+
+// Legacy HWCAP constants
+#ifndef COMPAT_HWCAP_ISA_I
+#define COMPAT_HWCAP_ISA_I  (1U << ('I' - 'A'))
+#endif
+
+#ifndef COMPAT_HWCAP_ISA_M
+#define COMPAT_HWCAP_ISA_M  (1U << ('M' - 'A'))
+#endif
+
+#ifndef COMPAT_HWCAP_ISA_A
+#define COMPAT_HWCAP_ISA_A  (1U << ('A' - 'A'))
+#endif
+
+#ifndef COMPAT_HWCAP_ISA_F
+#define COMPAT_HWCAP_ISA_F  (1U << ('F' - 'A'))
+#endif
+
+#ifndef COMPAT_HWCAP_ISA_D
+#define COMPAT_HWCAP_ISA_D  (1U << ('D' - 'A'))
+#endif
+
+#ifndef COMPAT_HWCAP_ISA_C
+#define COMPAT_HWCAP_ISA_C  (1U << ('C' - 'A'))
+#endif
+
+#ifndef COMPAT_HWCAP_ISA_V
+#define COMPAT_HWCAP_ISA_V  (1U << ('V' - 'A'))
+#endif
+
+#if defined(__linux__) && defined(__riscv)
+// Definitions for riscv_hwprobe (Linux 6.4+)
+#ifndef __NR_riscv_hwprobe
+#define __NR_riscv_hwprobe 258
+#endif
+
+#ifndef RISCV_HWPROBE_KEY_IMA_EXT_0
+#define RISCV_HWPROBE_KEY_IMA_EXT_0 4
+#endif
+
+#ifndef RISCV_HWPROBE_IMA_V
+#define RISCV_HWPROBE_IMA_V (1ULL << 2)
+#endif
+
+#ifndef RISCV_HWPROBE_EXT_ZVBB
+#define RISCV_HWPROBE_EXT_ZVBB (1ULL << 17)
+#endif
+
+#ifndef RISCV_HWPROBE_EXT_ZVBC
+#define RISCV_HWPROBE_EXT_ZVBC (1ULL << 18)
+#endif
+
+#ifndef RISCV_HWPROBE_EXT_ZVKG
+#define RISCV_HWPROBE_EXT_ZVKG (1ULL << 20)
+#endif
+
+#ifndef RISCV_HWPROBE_EXT_ZVFH
+#define RISCV_HWPROBE_EXT_ZVFH (1ULL << 30)
+#endif
+
+struct riscv_hwprobe {
+    int64_t key;
+    uint64_t value;
+};
+#endif
+
+enum class RISCVExtension : uint64_t {
+    // 0-25: Legacy single-letter map (matches HWCAP for convenience)
+    I = COMPAT_HWCAP_ISA_I,
+    M = COMPAT_HWCAP_ISA_M,
+    A = COMPAT_HWCAP_ISA_A,
+    F = COMPAT_HWCAP_ISA_F,
+    D = COMPAT_HWCAP_ISA_D,
+    C = COMPAT_HWCAP_ISA_C,
+    V = COMPAT_HWCAP_ISA_V,
+
+    // 26+: Extended Z-extensions
+    // Adding new extensions here is safe and conflict-free
+    Zvfh = 1ULL << 26,
+    Zvbb = 1ULL << 27,
+    Zvbc = 1ULL << 28,
+    Zvkg = 1ULL << 29
+};
+
+template <CSR csr>
+struct CSRReader : public CodeGenerator {
+    // Buffer capacity exactly for 2 instructions.
+    static constexpr size_t capacity = 8;
+
+    CSRReader() : CodeGenerator(capacity) {
+        csrrs(a0, csr, x0);
+        ret();
+    }
+};
+
+/**
+ * Class that detects information about a RISC-V CPU.
+ */
+class CPU final {
+public:
+    static const CPU& getInstance() {
+        static const CPU cpu;
+        return cpu;
+    }
+
+    CPU() {
+        hwcapFeatures = 0;
+        xlen = sizeof(void*) * 8; // Fallback if sysconf fails
+
+#if defined(__linux__) && defined(__riscv)
+        // Set hwcapFeatures with AT_HWCAP value from
+        // the Linux auxiliary vector to check for base extensions support.
+        hwcapFeatures = getauxval(AT_HWCAP) & (
+            COMPAT_HWCAP_ISA_I |
+            COMPAT_HWCAP_ISA_M |
+            COMPAT_HWCAP_ISA_A |
+            COMPAT_HWCAP_ISA_F |
+            COMPAT_HWCAP_ISA_D |
+            COMPAT_HWCAP_ISA_C |
+            COMPAT_HWCAP_ISA_V
+        );
+
+        // Try to use riscv_hwprobe to detect Z-extensions
+        struct riscv_hwprobe requests[] = {
+            {RISCV_HWPROBE_KEY_IMA_EXT_0, 0}
+        };
+
+        int ret = syscall(__NR_riscv_hwprobe, &requests, sizeof(requests) / sizeof(requests[0]), 0, NULL, 0);
+
+        if (ret == 0) {
+            uint64_t v = requests[0].value;
+            // Update V support from hwprobe if present
+            if (v & RISCV_HWPROBE_IMA_V) hwcapFeatures |= static_cast<uint64_t>(RISCVExtension::V);
+
+            // Detect Z-extensions using the table
+            const struct {
+                RISCVExtension id;
+                uint64_t hwprobe_bit; // Bit in RISCV_HWPROBE_KEY_IMA_EXT_0
+            } table[] = {
+                { RISCVExtension::Zvfh, RISCV_HWPROBE_EXT_ZVFH },
+                { RISCVExtension::Zvbb, RISCV_HWPROBE_EXT_ZVBB },
+                { RISCVExtension::Zvbc, RISCV_HWPROBE_EXT_ZVBC },
+                { RISCVExtension::Zvkg, RISCV_HWPROBE_EXT_ZVKG }
+            };
+            for (const auto& entry : table) {
+                if (v & entry.hwprobe_bit) {
+                    hwcapFeatures |= static_cast<uint64_t>(entry.id);
+                }
+            }
+        }
+
+        // Set xlen, number of cores, cache info
+        xlen = sysconf(_SC_LONG_BIT);
+        numCores = sysconf(_SC_NPROCESSORS_ONLN);
+
+        dataCacheSize_[0] = sysconf(_SC_LEVEL1_DCACHE_SIZE);
+        dataCacheSize_[1] = sysconf(_SC_LEVEL2_CACHE_SIZE);
+        dataCacheSize_[2] = sysconf(_SC_LEVEL3_CACHE_SIZE);
+        dataCacheSize_[3] = sysconf(_SC_LEVEL4_CACHE_SIZE);
+
+        dataCacheLineSize_[0] = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+        dataCacheLineSize_[1] = sysconf(_SC_LEVEL2_CACHE_LINESIZE);
+        dataCacheLineSize_[2] = sysconf(_SC_LEVEL3_CACHE_LINESIZE);
+        dataCacheLineSize_[3] = sysconf(_SC_LEVEL4_CACHE_LINESIZE);
+#endif
+
+        // Set vlen
+        if(hasExtension(RISCVExtension::V)) {
+            CSRReader<CSR::vlenb> csrReaderGenerator;
+            csrReaderGenerator.ready();
+            const auto csrReader = csrReaderGenerator.getCode<uint32_t (*)()>();
+            vlen = csrReader() * 8 /* bit */;
+        }
+
+        // Set flen (bit)
+        if (hasExtension(RISCVExtension::D)) {
+            flen = 64;
+        } else if (hasExtension(RISCVExtension::F)) {
+            flen = 32;
+        }
+    }
+
+    /**
+     * Checks if a particular RISC-V extension is available.
+     *
+     * @param extension The extension to check.
+     */
+    bool hasExtension(RISCVExtension extension) const {
+        return (hwcapFeatures & static_cast<uint64_t>(extension)) != 0;
+    }
+
+    /**
+     * Get vector register width in bits
+    */
+    uint32_t getVlen() const {
+        return vlen;
+    }
+
+    /**
+     * Get general purpose register width in bits
+    */
+    uint32_t getXlen() const {
+        return xlen;
+    };
+
+    /**
+     * Get floating-point register width in bits
+    */
+    uint32_t getFlen() const {
+        return flen;
+    }
+
+    uint32_t getNumCores() const {
+        return numCores;
+    }
+
+    /**
+     * Get data cache size in bytes
+     * @param lvl Cache level 1..4
+    */
+    uint32_t getDataCacheSize(uint32_t lvl) const {
+        if (lvl == 0 || lvl > maxNumberCacheLevels) XBYAK_RISCV_THROW(ERR_BAD_PARAMETER);
+        return dataCacheSize_[lvl - 1];
+    }
+
+    /**
+     * Get data cache line size in bytes
+     * @param lvl Cache level 1..4
+    */
+    uint32_t getDataCacheLineSize(uint32_t lvl) const {
+        if (lvl == 0 || lvl > maxNumberCacheLevels) XBYAK_RISCV_THROW(ERR_BAD_PARAMETER);
+        return dataCacheLineSize_[lvl - 1];
+    }
+
+private:
+    uint64_t hwcapFeatures = 0;
+    static constexpr size_t maxNumberCacheLevels = 4;
+    uint32_t dataCacheSize_[maxNumberCacheLevels] = {0, 0, 0, 0};
+    uint32_t dataCacheLineSize_[maxNumberCacheLevels] = {0, 0, 0, 0};
+    uint32_t numCores = 0;
+    uint32_t xlen = 0;
+    uint32_t vlen = 0;
+    uint32_t flen = 0;
+};
+
+} // Xbyak_riscv
diff --git a/third_party/xbyak_riscv/xbyak_riscv_v.hpp b/third_party/xbyak_riscv/xbyak_riscv_v.hpp
new file mode 100644
index 00000000000..7bff4daf391
--- /dev/null
+++ b/third_party/xbyak_riscv/xbyak_riscv_v.hpp
@@ -0,0 +1,776 @@
+/*
+	Copyright (C), 2023, MITSUNARI Shigeo
+	Copyright (C), 2023, KNS Group LLC (YADRO)
+	Licensed under the 3-Clause BSD License
+	You may obtain a copy of the License at https://opensource.org/license/bsd-3-clause/
+*/
+void vaadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x24002057, vm, vs2, vs1, vd); }
+void vaadd_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x24006057, vm, vs2, rs1, vd); }
+void vaaddu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x20002057, vm, vs2, vs1, vd); }
+void vaaddu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x20006057, vm, vs2, rs1, vd); }
+void vadc_vim(const VReg& vd, const VReg& vs2, int32_t simm5) { opIVI(0x40003057, 0, vs2, simm5, vd); }
+void vadc_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x40000057, 0, vs2, vs1, vd); }
+void vadc_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x40004057, 0, vs2, rs1, vd); }
+void vadd_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x3057, vm, vs2, simm5, vd); }
+void vadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x57, vm, vs2, vs1, vd); }
+void vadd_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x4057, vm, vs2, rs1, vd); }
+void vand_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x24003057, vm, vs2, simm5, vd); }
+void vand_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x24000057, vm, vs2, vs1, vd); }
+void vand_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x24004057, vm, vs2, rs1, vd); }
+void vasub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x2c002057, vm, vs2, vs1, vd); }
+void vasub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x2c006057, vm, vs2, rs1, vd); }
+void vasubu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x28002057, vm, vs2, vs1, vd); }
+void vasubu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x28006057, vm, vs2, rs1, vd); }
+void vcompress_vm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opMVV(0x5e002057, 0, vs2, vs1, vd); }
+void vcpop_m(const Reg& rd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x40082057, vm, vs2, 0, rd); }
+void vdiv_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x84002057, vm, vs2, vs1, vd); }
+void vdiv_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x84006057, vm, vs2, rs1, vd); }
+void vdivu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x80002057, vm, vs2, vs1, vd); }
+void vdivu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x80006057, vm, vs2, rs1, vd); }
+void vfadd_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x5057, vm, vs2, rs1, vd); }
+void vfadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x1057, vm, vs2, vs1, vd); }
+void vfclass_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x4c081057, vm, vs2, 0, vd); }
+void vfcvt_f_x_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48019057, vm, vs2, 0, vd); }
+void vfcvt_f_xu_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48011057, vm, vs2, 0, vd); }
+void vfcvt_rtz_x_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48039057, vm, vs2, 0, vd); }
+void vfcvt_rtz_xu_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48031057, vm, vs2, 0, vd); }
+void vfcvt_x_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48009057, vm, vs2, 0, vd); }
+void vfcvt_xu_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48001057, vm, vs2, 0, vd); }
+void vfdiv_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x80005057, vm, vs2, rs1, vd); }
+void vfdiv_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x80001057, vm, vs2, vs1, vd); }
+void vfirst_m(const Reg& rd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x4008a057, vm, vs2, 0, rd); }
+void vfmacc_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xb0005057, vm, vs2, rs1, vd); }
+void vfmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xb0001057, vm, vs2, vs1, vd); }
+void vfmadd_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xa0005057, vm, vs2, rs1, vd); }
+void vfmadd_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xa0001057, vm, vs2, vs1, vd); }
+void vfmax_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x18005057, vm, vs2, rs1, vd); }
+void vfmax_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x18001057, vm, vs2, vs1, vd); }
+void vfmerge_vfm(const VReg& vd, const VReg& vs2, const FReg& rs1) { opFVF(0x5c005057, 0, vs2, rs1, vd); }
+void vfmin_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x10005057, vm, vs2, rs1, vd); }
+void vfmin_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x10001057, vm, vs2, vs1, vd); }
+void vfmsac_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xb8005057, vm, vs2, rs1, vd); }
+void vfmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xb8001057, vm, vs2, vs1, vd); }
+void vfmsub_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xa8005057, vm, vs2, rs1, vd); }
+void vfmsub_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xa8001057, vm, vs2, vs1, vd); }
+void vfmul_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x90005057, vm, vs2, rs1, vd); }
+void vfmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x90001057, vm, vs2, vs1, vd); }
+void vfmv_f_s(const FReg& rd, const VReg& vs2) { opFVV(0x42001057, 0, vs2, 0, rd); }
+void vfmv_s_f(const VReg& vd, const FReg& rs1) { opFVF(0x42005057, 0, 0, rs1, vd); }
+void vfmv_v_f(const VReg& vd, const FReg& rs1) { opFVF(0x5e005057, 0, 0, rs1, vd); }
+void vfncvt_f_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x480a1057, vm, vs2, 0, vd); }
+void vfncvt_f_x_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48099057, vm, vs2, 0, vd); }
+void vfncvt_f_xu_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48091057, vm, vs2, 0, vd); }
+void vfncvt_rod_f_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x480a9057, vm, vs2, 0, vd); }
+void vfncvt_rtz_x_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x480b9057, vm, vs2, 0, vd); }
+void vfncvt_rtz_xu_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x480b1057, vm, vs2, 0, vd); }
+void vfncvt_x_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48089057, vm, vs2, 0, vd); }
+void vfncvt_xu_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48081057, vm, vs2, 0, vd); }
+void vfnmacc_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xb4005057, vm, vs2, rs1, vd); }
+void vfnmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xb4001057, vm, vs2, vs1, vd); }
+void vfnmadd_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xa4005057, vm, vs2, rs1, vd); }
+void vfnmadd_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xa4001057, vm, vs2, vs1, vd); }
+void vfnmsac_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xbc005057, vm, vs2, rs1, vd); }
+void vfnmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xbc001057, vm, vs2, vs1, vd); }
+void vfnmsub_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xac005057, vm, vs2, rs1, vd); }
+void vfnmsub_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xac001057, vm, vs2, vs1, vd); }
+void vfrdiv_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x84005057, vm, vs2, rs1, vd); }
+void vfrec7_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x4c029057, vm, vs2, 0, vd); }
+void vfredmax_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x1c001057, vm, vs2, vs1, vd); }
+void vfredmin_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x14001057, vm, vs2, vs1, vd); }
+void vfredosum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xc001057, vm, vs2, vs1, vd); }
+void vfredusum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x4001057, vm, vs2, vs1, vd); }
+void vfrsqrt7_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x4c021057, vm, vs2, 0, vd); }
+void vfrsub_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x9c005057, vm, vs2, rs1, vd); }
+void vfsgnj_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x20005057, vm, vs2, rs1, vd); }
+void vfsgnj_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x20001057, vm, vs2, vs1, vd); }
+void vfsgnjn_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x24005057, vm, vs2, rs1, vd); }
+void vfsgnjn_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x24001057, vm, vs2, vs1, vd); }
+void vfsgnjx_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x28005057, vm, vs2, rs1, vd); }
+void vfsgnjx_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x28001057, vm, vs2, vs1, vd); }
+void vfslide1down_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x3c005057, vm, vs2, rs1, vd); }
+void vfslide1up_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x38005057, vm, vs2, rs1, vd); }
+void vfsqrt_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x4c001057, vm, vs2, 0, vd); }
+void vfsub_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x8005057, vm, vs2, rs1, vd); }
+void vfsub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x8001057, vm, vs2, vs1, vd); }
+void vfwadd_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xc0005057, vm, vs2, rs1, vd); }
+void vfwadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xc0001057, vm, vs2, vs1, vd); }
+void vfwadd_wf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xd0005057, vm, vs2, rs1, vd); }
+void vfwadd_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xd0001057, vm, vs2, vs1, vd); }
+void vfwcvt_f_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48061057, vm, vs2, 0, vd); }
+void vfwcvt_f_x_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48059057, vm, vs2, 0, vd); }
+void vfwcvt_f_xu_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48051057, vm, vs2, 0, vd); }
+void vfwcvt_rtz_x_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48079057, vm, vs2, 0, vd); }
+void vfwcvt_rtz_xu_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48071057, vm, vs2, 0, vd); }
+void vfwcvt_x_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48049057, vm, vs2, 0, vd); }
+void vfwcvt_xu_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48041057, vm, vs2, 0, vd); }
+void vfwmacc_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xf0005057, vm, vs2, rs1, vd); }
+void vfwmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xf0001057, vm, vs2, vs1, vd); }
+void vfwmsac_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xf8005057, vm, vs2, rs1, vd); }
+void vfwmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xf8001057, vm, vs2, vs1, vd); }
+void vfwmul_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xe0005057, vm, vs2, rs1, vd); }
+void vfwmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xe0001057, vm, vs2, vs1, vd); }
+void vfwnmacc_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xf4005057, vm, vs2, rs1, vd); }
+void vfwnmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xf4001057, vm, vs2, vs1, vd); }
+void vfwnmsac_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xfc005057, vm, vs2, rs1, vd); }
+void vfwnmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xfc001057, vm, vs2, vs1, vd); }
+void vfwredosum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xcc001057, vm, vs2, vs1, vd); }
+void vfwredusum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xc4001057, vm, vs2, vs1, vd); }
+void vfwsub_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xc8005057, vm, vs2, rs1, vd); }
+void vfwsub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xc8001057, vm, vs2, vs1, vd); }
+void vfwsub_wf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xd8005057, vm, vs2, rs1, vd); }
+void vfwsub_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xd8001057, vm, vs2, vs1, vd); }
+void vid_v(const VReg& vd, VM vm=VM::unmasked) { opMVV(0x5008a057, vm, 0, 0, vd); }
+void viota_m(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x50082057, vm, vs2, 0, vd); }
+void vl1re16_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2805007, 0, 0, rs1, vd); }
+void vl1re32_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2806007, 0, 0, rs1, vd); }
+void vl1re64_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2807007, 0, 0, rs1, vd); }
+void vl1re8_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2800007, 0, 0, rs1, vd); }
+void vl2re16_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2805007, 0, 0, rs1, vd); }
+void vl2re32_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2806007, 0, 0, rs1, vd); }
+void vl2re64_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2807007, 0, 0, rs1, vd); }
+void vl2re8_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2800007, 0, 0, rs1, vd); }
+void vl4re16_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2805007, 0, 0, rs1, vd); }
+void vl4re32_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2806007, 0, 0, rs1, vd); }
+void vl4re64_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2807007, 0, 0, rs1, vd); }
+void vl4re8_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2800007, 0, 0, rs1, vd); }
+void vl8re16_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2805007, 0, 0, rs1, vd); }
+void vl8re32_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2806007, 0, 0, rs1, vd); }
+void vl8re64_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2807007, 0, 0, rs1, vd); }
+void vl8re8_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2800007, 0, 0, rs1, vd); }
+void vlseg1e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10007007, vm, 0, rs1, vd); }
+void vlseg2e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x30007007, vm, 0, rs1, vd); }
+void vlseg3e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x50007007, vm, 0, rs1, vd); }
+void vlseg4e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x70007007, vm, 0, rs1, vd); }
+void vlseg5e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x90007007, vm, 0, rs1, vd); }
+void vlseg6e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb0007007, vm, 0, rs1, vd); }
+void vlseg7e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd0007007, vm, 0, rs1, vd); }
+void vlseg8e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf0007007, vm, 0, rs1, vd); }
+void vle1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10007007, vm, 0, rs1, vd); }
+void vlseg1e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11007007, vm, 0, rs1, vd); }
+void vlseg2e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x31007007, vm, 0, rs1, vd); }
+void vlseg3e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x51007007, vm, 0, rs1, vd); }
+void vlseg4e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x71007007, vm, 0, rs1, vd); }
+void vlseg5e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x91007007, vm, 0, rs1, vd); }
+void vlseg6e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb1007007, vm, 0, rs1, vd); }
+void vlseg7e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd1007007, vm, 0, rs1, vd); }
+void vlseg8e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf1007007, vm, 0, rs1, vd); }
+void vle1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11007007, vm, 0, rs1, vd); }
+void vlseg1e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10000007, vm, 0, rs1, vd); }
+void vlseg2e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x30000007, vm, 0, rs1, vd); }
+void vlseg3e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x50000007, vm, 0, rs1, vd); }
+void vlseg4e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x70000007, vm, 0, rs1, vd); }
+void vlseg5e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x90000007, vm, 0, rs1, vd); }
+void vlseg6e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb0000007, vm, 0, rs1, vd); }
+void vlseg7e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd0000007, vm, 0, rs1, vd); }
+void vlseg8e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf0000007, vm, 0, rs1, vd); }
+void vle128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10000007, vm, 0, rs1, vd); }
+void vlseg1e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11000007, vm, 0, rs1, vd); }
+void vlseg2e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x31000007, vm, 0, rs1, vd); }
+void vlseg3e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x51000007, vm, 0, rs1, vd); }
+void vlseg4e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x71000007, vm, 0, rs1, vd); }
+void vlseg5e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x91000007, vm, 0, rs1, vd); }
+void vlseg6e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb1000007, vm, 0, rs1, vd); }
+void vlseg7e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd1000007, vm, 0, rs1, vd); }
+void vlseg8e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf1000007, vm, 0, rs1, vd); }
+void vle128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11000007, vm, 0, rs1, vd); }
+void vlseg1e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x5007, vm, 0, rs1, vd); }
+void vlseg2e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x20005007, vm, 0, rs1, vd); }
+void vlseg3e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x40005007, vm, 0, rs1, vd); }
+void vlseg4e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x60005007, vm, 0, rs1, vd); }
+void vlseg5e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x80005007, vm, 0, rs1, vd); }
+void vlseg6e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa0005007, vm, 0, rs1, vd); }
+void vlseg7e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc0005007, vm, 0, rs1, vd); }
+void vlseg8e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe0005007, vm, 0, rs1, vd); }
+void vle16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x5007, vm, 0, rs1, vd); }
+void vlseg1e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1005007, vm, 0, rs1, vd); }
+void vlseg2e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x21005007, vm, 0, rs1, vd); }
+void vlseg3e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x41005007, vm, 0, rs1, vd); }
+void vlseg4e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x61005007, vm, 0, rs1, vd); }
+void vlseg5e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x81005007, vm, 0, rs1, vd); }
+void vlseg6e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa1005007, vm, 0, rs1, vd); }
+void vlseg7e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc1005007, vm, 0, rs1, vd); }
+void vlseg8e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe1005007, vm, 0, rs1, vd); }
+void vle16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1005007, vm, 0, rs1, vd); }
+void vlseg1e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10005007, vm, 0, rs1, vd); }
+void vlseg2e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x30005007, vm, 0, rs1, vd); }
+void vlseg3e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x50005007, vm, 0, rs1, vd); }
+void vlseg4e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x70005007, vm, 0, rs1, vd); }
+void vlseg5e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x90005007, vm, 0, rs1, vd); }
+void vlseg6e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb0005007, vm, 0, rs1, vd); }
+void vlseg7e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd0005007, vm, 0, rs1, vd); }
+void vlseg8e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf0005007, vm, 0, rs1, vd); }
+void vle256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10005007, vm, 0, rs1, vd); }
+void vlseg1e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11005007, vm, 0, rs1, vd); }
+void vlseg2e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x31005007, vm, 0, rs1, vd); }
+void vlseg3e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x51005007, vm, 0, rs1, vd); }
+void vlseg4e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x71005007, vm, 0, rs1, vd); }
+void vlseg5e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x91005007, vm, 0, rs1, vd); }
+void vlseg6e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb1005007, vm, 0, rs1, vd); }
+void vlseg7e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd1005007, vm, 0, rs1, vd); }
+void vlseg8e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf1005007, vm, 0, rs1, vd); }
+void vle256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11005007, vm, 0, rs1, vd); }
+void vlseg1e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x6007, vm, 0, rs1, vd); }
+void vlseg2e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x20006007, vm, 0, rs1, vd); }
+void vlseg3e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x40006007, vm, 0, rs1, vd); }
+void vlseg4e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x60006007, vm, 0, rs1, vd); }
+void vlseg5e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x80006007, vm, 0, rs1, vd); }
+void vlseg6e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa0006007, vm, 0, rs1, vd); }
+void vlseg7e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc0006007, vm, 0, rs1, vd); }
+void vlseg8e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe0006007, vm, 0, rs1, vd); }
+void vle32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x6007, vm, 0, rs1, vd); }
+void vlseg1e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1006007, vm, 0, rs1, vd); }
+void vlseg2e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x21006007, vm, 0, rs1, vd); }
+void vlseg3e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x41006007, vm, 0, rs1, vd); }
+void vlseg4e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x61006007, vm, 0, rs1, vd); }
+void vlseg5e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x81006007, vm, 0, rs1, vd); }
+void vlseg6e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa1006007, vm, 0, rs1, vd); }
+void vlseg7e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc1006007, vm, 0, rs1, vd); }
+void vlseg8e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe1006007, vm, 0, rs1, vd); }
+void vle32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1006007, vm, 0, rs1, vd); }
+void vlseg1e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10006007, vm, 0, rs1, vd); }
+void vlseg2e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x30006007, vm, 0, rs1, vd); }
+void vlseg3e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x50006007, vm, 0, rs1, vd); }
+void vlseg4e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x70006007, vm, 0, rs1, vd); }
+void vlseg5e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x90006007, vm, 0, rs1, vd); }
+void vlseg6e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb0006007, vm, 0, rs1, vd); }
+void vlseg7e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd0006007, vm, 0, rs1, vd); }
+void vlseg8e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf0006007, vm, 0, rs1, vd); }
+void vle512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10006007, vm, 0, rs1, vd); }
+void vlseg1e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11006007, vm, 0, rs1, vd); }
+void vlseg2e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x31006007, vm, 0, rs1, vd); }
+void vlseg3e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x51006007, vm, 0, rs1, vd); }
+void vlseg4e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x71006007, vm, 0, rs1, vd); }
+void vlseg5e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x91006007, vm, 0, rs1, vd); }
+void vlseg6e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb1006007, vm, 0, rs1, vd); }
+void vlseg7e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd1006007, vm, 0, rs1, vd); }
+void vlseg8e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf1006007, vm, 0, rs1, vd); }
+void vle512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11006007, vm, 0, rs1, vd); }
+void vlseg1e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x7007, vm, 0, rs1, vd); }
+void vlseg2e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x20007007, vm, 0, rs1, vd); }
+void vlseg3e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x40007007, vm, 0, rs1, vd); }
+void vlseg4e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x60007007, vm, 0, rs1, vd); }
+void vlseg5e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x80007007, vm, 0, rs1, vd); }
+void vlseg6e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa0007007, vm, 0, rs1, vd); }
+void vlseg7e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc0007007, vm, 0, rs1, vd); }
+void vlseg8e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe0007007, vm, 0, rs1, vd); }
+void vle64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x7007, vm, 0, rs1, vd); }
+void vlseg1e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1007007, vm, 0, rs1, vd); }
+void vlseg2e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x21007007, vm, 0, rs1, vd); }
+void vlseg3e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x41007007, vm, 0, rs1, vd); }
+void vlseg4e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x61007007, vm, 0, rs1, vd); }
+void vlseg5e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x81007007, vm, 0, rs1, vd); }
+void vlseg6e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa1007007, vm, 0, rs1, vd); }
+void vlseg7e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc1007007, vm, 0, rs1, vd); }
+void vlseg8e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe1007007, vm, 0, rs1, vd); }
+void vle64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1007007, vm, 0, rs1, vd); }
+void vlseg1e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x7, vm, 0, rs1, vd); }
+void vlseg2e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x20000007, vm, 0, rs1, vd); }
+void vlseg3e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x40000007, vm, 0, rs1, vd); }
+void vlseg4e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x60000007, vm, 0, rs1, vd); }
+void vlseg5e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x80000007, vm, 0, rs1, vd); }
+void vlseg6e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa0000007, vm, 0, rs1, vd); }
+void vlseg7e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc0000007, vm, 0, rs1, vd); }
+void vlseg8e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe0000007, vm, 0, rs1, vd); }
+void vle8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x7, vm, 0, rs1, vd); }
+void vlseg1e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1000007, vm, 0, rs1, vd); }
+void vlseg2e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x21000007, vm, 0, rs1, vd); }
+void vlseg3e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x41000007, vm, 0, rs1, vd); }
+void vlseg4e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x61000007, vm, 0, rs1, vd); }
+void vlseg5e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x81000007, vm, 0, rs1, vd); }
+void vlseg6e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa1000007, vm, 0, rs1, vd); }
+void vlseg7e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc1000007, vm, 0, rs1, vd); }
+void vlseg8e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe1000007, vm, 0, rs1, vd); }
+void vle8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1000007, vm, 0, rs1, vd); }
+void vlm_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2b00007, 0, 0, rs1, vd); }
+void vloxei1024_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x1c007007, vm, vs2, rs1, vd); }
+void vloxei128_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x1c000007, vm, vs2, rs1, vd); }
+void vloxei16_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0xc005007, vm, vs2, rs1, vd); }
+void vloxei256_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x1c005007, vm, vs2, rs1, vd); }
+void vloxei32_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0xc006007, vm, vs2, rs1, vd); }
+void vloxei512_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x1c006007, vm, vs2, rs1, vd); }
+void vloxei64_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0xc007007, vm, vs2, rs1, vd); }
+void vloxei8_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0xc000007, vm, vs2, rs1, vd); }
+void vlsseg1e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18007007, vm, rs2, rs1, vd); }
+void vlsseg2e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x38007007, vm, rs2, rs1, vd); }
+void vlsseg3e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x58007007, vm, rs2, rs1, vd); }
+void vlsseg4e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x78007007, vm, rs2, rs1, vd); }
+void vlsseg5e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x98007007, vm, rs2, rs1, vd); }
+void vlsseg6e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xb8007007, vm, rs2, rs1, vd); }
+void vlsseg7e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xd8007007, vm, rs2, rs1, vd); }
+void vlsseg8e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xf8007007, vm, rs2, rs1, vd); }
+void vlse1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18007007, vm, rs2, rs1, vd); }
+void vlsseg1e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18000007, vm, rs2, rs1, vd); }
+void vlsseg2e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x38000007, vm, rs2, rs1, vd); }
+void vlsseg3e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x58000007, vm, rs2, rs1, vd); }
+void vlsseg4e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x78000007, vm, rs2, rs1, vd); }
+void vlsseg5e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x98000007, vm, rs2, rs1, vd); }
+void vlsseg6e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xb8000007, vm, rs2, rs1, vd); }
+void vlsseg7e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xd8000007, vm, rs2, rs1, vd); }
+void vlsseg8e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xf8000007, vm, rs2, rs1, vd); }
+void vlse128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18000007, vm, rs2, rs1, vd); }
+void vlsseg1e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8005007, vm, rs2, rs1, vd); }
+void vlsseg2e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x28005007, vm, rs2, rs1, vd); }
+void vlsseg3e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x48005007, vm, rs2, rs1, vd); }
+void vlsseg4e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x68005007, vm, rs2, rs1, vd); }
+void vlsseg5e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x88005007, vm, rs2, rs1, vd); }
+void vlsseg6e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xa8005007, vm, rs2, rs1, vd); }
+void vlsseg7e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xc8005007, vm, rs2, rs1, vd); }
+void vlsseg8e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xe8005007, vm, rs2, rs1, vd); }
+void vlse16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8005007, vm, rs2, rs1, vd); }
+void vlsseg1e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18005007, vm, rs2, rs1, vd); }
+void vlsseg2e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x38005007, vm, rs2, rs1, vd); }
+void vlsseg3e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x58005007, vm, rs2, rs1, vd); }
+void vlsseg4e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x78005007, vm, rs2, rs1, vd); }
+void vlsseg5e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x98005007, vm, rs2, rs1, vd); }
+void vlsseg6e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xb8005007, vm, rs2, rs1, vd); }
+void vlsseg7e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xd8005007, vm, rs2, rs1, vd); }
+void vlsseg8e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xf8005007, vm, rs2, rs1, vd); }
+void vlse256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18005007, vm, rs2, rs1, vd); }
+void vlsseg1e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8006007, vm, rs2, rs1, vd); }
+void vlsseg2e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x28006007, vm, rs2, rs1, vd); }
+void vlsseg3e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x48006007, vm, rs2, rs1, vd); }
+void vlsseg4e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x68006007, vm, rs2, rs1, vd); }
+void vlsseg5e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x88006007, vm, rs2, rs1, vd); }
+void vlsseg6e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xa8006007, vm, rs2, rs1, vd); }
+void vlsseg7e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xc8006007, vm, rs2, rs1, vd); }
+void vlsseg8e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xe8006007, vm, rs2, rs1, vd); }
+void vlse32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8006007, vm, rs2, rs1, vd); }
+void vlsseg1e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18006007, vm, rs2, rs1, vd); }
+void vlsseg2e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x38006007, vm, rs2, rs1, vd); }
+void vlsseg3e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x58006007, vm, rs2, rs1, vd); }
+void vlsseg4e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x78006007, vm, rs2, rs1, vd); }
+void vlsseg5e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x98006007, vm, rs2, rs1, vd); }
+void vlsseg6e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xb8006007, vm, rs2, rs1, vd); }
+void vlsseg7e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xd8006007, vm, rs2, rs1, vd); }
+void vlsseg8e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xf8006007, vm, rs2, rs1, vd); }
+void vlse512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18006007, vm, rs2, rs1, vd); }
+void vlsseg1e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8007007, vm, rs2, rs1, vd); }
+void vlsseg2e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x28007007, vm, rs2, rs1, vd); }
+void vlsseg3e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x48007007, vm, rs2, rs1, vd); }
+void vlsseg4e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x68007007, vm, rs2, rs1, vd); }
+void vlsseg5e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x88007007, vm, rs2, rs1, vd); }
+void vlsseg6e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xa8007007, vm, rs2, rs1, vd); }
+void vlsseg7e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xc8007007, vm, rs2, rs1, vd); }
+void vlsseg8e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xe8007007, vm, rs2, rs1, vd); }
+void vlse64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8007007, vm, rs2, rs1, vd); }
+void vlsseg1e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8000007, vm, rs2, rs1, vd); }
+void vlsseg2e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x28000007, vm, rs2, rs1, vd); }
+void vlsseg3e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x48000007, vm, rs2, rs1, vd); }
+void vlsseg4e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x68000007, vm, rs2, rs1, vd); }
+void vlsseg5e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x88000007, vm, rs2, rs1, vd); }
+void vlsseg6e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xa8000007, vm, rs2, rs1, vd); }
+void vlsseg7e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xc8000007, vm, rs2, rs1, vd); }
+void vlsseg8e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xe8000007, vm, rs2, rs1, vd); }
+void vlse8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8000007, vm, rs2, rs1, vd); }
+void vluxei1024_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x14007007, vm, vs2, rs1, vd); }
+void vluxei128_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x14000007, vm, vs2, rs1, vd); }
+void vluxei16_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x4005007, vm, vs2, rs1, vd); }
+void vluxei256_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x14005007, vm, vs2, rs1, vd); }
+void vluxei32_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x4006007, vm, vs2, rs1, vd); }
+void vluxei512_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x14006007, vm, vs2, rs1, vd); }
+void vluxei64_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x4007007, vm, vs2, rs1, vd); }
+void vluxei8_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x4000007, vm, vs2, rs1, vd); }
+void vmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xb4002057, vm, vs2, vs1, vd); }
+void vmacc_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xb4006057, vm, vs2, rs1, vd); }
+void vmadc_vi(const VReg& vd, const VReg& vs2, int32_t simm5) { opIVI(0x46003057, 0, vs2, simm5, vd); }
+void vmadc_vim(const VReg& vd, const VReg& vs2, int32_t simm5) { opIVI(0x44003057, 0, vs2, simm5, vd); }
+void vmadc_vv(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x46000057, 0, vs2, vs1, vd); }
+void vmadc_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x44000057, 0, vs2, vs1, vd); }
+void vmadc_vx(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x46004057, 0, vs2, rs1, vd); }
+void vmadc_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x44004057, 0, vs2, rs1, vd); }
+void vmadd_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xa4002057, vm, vs2, vs1, vd); }
+void vmadd_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xa4006057, vm, vs2, rs1, vd); }
+void vmand_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x64002057, vm, vs2, vs1, vd); }
+void vmandn_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x60002057, vm, vs2, vs1, vd); }
+void vmax_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x1c000057, vm, vs2, vs1, vd); }
+void vmax_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x1c004057, vm, vs2, rs1, vd); }
+void vmaxu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x18000057, vm, vs2, vs1, vd); }
+void vmaxu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x18004057, vm, vs2, rs1, vd); }
+void vmerge_vim(const VReg& vd, const VReg& vs2, int32_t simm5) { opIVI(0x5c003057, 0, vs2, simm5, vd); }
+void vmerge_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x5c000057, 0, vs2, vs1, vd); }
+void vmerge_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x5c004057, 0, vs2, rs1, vd); }
+void vmfeq_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x60005057, vm, vs2, rs1, vd); }
+void vmfeq_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x60001057, vm, vs2, vs1, vd); }
+void vmfge_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x7c005057, vm, vs2, rs1, vd); }
+void vmfgt_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x74005057, vm, vs2, rs1, vd); }
+void vmfle_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x64005057, vm, vs2, rs1, vd); }
+void vmfle_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x64001057, vm, vs2, vs1, vd); }
+void vmflt_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x6c005057, vm, vs2, rs1, vd); }
+void vmflt_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x6c001057, vm, vs2, vs1, vd); }
+void vmfne_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x70005057, vm, vs2, rs1, vd); }
+void vmfne_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x70001057, vm, vs2, vs1, vd); }
+void vmin_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x14000057, vm, vs2, vs1, vd); }
+void vmin_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x14004057, vm, vs2, rs1, vd); }
+void vminu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x10000057, vm, vs2, vs1, vd); }
+void vminu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x10004057, vm, vs2, rs1, vd); }
+void vmnand_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x74002057, vm, vs2, vs1, vd); }
+void vmnor_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x78002057, vm, vs2, vs1, vd); }
+void vmor_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x68002057, vm, vs2, vs1, vd); }
+void vmorn_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x70002057, vm, vs2, vs1, vd); }
+void vmsbc_vv(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x4e000057, 0, vs2, vs1, vd); }
+void vmsbc_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x4c000057, 0, vs2, vs1, vd); }
+void vmsbc_vx(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x4e004057, 0, vs2, rs1, vd); }
+void vmsbc_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x4c004057, 0, vs2, rs1, vd); }
+void vmsbf_m(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x5000a057, vm, vs2, 0, vd); }
+void vmseq_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x60003057, vm, vs2, simm5, vd); }
+void vmseq_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x60000057, vm, vs2, vs1, vd); }
+void vmseq_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x60004057, vm, vs2, rs1, vd); }
+void vmsgt_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x7c003057, vm, vs2, simm5, vd); }
+void vmsgt_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x7c004057, vm, vs2, rs1, vd); }
+void vmsgtu_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x78003057, vm, vs2, simm5, vd); }
+void vmsgtu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x78004057, vm, vs2, rs1, vd); }
+void vmsif_m(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x5001a057, vm, vs2, 0, vd); }
+void vmsle_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x74003057, vm, vs2, simm5, vd); }
+void vmsle_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x74000057, vm, vs2, vs1, vd); }
+void vmsle_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x74004057, vm, vs2, rs1, vd); }
+void vmsleu_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x70003057, vm, vs2, simm5, vd); }
+void vmsleu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x70000057, vm, vs2, vs1, vd); }
+void vmsleu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x70004057, vm, vs2, rs1, vd); }
+void vmslt_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x6c000057, vm, vs2, vs1, vd); }
+void vmslt_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x6c004057, vm, vs2, rs1, vd); }
+void vmsltu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x68000057, vm, vs2, vs1, vd); }
+void vmsltu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x68004057, vm, vs2, rs1, vd); }
+void vmsne_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x64003057, vm, vs2, simm5, vd); }
+void vmsne_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x64000057, vm, vs2, vs1, vd); }
+void vmsne_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x64004057, vm, vs2, rs1, vd); }
+void vmsof_m(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x50012057, vm, vs2, 0, vd); }
+void vmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x94002057, vm, vs2, vs1, vd); }
+void vmul_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x94006057, vm, vs2, rs1, vd); }
+void vmulh_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x9c002057, vm, vs2, vs1, vd); }
+void vmulh_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x9c006057, vm, vs2, rs1, vd); }
+void vmulhsu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x98002057, vm, vs2, vs1, vd); }
+void vmulhsu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x98006057, vm, vs2, rs1, vd); }
+void vmulhu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x90002057, vm, vs2, vs1, vd); }
+void vmulhu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x90006057, vm, vs2, rs1, vd); }
+void vmv1r_v(const VReg& vd, const VReg& vs2) { opIVI(0x9e003057, 0, vs2, 0, vd); }
+void vmv2r_v(const VReg& vd, const VReg& vs2) { opIVI(0x9e00b057, 0, vs2, 0, vd); }
+void vmv4r_v(const VReg& vd, const VReg& vs2) { opIVI(0x9e01b057, 0, vs2, 0, vd); }
+void vmv8r_v(const VReg& vd, const VReg& vs2) { opIVI(0x9e03b057, 0, vs2, 0, vd); }
+void vmv_s_x(const VReg& vd, const Reg& rs1) { opMVX(0x42006057, 0, 0, rs1, vd); }
+void vmv_v_i(const VReg& vd, int32_t simm5) { opIVI(0x5e003057, 0, 0, simm5, vd); }
+void vmv_v_v(const VReg& vd, const VReg& vs1) { opIVV(0x5e000057, 0, 0, vs1, vd); }
+void vmv_v_x(const VReg& vd, const Reg& rs1) { opIVX(0x5e004057, 0, 0, rs1, vd); }
+void vmv_x_s(const Reg& rd, const VReg& vs2) { opMVV(0x42002057, 0, vs2, 0, rd); }
+void vmxnor_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x7c002057, vm, vs2, vs1, vd); }
+void vmxor_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x6c002057, vm, vs2, vs1, vd); }
+void vnclip_wi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xbc003057, vm, vs2, simm5, vd); }
+void vnclip_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xbc000057, vm, vs2, vs1, vd); }
+void vnclip_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xbc004057, vm, vs2, rs1, vd); }
+void vnclipu_wi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xb8003057, vm, vs2, simm5, vd); }
+void vnclipu_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xb8000057, vm, vs2, vs1, vd); }
+void vnclipu_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xb8004057, vm, vs2, rs1, vd); }
+void vnmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xbc002057, vm, vs2, vs1, vd); }
+void vnmsac_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xbc006057, vm, vs2, rs1, vd); }
+void vnmsub_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xac002057, vm, vs2, vs1, vd); }
+void vnmsub_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xac006057, vm, vs2, rs1, vd); }
+void vnsra_wi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xb4003057, vm, vs2, simm5, vd); }
+void vnsra_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xb4000057, vm, vs2, vs1, vd); }
+void vnsra_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xb4004057, vm, vs2, rs1, vd); }
+void vnsrl_wi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xb0003057, vm, vs2, simm5, vd); }
+void vnsrl_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xb0000057, vm, vs2, vs1, vd); }
+void vnsrl_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xb0004057, vm, vs2, rs1, vd); }
+void vor_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x28003057, vm, vs2, simm5, vd); }
+void vor_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x28000057, vm, vs2, vs1, vd); }
+void vor_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x28004057, vm, vs2, rs1, vd); }
+void vredand_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x4002057, vm, vs2, vs1, vd); }
+void vredmax_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x1c002057, vm, vs2, vs1, vd); }
+void vredmaxu_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x18002057, vm, vs2, vs1, vd); }
+void vredmin_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x14002057, vm, vs2, vs1, vd); }
+void vredminu_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x10002057, vm, vs2, vs1, vd); }
+void vredor_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x8002057, vm, vs2, vs1, vd); }
+void vredsum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x2057, vm, vs2, vs1, vd); }
+void vredxor_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xc002057, vm, vs2, vs1, vd); }
+void vrem_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x8c002057, vm, vs2, vs1, vd); }
+void vrem_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x8c006057, vm, vs2, rs1, vd); }
+void vremu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x88002057, vm, vs2, vs1, vd); }
+void vremu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x88006057, vm, vs2, rs1, vd); }
+void vrgather_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x30003057, vm, vs2, simm5, vd); }
+void vrgather_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x30000057, vm, vs2, vs1, vd); }
+void vrgather_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x30004057, vm, vs2, rs1, vd); }
+void vrgatherei16_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x38000057, vm, vs2, vs1, vd); }
+void vrsub_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xc003057, vm, vs2, simm5, vd); }
+void vrsub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xc004057, vm, vs2, rs1, vd); }
+void vs1r_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2800027, 0, 0, rs1, vs3); }
+void vs2r_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2800027, 0, 0, rs1, vs3); }
+void vs4r_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2800027, 0, 0, rs1, vs3); }
+void vs8r_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2800027, 0, 0, rs1, vs3); }
+void vsadd_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x84003057, vm, vs2, simm5, vd); }
+void vsadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x84000057, vm, vs2, vs1, vd); }
+void vsadd_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x84004057, vm, vs2, rs1, vd); }
+void vsaddu_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x80003057, vm, vs2, simm5, vd); }
+void vsaddu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x80000057, vm, vs2, vs1, vd); }
+void vsaddu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x80004057, vm, vs2, rs1, vd); }
+void vsbc_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x48000057, 0, vs2, vs1, vd); }
+void vsbc_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x48004057, 0, vs2, rs1, vd); }
+void vsseg1e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10007027, vm, 0, rs1, vs3); }
+void vsseg2e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x30007027, vm, 0, rs1, vs3); }
+void vsseg3e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x50007027, vm, 0, rs1, vs3); }
+void vsseg4e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x70007027, vm, 0, rs1, vs3); }
+void vsseg5e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x90007027, vm, 0, rs1, vs3); }
+void vsseg6e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xb0007027, vm, 0, rs1, vs3); }
+void vsseg7e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xd0007027, vm, 0, rs1, vs3); }
+void vsseg8e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xf0007027, vm, 0, rs1, vs3); }
+void vse1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10007027, vm, 0, rs1, vs3); }
+void vsseg1e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10000027, vm, 0, rs1, vs3); }
+void vsseg2e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x30000027, vm, 0, rs1, vs3); }
+void vsseg3e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x50000027, vm, 0, rs1, vs3); }
+void vsseg4e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x70000027, vm, 0, rs1, vs3); }
+void vsseg5e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x90000027, vm, 0, rs1, vs3); }
+void vsseg6e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xb0000027, vm, 0, rs1, vs3); }
+void vsseg7e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xd0000027, vm, 0, rs1, vs3); }
+void vsseg8e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xf0000027, vm, 0, rs1, vs3); }
+void vse128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10000027, vm, 0, rs1, vs3); }
+void vsseg1e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x5027, vm, 0, rs1, vs3); }
+void vsseg2e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x20005027, vm, 0, rs1, vs3); }
+void vsseg3e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x40005027, vm, 0, rs1, vs3); }
+void vsseg4e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x60005027, vm, 0, rs1, vs3); }
+void vsseg5e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x80005027, vm, 0, rs1, vs3); }
+void vsseg6e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xa0005027, vm, 0, rs1, vs3); }
+void vsseg7e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xc0005027, vm, 0, rs1, vs3); }
+void vsseg8e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xe0005027, vm, 0, rs1, vs3); }
+void vse16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x5027, vm, 0, rs1, vs3); }
+void vsseg1e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10005027, vm, 0, rs1, vs3); }
+void vsseg2e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x30005027, vm, 0, rs1, vs3); }
+void vsseg3e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x50005027, vm, 0, rs1, vs3); }
+void vsseg4e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x70005027, vm, 0, rs1, vs3); }
+void vsseg5e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x90005027, vm, 0, rs1, vs3); }
+void vsseg6e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xb0005027, vm, 0, rs1, vs3); }
+void vsseg7e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xd0005027, vm, 0, rs1, vs3); }
+void vsseg8e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xf0005027, vm, 0, rs1, vs3); }
+void vse256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10005027, vm, 0, rs1, vs3); }
+void vsseg1e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x6027, vm, 0, rs1, vs3); }
+void vsseg2e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x20006027, vm, 0, rs1, vs3); }
+void vsseg3e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x40006027, vm, 0, rs1, vs3); }
+void vsseg4e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x60006027, vm, 0, rs1, vs3); }
+void vsseg5e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x80006027, vm, 0, rs1, vs3); }
+void vsseg6e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xa0006027, vm, 0, rs1, vs3); }
+void vsseg7e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xc0006027, vm, 0, rs1, vs3); }
+void vsseg8e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xe0006027, vm, 0, rs1, vs3); }
+void vse32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x6027, vm, 0, rs1, vs3); }
+void vsseg1e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10006027, vm, 0, rs1, vs3); }
+void vsseg2e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x30006027, vm, 0, rs1, vs3); }
+void vsseg3e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x50006027, vm, 0, rs1, vs3); }
+void vsseg4e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x70006027, vm, 0, rs1, vs3); }
+void vsseg5e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x90006027, vm, 0, rs1, vs3); }
+void vsseg6e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xb0006027, vm, 0, rs1, vs3); }
+void vsseg7e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xd0006027, vm, 0, rs1, vs3); }
+void vsseg8e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xf0006027, vm, 0, rs1, vs3); }
+void vse512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10006027, vm, 0, rs1, vs3); }
+void vsseg1e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x7027, vm, 0, rs1, vs3); }
+void vsseg2e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x20007027, vm, 0, rs1, vs3); }
+void vsseg3e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x40007027, vm, 0, rs1, vs3); }
+void vsseg4e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x60007027, vm, 0, rs1, vs3); }
+void vsseg5e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x80007027, vm, 0, rs1, vs3); }
+void vsseg6e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xa0007027, vm, 0, rs1, vs3); }
+void vsseg7e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xc0007027, vm, 0, rs1, vs3); }
+void vsseg8e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xe0007027, vm, 0, rs1, vs3); }
+void vse64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x7027, vm, 0, rs1, vs3); }
+void vsseg1e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x27, vm, 0, rs1, vs3); }
+void vsseg2e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x20000027, vm, 0, rs1, vs3); }
+void vsseg3e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x40000027, vm, 0, rs1, vs3); }
+void vsseg4e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x60000027, vm, 0, rs1, vs3); }
+void vsseg5e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x80000027, vm, 0, rs1, vs3); }
+void vsseg6e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xa0000027, vm, 0, rs1, vs3); }
+void vsseg7e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xc0000027, vm, 0, rs1, vs3); }
+void vsseg8e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xe0000027, vm, 0, rs1, vs3); }
+void vse8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x27, vm, 0, rs1, vs3); }
+void vsext_vf2(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x4803a057, vm, vs2, 0, vd); }
+void vsext_vf4(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x4802a057, vm, vs2, 0, vd); }
+void vsext_vf8(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x4801a057, vm, vs2, 0, vd); }
+void vslide1down_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x3c006057, vm, vs2, rs1, vd); }
+void vslide1up_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x38006057, vm, vs2, rs1, vd); }
+void vslidedown_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x3c003057, vm, vs2, simm5, vd); }
+void vslidedown_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x3c004057, vm, vs2, rs1, vd); }
+void vslideup_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x38003057, vm, vs2, simm5, vd); }
+void vslideup_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x38004057, vm, vs2, rs1, vd); }
+void vsll_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x94003057, vm, vs2, simm5, vd); }
+void vsll_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x94000057, vm, vs2, vs1, vd); }
+void vsll_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x94004057, vm, vs2, rs1, vd); }
+void vsm_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2b00027, 0, 0, rs1, vs3); }
+void vsmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x9c000057, vm, vs2, vs1, vd); }
+void vsmul_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x9c004057, vm, vs2, rs1, vd); }
+void vsoxei1024_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x1c007027, vm, vs2, rs1, vs3); }
+void vsoxei128_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x1c000027, vm, vs2, rs1, vs3); }
+void vsoxei16_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0xc005027, vm, vs2, rs1, vs3); }
+void vsoxei256_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x1c005027, vm, vs2, rs1, vs3); }
+void vsoxei32_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0xc006027, vm, vs2, rs1, vs3); }
+void vsoxei512_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x1c006027, vm, vs2, rs1, vs3); }
+void vsoxei64_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0xc007027, vm, vs2, rs1, vs3); }
+void vsoxei8_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0xc000027, vm, vs2, rs1, vs3); }
+void vsra_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xa4003057, vm, vs2, simm5, vd); }
+void vsra_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xa4000057, vm, vs2, vs1, vd); }
+void vsra_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xa4004057, vm, vs2, rs1, vd); }
+void vsrl_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xa0003057, vm, vs2, simm5, vd); }
+void vsrl_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xa0000057, vm, vs2, vs1, vd); }
+void vsrl_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xa0004057, vm, vs2, rs1, vd); }
+void vssseg1e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18007027, vm, rs2, rs1, vs3); }
+void vssseg2e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x38007027, vm, rs2, rs1, vs3); }
+void vssseg3e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x58007027, vm, rs2, rs1, vs3); }
+void vssseg4e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x78007027, vm, rs2, rs1, vs3); }
+void vssseg5e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x98007027, vm, rs2, rs1, vs3); }
+void vssseg6e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xb8007027, vm, rs2, rs1, vs3); }
+void vssseg7e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xd8007027, vm, rs2, rs1, vs3); }
+void vssseg8e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xf8007027, vm, rs2, rs1, vs3); }
+void vsse1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18007027, vm, rs2, rs1, vs3); }
+void vssseg1e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18000027, vm, rs2, rs1, vs3); }
+void vssseg2e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x38000027, vm, rs2, rs1, vs3); }
+void vssseg3e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x58000027, vm, rs2, rs1, vs3); }
+void vssseg4e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x78000027, vm, rs2, rs1, vs3); }
+void vssseg5e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x98000027, vm, rs2, rs1, vs3); }
+void vssseg6e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xb8000027, vm, rs2, rs1, vs3); }
+void vssseg7e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xd8000027, vm, rs2, rs1, vs3); }
+void vssseg8e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xf8000027, vm, rs2, rs1, vs3); }
+void vsse128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18000027, vm, rs2, rs1, vs3); }
+void vssseg1e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8005027, vm, rs2, rs1, vs3); }
+void vssseg2e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x28005027, vm, rs2, rs1, vs3); }
+void vssseg3e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x48005027, vm, rs2, rs1, vs3); }
+void vssseg4e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x68005027, vm, rs2, rs1, vs3); }
+void vssseg5e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x88005027, vm, rs2, rs1, vs3); }
+void vssseg6e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xa8005027, vm, rs2, rs1, vs3); }
+void vssseg7e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xc8005027, vm, rs2, rs1, vs3); }
+void vssseg8e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xe8005027, vm, rs2, rs1, vs3); }
+void vsse16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8005027, vm, rs2, rs1, vs3); }
+void vssseg1e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18005027, vm, rs2, rs1, vs3); }
+void vssseg2e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x38005027, vm, rs2, rs1, vs3); }
+void vssseg3e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x58005027, vm, rs2, rs1, vs3); }
+void vssseg4e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x78005027, vm, rs2, rs1, vs3); }
+void vssseg5e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x98005027, vm, rs2, rs1, vs3); }
+void vssseg6e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xb8005027, vm, rs2, rs1, vs3); }
+void vssseg7e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xd8005027, vm, rs2, rs1, vs3); }
+void vssseg8e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xf8005027, vm, rs2, rs1, vs3); }
+void vsse256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18005027, vm, rs2, rs1, vs3); }
+void vssseg1e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8006027, vm, rs2, rs1, vs3); }
+void vssseg2e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x28006027, vm, rs2, rs1, vs3); }
+void vssseg3e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x48006027, vm, rs2, rs1, vs3); }
+void vssseg4e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x68006027, vm, rs2, rs1, vs3); }
+void vssseg5e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x88006027, vm, rs2, rs1, vs3); }
+void vssseg6e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xa8006027, vm, rs2, rs1, vs3); }
+void vssseg7e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xc8006027, vm, rs2, rs1, vs3); }
+void vssseg8e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xe8006027, vm, rs2, rs1, vs3); }
+void vsse32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8006027, vm, rs2, rs1, vs3); }
+void vssseg1e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18006027, vm, rs2, rs1, vs3); }
+void vssseg2e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x38006027, vm, rs2, rs1, vs3); }
+void vssseg3e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x58006027, vm, rs2, rs1, vs3); }
+void vssseg4e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x78006027, vm, rs2, rs1, vs3); }
+void vssseg5e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x98006027, vm, rs2, rs1, vs3); }
+void vssseg6e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xb8006027, vm, rs2, rs1, vs3); }
+void vssseg7e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xd8006027, vm, rs2, rs1, vs3); }
+void vssseg8e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xf8006027, vm, rs2, rs1, vs3); }
+void vsse512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18006027, vm, rs2, rs1, vs3); }
+void vssseg1e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8007027, vm, rs2, rs1, vs3); }
+void vssseg2e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x28007027, vm, rs2, rs1, vs3); }
+void vssseg3e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x48007027, vm, rs2, rs1, vs3); }
+void vssseg4e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x68007027, vm, rs2, rs1, vs3); }
+void vssseg5e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x88007027, vm, rs2, rs1, vs3); }
+void vssseg6e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xa8007027, vm, rs2, rs1, vs3); }
+void vssseg7e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xc8007027, vm, rs2, rs1, vs3); }
+void vssseg8e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xe8007027, vm, rs2, rs1, vs3); }
+void vsse64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8007027, vm, rs2, rs1, vs3); }
+void vssseg1e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8000027, vm, rs2, rs1, vs3); }
+void vssseg2e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x28000027, vm, rs2, rs1, vs3); }
+void vssseg3e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x48000027, vm, rs2, rs1, vs3); }
+void vssseg4e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x68000027, vm, rs2, rs1, vs3); }
+void vssseg5e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x88000027, vm, rs2, rs1, vs3); }
+void vssseg6e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xa8000027, vm, rs2, rs1, vs3); }
+void vssseg7e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xc8000027, vm, rs2, rs1, vs3); }
+void vssseg8e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xe8000027, vm, rs2, rs1, vs3); }
+void vsse8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8000027, vm, rs2, rs1, vs3); }
+void vssra_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xac003057, vm, vs2, simm5, vd); }
+void vssra_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xac000057, vm, vs2, vs1, vd); }
+void vssra_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xac004057, vm, vs2, rs1, vd); }
+void vssrl_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xa8003057, vm, vs2, simm5, vd); }
+void vssrl_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xa8000057, vm, vs2, vs1, vd); }
+void vssrl_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xa8004057, vm, vs2, rs1, vd); }
+void vssub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x8c000057, vm, vs2, vs1, vd); }
+void vssub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x8c004057, vm, vs2, rs1, vd); }
+void vssubu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x88000057, vm, vs2, vs1, vd); }
+void vssubu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x88004057, vm, vs2, rs1, vd); }
+void vsub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x8000057, vm, vs2, vs1, vd); }
+void vsub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x8004057, vm, vs2, rs1, vd); }
+void vsuxei1024_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x14007027, vm, vs2, rs1, vs3); }
+void vsuxei128_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x14000027, vm, vs2, rs1, vs3); }
+void vsuxei16_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x4005027, vm, vs2, rs1, vs3); }
+void vsuxei256_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x14005027, vm, vs2, rs1, vs3); }
+void vsuxei32_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x4006027, vm, vs2, rs1, vs3); }
+void vsuxei512_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x14006027, vm, vs2, rs1, vs3); }
+void vsuxei64_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x4007027, vm, vs2, rs1, vs3); }
+void vsuxei8_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x4000027, vm, vs2, rs1, vs3); }
+void vwadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xc4002057, vm, vs2, vs1, vd); }
+void vwadd_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xc4006057, vm, vs2, rs1, vd); }
+void vwadd_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xd4002057, vm, vs2, vs1, vd); }
+void vwadd_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xd4006057, vm, vs2, rs1, vd); }
+void vwaddu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xc0002057, vm, vs2, vs1, vd); }
+void vwaddu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xc0006057, vm, vs2, rs1, vd); }
+void vwaddu_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xd0002057, vm, vs2, vs1, vd); }
+void vwaddu_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xd0006057, vm, vs2, rs1, vd); }
+void vwmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xf4002057, vm, vs2, vs1, vd); }
+void vwmacc_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xf4006057, vm, vs2, rs1, vd); }
+void vwmaccsu_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xfc002057, vm, vs2, vs1, vd); }
+void vwmaccsu_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xfc006057, vm, vs2, rs1, vd); }
+void vwmaccu_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xf0002057, vm, vs2, vs1, vd); }
+void vwmaccu_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xf0006057, vm, vs2, rs1, vd); }
+void vwmaccus_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xf8006057, vm, vs2, rs1, vd); }
+void vwmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xec002057, vm, vs2, vs1, vd); }
+void vwmul_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xec006057, vm, vs2, rs1, vd); }
+void vwmulsu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xe8002057, vm, vs2, vs1, vd); }
+void vwmulsu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xe8006057, vm, vs2, rs1, vd); }
+void vwmulu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xe0002057, vm, vs2, vs1, vd); }
+void vwmulu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xe0006057, vm, vs2, rs1, vd); }
+void vwredsum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xc4000057, vm, vs2, vs1, vd); }
+void vwredsumu_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xc0000057, vm, vs2, vs1, vd); }
+void vwsub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xcc002057, vm, vs2, vs1, vd); }
+void vwsub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xcc006057, vm, vs2, rs1, vd); }
+void vwsub_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xdc002057, vm, vs2, vs1, vd); }
+void vwsub_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xdc006057, vm, vs2, rs1, vd); }
+void vwsubu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xc8002057, vm, vs2, vs1, vd); }
+void vwsubu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xc8006057, vm, vs2, rs1, vd); }
+void vwsubu_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xd8002057, vm, vs2, vs1, vd); }
+void vwsubu_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xd8006057, vm, vs2, rs1, vd); }
+void vxor_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x2c003057, vm, vs2, simm5, vd); }
+void vxor_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x2c000057, vm, vs2, vs1, vd); }
+void vxor_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x2c004057, vm, vs2, rs1, vd); }
+void vzext_vf2(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x48032057, vm, vs2, 0, vd); }
+void vzext_vf4(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x48022057, vm, vs2, 0, vd); }
+void vzext_vf8(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x48012057, vm, vs2, 0, vd); }
+
+void vsetivli(const Reg& rd, uint32_t uimm, SEW sew, LMUL lmul=LMUL::m1, VTA vta=VTA::tu, VMA vma=VMA::mu) {
+    uint32_t zimm = (static_cast<uint32_t>(vma)<<7) |
+                    (static_cast<uint32_t>(vta)<<6) |
+                    (static_cast<uint32_t>(sew)<<3) |
+                    (static_cast<uint32_t>(lmul));
+    uint32_t v = (0x3<<30) | (zimm<<20) | (uimm<<15) | (0x7<<12) | (rd.getIdx()<<7) | (0x57);
+    append4B(v);
+}
+
+void vsetvli(const Reg& rd, const Reg& rs1, SEW sew, LMUL lmul=LMUL::m1, VTA vta=VTA::tu, VMA vma=VMA::mu) {
+    uint32_t zimm = (static_cast<uint32_t>(vma)<<7) |
+                    (static_cast<uint32_t>(vta)<<6) |
+                    (static_cast<uint32_t>(sew)<<3) |
+                    (static_cast<uint32_t>(lmul));
+    uint32_t v = (0x0<<31) | (zimm<<20) | (rs1.getIdx()<<15) | (0x7<<12) | (rd.getIdx()<<7) | (0x57);
+    append4B(v);
+}
+
+void vsetvl(const Reg& rd, const Reg& rs1, const Reg& rs2) {
+    uint32_t v = (0x40<<25) | (rs2.getIdx()<<20) | (rs1.getIdx()<<15) | (0x7<<12) | (rd.getIdx()<<7) | (0x57);
+    append4B(v);
+}
+
+
+// Copy mask register
+void vmmv_m(const VReg& vd, const VReg& vs) { vmand_mm(vd, vs, vs); }
+// Clear mask register
+void vmclr_m(const VReg& vd) { vmxor_mm(vd, vd, vd); }
+// Set mask register
+void vmset_m(const VReg& vd) { vmxnor_mm(vd, vd, vd); }
+// Invert bits
+void vmnot_m(const VReg& vd, const VReg& vs) { vmnand_mm(vd, vs, vs); }
+
+
+// vector compare pseudoinstructions
+void vmfgt_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { vmflt_vv(vd, vs2, vs1, vm); }
+void vmfge_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { vmfle_vv(vd, vs2, vs1, vm); }
+
+// sign-related pseudoinstructions
+void vfabs_v(const VReg& vd, const VReg& vs, VM vm=VM::unmasked) { vfsgnjx_vv(vd, vs, vs, vm); }
+void vfneg_v(const VReg& vd, const VReg& vs, VM vm=VM::unmasked) { vfsgnjn_vv(vd, vs, vs, vm); }

From 231fcc0f76097c11249e05eb7d3ee1c7c47e96a3 Mon Sep 17 00:00:00 2001
From: StrelkovKM <strelkovkm96@outlook.com>
Date: Mon, 23 Mar 2026 19:29:28 +0000
Subject: [PATCH 03/13] [CPU][RV64] Edit:CMakeLists.txt

Reason: fix jit_utils
---
 src/CMakeLists.txt               | 2 +-
 src/cpu/CMakeLists.txt           | 2 +-
 src/cpu/cpu_convolution_list.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e69a804d39a..a70b63dad37 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -77,7 +77,7 @@ if(DNNL_EXPERIMENTAL)
 endif()
 
 if(DNNL_EXPERIMENTAL_UKERNEL)
-    if(DNNL_TARGET_ARCH STREQUAL "X64" OR DNNL_TARGET_ARCH STREQUAL "AARCH64" OR DNNL_TARGET_ARCH STREQUAL "RISCV64")
+    if(DNNL_TARGET_ARCH STREQUAL "X64" OR DNNL_TARGET_ARCH STREQUAL "AARCH64" OR DNNL_TARGET_ARCH STREQUAL "RV64")
         message(STATUS "Experimental functionality for ukernels is enabled")
     else()
         message(FATAL_ERROR "ukernel API isn't supported for ${DNNL_TARGET_ARCH}.")
diff --git a/src/cpu/CMakeLists.txt b/src/cpu/CMakeLists.txt
index ab791ee7b2c..19923b7b12d 100644
--- a/src/cpu/CMakeLists.txt
+++ b/src/cpu/CMakeLists.txt
@@ -29,7 +29,7 @@ foreach(SOURCE_FILE ${SOURCES_EXTRA})
     list(APPEND SOURCES "${SOURCE_FILE}")
 endforeach()
 
-if((DNNL_TARGET_ARCH STREQUAL "X64") OR (DNNL_TARGET_ARCH STREQUAL "AARCH64"))
+if((DNNL_TARGET_ARCH STREQUAL "X64") OR (DNNL_TARGET_ARCH STREQUAL "AARCH64") OR (DNNL_TARGET_ARCH STREQUAL "RV64"))
     file(GLOB_RECURSE SOURCES_JIT_UTILS
         ${CMAKE_CURRENT_SOURCE_DIR}/jit_utils/*.[ch]
         ${CMAKE_CURRENT_SOURCE_DIR}/jit_utils/*.[ch]pp
diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp
index 350ac8e14e4..47b0fb1494b 100644
--- a/src/cpu/cpu_convolution_list.cpp
+++ b/src/cpu/cpu_convolution_list.cpp
@@ -180,7 +180,7 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
             CPU_INSTANCE_AARCH64(brgemm_convolution_fwd_t, sve_128)
             // CPU_INSTANCE_X64(jit_uni_ncsp_convolution_fwd_t)
 
-            //CPU_INSTANCE_RV64GCV(jit_rvv_1x1_convolution_fwd_t)
+            CPU_INSTANCE_RV64GCV(jit_rvv_1x1_convolution_fwd_t)
             CPU_INSTANCE_RV64GCV(riscv_gemm_convolution_fwd_t)
             
 

From a51ec7d084b1f967288e01bf7d7e22a051a9deb3 Mon Sep 17 00:00:00 2001
From: StrelkovKM <strelkovkm96@outlook.com>
Date: Wed, 22 Apr 2026 15:39:51 +0000
Subject: [PATCH 04/13] [CPU][RV64] Edit:CMake src/

---
 src/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a70b63dad37..08d882bfee0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -77,7 +77,7 @@ if(DNNL_EXPERIMENTAL)
 endif()
 
 if(DNNL_EXPERIMENTAL_UKERNEL)
-    if(DNNL_TARGET_ARCH STREQUAL "X64" OR DNNL_TARGET_ARCH STREQUAL "AARCH64" OR DNNL_TARGET_ARCH STREQUAL "RV64")
+    if(DNNL_TARGET_ARCH STREQUAL "X64" OR DNNL_TARGET_ARCH STREQUAL "AARCH64")
         message(STATUS "Experimental functionality for ukernels is enabled")
     else()
         message(FATAL_ERROR "ukernel API isn't supported for ${DNNL_TARGET_ARCH}.")

From 3f044236329a3f848f25e707ad2c09319cd964b9 Mon Sep 17 00:00:00 2001
From: StrelkovKM <strelkovkm96@outlook.com>
Date: Wed, 22 Apr 2026 15:47:06 +0000
Subject: [PATCH 05/13] [CPU][RV64] Edit: Return ref impl src/

---
 src/cpu/cpu_convolution_list.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp
index 47b0fb1494b..b91ac65b790 100644
--- a/src/cpu/cpu_convolution_list.cpp
+++ b/src/cpu/cpu_convolution_list.cpp
@@ -184,9 +184,9 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
             CPU_INSTANCE_RV64GCV(riscv_gemm_convolution_fwd_t)
             
 
-            // CPU_INSTANCE(gemm_convolution_fwd_t)
-            // CPU_INSTANCE(ref_convolution_fwd_t)
-            // CPU_INSTANCE(ref_fused_convolution_fwd_t)
+            CPU_INSTANCE(gemm_convolution_fwd_t)
+            CPU_INSTANCE(ref_convolution_fwd_t)
+            CPU_INSTANCE(ref_fused_convolution_fwd_t)
             nullptr,
         }},
         {{forward, f32, f16, f32}, {

From 981f6827985614efd7bb6eb35a455c6e21627de2 Mon Sep 17 00:00:00 2001
From: StrelkovKM <strelkovkm96@outlook.com>
Date: Wed, 22 Apr 2026 16:13:46 +0000
Subject: [PATCH 06/13] [CPU][RV64] optimize jit_rvv & im2col

---
 src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp | 34 +++++++++++++++++-------
 src/cpu/rv64/rvv_gemm_convolution.cpp    | 13 ++++++---
 2 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp
index c63a375d13b..243b93ff36e 100644
--- a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp
+++ b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp
@@ -225,12 +225,17 @@ void jit_rvv_1x1_conv_kernel_t::balance(jit_1x1_conv_conf_t &jcp) {
 }
 
 void jit_rvv_1x1_conv_kernel_t::generate() {
+    static_assert(sizeof(size_t) == 8, "oneDNN RV64 requires 64-bit pointer arithmetic");
+
     preamble();
 
     // Set initial VL to oc_block (4)
-    li(reg_tmp_imm, jcp.oc_block);
-    vsetvli(reg_tmp_imm, reg_tmp_imm, Xbyak_riscv::SEW::e32,
-            Xbyak_riscv::LMUL::m1);
+    if (jcp.oc_block <= 31) {
+        vsetivli(reg_tmp_imm, jcp.oc_block, Xbyak_riscv::SEW::e32, Xbyak_riscv::LMUL::m1);
+    } else {
+        li(reg_tmp_imm, jcp.oc_block);
+        vsetvli(reg_tmp_imm, reg_tmp_imm, Xbyak_riscv::SEW::e32, Xbyak_riscv::LMUL::m1);
+    }
 
     // Load parameters
     ld(reg_bcast_data, reg_param, GET_OFF(bcast_data));
@@ -423,9 +428,14 @@ void jit_rvv_1x1_conv_kernel_t::reduce_loop(int load_loop_blk, int ur) {
 
     auto store = [=]() {
         mv(reg_tmp_addr, aux_reg_output_data);
+
+        bool has_relu = false;
+
         for (int i_ur = 0; i_ur < ur; ++i_ur) {
             for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
+
                 vse32_v(vreg_accum(i_load, i_ur), reg_tmp_addr);
+
                 if (i_load + 1 < load_loop_blk)
                     addi(reg_tmp_addr, reg_tmp_addr,
                             jcp.load_block * jcp.typesize_out);
@@ -449,8 +459,8 @@ void jit_rvv_1x1_conv_kernel_t::reduce_loop(int load_loop_blk, int ur) {
                 }
 
                 if (i_ur + 1 < ur) {
-                    size_t offset
-                            = (size_t)(i_ur + 1) * jcp.bcast_loop_bcast_step;
+                    ptrdiff_t offset
+                            = (ptrdiff_t)(i_ur + 1) * jcp.bcast_loop_bcast_step;
                     if (offset <= 2047) {
                         flw(freg_bcast, aux_reg_bcast_data, offset);
                     } else {
@@ -470,12 +480,16 @@ void jit_rvv_1x1_conv_kernel_t::reduce_loop(int load_loop_blk, int ur) {
 
         // Prefetch weights for next iteration
         if (!last_block) {
-            for (int i_unroll = 0; i_unroll < jcp.reduce_loop_unroll;
-                    ++i_unroll) {
+            for (int i_unroll = 0; i_unroll < jcp.reduce_loop_unroll; ++i_unroll) {
                 for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
-                    size_t weight_off
-                            = (size_t)i_unroll * jcp.reduce_loop_load_step
-                            + (size_t)i_load * jcp.load_loop_load_step;
+                    ptrdiff_t weight_off = (ptrdiff_t)i_unroll * jcp.reduce_loop_load_step
+                                         + (ptrdiff_t)i_load * jcp.load_loop_load_step;
+
+                    li(reg_tmp_addr, weight_off + 256);
+
+                    add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr);
+                    flw(x0, reg_tmp_addr, 0); 
+
                     li(reg_tmp_addr, weight_off);
                     add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr);
                     vle32_v(vreg_load(i_load, i_unroll), reg_tmp_addr);
diff --git a/src/cpu/rv64/rvv_gemm_convolution.cpp b/src/cpu/rv64/rvv_gemm_convolution.cpp
index fc20fb2fecf..ed75ccfc0ea 100644
--- a/src/cpu/rv64/rvv_gemm_convolution.cpp
+++ b/src/cpu/rv64/rvv_gemm_convolution.cpp
@@ -106,12 +106,17 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc(
         // jit_gemm_convolution_utils::im2col_dt_3d() requires external
         // data initialization by zeroes
 
+        const size_t total_sz = jcp.im2col_sz;
+        const size_t vlmax = __riscv_vsetvlmax_e32m1();
+        const vfloat32m1_t v_zero = __riscv_vfmv_v_f_f32m1(0.0f, vlmax);
         ptrdiff_t i = 0;
-        while (i < jcp.im2col_sz) {
-            size_t vl = __riscv_vsetvl_e32m1(jcp.im2col_sz - i);
-            vfloat32m1_t v_zero = __riscv_vfmv_v_f_f32m1(0.0f, vl);
+
+        for (; i <= (ptrdiff_t)total_sz - (ptrdiff_t)vlmax; i += (ptrdiff_t)vlmax) {
+            __riscv_vse32_v_f32m1(col + i, v_zero, vlmax);
+        }
+        if (i < (ptrdiff_t)total_sz) {
+            size_t vl = __riscv_vsetvl_e32m1(total_sz - i);
             __riscv_vse32_v_f32m1(col + i, v_zero, vl);
-            i += vl;
         }
     }
 

From 9b7837fc5bca28f171be3059fc87c717285fe1a7 Mon Sep 17 00:00:00 2001
From: StrelkovKM <strelkovkm96@outlook.com>
Date: Wed, 22 Apr 2026 19:46:13 +0000
Subject: [PATCH 07/13] [CPU][RV64] bugfix

---
 src/cpu/cpu_convolution_list.cpp         |  6 ++--
 src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp | 46 ++++++++++++++++--------
 2 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp
index b91ac65b790..47b0fb1494b 100644
--- a/src/cpu/cpu_convolution_list.cpp
+++ b/src/cpu/cpu_convolution_list.cpp
@@ -184,9 +184,9 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
             CPU_INSTANCE_RV64GCV(riscv_gemm_convolution_fwd_t)
             
 
-            CPU_INSTANCE(gemm_convolution_fwd_t)
-            CPU_INSTANCE(ref_convolution_fwd_t)
-            CPU_INSTANCE(ref_fused_convolution_fwd_t)
+            // CPU_INSTANCE(gemm_convolution_fwd_t)
+            // CPU_INSTANCE(ref_convolution_fwd_t)
+            // CPU_INSTANCE(ref_fused_convolution_fwd_t)
             nullptr,
         }},
         {{forward, f32, f16, f32}, {
diff --git a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp
index 243b93ff36e..e88b18214b3 100644
--- a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp
+++ b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp
@@ -211,6 +211,10 @@ status_t jit_rvv_1x1_conv_kernel_t::init_conf(jit_1x1_conv_conf_t &jcp,
             = jcp.ic_without_padding * jcp.oc_block * jcp.typesize_in;
     jcp.load_loop_iter_step = jcp.oc_block;
 
+    if (jcp.reduce_loop_load_step > (1LL << 40) / jcp.reduce_loop_unroll) {
+        return status::unimplemented;
+    }
+
     return status::success;
 }
 
@@ -225,7 +229,6 @@ void jit_rvv_1x1_conv_kernel_t::balance(jit_1x1_conv_conf_t &jcp) {
 }
 
 void jit_rvv_1x1_conv_kernel_t::generate() {
-    static_assert(sizeof(size_t) == 8, "oneDNN RV64 requires 64-bit pointer arithmetic");
 
     preamble();
 
@@ -286,13 +289,17 @@ void jit_rvv_1x1_conv_kernel_t::generate() {
 
     L(load_loop_tail);
     {
-        Label tail_loop;
+        Label tail_loop, tail_end;
         L(tail_loop);
-        blez(reg_load_loop_work, load_loop_end);
+        blez(reg_load_loop_work, tail_end);
 
         // Last block may be partial, use vsetvli to set VL dynamically
-        vsetvli(reg_tmp_imm, reg_load_loop_work, Xbyak_riscv::SEW::e32,
-                Xbyak_riscv::LMUL::m1);
+        if (jcp.oc_block <= 31) {
+            vsetivli(reg_tmp_imm, jcp.oc_block, Xbyak_riscv::SEW::e32, Xbyak_riscv::LMUL::m1);
+        } else {
+            li(reg_tmp_imm, jcp.oc_block);
+            vsetvli(reg_tmp_imm, reg_tmp_imm, Xbyak_riscv::SEW::e32, Xbyak_riscv::LMUL::m1);
+        }
 
         bcast_loop(1);
 
@@ -305,11 +312,12 @@ void jit_rvv_1x1_conv_kernel_t::generate() {
         }
         li(reg_tmp_imm, jcp.oc_block * jcp.typesize_out);
         add(reg_output_data, reg_output_data, reg_tmp_imm);
-
+    
         li(reg_tmp_imm, jcp.oc_block);
         sub(reg_load_loop_work, reg_load_loop_work, reg_tmp_imm);
-
+    
         jal(x0, tail_loop);
+        L(tail_end);
     }
     L(load_loop_end);
 
@@ -369,8 +377,17 @@ void jit_rvv_1x1_conv_kernel_t::bcast_loop(int load_loop_blk) {
         Label bcast_loop_tail_end;
         blez(reg_bcast_loop_iter, bcast_loop_tail_end);
 
-        reduce_loop(load_loop_blk, jcp.ur_tail);
+        auto restore_vl = [=]() {
+            if (jcp.oc_block <= 31) {
+                vsetivli(reg_tmp_imm, jcp.oc_block, SEW::e32, LMUL::m1);
+            } else {
+                li(reg_tmp_imm, jcp.oc_block);
+                vsetvli(reg_tmp_imm, reg_tmp_imm, SEW::e32, LMUL::m1);
+            }
+        };
 
+        reduce_loop(load_loop_blk, jcp.ur_tail);
+        restore_vl(); 
         L(bcast_loop_tail_end);
     }
 }
@@ -429,8 +446,6 @@ void jit_rvv_1x1_conv_kernel_t::reduce_loop(int load_loop_blk, int ur) {
     auto store = [=]() {
         mv(reg_tmp_addr, aux_reg_output_data);
 
-        bool has_relu = false;
-
         for (int i_ur = 0; i_ur < ur; ++i_ur) {
             for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
 
@@ -488,7 +503,10 @@ void jit_rvv_1x1_conv_kernel_t::reduce_loop(int load_loop_blk, int ur) {
                     li(reg_tmp_addr, weight_off + 256);
 
                     add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr);
-                    flw(x0, reg_tmp_addr, 0); 
+                    #if defined(__riscv_zicbom)
+                        // cbo.prefetch.i: 0b0000000_00010_00000_010_00000_0001111
+                        asm volatile(".word 0x0020000f" : : "r"(reg_tmp_addr));
+                    #endif
 
                     li(reg_tmp_addr, weight_off);
                     add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr);
@@ -503,8 +521,8 @@ void jit_rvv_1x1_conv_kernel_t::reduce_loop(int load_loop_blk, int ur) {
     // Load first round of weights (IC=0..unroll-1)
     for (int i_unroll = 0; i_unroll < jcp.reduce_loop_unroll; ++i_unroll) {
         for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
-            size_t weight_off = (size_t)i_unroll * jcp.reduce_loop_load_step
-                    + (size_t)i_load * jcp.load_loop_load_step;
+            ptrdiff_t weight_off = (ptrdiff_t)i_unroll * jcp.reduce_loop_load_step
+                    + (ptrdiff_t)i_load * jcp.load_loop_load_step;
             if (weight_off == 0) {
                 vle32_v(vreg_load(i_load, i_unroll), aux_reg_load_data);
             } else {
@@ -547,7 +565,7 @@ void jit_rvv_1x1_conv_kernel_t::reduce_loop(int load_loop_blk, int ur) {
         L(tail_loop);
         {
             for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
-                size_t weight_off = (size_t)i_load * jcp.load_loop_load_step;
+                ptrdiff_t weight_off = (ptrdiff_t)i_load * jcp.load_loop_load_step;
                 if (weight_off == 0) {
                     vle32_v(vreg_load(i_load, 0), aux_reg_load_data);
                 } else {

From e55a495ceb45cdfb207275e42a1fe621ec2ab720 Mon Sep 17 00:00:00 2001
From: StrelkovKM <strelkovkm96@outlook.com>
Date: Fri, 24 Apr 2026 17:19:49 +0000
Subject: [PATCH 08/13] Update branch

---
 src/cpu/rv64/rvv_postops.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/cpu/rv64/rvv_postops.hpp b/src/cpu/rv64/rvv_postops.hpp
index 28c54f2e77e..bb4171b8412 100644
--- a/src/cpu/rv64/rvv_postops.hpp
+++ b/src/cpu/rv64/rvv_postops.hpp
@@ -18,6 +18,7 @@
 
 #include <riscv_vector.h>
 
+
 namespace dnnl {
 namespace impl {
 namespace cpu {

From 874b0f38d85d4c67236d18ffdc4864ab9e05f35d Mon Sep 17 00:00:00 2001
From: StrelkovKM <strelkovkm96@outlook.com>
Date: Fri, 24 Apr 2026 17:20:43 +0000
Subject: [PATCH 09/13] Reset

---
 src/cpu/rv64/rvv_postops.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/cpu/rv64/rvv_postops.hpp b/src/cpu/rv64/rvv_postops.hpp
index bb4171b8412..28c54f2e77e 100644
--- a/src/cpu/rv64/rvv_postops.hpp
+++ b/src/cpu/rv64/rvv_postops.hpp
@@ -18,7 +18,6 @@
 
 #include <riscv_vector.h>
 
-
 namespace dnnl {
 namespace impl {
 namespace cpu {

From a9d51bfa917c98439857aeea8aa95f14589caac0 Mon Sep 17 00:00:00 2001
From: StrelkovKM <strelkovkm96@outlook.com>
Date: Sun, 26 Apr 2026 14:50:04 +0000
Subject: [PATCH 10/13] [CPU][RV64]Return ref & add debug comments

---
 src/cpu/cpu_convolution_list.cpp      |  6 +++---
 src/cpu/rv64/rvv_gemm_convolution.hpp | 10 +++++++++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp
index 2e6bb28fe58..2913a81313b 100644
--- a/src/cpu/cpu_convolution_list.cpp
+++ b/src/cpu/cpu_convolution_list.cpp
@@ -184,9 +184,9 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
             CPU_INSTANCE_RV64GCV(riscv_gemm_convolution_fwd_t)
             
 
-            // CPU_INSTANCE(gemm_convolution_fwd_t)
-            // CPU_INSTANCE(ref_convolution_fwd_t)
-            // CPU_INSTANCE(ref_fused_convolution_fwd_t)
+            CPU_INSTANCE(gemm_convolution_fwd_t)
+            CPU_INSTANCE(ref_convolution_fwd_t)
+            CPU_INSTANCE(ref_fused_convolution_fwd_t)
             nullptr,
         }},
         {{forward, f32, f16, f32}, {
diff --git a/src/cpu/rv64/rvv_gemm_convolution.hpp b/src/cpu/rv64/rvv_gemm_convolution.hpp
index 7bcda8e9462..e0f2afe3c07 100644
--- a/src/cpu/rv64/rvv_gemm_convolution.hpp
+++ b/src/cpu/rv64/rvv_gemm_convolution.hpp
@@ -68,6 +68,8 @@ struct riscv_gemm_convolution_fwd_t : public primitive_t {
 
             // TODO: make `init_conf` assign initialized object to `jcp_`
             jcp_ = conv_gemm_conf_t();
+
+            std::cout << "GEMM INIT CONSTRUCTION" << std::endl;
             return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad,
                     *desc(), src_md_, weights_md_, dst_md_, bias_md_, attr_,
                     dnnl_get_max_threads());
@@ -113,18 +115,24 @@ struct riscv_gemm_convolution_fwd_t : public primitive_t {
         : primitive_t(apd), post_ops_(nullptr) {}
 
     status_t init(engine_t *engine) override {
+        std::cout << "GEMM INIT" << std::endl;
         const auto &jcp = pd()->jcp_;
 
         if (jcp.with_eltwise || jcp.with_binary) {
             CHECK(safe_ptr_assign(post_ops_, new ref_post_ops_t(jcp.post_ops)));
             CHECK(post_ops_->init(pd()->dst_md()));
         }
+
+        std::cout << "GEMM SUCCESS" << std::endl;
         return status::success;
     }
 
     using data_t = typename prec_traits_t<data_type::f32>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
+        fprintf(stderr, "[RVV EXECUTE] Layer executed!\n");
+        fflush(stderr);
+
         bool is_nspc = pd()->jcp_.is_nspc;
         return is_nspc ? execute_forward_nspc(ctx) : execute_forward_ncsp(ctx);
     }
@@ -146,4 +154,4 @@ struct riscv_gemm_convolution_fwd_t : public primitive_t {
 } // namespace impl
 } // namespace dnnl
 
-#endif
+#endif
\ No newline at end of file

From 7c82da94cd9c864ff4285ba09282ce3e80e5e4b0 Mon Sep 17 00:00:00 2001
From: StrelkovKM <strelkovkm96@outlook.com>
Date: Sun, 26 Apr 2026 16:01:13 +0000
Subject: [PATCH 11/13] [CPU][RV64] Optimize im2col, RVV(Bias + ReLU)

---
 src/cpu/cpu_convolution_list.cpp              |    8 +-
 src/cpu/rv64/cpu_isa_traits.cpp               |   44 -
 src/cpu/rv64/cpu_isa_traits.hpp               |  107 --
 src/cpu/rv64/jit_generator.hpp                |  137 --
 src/cpu/rv64/jit_primitive_conf.hpp           |   97 --
 src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp      |  613 --------
 src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp      |  109 --
 src/cpu/rv64/jit_rvv_1x1_convolution.cpp      |  144 --
 src/cpu/rv64/jit_rvv_1x1_convolution.hpp      |  170 --
 src/cpu/rv64/rvv_gemm_convolution.cpp         |  266 ++--
 src/cpu/rv64/rvv_gemm_convolution.hpp         |    1 -
 third_party/xbyak_riscv/xbyak_riscv.hpp       | 1383 -----------------
 third_party/xbyak_riscv/xbyak_riscv_csr.hpp   |  112 --
 .../xbyak_riscv/xbyak_riscv_mnemonic.hpp      |  231 ---
 third_party/xbyak_riscv/xbyak_riscv_util.hpp  |  271 ----
 third_party/xbyak_riscv/xbyak_riscv_v.hpp     |  776 ---------
 16 files changed, 175 insertions(+), 4294 deletions(-)
 delete mode 100644 src/cpu/rv64/cpu_isa_traits.cpp
 delete mode 100644 src/cpu/rv64/cpu_isa_traits.hpp
 delete mode 100644 src/cpu/rv64/jit_generator.hpp
 delete mode 100644 src/cpu/rv64/jit_primitive_conf.hpp
 delete mode 100644 src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp
 delete mode 100644 src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp
 delete mode 100644 src/cpu/rv64/jit_rvv_1x1_convolution.cpp
 delete mode 100644 src/cpu/rv64/jit_rvv_1x1_convolution.hpp
 delete mode 100644 third_party/xbyak_riscv/xbyak_riscv.hpp
 delete mode 100644 third_party/xbyak_riscv/xbyak_riscv_csr.hpp
 delete mode 100644 third_party/xbyak_riscv/xbyak_riscv_mnemonic.hpp
 delete mode 100644 third_party/xbyak_riscv/xbyak_riscv_util.hpp
 delete mode 100644 third_party/xbyak_riscv/xbyak_riscv_v.hpp

diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp
index 2913a81313b..c8f41b8e947 100644
--- a/src/cpu/cpu_convolution_list.cpp
+++ b/src/cpu/cpu_convolution_list.cpp
@@ -78,7 +78,6 @@ using namespace dnnl::impl::cpu::aarch64;
 using namespace dnnl::impl::cpu::acl;
 #elif DNNL_RV64
 #include "cpu/rv64/rvv_gemm_convolution.hpp"
-#include "cpu/rv64/jit_rvv_1x1_convolution.hpp"
 using namespace dnnl::impl::cpu::rv64;
 #endif
 
@@ -180,13 +179,12 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
             CPU_INSTANCE_AARCH64(brgemm_convolution_fwd_t, sve_128)
             // CPU_INSTANCE_X64(jit_uni_ncsp_convolution_fwd_t)
 
-            CPU_INSTANCE_RV64GCV(jit_rvv_1x1_convolution_fwd_t)
             CPU_INSTANCE_RV64GCV(riscv_gemm_convolution_fwd_t)
             
 
-            CPU_INSTANCE(gemm_convolution_fwd_t)
-            CPU_INSTANCE(ref_convolution_fwd_t)
-            CPU_INSTANCE(ref_fused_convolution_fwd_t)
+            // CPU_INSTANCE(gemm_convolution_fwd_t)
+            // CPU_INSTANCE(ref_convolution_fwd_t)
+            // CPU_INSTANCE(ref_fused_convolution_fwd_t)
             nullptr,
         }},
         {{forward, f32, f16, f32}, {
diff --git a/src/cpu/rv64/cpu_isa_traits.cpp b/src/cpu/rv64/cpu_isa_traits.cpp
deleted file mode 100644
index b8c3fc658e0..00000000000
--- a/src/cpu/rv64/cpu_isa_traits.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*******************************************************************************
-* Copyright 2019 Intel Corporation
-* Copyright 2025 Institute of Software, Chinese Academy of Sciences
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "cpu/rv64/cpu_isa_traits.hpp"
-#include "cpu/platform.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace rv64 {
-
-struct isa_info_t {
-    isa_info_t(cpu_isa_t aisa) : isa(aisa) {};
-    cpu_isa_t isa;
-};
-
-static isa_info_t get_isa_info_t(void) {
-    if (mayiuse(zvfh)) return isa_info_t(zvfh);
-    if (mayiuse(v)) return isa_info_t(v);
-    return isa_info_t(isa_undef);
-}
-
-cpu_isa_t get_max_cpu_isa() {
-    return get_isa_info_t().isa;
-}
-
-} // namespace rv64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/cpu/rv64/cpu_isa_traits.hpp b/src/cpu/rv64/cpu_isa_traits.hpp
deleted file mode 100644
index be5a4fc1d49..00000000000
--- a/src/cpu/rv64/cpu_isa_traits.hpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*******************************************************************************
-* Copyright 2018 Intel Corporation
-* Copyright 2025 Institute of Software, Chinese Academy of Sciences
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef CPU_RV64_CPU_ISA_TRAITS_HPP
-#define CPU_RV64_CPU_ISA_TRAITS_HPP
-
-#include <type_traits>
-
-#include "common/type_helpers.hpp"
-#include "common/utils.hpp"
-#include "dnnl_types.h"
-
-#ifndef XBYAK_RISCV_V
-#define XBYAK_RISCV_V 1
-#endif
-
-#include "xbyak_riscv/xbyak_riscv_util.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace rv64 {
-
-enum cpu_isa_bit_t : unsigned {
-    v_bit = 1u << 0,
-    zvfh_bit = 1u << 1,
-};
-
-enum cpu_isa_t : unsigned {
-    isa_undef = 0u,
-    v = v_bit,
-    zvfh = zvfh_bit | v,
-    isa_all = ~0u,
-};
-
-struct Riscv64Cpu {
-public:
-    static Riscv64Cpu &getInstance() {
-        static Riscv64Cpu instance;
-        return instance;
-    }
-
-    bool get_has_v() const { return has_v; }
-    bool get_has_zvfh() const { return has_zvfh; }
-
-private:
-    bool has_v = false;
-    bool has_zvfh = false;
-
-    Riscv64Cpu() {
-        const auto &xbyak_cpu = Xbyak_riscv::CPU::getInstance();
-
-        has_v = xbyak_cpu.hasExtension(Xbyak_riscv::RISCVExtension::V);
-
-        if (has_v) {
-            has_zvfh
-                    = xbyak_cpu.hasExtension(Xbyak_riscv::RISCVExtension::Zvfh);
-        } else {
-            has_zvfh = false;
-        }
-    }
-};
-
-inline bool mayiuse(const cpu_isa_t cpu_isa, bool soft = false) {
-    MAYBE_UNUSED(soft);
-    const Riscv64Cpu &cpu = Riscv64Cpu::getInstance();
-
-    switch (cpu_isa) {
-        case v: return cpu.get_has_v();
-        case zvfh: return cpu.get_has_v() && cpu.get_has_zvfh();
-        case isa_undef: return true;
-        case isa_all: return false;
-    }
-    return false;
-}
-
-cpu_isa_t get_max_cpu_isa();
-
-#include "common/z_magic.hpp"
-/* clang-format off */
-#define JIT_IMPL_NAME_HELPER(prefix, isa, suffix_if_any) \
-    ((isa) == isa_undef ? prefix STRINGIFY(any) : \
-    ((isa) == v ? prefix STRINGIFY(rvv) : \
-    ((isa) == zvfh ? prefix STRINGIFY(rvv_zvfh) : \
-    prefix suffix_if_any)))
-/* clang-format on */
-
-} // namespace rv64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/cpu/rv64/jit_generator.hpp b/src/cpu/rv64/jit_generator.hpp
deleted file mode 100644
index c795aba8c61..00000000000
--- a/src/cpu/rv64/jit_generator.hpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/*******************************************************************************
-* Copyright 2025 ZTE Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef CPU_RV64_JIT_GENERATOR_HPP
-#define CPU_RV64_JIT_GENERATOR_HPP
-
-#include <cstdint>
-#include <utility>
-
-#include "common/c_types_map.hpp"
-#include "common/type_helpers.hpp"
-#include "common/utils.hpp"
-#include "cpu/jit_utils/jit_utils.hpp"
-
-#include "cpu/rv64/cpu_isa_traits.hpp"
-#include "xbyak_riscv/xbyak_riscv.hpp"
-
-#define DECLARE_CPU_JIT_AUX_FUNCTIONS(gen_name) \
-    const char *name() const override { \
-        return STRINGIFY(gen_name); \
-    } \
-    const char *source_file() const override { \
-        return __FILE__; \
-    }
-
-#define JIT_ASSERT(condition) \
-    do { \
-        assert(condition); \
-        if (!(condition)) XBYAK_RISCV_THROW(Xbyak_riscv::ERR_INTERNAL); \
-    } while (false)
-
-#define JIT_ASSERT_RET(condition, ret) \
-    do { \
-        assert(condition); \
-        if (!(condition)) \
-            XBYAK_RISCV_THROW_RET(Xbyak_riscv::ERR_INTERNAL, ret); \
-    } while (false)
-
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace rv64 {
-
-// Simple helper to check subset relation between two ISA masks.
-inline bool is_subset(cpu_isa_t isa, cpu_isa_t max_isa) {
-    using u_t = typename std::underlying_type<cpu_isa_t>::type;
-    return (static_cast<u_t>(isa) & static_cast<u_t>(max_isa))
-            == static_cast<u_t>(isa);
-}
-
-// Minimal RV64 JIT generator base class.
-class jit_generator_t : public Xbyak_riscv::CodeGenerator, public c_compatible {
-public:
-    using c_compatible::operator new;
-    using c_compatible::operator new[];
-    using c_compatible::operator delete;
-    using c_compatible::operator delete[];
-
-    // All JIT kernels must override these to provide a stable name used for
-    // debug/logging and jit code registration.
-    virtual const char *name() const = 0;
-    virtual const char *source_file() const = 0;
-
-    explicit jit_generator_t(const char * /*unused_name*/,
-            cpu_isa_t max_cpu_isa = get_max_cpu_isa())
-        : Xbyak_riscv::CodeGenerator(max_code_size)
-        , max_cpu_isa_(max_cpu_isa) {}
-
-    ~jit_generator_t() override = default;
-
-    const uint8_t *jit_ker() const { return jit_ker_; }
-
-    template <typename... kernel_args_t>
-    void operator()(kernel_args_t... args) const {
-        using jit_kernel_func_t = void (*)(const kernel_args_t...);
-        // This const_cast is required for Clang.
-        // Clang rejects reinterpret_cast from const uint8_t* to function pointer.
-        auto *fptr = reinterpret_cast<jit_kernel_func_t>(
-                const_cast<uint8_t *>(jit_ker_));
-        (*fptr)(std::forward<kernel_args_t>(args)...);
-    }
-
-    virtual status_t create_kernel() {
-        try {
-            generate();
-        } catch (...) { return status::runtime_error; }
-
-        this->ready(Xbyak_riscv::CodeArray::PROTECT_RWE);
-
-        jit_ker_ = Xbyak_riscv::CodeGenerator::getCode();
-
-        if (jit_ker_) {
-            jit_utils::register_jit_code(jit_ker_,
-                    Xbyak_riscv::CodeArray::getSize(), name(), source_file());
-            return status::success;
-        }
-
-        return status::runtime_error;
-    }
-
-    inline cpu_isa_t max_cpu_isa() const noexcept { return max_cpu_isa_; }
-
-    // Helper to check that a requested ISA is both within the per‑kernel limit
-    // and supported by the current CPU.
-    inline bool is_valid_isa(cpu_isa_t isa) const {
-        return is_subset(isa, max_cpu_isa_) && mayiuse(isa);
-    }
-
-protected:
-    virtual void generate() = 0;
-
-private:
-    static constexpr unsigned max_code_size = 256 * 1024;
-
-    const cpu_isa_t max_cpu_isa_;
-    const uint8_t *jit_ker_ = nullptr;
-};
-
-} // namespace rv64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/cpu/rv64/jit_primitive_conf.hpp b/src/cpu/rv64/jit_primitive_conf.hpp
deleted file mode 100644
index dde5afb8d32..00000000000
--- a/src/cpu/rv64/jit_primitive_conf.hpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/*******************************************************************************
-* Copyright 2025 ZTE Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef CPU_RV64_JIT_PRIMITIVE_CONF_HPP
-#define CPU_RV64_JIT_PRIMITIVE_CONF_HPP
-
-#include "common/c_types_map.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace rv64 {
-
-struct jit_1x1_conv_conf_t {
-    prop_kind_t prop_kind;
-    int mb;
-    int ngroups, ic, oc, oc_without_padding, ic_without_padding;
-    int iw, ih, id;
-    int ow, oh, od;
-    int os, is;
-    int kw, kh, kd;
-    int stride_w, stride_h, stride_d;
-    int t_pad, l_pad, f_pad;
-
-    int ic_block, oc_block;
-    int load_block, reduce_block;
-    int bcast_block;
-
-    dim_t load_dim, bcast_dim, reduce_dim;
-
-    int ur, ur_tail;
-    int load_loop_blk;
-    int reduce_loop_unroll;
-    int nthr;
-    int nb_bcast, nb_load, nb_reduce, load_grp_count;
-    int nb_load_blocking, nb_load_blocking_max;
-    int nb_bcast_blocking, nb_bcast_blocking_max;
-    int nb_reduce_blocking;
-
-    dim_t reduce_loop_bcast_step;
-    int reduce_loop_load_step;
-    int bcast_loop_bcast_step;
-    int bcast_loop_output_step;
-    int load_loop_load_step;
-    int load_loop_iter_step;
-
-    bool with_bias;
-    bool with_sum;
-    bool with_eltwise;
-    bool with_binary;
-    bool with_dw_conv;
-
-    int typesize_in;
-    int typesize_out;
-    int typesize_bia;
-    int typesize_acc;
-
-    format_tag_t src_tag, wei_tag, dst_tag;
-};
-
-struct jit_1x1_conv_args_t {
-    const void *bcast_data;
-    const void *load_data;
-    const void *output_data;
-    const void *bias_data;
-
-    size_t load_dim;
-    size_t bcast_dim;
-    size_t reduce_dim;
-
-    size_t first_last_flag;
-};
-
-enum {
-    FLAG_REDUCE_FIRST = 1 << 0,
-    FLAG_REDUCE_LAST = 1 << 1,
-};
-
-} // namespace rv64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp
deleted file mode 100644
index e88b18214b3..00000000000
--- a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.cpp
+++ /dev/null
@@ -1,613 +0,0 @@
-/*******************************************************************************
-* Copyright 2025 ZTE Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <assert.h>
-#include "common/c_types_map.hpp"
-#include "common/dnnl_thread.hpp"
-#include "common/memory.hpp"
-#include "common/utils.hpp"
-
-#include "cpu/rv64/jit_rvv_1x1_conv_kernel.hpp"
-
-#define GET_OFF(field) \
-    static_cast<int32_t>(offsetof(jit_1x1_conv_args_t, field))
-
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace rv64 {
-
-using namespace dnnl::impl::format_tag;
-using namespace dnnl::impl::prop_kind;
-using namespace dnnl::impl::utils;
-using namespace Xbyak_riscv;
-
-jit_rvv_1x1_conv_kernel_t::jit_rvv_1x1_conv_kernel_t(
-        const jit_1x1_conv_conf_t &ajcp, const primitive_attr_t &attr,
-        const memory_desc_t &dst_md)
-    : jit_generator_t("jit_rvv_1x1_conv_kernel"), jcp(ajcp), attr_(attr) {
-    create_kernel();
-}
-
-status_t jit_rvv_1x1_conv_kernel_t::init_conf(jit_1x1_conv_conf_t &jcp,
-        const convolution_desc_t &cd, const memory_desc_wrapper &src_d,
-        const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d,
-        const primitive_attr_t &attr, int nthreads, bool reduce_src) {
-
-    const int ndims = src_d.ndims();
-
-    jcp.prop_kind = cd.prop_kind;
-    jcp.nthr = nthreads;
-
-    jcp.with_bias = cd.bias_desc.format_kind != format_kind::undef;
-
-    // Initialize dimensions
-    jcp.mb = src_d.dims()[0];
-    jcp.ngroups
-            = weights_d.ndims() == src_d.ndims() + 1 ? weights_d.dims()[0] : 1;
-    jcp.oc_without_padding = dst_d.dims()[1] / jcp.ngroups;
-    jcp.ic_without_padding = src_d.dims()[1] / jcp.ngroups;
-    jcp.oc = jcp.oc_without_padding;
-    jcp.ic = jcp.ic_without_padding;
-
-    // Targeting SEW=32 (float), LMUL=1, VLEN=128 -> simd_w = 4
-    const int simd_w = 4;
-
-    // OC is padded to match oc_block in weights format (Oihw4o)
-    // IC is not padded; kernel handles IC tail processing
-    jcp.oc = rnd_up(jcp.oc, simd_w);
-
-    // 3D convolution support
-    jcp.id = (ndims == 5) ? src_d.dims()[2] : 1;
-    jcp.od = (ndims == 5) ? dst_d.dims()[2] : 1;
-
-    jcp.ih = (ndims == 3) ? 1 : src_d.dims()[ndims - 2];
-    jcp.iw = src_d.dims()[ndims - 1];
-    jcp.oh = (ndims == 3) ? 1 : dst_d.dims()[ndims - 2];
-    jcp.ow = dst_d.dims()[ndims - 1];
-
-    // Spatial dimensions: D*H*W
-    jcp.os = jcp.od * jcp.oh * jcp.ow;
-    jcp.is = jcp.id * jcp.ih * jcp.iw;
-
-    jcp.oc_block = simd_w;
-    jcp.ic_block = simd_w;
-
-    // Dynamic parameter calculation
-    // Register constraint: (ur * load_loop_blk) + (unroll * load_loop_blk) + 1 <= 32
-    jcp.reduce_loop_unroll = 4;
-
-    const int SMALL_SPATIAL = 10;
-    const int BIG_SPATIAL = 65;
-    const int BIG_LOAD_DIM = (jcp.ic >= 512) ? 256 : 512;
-
-    // Initial load_loop_blk selection
-    if (jcp.oc % (2 * jcp.oc_block) == 0 && jcp.os >= 11) {
-        jcp.load_loop_blk = 2;
-    } else {
-        jcp.load_loop_blk = 1;
-    }
-
-    // Dynamic ur selection algorithm
-    int max_regs, min_regs, size_threshold;
-
-    const int spatial = jcp.od * jcp.oh;
-
-    // Select register range based on batch size and thread count
-    if ((8 * jcp.mb) / jcp.nthr >= 1 || jcp.mb == 1) {
-        max_regs = 9;
-        min_regs = 6;
-        size_threshold = 14;
-
-        // Special shape optimization
-        if (jcp.oc > 128 && jcp.oc < BIG_LOAD_DIM && spatial > SMALL_SPATIAL
-                && spatial < BIG_SPATIAL && jcp.ic < 256) {
-            max_regs = 6;
-            min_regs = 5;
-        }
-    } else {
-        max_regs = 30;
-        min_regs = 9;
-        size_threshold = 14;
-    }
-
-    // Initial ur
-    jcp.ur = 1;
-
-    // First pass: find largest ur that divides spatial evenly
-    for (int ur_w = max_regs; ur_w >= min_regs; ur_w--) {
-        if ((spatial >= size_threshold && spatial % ur_w == 0)
-                || (spatial < size_threshold && jcp.os % ur_w == 0)) {
-            jcp.ur = ur_w;
-            break;
-        }
-    }
-
-    // If first pass fails, use heuristic
-    if (jcp.ur == 1) {
-        jcp.ur = nstl::min(max_regs, jcp.os);
-        int os_tail = jcp.os % max_regs;
-        for (int i = max_regs; i >= min_regs; i--) {
-            int i_tail = jcp.os % i;
-            if (i_tail > os_tail || i_tail == 0) {
-                jcp.ur = i;
-                os_tail = i_tail;
-                if (i_tail == 0) break;
-            }
-        }
-    }
-
-    // Adjust ur based on load_loop_blk (ensure register limit)
-    // Register constraint: ur * load_loop_blk + unroll * load_loop_blk + 1 <= 32
-    int max_ur_for_blk = (32 - 1 - jcp.reduce_loop_unroll * jcp.load_loop_blk)
-            / jcp.load_loop_blk;
-    if (jcp.ur > max_ur_for_blk) {
-        jcp.ur = max_ur_for_blk;
-        if (jcp.ur < 1) jcp.ur = 1;
-    }
-
-    jcp.load_block = jcp.oc_block;
-    jcp.reduce_block = jcp.ic_block;
-
-    jcp.bcast_block = jcp.ur;
-    jcp.load_dim = jcp.oc_without_padding;
-    jcp.bcast_dim = jcp.os;
-    jcp.reduce_dim = jcp.ic_without_padding;
-
-    jcp.ur_tail = jcp.bcast_dim % jcp.ur;
-
-    jcp.nb_bcast = div_up(jcp.os, jcp.bcast_block);
-    jcp.nb_load = div_up(jcp.oc_without_padding, jcp.load_block);
-    jcp.nb_reduce = div_up(jcp.ic_without_padding, jcp.reduce_block);
-    jcp.load_grp_count = 1;
-
-    // Blocking strategy for NHWC layout
-    jcp.nb_reduce_blocking = jcp.nb_reduce;
-    jcp.nb_load_blocking = jcp.nb_load;
-    jcp.nb_load_blocking_max = jcp.nb_load;
-
-    // Spatial dimension blocking (in ur units)
-    int target_bcast_blocking = 735;
-    jcp.nb_bcast_blocking
-            = nstl::min(jcp.nb_bcast, div_up(target_bcast_blocking, jcp.ur));
-    if (jcp.nb_bcast_blocking == 0) jcp.nb_bcast_blocking = 1;
-    jcp.nb_bcast_blocking_max = jcp.nb_bcast_blocking;
-
-    // Optimize reduce_loop_unroll based on available registers
-    if (jcp.load_loop_blk == 2) {
-        jcp.reduce_loop_unroll = 4;
-    } else {
-        jcp.reduce_loop_unroll = 4;
-    }
-
-    // Layout-dependent stride parameters (for NHWC)
-    jcp.typesize_in = sizeof(float);
-    jcp.typesize_out = sizeof(float);
-
-    jcp.reduce_loop_bcast_step = jcp.typesize_in;
-    jcp.reduce_loop_load_step = jcp.oc_block * jcp.typesize_in;
-
-    // Strides within bcast_loop (spatial dimensions)
-    jcp.bcast_loop_bcast_step
-            = jcp.ngroups * jcp.ic_without_padding * jcp.typesize_in;
-    jcp.bcast_loop_output_step
-            = jcp.ngroups * jcp.oc_without_padding * jcp.typesize_out;
-
-    // Strides within load_loop (OC dimension)
-    jcp.load_loop_load_step
-            = jcp.ic_without_padding * jcp.oc_block * jcp.typesize_in;
-    jcp.load_loop_iter_step = jcp.oc_block;
-
-    if (jcp.reduce_loop_load_step > (1LL << 40) / jcp.reduce_loop_unroll) {
-        return status::unimplemented;
-    }
-
-    return status::success;
-}
-
-void jit_rvv_1x1_conv_kernel_t::init_scratchpad(
-        memory_tracking::registrar_t &scratchpad,
-        const jit_1x1_conv_conf_t &jcp) {
-    // Not implemented
-}
-
-void jit_rvv_1x1_conv_kernel_t::balance(jit_1x1_conv_conf_t &jcp) {
-    // Not implemented
-}
-
-void jit_rvv_1x1_conv_kernel_t::generate() {
-
-    preamble();
-
-    // Set initial VL to oc_block (4)
-    if (jcp.oc_block <= 31) {
-        vsetivli(reg_tmp_imm, jcp.oc_block, Xbyak_riscv::SEW::e32, Xbyak_riscv::LMUL::m1);
-    } else {
-        li(reg_tmp_imm, jcp.oc_block);
-        vsetvli(reg_tmp_imm, reg_tmp_imm, Xbyak_riscv::SEW::e32, Xbyak_riscv::LMUL::m1);
-    }
-
-    // Load parameters
-    ld(reg_bcast_data, reg_param, GET_OFF(bcast_data));
-    ld(reg_load_data, reg_param, GET_OFF(load_data));
-    ld(reg_output_data, reg_param, GET_OFF(output_data));
-    if (jcp.with_bias) ld(reg_bias_data, reg_param, GET_OFF(bias_data));
-
-    ld(reg_load_loop_work, reg_param, GET_OFF(load_dim));
-    ld(reg_bcast_loop_work, reg_param, GET_OFF(bcast_dim));
-    ld(reg_reduce_loop_work, reg_param, GET_OFF(reduce_dim));
-    ld(reg_reduce_pos_flag, reg_param, GET_OFF(first_last_flag));
-
-    // Main loop generation
-    auto load_loop_body = [=](int load_loop_blk) {
-        bcast_loop(load_loop_blk);
-
-        // Update pointers and work counters
-        li(reg_tmp_imm, load_loop_blk * jcp.load_loop_load_step);
-        add(reg_load_data, reg_load_data, reg_tmp_imm);
-
-        if (jcp.with_bias) {
-            li(reg_tmp_imm, load_loop_blk * jcp.oc_block * jcp.typesize_out);
-            add(reg_bias_data, reg_bias_data, reg_tmp_imm);
-        }
-
-        li(reg_tmp_imm, load_loop_blk * jcp.oc_block * jcp.typesize_out);
-        add(reg_output_data, reg_output_data, reg_tmp_imm);
-
-        li(reg_tmp_imm, load_loop_blk * jcp.load_loop_iter_step);
-        sub(reg_load_loop_work, reg_load_loop_work, reg_tmp_imm);
-    };
-
-    Label load_loop_label, load_loop_end, load_loop_tail;
-
-    if (jcp.load_loop_blk > 1) {
-        L(load_loop_label);
-        li(reg_tmp_imm, jcp.load_loop_blk * jcp.oc_block);
-        blt(reg_load_loop_work, reg_tmp_imm, load_loop_tail);
-
-        // Ensure VL is full
-        li(reg_tmp_imm, jcp.oc_block);
-        vsetvli(reg_tmp_imm, reg_tmp_imm, Xbyak_riscv::SEW::e32,
-                Xbyak_riscv::LMUL::m1);
-
-        load_loop_body(jcp.load_loop_blk);
-        jal(x0, load_loop_label);
-    }
-
-    L(load_loop_tail);
-    {
-        Label tail_loop, tail_end;
-        L(tail_loop);
-        blez(reg_load_loop_work, tail_end);
-
-        // Last block may be partial, use vsetvli to set VL dynamically
-        if (jcp.oc_block <= 31) {
-            vsetivli(reg_tmp_imm, jcp.oc_block, Xbyak_riscv::SEW::e32, Xbyak_riscv::LMUL::m1);
-        } else {
-            li(reg_tmp_imm, jcp.oc_block);
-            vsetvli(reg_tmp_imm, reg_tmp_imm, Xbyak_riscv::SEW::e32, Xbyak_riscv::LMUL::m1);
-        }
-
-        bcast_loop(1);
-
-        // Update pointers and work counters (tail loop)
-        li(reg_tmp_imm, jcp.load_loop_load_step);
-        add(reg_load_data, reg_load_data, reg_tmp_imm);
-        if (jcp.with_bias) {
-            li(reg_tmp_imm, jcp.oc_block * jcp.typesize_out);
-            add(reg_bias_data, reg_bias_data, reg_tmp_imm);
-        }
-        li(reg_tmp_imm, jcp.oc_block * jcp.typesize_out);
-        add(reg_output_data, reg_output_data, reg_tmp_imm);
-    
-        li(reg_tmp_imm, jcp.oc_block);
-        sub(reg_load_loop_work, reg_load_loop_work, reg_tmp_imm);
-    
-        jal(x0, tail_loop);
-        L(tail_end);
-    }
-    L(load_loop_end);
-
-    postamble();
-}
-
-void jit_rvv_1x1_conv_kernel_t::preamble() {
-    addi(sp, sp, -64);
-    sd(ra, sp, 56);
-    sd(s0, sp, 48);
-    sd(s1, sp, 40);
-    sd(s2, sp, 32);
-    sd(s3, sp, 24);
-    sd(s4, sp, 16);
-    sd(s5, sp, 8);
-}
-
-void jit_rvv_1x1_conv_kernel_t::postamble() {
-    ld(ra, sp, 56);
-    ld(s0, sp, 48);
-    ld(s1, sp, 40);
-    ld(s2, sp, 32);
-    ld(s3, sp, 24);
-    ld(s4, sp, 16);
-    ld(s5, sp, 8);
-    addi(sp, sp, 64);
-    ret();
-}
-
-void jit_rvv_1x1_conv_kernel_t::bcast_loop(int load_loop_blk) {
-    mv(reg_bcast_loop_iter, reg_bcast_loop_work);
-    mv(aux1_reg_bcast_data, reg_bcast_data);
-    mv(aux_reg_output_data, reg_output_data);
-
-    Label bcast_loop_label, bcast_loop_tail;
-
-    li(reg_tmp_imm, jcp.ur);
-    blt(reg_bcast_loop_iter, reg_tmp_imm, bcast_loop_tail);
-
-    L(bcast_loop_label);
-    {
-        reduce_loop(load_loop_blk, jcp.ur);
-
-        li(reg_tmp_imm, jcp.ur * jcp.bcast_loop_bcast_step);
-        add(aux1_reg_bcast_data, aux1_reg_bcast_data, reg_tmp_imm);
-
-        li(reg_tmp_imm, jcp.ur * jcp.bcast_loop_output_step);
-        add(aux_reg_output_data, aux_reg_output_data, reg_tmp_imm);
-
-        addi(reg_bcast_loop_iter, reg_bcast_loop_iter, -jcp.ur);
-        li(reg_tmp_imm, jcp.ur);
-        bge(reg_bcast_loop_iter, reg_tmp_imm, bcast_loop_label);
-    }
-
-    L(bcast_loop_tail);
-    if (jcp.ur_tail > 0) {
-        Label bcast_loop_tail_end;
-        blez(reg_bcast_loop_iter, bcast_loop_tail_end);
-
-        auto restore_vl = [=]() {
-            if (jcp.oc_block <= 31) {
-                vsetivli(reg_tmp_imm, jcp.oc_block, SEW::e32, LMUL::m1);
-            } else {
-                li(reg_tmp_imm, jcp.oc_block);
-                vsetvli(reg_tmp_imm, reg_tmp_imm, SEW::e32, LMUL::m1);
-            }
-        };
-
-        reduce_loop(load_loop_blk, jcp.ur_tail);
-        restore_vl(); 
-        L(bcast_loop_tail_end);
-    }
-}
-
-void jit_rvv_1x1_conv_kernel_t::reduce_loop(int load_loop_blk, int ur) {
-    mv(aux_reg_load_data, reg_load_data);
-    mv(aux_reg_bcast_data, aux1_reg_bcast_data);
-
-    auto init = [=]() {
-        Label init_zero, init_done;
-        andi(reg_tmp_imm, reg_reduce_pos_flag, FLAG_REDUCE_FIRST);
-        bnez(reg_tmp_imm, init_zero);
-
-        // Load from dst for accumulation
-        mv(reg_tmp_addr, aux_reg_output_data);
-        for (int i_ur = 0; i_ur < ur; ++i_ur) {
-            for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
-                vle32_v(vreg_accum(i_load, i_ur), reg_tmp_addr);
-                if (i_load + 1 < load_loop_blk)
-                    addi(reg_tmp_addr, reg_tmp_addr,
-                            jcp.load_block * jcp.typesize_out);
-            }
-            li(reg_tmp_imm,
-                    jcp.bcast_loop_output_step
-                            - (load_loop_blk - 1) * jcp.load_block
-                                    * jcp.typesize_out);
-            add(reg_tmp_addr, reg_tmp_addr, reg_tmp_imm);
-        }
-        jal(x0, init_done);
-
-        L(init_zero);
-        for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
-            if (jcp.with_bias) {
-                size_t bias_off
-                        = (size_t)i_load * jcp.oc_block * jcp.typesize_out;
-                if (bias_off == 0) {
-                    vle32_v(vreg_load(0), reg_bias_data);
-                } else {
-                    li(reg_tmp_addr, bias_off);
-                    add(reg_tmp_addr, reg_tmp_addr, reg_bias_data);
-                    vle32_v(vreg_load(0), reg_tmp_addr);
-                }
-            }
-            for (int i_ur = 0; i_ur < ur; ++i_ur) {
-                if (jcp.with_bias) {
-                    vmv_v_v(vreg_accum(i_load, i_ur), vreg_load(0));
-                } else {
-                    vxor_vv(vreg_accum(i_load, i_ur), vreg_accum(i_load, i_ur),
-                            vreg_accum(i_load, i_ur));
-                }
-            }
-        }
-        L(init_done);
-    };
-
-    auto store = [=]() {
-        mv(reg_tmp_addr, aux_reg_output_data);
-
-        for (int i_ur = 0; i_ur < ur; ++i_ur) {
-            for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
-
-                vse32_v(vreg_accum(i_load, i_ur), reg_tmp_addr);
-
-                if (i_load + 1 < load_loop_blk)
-                    addi(reg_tmp_addr, reg_tmp_addr,
-                            jcp.load_block * jcp.typesize_out);
-            }
-            li(reg_tmp_imm,
-                    jcp.bcast_loop_output_step
-                            - (load_loop_blk - 1) * jcp.load_block
-                                    * jcp.typesize_out);
-            add(reg_tmp_addr, reg_tmp_addr, reg_tmp_imm);
-        }
-    };
-
-    auto fma_block = [=](int current_unroll, bool last_block) {
-        for (int i_unroll = 0; i_unroll < current_unroll; ++i_unroll) {
-            flw(freg_bcast, aux_reg_bcast_data, 0);
-
-            for (int i_ur = 0; i_ur < ur; ++i_ur) {
-                for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
-                    vfmacc_vf(vreg_accum(i_load, i_ur), freg_bcast,
-                            vreg_load(i_load, i_unroll));
-                }
-
-                if (i_ur + 1 < ur) {
-                    ptrdiff_t offset
-                            = (ptrdiff_t)(i_ur + 1) * jcp.bcast_loop_bcast_step;
-                    if (offset <= 2047) {
-                        flw(freg_bcast, aux_reg_bcast_data, offset);
-                    } else {
-                        li(reg_tmp_addr, offset);
-                        add(reg_tmp_addr, reg_tmp_addr, aux_reg_bcast_data);
-                        flw(freg_bcast, reg_tmp_addr, 0);
-                    }
-                }
-            }
-            addi(aux_reg_bcast_data, aux_reg_bcast_data,
-                    jcp.reduce_loop_bcast_step);
-        }
-
-        // Update weight pointer to next unroll block
-        li(reg_tmp_imm, jcp.reduce_loop_unroll * jcp.reduce_loop_load_step);
-        add(aux_reg_load_data, aux_reg_load_data, reg_tmp_imm);
-
-        // Prefetch weights for next iteration
-        if (!last_block) {
-            for (int i_unroll = 0; i_unroll < jcp.reduce_loop_unroll; ++i_unroll) {
-                for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
-                    ptrdiff_t weight_off = (ptrdiff_t)i_unroll * jcp.reduce_loop_load_step
-                                         + (ptrdiff_t)i_load * jcp.load_loop_load_step;
-
-                    li(reg_tmp_addr, weight_off + 256);
-
-                    add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr);
-                    #if defined(__riscv_zicbom)
-                        // cbo.prefetch.i: 0b0000000_00010_00000_010_00000_0001111
-                        asm volatile(".word 0x0020000f" : : "r"(reg_tmp_addr));
-                    #endif
-
-                    li(reg_tmp_addr, weight_off);
-                    add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr);
-                    vle32_v(vreg_load(i_load, i_unroll), reg_tmp_addr);
-                }
-            }
-        }
-    };
-
-    init();
-
-    // Load first round of weights (IC=0..unroll-1)
-    for (int i_unroll = 0; i_unroll < jcp.reduce_loop_unroll; ++i_unroll) {
-        for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
-            ptrdiff_t weight_off = (ptrdiff_t)i_unroll * jcp.reduce_loop_load_step
-                    + (ptrdiff_t)i_load * jcp.load_loop_load_step;
-            if (weight_off == 0) {
-                vle32_v(vreg_load(i_load, i_unroll), aux_reg_load_data);
-            } else {
-                li(reg_tmp_addr, weight_off);
-                add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr);
-                vle32_v(vreg_load(i_load, i_unroll), reg_tmp_addr);
-            }
-        }
-    }
-
-    mv(reduce_loop_iter, reg_reduce_loop_work);
-    Label reduce_loop_label, reduce_loop_tail;
-
-    li(reg_tmp_imm, jcp.reduce_loop_unroll);
-    blt(reduce_loop_iter, reg_tmp_imm, reduce_loop_tail);
-
-    L(reduce_loop_label);
-    {
-        li(reg_tmp_imm, jcp.reduce_loop_unroll);
-        sub(reg_tmp_imm, reduce_loop_iter, reg_tmp_imm);
-        li(reg_tmp_addr, jcp.reduce_loop_unroll);
-        Label is_last, do_fma;
-        blt(reg_tmp_imm, reg_tmp_addr, is_last);
-        fma_block(jcp.reduce_loop_unroll, false);
-        jal(x0, do_fma);
-        L(is_last);
-        fma_block(jcp.reduce_loop_unroll, true);
-        L(do_fma);
-
-        addi(reduce_loop_iter, reduce_loop_iter, -jcp.reduce_loop_unroll);
-        li(reg_tmp_imm, jcp.reduce_loop_unroll);
-        bge(reduce_loop_iter, reg_tmp_imm, reduce_loop_label);
-    }
-
-    L(reduce_loop_tail);
-    {
-        Label tail_done;
-        blez(reduce_loop_iter, tail_done);
-        Label tail_loop;
-        L(tail_loop);
-        {
-            for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
-                ptrdiff_t weight_off = (ptrdiff_t)i_load * jcp.load_loop_load_step;
-                if (weight_off == 0) {
-                    vle32_v(vreg_load(i_load, 0), aux_reg_load_data);
-                } else {
-                    li(reg_tmp_addr, weight_off);
-                    add(reg_tmp_addr, aux_reg_load_data, reg_tmp_addr);
-                    vle32_v(vreg_load(i_load, 0), reg_tmp_addr);
-                }
-            }
-
-            flw(freg_bcast, aux_reg_bcast_data, 0);
-            for (int i_ur = 0; i_ur < ur; ++i_ur) {
-                for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
-                    vfmacc_vf(vreg_accum(i_load, i_ur), freg_bcast,
-                            vreg_load(i_load, 0));
-                }
-                if (i_ur + 1 < ur) {
-                    size_t offset
-                            = (size_t)(i_ur + 1) * jcp.bcast_loop_bcast_step;
-                    if (offset <= 2047) {
-                        flw(freg_bcast, aux_reg_bcast_data, offset);
-                    } else {
-                        li(reg_tmp_addr, offset);
-                        add(reg_tmp_addr, reg_tmp_addr, aux_reg_bcast_data);
-                        flw(freg_bcast, reg_tmp_addr, 0);
-                    }
-                }
-            }
-
-            addi(aux_reg_bcast_data, aux_reg_bcast_data,
-                    jcp.reduce_loop_bcast_step);
-            addi(aux_reg_load_data, aux_reg_load_data,
-                    jcp.reduce_loop_load_step);
-            addi(reduce_loop_iter, reduce_loop_iter, -1);
-            bnez(reduce_loop_iter, tail_loop);
-        }
-        L(tail_done);
-    }
-
-    store();
-}
-
-} // namespace rv64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp b/src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp
deleted file mode 100644
index 0fcd9774aec..00000000000
--- a/src/cpu/rv64/jit_rvv_1x1_conv_kernel.hpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*******************************************************************************
-* Copyright 2025 ZTE Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef CPU_RV64_JIT_RVV_1X1_CONV_KERNEL_HPP
-#define CPU_RV64_JIT_RVV_1X1_CONV_KERNEL_HPP
-
-#include "common/c_types_map.hpp"
-#include "common/memory_tracking.hpp"
-
-#include "cpu/rv64/jit_generator.hpp"
-#include "cpu/rv64/jit_primitive_conf.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace rv64 {
-
-using namespace Xbyak_riscv;
-
-struct jit_rvv_1x1_conv_kernel_t : public jit_generator_t {
-    jit_rvv_1x1_conv_kernel_t(const jit_1x1_conv_conf_t &ajcp,
-            const primitive_attr_t &attr, const memory_desc_t &dst_md);
-
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_rvv_1x1_conv_kernel)
-
-    static status_t init_conf(jit_1x1_conv_conf_t &jcp,
-            const convolution_desc_t &cd, const memory_desc_wrapper &src_d,
-            const memory_desc_wrapper &weights_d,
-            const memory_desc_wrapper &dst_d, const primitive_attr_t &attr,
-            int nthreads, bool reduce_src);
-
-    static void init_scratchpad(memory_tracking::registrar_t &scratchpad,
-            const jit_1x1_conv_conf_t &jcp);
-
-    static void balance(jit_1x1_conv_conf_t &jcp);
-
-    jit_1x1_conv_conf_t jcp;
-    const primitive_attr_t &attr_;
-
-private:
-    using Reg = Xbyak_riscv::Reg;
-    using VReg = Xbyak_riscv::VReg;
-    using FReg = Xbyak_riscv::FReg;
-
-    const Reg reg_param = a0;
-    const Reg reg_bcast_data = a1;
-    const Reg reg_load_data = a2;
-    const Reg reg_output_data = a3;
-    const Reg reg_bias_data = a4;
-
-    const Reg reg_load_loop_work = t0;
-    const Reg reg_bcast_loop_work = t1;
-    const Reg reg_reduce_loop_work = t2;
-
-    const Reg aux_reg_bcast_data = t3;
-    const Reg aux_reg_load_data = t4;
-    const Reg aux_reg_output_data = t5;
-    const Reg aux1_reg_bcast_data = t6;
-
-    const Reg reduce_loop_iter = s0;
-    const Reg reg_bcast_loop_iter = s1;
-    const Reg reg_reduce_pos_flag = s2;
-    const Reg reg_output_stride = s3;
-
-    const Reg reg_tmp_imm = s4;
-    const Reg reg_tmp_addr = s5;
-
-    VReg vreg_accum(int i_load, int i_ur) {
-        // Avoid v0, start from v1
-        return VReg(1 + i_ur * jcp.load_loop_blk + i_load);
-    }
-
-    VReg vreg_load(int i_load, int i_unroll = 0) {
-        // Allocate after accum to avoid conflicts
-        // accum uses v1 to v(ur * load_loop_blk)
-        return VReg(1 + jcp.ur * jcp.load_loop_blk
-                + i_unroll * jcp.load_loop_blk + i_load);
-    }
-
-    const FReg freg_bcast = fa0;
-    const FReg freg_load = fa1;
-
-    void generate() override;
-    void preamble();
-    void postamble();
-    void bcast_loop(int load_loop_blk);
-    void reduce_loop(int load_loop_blk, int ur);
-    void fma_block(int load_loop_blk, int ur);
-};
-
-} // namespace rv64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/cpu/rv64/jit_rvv_1x1_convolution.cpp b/src/cpu/rv64/jit_rvv_1x1_convolution.cpp
deleted file mode 100644
index f744419990a..00000000000
--- a/src/cpu/rv64/jit_rvv_1x1_convolution.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*******************************************************************************
-* Copyright 2025 ZTE Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "common/c_types_map.hpp"
-#include "common/dnnl_thread.hpp"
-#include "common/type_helpers.hpp"
-#include "common/utils.hpp"
-
-#include "cpu/rv64/jit_rvv_1x1_convolution.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace rv64 {
-
-using namespace dnnl::impl::status;
-using namespace dnnl::impl::utils;
-
-void jit_rvv_1x1_convolution_fwd_t::execute_forward(
-        const exec_ctx_t &ctx) const {
-    auto src = CTX_IN_MEM(const float *, DNNL_ARG_SRC);
-    auto weights = CTX_IN_MEM(const float *, DNNL_ARG_WEIGHTS);
-    auto bias = CTX_IN_MEM(const float *, DNNL_ARG_BIAS);
-    auto dst = CTX_OUT_MEM(float *, DNNL_ARG_DST);
-
-    const auto &scratchpad = ctx.get_scratchpad_grantor();
-
-    parallel(pd()->jcp_.nthr, [&](const int ithr, const int nthr) {
-        execute_forward_thr(ithr, nthr, src, weights, bias, dst, scratchpad);
-    });
-}
-
-void jit_rvv_1x1_convolution_fwd_t::execute_forward_thr(const int ithr,
-        const int nthr, const float *src, const float *weights,
-        const float *bias, float *dst,
-        const memory_tracking::grantor_t &scratchpad) const {
-
-    const memory_desc_wrapper src_d(pd()->src_md());
-    const memory_desc_wrapper dst_d(pd()->dst_md());
-    const memory_desc_wrapper weights_d(pd()->weights_md(0));
-
-    const auto &jcp = pd()->jcp_;
-
-    auto step = [](int default_step, int remaining, int tail_step) {
-        assert(default_step <= tail_step);
-        return remaining < tail_step ? remaining : default_step;
-    };
-
-    // RVV 1x1 convolution uses NHWC layout.
-    // Spatial dimensions are collapsed into 'os'.
-    // Threading is balanced over (MB * groups * nb_bcast) and (nb_load).
-
-    const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast;
-    int bcast_start {0}, bcast_end {0}, ocb_start {0}, ocb_end {0};
-
-    balance2D(nthr, ithr, work_amount, bcast_start, bcast_end, jcp.nb_load,
-            ocb_start, ocb_end, jcp.load_grp_count);
-
-    if (bcast_start >= bcast_end || ocb_start >= ocb_end) return;
-
-    auto p = jit_1x1_conv_args_t();
-
-    auto ker_1x1 = [&](int ocb, int load_step, int icb, int n, int g, int osb,
-                           int bcast_step) {
-        const int oc_off = g * jcp.oc_without_padding + ocb * jcp.oc_block;
-        const size_t dst_off
-                = (size_t)n * jcp.os * jcp.ngroups * jcp.oc_without_padding
-                + (size_t)osb * jcp.bcast_block * jcp.ngroups
-                        * jcp.oc_without_padding
-                + oc_off;
-
-        p.output_data = &dst[dst_off];
-        p.bias_data = bias ? &bias[oc_off] : nullptr;
-
-        const size_t wei_off = (size_t)g * jcp.oc * jcp.ic_without_padding
-                + (size_t)ocb * jcp.ic_without_padding * jcp.oc_block
-                + (size_t)icb * jcp.ic_block * jcp.oc_block;
-        p.load_data = &weights[wei_off];
-
-        const int ic_off = g * jcp.ic_without_padding + icb * jcp.ic_block;
-        const size_t src_off
-                = (size_t)n * jcp.is * jcp.ngroups * jcp.ic_without_padding
-                + (size_t)osb * jcp.bcast_block * jcp.ngroups
-                        * jcp.ic_without_padding
-                + ic_off;
-        p.bcast_data = &src[src_off];
-
-        p.bcast_dim = this_block_size(
-                osb * jcp.bcast_block, jcp.os, bcast_step * jcp.bcast_block);
-        p.load_dim = this_block_size(ocb * jcp.oc_block, jcp.oc_without_padding,
-                load_step * jcp.oc_block);
-        p.reduce_dim = this_block_size(icb * jcp.ic_block,
-                jcp.ic_without_padding, jcp.nb_reduce_blocking * jcp.ic_block);
-
-        p.first_last_flag = (icb == 0 ? FLAG_REDUCE_FIRST : 0)
-                | (icb + jcp.nb_reduce_blocking >= jcp.nb_reduce
-                                ? FLAG_REDUCE_LAST
-                                : 0);
-
-        (*kernel_)(&p);
-    };
-
-    // Loop order: Load -> Bcast -> Reduce (LBR)
-    // This order keeps weights in registers/L1 while iterating over spatial.
-    for (int ocb = ocb_start; ocb < ocb_end;) {
-        int load_step = step(
-                jcp.nb_load_blocking, ocb_end - ocb, jcp.nb_load_blocking_max);
-        int iwork = bcast_start;
-        while (iwork < bcast_end) {
-            int n {0}, g {0}, osb {0};
-            nd_iterator_init(
-                    iwork, n, jcp.mb, g, jcp.ngroups, osb, jcp.nb_bcast);
-
-            int bcast_step = step(jcp.nb_bcast_blocking, bcast_end - iwork,
-                    jcp.nb_bcast_blocking_max);
-            bcast_step = nstl::min(bcast_step, jcp.nb_bcast - osb);
-
-            for (int icb = 0; icb < jcp.nb_reduce;
-                    icb += jcp.nb_reduce_blocking) {
-                ker_1x1(ocb, load_step, icb, n, g, osb, bcast_step);
-            }
-            iwork += bcast_step;
-        }
-        ocb += load_step;
-    }
-}
-
-} // namespace rv64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/cpu/rv64/jit_rvv_1x1_convolution.hpp b/src/cpu/rv64/jit_rvv_1x1_convolution.hpp
deleted file mode 100644
index 2d379cc6ec9..00000000000
--- a/src/cpu/rv64/jit_rvv_1x1_convolution.hpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*******************************************************************************
-* Copyright 2025 ZTE Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef CPU_RV64_JIT_RVV_1X1_CONVOLUTION_HPP
-#define CPU_RV64_JIT_RVV_1X1_CONVOLUTION_HPP
-
-#include "common/c_types_map.hpp"
-#include "common/dnnl_thread.hpp"
-#include "common/memory_tracking.hpp"
-#include "common/primitive.hpp"
-#include "common/utils.hpp"
-
-#include "cpu/cpu_convolution_pd.hpp"
-#include "cpu/platform.hpp"
-
-#include "cpu/rv64/jit_rvv_1x1_conv_kernel.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace rv64 {
-
-struct jit_rvv_1x1_convolution_fwd_t : public primitive_t {
-    struct pd_t : public cpu_convolution_fwd_pd_t {
-        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
-
-        DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_1x1:", v, ""),
-                jit_rvv_1x1_convolution_fwd_t);
-
-        status_t init(engine_t *engine) {
-            using namespace utils;
-            using namespace format_tag;
-
-            const memory_desc_wrapper src_d(src_md());
-            const memory_desc_wrapper weights_d(weights_md());
-            const memory_desc_wrapper dst_d(dst_md());
-
-            VDISPATCH_CONV(is_fwd(), VERBOSE_BAD_PROPKIND);
-            VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
-                    VERBOSE_BAD_ALGORITHM);
-            VDISPATCH_CONV(
-                    expect_data_types(data_type::f32, data_type::f32,
-                            data_type::f32, data_type::f32, data_type::undef),
-                    VERBOSE_UNSUPPORTED_DT);
-            VDISPATCH_CONV(attr()->has_default_values(
-                                   primitive_attr_t::skip_mask_t::post_ops),
-                    VERBOSE_UNSUPPORTED_ATTR);
-            VDISPATCH_CONV(post_ops_ok(), VERBOSE_UNSUPPORTED_POSTOP);
-            VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
-
-            // Only support: data = nwc/nhwc/ndhwc, weights = blocked formats (Oiw4o/gOiw4o/etc)
-            const int n = ndims();
-            const bool g = with_groups();
-            const auto dat_tag_nxc = utils::pick(n - 3, nwc, nhwc, ndhwc);
-            const auto wei_tag_blocked = utils::pick(2 * n - 6 + (g ? 1 : 0),
-                    Oiw4o, gOiw4o, Oihw4o, gOihw4o, Oidhw4o, gOidhw4o);
-
-            // Check if src/dst match supported format (nxc)
-            // Only accept format_kind::any as a fallback, reject explicit
-            // unsupported formats
-            VDISPATCH_CONV(IMPLICATION(src_d.matches_one_of_tag(dat_tag_nxc)
-                                           != dat_tag_nxc,
-                                   src_d.format_kind() == format_kind::any),
-                    VERBOSE_UNSUPPORTED_TAG);
-            VDISPATCH_CONV(IMPLICATION(dst_d.matches_one_of_tag(dat_tag_nxc)
-                                           != dat_tag_nxc,
-                                   dst_d.format_kind() == format_kind::any),
-                    VERBOSE_UNSUPPORTED_TAG);
-            VDISPATCH_CONV(
-                    IMPLICATION(weights_d.matches_one_of_tag(wei_tag_blocked)
-                                    != wei_tag_blocked,
-                            weights_d.format_kind() == format_kind::any),
-                    VERBOSE_UNSUPPORTED_TAG);
-
-            // Set default formats if format_kind == any
-            VDISPATCH_CONV(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
-
-            // ISA check
-            VDISPATCH_CONV(mayiuse(v), VERBOSE_UNSUPPORTED_ISA);
-
-            // 1x1 convolution check
-            const int ndims = src_d.ndims();
-            const int weights_ndims = weights_d.ndims();
-            for (int i = 0; i < ndims - 2; ++i) {
-                VDISPATCH_CONV(
-                        weights_d.dims()[weights_ndims - (ndims - 2) + i] == 1,
-                        VERBOSE_UNSUPPORTED_FEATURE,
-                        "only 1x1 convolution is supported");
-                VDISPATCH_CONV(desc()->strides[i] == 1,
-                        VERBOSE_UNSUPPORTED_FEATURE,
-                        "only stride 1 is supported");
-                VDISPATCH_CONV(desc()->padding[0][i] == 0,
-                        VERBOSE_UNSUPPORTED_FEATURE,
-                        "padding is not supported");
-            }
-
-            VDISPATCH_CONV_SC(jit_rvv_1x1_conv_kernel_t::init_conf(jcp_,
-                                      *desc(), src_d, weights_d, dst_d, *attr(),
-                                      dnnl_get_max_threads(), false),
-                    VERBOSE_UNSUPPORTED_FEATURE, "init_conf failed");
-
-            auto scratchpad = scratchpad_registry().registrar();
-            jit_rvv_1x1_conv_kernel_t::init_scratchpad(scratchpad, jcp_);
-
-            return status::success;
-        }
-
-        jit_1x1_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
-
-    protected:
-        bool post_ops_ok() const {
-            // TODO: Post-ops support is not implemented yet.
-            return attr()->post_ops_.len() == 0;
-        }
-        bool set_default_formats() {
-            using namespace format_tag;
-            const int n = ndims();
-            const bool g = with_groups();
-            const auto dat_tag = utils::pick(n - 3, nwc, nhwc, ndhwc);
-            const auto wei_tag = utils::pick(2 * n - 6 + (g ? 1 : 0), Oiw4o,
-                    gOiw4o, Oihw4o, gOihw4o, Oidhw4o, gOidhw4o);
-
-            return set_default_formats_common(dat_tag, wei_tag, dat_tag);
-        }
-    };
-
-    jit_rvv_1x1_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
-
-    status_t init(engine_t *engine) override {
-        CHECK(safe_ptr_assign(kernel_,
-                new jit_rvv_1x1_conv_kernel_t(
-                        pd()->jcp_, *pd()->attr(), *pd()->dst_md())));
-        return kernel_->create_kernel();
-    }
-
-    status_t execute(const exec_ctx_t &ctx) const override {
-        execute_forward(ctx);
-        return status::success;
-    }
-
-private:
-    void execute_forward(const exec_ctx_t &ctx) const;
-    void execute_forward_thr(const int ithr, const int nthr, const float *src,
-            const float *weights, const float *bias, float *dst,
-            const memory_tracking::grantor_t &scratchpad) const;
-
-    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-
-    std::unique_ptr<jit_rvv_1x1_conv_kernel_t> kernel_;
-};
-
-} // namespace rv64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/cpu/rv64/rvv_gemm_convolution.cpp b/src/cpu/rv64/rvv_gemm_convolution.cpp
index ed75ccfc0ea..dfb271575d0 100644
--- a/src/cpu/rv64/rvv_gemm_convolution.cpp
+++ b/src/cpu/rv64/rvv_gemm_convolution.cpp
@@ -1,22 +1,17 @@
 /*******************************************************************************
-* Copyright 2016 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
+Copyright 2016 Intel Corporation
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
 *******************************************************************************/
-
 #include <atomic>
 #include <riscv_vector.h>
-
 #include "common/c_types_map.hpp"
 #include "common/dnnl_thread.hpp"
 #include "common/type_helpers.hpp"
@@ -38,10 +33,102 @@ struct im_pos_t {
     dim_t n, g, od, sp, ic, oc;
     bool do_im2col(const im_pos_t &prev) const {
         return true
-                && (n != prev.n || g != prev.g || od != prev.od || sp != prev.sp
-                        || ic != prev.ic);
+            && (n != prev.n || g != prev.g || od != prev.od || sp != prev.sp
+                    || ic != prev.ic);
     }
 };
+
+// Helper function to apply bias and eltwise using RVV in NSPC layout
+// Using float explicitly as data_t is float in this specialization
+static void apply_bias_eltwise_rvv_nspc(
+        const float *__restrict bia_arr,
+        float *__restrict dst_arr,
+        size_t start_oc, size_t end_oc,
+        bool with_bias,
+        bool with_eltwise,
+        const ref_post_ops_t *post_ops,
+        const exec_ctx_t &ctx,
+        const memory_desc_t *dst_md, // Changed to pointer to memory_desc_t
+        const conv_gemm_conf_t &jcp,
+        size_t g, size_t os_offset_factor) {
+    
+    size_t n_elems = end_oc - start_oc + 1;
+    if (n_elems == 0) return;
+
+    size_t oc = 0;
+    const float *b_ptr = with_bias ? (bia_arr + start_oc) : nullptr;
+    float *d_ptr = dst_arr + start_oc;
+
+    // Prepare eltwise params if needed
+    float eltwise_alpha = 0.0f;
+    float eltwise_scale = 1.0f;
+    bool is_fast_relu = false;
+    
+    if (with_eltwise && jcp.post_ops.len() == 1) {
+        const auto &eltwise = jcp.post_ops.entry_.back().eltwise;
+        if (eltwise.alg == alg_kind::eltwise_relu) {
+            eltwise_alpha = eltwise.alpha;
+            eltwise_scale = eltwise.scale;
+            is_fast_relu = true;
+        }
+    }
+
+    while (oc < n_elems) {
+        size_t vl = __riscv_vsetvl_e32m1(n_elems - oc);
+        
+        vfloat32m1_t v_dst = __riscv_vle32_v_f32m1(d_ptr + oc, vl);
+
+        // 1. Add Bias
+        if (with_bias) {
+            vfloat32m1_t v_bias = __riscv_vle32_v_f32m1(b_ptr + oc, vl);
+            v_dst = __riscv_vfadd_vv_f32m1(v_dst, v_bias, vl);
+        }
+
+        // 2. Apply Eltwise (Fast ReLU path)
+        if (is_fast_relu) {
+            if (eltwise_alpha == 0.0f) {
+                // Standard ReLU
+                v_dst = __riscv_vfmax_vf_f32m1(v_dst, 0.0f, vl);
+            } else {
+                // Leaky ReLU-like
+                vbool32_t mask = __riscv_vmflt_vf_f32m1_b32(v_dst, 0.0f, vl);
+                v_dst = __riscv_vfmul_vf_f32m1_m(mask, v_dst, eltwise_alpha, vl);
+            }
+            
+            if (eltwise_scale != 1.0f) {
+                v_dst = __riscv_vfmul_vf_f32m1(v_dst, eltwise_scale, vl);
+            }
+            __riscv_vse32_v_f32m1(d_ptr + oc, v_dst, vl);
+            oc += vl;
+        } else {
+            // If not fast relu, break to handle scalarly or generic post-ops
+            break; 
+        }
+    }
+
+    // Handle remaining elements or generic post-ops scalarly
+    if (oc < n_elems || (!is_fast_relu && with_eltwise)) {
+        for (size_t i = oc; i < n_elems; ++i) {
+            size_t cur_oc = start_oc + i;
+            float *dst_val = dst_arr + cur_oc;
+            
+            if (with_bias) {
+                *dst_val += bia_arr[cur_oc];
+            }
+            
+            if (with_eltwise || jcp.with_binary) {
+                 ref_post_ops_t::args_t args;
+                 args.ctx = &ctx;
+                 args.dst_md = dst_md; // Use the passed pointer
+                 // Calculate offset correctly
+                 // Note: l_offset calculation might need adjustment based on exact memory layout expectations of post_ops
+                 args.l_offset = (g * jcp.oc + cur_oc) * (jcp.os * jcp.od);
+                 post_ops->execute(*dst_val, args);
+            }
+        }
+    }
+}
+
 } // namespace
 
 status_t riscv_gemm_convolution_fwd_t::execute_forward_nspc(
@@ -50,7 +137,6 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_nspc(
     auto wei_base = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS);
     auto bia_base = CTX_IN_MEM(const data_t *, DNNL_ARG_BIAS);
     auto dst_base = CTX_OUT_MEM(data_t *, DNNL_ARG_DST);
-
     auto scratchpad = ctx.get_scratchpad_grantor();
     const conv_gemm_conf_t &jcp = pd()->jcp_;
     std::atomic<status_t> st(status::success);
@@ -58,7 +144,11 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_nspc(
     parallel(jcp.nthr, [&](const int ithr, const int nthr) {
         status_t st_thr = execute_forward_thr_nspc(ctx, ithr, nthr, src_base,
                 wei_base, bia_base, dst_base, scratchpad);
-        if (st_thr != status::success) st = st_thr;
+
+        if (st_thr != status::success) {
+            status_t expected = status::success;
+            st.compare_exchange_strong(expected, st_thr);
+        }
     });
 
     return st;
@@ -69,7 +159,6 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc(
         const data_t *src_base, const data_t *wei_base, const data_t *bia_base,
         data_t *dst_base, const memory_tracking::grantor_t &scratchpad) const {
     const conv_gemm_conf_t &jcp = pd()->jcp_;
-
     // Src Format: mb-spatial-groups-input_channels
     const dim_t src_mb_stride = jcp.id * jcp.ih * jcp.iw * jcp.ngroups * jcp.ic;
     const dim_t src_g_stride = jcp.ic;
@@ -92,7 +181,7 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc(
 
     assert(IMPLICATION(is_problem_3d,
             jcp.oh_block == jcp.oh && jcp.ow_block == jcp.ow
-                    && jcp.ic_block == jcp.ic));
+                     && jcp.ic_block == jcp.ic));
     assert(IMPLICATION(jcp.ow_block != jcp.ow, jcp.oh_block == 1));
 
     const dim_t nb_oh = div_up(jcp.oh, jcp.oh_block);
@@ -102,10 +191,8 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc(
     balance211(work_amount, nthr, ithr, start, end);
     nd_iterator_init(start, n, jcp.mb, g, jcp.ngroups, ohb, nb_oh, owb, nb_ow);
 
+    // Pre-zeroing for 3D problem if needed (outside loop)
     if (jcp.im2col_sz && is_problem_3d) {
-        // jit_gemm_convolution_utils::im2col_dt_3d() requires external
-        // data initialization by zeroes
-
         const size_t total_sz = jcp.im2col_sz;
         const size_t vlmax = __riscv_vsetvlmax_e32m1();
         const vfloat32m1_t v_zero = __riscv_vfmv_v_f_f32m1(0.0f, vlmax);
@@ -120,6 +207,10 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc(
         }
     }
 
+    // Cache post_ops pointer and dst_md
+    const ref_post_ops_t *post_ops_ptr = post_ops_.get();
+    const memory_desc_t *dst_md_ptr = pd()->dst_md();
+
     for (dim_t iwork = start; iwork < end; ++iwork) {
         dim_t oh = ohb * jcp.oh_block;
         dim_t ow = owb * jcp.ow_block;
@@ -129,14 +220,16 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc(
 
         const int h_step = nstl::min(jcp.oh_block, jcp.oh - oh);
         const int w_step = nstl::min(jcp.ow_block, jcp.ow - ow);
+        
         if (jcp.im2col_sz && is_problem_3d) {
-            jit_gemm_convolution_utils::transpose_dt(jcp, src, imtr);
+             jit_gemm_convolution_utils::transpose_dt(jcp, src, imtr);
         }
 
         for (int od = 0; od < jcp.od; od++) {
             data_t *__restrict dst = dst_base + n * dst_mb_stride
                     + g * dst_g_stride
                     + ((od * jcp.oh + oh) * jcp.ow + ow) * dst_os_stride;
+            
             if (jcp.im2col_sz) {
                 if (is_problem_3d)
                     jit_gemm_convolution_utils::im2col_dt_3d<data_t, data_t>(
@@ -152,25 +245,27 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc(
             const dim_t LDA = M * jcp.ngroups;
             const dim_t LDB = jcp.im2col_sz ? N : K * jcp.ngroups;
             const dim_t LDC = M * jcp.ngroups;
-            const char *BT = jcp.im2col_sz ? "T" : "N";
+            const char *BT = jcp.im2col_sz ? "T " : "N ";
             const data_t onef = 1.f;
             const float beta = jcp.with_sum ? 1.0f : 0.0f;
             const data_t *__restrict src_od
                     = src + od * jcp.oh * jcp.ow * jcp.ngroups * jcp.ic;
-            status_t st = extended_sgemm("N", BT, &M, &N, &K, &onef, wei, &LDA,
+            
+            status_t st = extended_sgemm("N ", BT, &M, &N, &K, &onef, wei, &LDA,
                     jcp.im2col_sz ? col : (data_t *)src_od, &LDB, &beta, dst,
                     &LDC);
             if (st != status::success) return st;
 
             if (jcp.with_bias || jcp.with_eltwise || jcp.with_binary) {
-                parallel(0, [&](int ithr, int nthr) {
-                    dim_t start, end;
-                    balance211(N * jcp.oc, nthr, ithr, start, end);
+                // NOTE: Keeping parallel(0, ...) as requested
+                parallel(0, [&](int ithr_inner, int nthr_inner) {
+                    dim_t start_inner, end_inner;
+                    balance211(N * jcp.oc, nthr_inner, ithr_inner, start_inner, end_inner);
 
-                    const size_t first_oc = start % jcp.oc;
-                    const size_t last_oc = (end - 1) % jcp.oc;
-                    const size_t first_os = start / jcp.oc;
-                    const size_t last_os = (end - 1) / jcp.oc;
+                    const size_t first_oc = start_inner % jcp.oc;
+                    const size_t last_oc = (end_inner - 1) % jcp.oc;
+                    const size_t first_os = start_inner / jcp.oc;
+                    const size_t last_os = (end_inner - 1) / jcp.oc;
 
                     for (size_t os = first_os; os <= last_os; ++os) {
                         const size_t start_oc = (os == first_os) ? first_oc : 0;
@@ -181,60 +276,36 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc(
                                 = bia_base ? bia_base + g * jcp.oc : nullptr;
                         data_t *__restrict dst_arr = dst + os * dst_os_stride;
 
-                        if (jcp.with_bias) {
-                            size_t n_elems = end_oc - start_oc + 1;
-                            if (n_elems > 0) {
-                                size_t oc = 0;
-                                const data_t *b_ptr = bia_arr + start_oc;
-                                data_t *d_ptr = dst_arr + start_oc;
-
-                                while (oc < n_elems) {
-                                    size_t vl = __riscv_vsetvl_e32m1(
-                                            n_elems - oc);
-                                    vfloat32m1_t v_dst = __riscv_vle32_v_f32m1(
-                                            d_ptr + oc, vl);
-                                    vfloat32m1_t v_bias = __riscv_vle32_v_f32m1(
-                                            b_ptr + oc, vl);
-                                    v_dst = __riscv_vfadd_vv_f32m1(
-                                            v_dst, v_bias, vl);
-                                    __riscv_vse32_v_f32m1(
-                                            d_ptr + oc, v_dst, vl);
-                                    oc += vl;
-                                }
-                            }
-                        }
-
-                        if (jcp.with_eltwise || jcp.with_binary) {
-                            bool fast_relu_done = false;
-                            if (jcp.with_eltwise && jcp.post_ops.len() == 1) {
-                                // fast branch for ReLU case
-                                const auto &eltwise
-                                        = jcp.post_ops.entry_.back().eltwise;
-
-                                if (eltwise.alg == alg_kind::eltwise_relu) {
-                                    const auto alpha = eltwise.alpha;
-                                    const auto scale = eltwise.scale;
-                                    PRAGMA_OMP_SIMD()
-                                    for (size_t oc = start_oc; oc <= end_oc;
-                                            oc++) {
-                                        if (dst_arr[oc] < 0)
-                                            dst_arr[oc] *= alpha;
-                                        dst_arr[oc] *= scale;
+                        // Check if we can use optimized RVV path
+                        bool has_binary = jcp.with_binary;
+                        bool has_complex_eltwise = jcp.with_eltwise && !(jcp.post_ops.len() == 1 && jcp.post_ops.entry_.back().eltwise.alg == alg_kind::eltwise_relu);
+                        
+                        if (!has_binary && !has_complex_eltwise) {
+                             apply_bias_eltwise_rvv_nspc(
+                                (const float*)bia_arr, (float*)dst_arr, start_oc, end_oc,
+                                jcp.with_bias, jcp.with_eltwise,
+                                post_ops_ptr, ctx, dst_md_ptr, jcp, g, 0);
+                        } else {
+                            // Fallback to original scalar logic for complex cases
+                            if (jcp.with_bias) {
+                                size_t n_elems = end_oc - start_oc + 1;
+                                if (n_elems > 0) {
+                                    // Scalar bias add
+                                    for(size_t k=0; k<n_elems; ++k) {
+                                        dst_arr[start_oc + k] += bia_arr[start_oc + k];
                                     }
-                                    fast_relu_done = true;
                                 }
                             }
-                            if (!fast_relu_done) {
+                            
+                            if (jcp.with_eltwise || jcp.with_binary) {
                                 ref_post_ops_t::args_t args;
                                 args.ctx = &ctx;
-                                args.dst_md = pd()->dst_md();
-
+                                args.dst_md = dst_md_ptr;
+                                
                                 for (size_t oc = start_oc; oc <= end_oc; oc++) {
-                                    // jcp.od is not part of jcp.os, so multiply
-                                    // jcp.od to get spatial offset.
                                     args.l_offset = (g * jcp.oc + oc)
                                             * (jcp.os * jcp.od);
-                                    post_ops_->execute(dst_arr[oc], args);
+                                    post_ops_ptr->execute(dst_arr[oc], args);
                                 }
                             }
                         }
@@ -253,7 +324,6 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp(
     auto weights = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS);
     auto bias = CTX_IN_MEM(const data_t *, DNNL_ARG_BIAS);
     auto dst = CTX_OUT_MEM(data_t *, DNNL_ARG_DST);
-
     auto col = ctx.get_scratchpad_grantor().get<data_t>(key_conv_gemm_col);
 
     const conv_gemm_conf_t &jcp = this->pd()->jcp_;
@@ -278,7 +348,7 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp(
 
     assert(IMPLICATION(is_problem_3d,
             jcp.os_block == jcp.os && jcp.ic_block == jcp.ic
-                    && jcp.os_nb_block == 1));
+                     && jcp.os_nb_block == 1));
 
     status_t st = status::success;
     parallel(jcp.nthr, [&](const int ithr, const int nthr) {
@@ -288,9 +358,20 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp(
         // external data initialization by zeroes
         const bool outer_padding = jcp.os_nb_block == 1;
         if (outer_padding && is_problem_3d) {
-            for (ptrdiff_t i = 0; i < jcp.im2col_sz; i++)
-                _col[i] = (data_t)0;
+            // OPTIMIZATION: Vectorized zeroing
+            const size_t total_sz = jcp.im2col_sz;
+            const size_t vlmax = __riscv_vsetvlmax_e32m1();
+            const vfloat32m1_t v_zero = __riscv_vfmv_v_f_f32m1(0.0f, vlmax);
+            ptrdiff_t i = 0;
+            for (; i <= (ptrdiff_t)total_sz - (ptrdiff_t)vlmax; i += (ptrdiff_t)vlmax) {
+                __riscv_vse32_v_f32m1(_col + i, v_zero, vlmax);
+            }
+            if (i < (ptrdiff_t)total_sz) {
+                size_t vl = __riscv_vsetvl_e32m1(total_sz - i);
+                __riscv_vse32_v_f32m1(_col + i, v_zero, vl);
+            }
         }
+        
         auto inner_ker = [&](int spatial, const im_pos_t &curr, im_pos_t &prev,
                                  im_pos_t &step, const im_pos_t &end) {
             const data_t *_src
@@ -315,7 +396,7 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp(
             const data_t one = 1.0;
 
             const dim_t M = jcp.os * jcp.od;
-            const dim_t m = step.sp;
+            const dim_t m = step.sp ;
             const dim_t LDA = jcp.im2col_sz ? m : M;
             data_t *_dst = dst + curr.n * dst_mb_stride + curr.g * dst_g_stride
                     + curr.oc * M + curr.od * jcp.os + curr.sp;
@@ -331,14 +412,11 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp(
             const data_t *_weights = weights + curr.g * weights_g_size
                     + curr.oc * weights_oc_size + curr.ic * jcp.ks;
 
-            status_t st = extended_sgemm("N", "N", &m, &N, &K, &one, _source,
+            status_t st = extended_sgemm("N ", "N ", &m, &N, &K, &one, _source,
                     &LDA, _weights, &LDB, &beta, _dst, &M);
             if (st != status::success) return st;
 
             if (curr.ic == jcp.ic - step.ic) {
-                // TODO: for "outer threading" we have parallel section within
-                // outermost "parallel". It is not good. Consider to use
-                // "parallel" here with number of threads passed as parameter
                 const int oc_start = curr.g * jcp.oc + curr.oc;
                 if (jcp.with_eltwise || jcp.with_binary) {
                     bool fast_relu_done = false;
@@ -364,11 +442,11 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp(
                                                 v_d, b, vl); // Add bias
 
                                         v_d = __riscv_vfmax_vf_f32m1(
-                                                v_d, 0.0f, vl);
+                                                 v_d, 0.0f, vl);
 
                                         if (eltwise.scale != 1.0f) {
                                             v_d = __riscv_vfmul_vf_f32m1(
-                                                    v_d, eltwise.scale, vl);
+                                                     v_d, eltwise.scale, vl);
                                         }
 
                                         __riscv_vse32_v_f32m1(d_ + oS, v_d, vl);
@@ -385,10 +463,10 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp(
                                         v_d = __riscv_vfadd_vf_f32m1(
                                                 v_d, b, vl); // Add bias
                                         vbool32_t mask
-                                                = __riscv_vmflt_vf_f32m1_b32(
+                                                 = __riscv_vmflt_vf_f32m1_b32(
                                                         v_d, 0.0f, vl);
                                         v_d = __riscv_vfmul_vf_f32m1_m(
-                                                mask, v_d, eltwise.alpha, vl);
+                                                 mask, v_d, eltwise.alpha, vl);
                                         v_d = __riscv_vfmul_vf_f32m1(
                                                 v_d, eltwise.scale, vl);
                                         __riscv_vse32_v_f32m1(d_ + oS, v_d, vl);
@@ -499,4 +577,4 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp(
 } // namespace rv64
 } // namespace cpu
 } // namespace impl
-} // namespace dnnl
+} // namespace dnnl
\ No newline at end of file
diff --git a/src/cpu/rv64/rvv_gemm_convolution.hpp b/src/cpu/rv64/rvv_gemm_convolution.hpp
index e0f2afe3c07..19f4289920c 100644
--- a/src/cpu/rv64/rvv_gemm_convolution.hpp
+++ b/src/cpu/rv64/rvv_gemm_convolution.hpp
@@ -69,7 +69,6 @@ struct riscv_gemm_convolution_fwd_t : public primitive_t {
             // TODO: make `init_conf` assign initialized object to `jcp_`
             jcp_ = conv_gemm_conf_t();
 
-            std::cout << "GEMM INIT CONSTRUCTION" << std::endl;
             return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad,
                     *desc(), src_md_, weights_md_, dst_md_, bias_md_, attr_,
                     dnnl_get_max_threads());
diff --git a/third_party/xbyak_riscv/xbyak_riscv.hpp b/third_party/xbyak_riscv/xbyak_riscv.hpp
deleted file mode 100644
index 249553a36f9..00000000000
--- a/third_party/xbyak_riscv/xbyak_riscv.hpp
+++ /dev/null
@@ -1,1383 +0,0 @@
-#pragma once
-/*!
-	@file xbyak_riscv.hpp
-	@brief Xbyak_riscv ; JIT assembler for RISC-V
-	@author herumi
-	@url https://github.com/herumi/xbyak_riscv
-	@note modified new BSD license
-	http://opensource.org/licenses/BSD-3-Clause
-*/
-
-// Copyright (C), 2023, KNS Group LLC (YADRO)
-
-#include <stdio.h>
-#include <stdint.h>
-#include <assert.h>
-#include <list>
-#include <string>
-#include <algorithm>
-#include <unordered_set>
-#include <unordered_map>
-
-#ifdef _WIN32
-	#ifndef WIN32_LEAN_AND_MEAN
-		#define WIN32_LEAN_AND_MEAN
-	#endif
-	#include <windows.h>
-	#include <malloc.h>
-#elif defined(__GNUC__)
-	#include <unistd.h>
-	#include <sys/mman.h>
-	#include <stdlib.h>
-#endif
-#if defined(__APPLE__)
-	#define XBYAK_RISCV_USE_MAP_JIT
-	#include <sys/sysctl.h>
-	#ifndef MAP_JIT
-		#define MAP_JIT 0x800
-	#endif
-#endif
-
-#if defined(__GNUC__) && !defined(__MINGW32__)
-	#define XBYAK_RISCV_USE_MMAP_ALLOCATOR
-#endif
-
-#ifdef NDEBUG
-	#define XBYAK_RISCV_ASSERT(x)
-#else
-	#define XBYAK_RISCV_ASSERT(x) assert(x)
-#endif
-
-// MFD_CLOEXEC defined only linux 3.17 or later.
-// Android wraps the memfd_create syscall from API version 30.
-#if !defined(MFD_CLOEXEC) || (defined(__ANDROID__) && __ANDROID_API__ < 30)
-	#undef XBYAK_RISCV_USE_MEMFD
-#endif
-
-#if defined(_WIN64) || defined(__MINGW64__) || (defined(__CYGWIN__) && defined(__x86_64__))
-	#define XBYAK_RISCV64_WIN
-#elif defined(__x86_64__)
-	#define XBYAK_RISCV64_GCC
-#endif
-#if !defined(XBYAK_RISCV64) && !defined(XBYAK_RISCV32)
-	#if defined(XBYAK_RISCV64_GCC) || defined(XBYAK_RISCV64_WIN)
-		#define XBYAK_RISCV64
-	#else
-		#define XBYAK_RISCV32
-	#endif
-#endif
-
-#ifdef _MSC_VER
-	#pragma warning(push)
-	#pragma warning(disable : 4514) /* remove inline function */
-	#pragma warning(disable : 4786) /* identifier is too long */
-	#pragma warning(disable : 4503) /* name is too long */
-	#pragma warning(disable : 4127) /* constant expresison */
-#endif
-
-#include "xbyak_riscv_csr.hpp"
-
-#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || (defined(_MSC_VER) && _MSC_VER >= 1910)
-	#define XBYAK_RISCV_CONSTEXPR constexpr
-#else
-	#define XBYAK_RISCV_CONSTEXPR
-#endif
-
-namespace Xbyak_riscv {
-
-enum {
-	DEFAULT_MAX_CODE_SIZE = 4096,
-	VERSION = 0x1010 /* 0xABCD = A.BC.D */
-};
-
-inline uint32_t getVersion() { return VERSION; }
-
-enum {
-	ERR_NONE = 1,
-	ERR_OFFSET_IS_TOO_BIG,
-	ERR_CODE_IS_TOO_BIG,
-	ERR_IMM_IS_TOO_BIG,
-	ERR_INVALID_IMM_OF_JAL,
-	ERR_INVALID_IMM_OF_BTYPE,
-	ERR_LABEL_IS_NOT_FOUND,
-	ERR_LABEL_IS_REDEFINED,
-	ERR_LABEL_IS_TOO_FAR,
-	ERR_LABEL_IS_NOT_SET_BY_L,
-	ERR_LABEL_IS_ALREADY_SET_BY_L,
-	ERR_CANT_PROTECT,
-	ERR_CANT_ALLOC,
-	ERR_BAD_PARAMETER,
-	ERR_MUNMAP,
-	ERR_BAD_ALIGN,
-	ERR_INTERNAL // Put it at last.
-};
-
-inline const char *ConvertErrorToString(int err)
-{
-	static const char *errTbl[] = {
-		"none",
-		"offset is too big",
-		"code is too big",
-		"imm is too big",
-		"invalid imm of jal",
-		"invalid imm of Btype",
-		"label is not found",
-		"label is redefined",
-		"label is too far",
-		"label is not set by L",
-		"label is already set by L",
-		"can't protect",
-		"can't alloc",
-		"bad parameter",
-		"munmap",
-		"bad align",
-		"internal error"
-	};
-	assert(ERR_INTERNAL == sizeof(errTbl) / sizeof(*errTbl));
-	return err <= ERR_INTERNAL ? errTbl[err] : "unknown err";
-}
-
-#ifdef XBYAK_RISCV_NO_EXCEPTION
-namespace local {
-
-inline int& GetErrorRef() {
-	static thread_local int err = 0;
-	return err;
-}
-
-inline void SetError(int err) {
-	if (local::GetErrorRef()) return; // keep the first err code
-	local::GetErrorRef() = err;
-}
-
-} // local
-
-inline void ClearError() {
-	local::GetErrorRef() = 0;
-}
-inline int GetError() { return Xbyak_riscv::local::GetErrorRef(); }
-
-#define XBYAK_RISCV_THROW(err) { Xbyak_riscv::local::SetError(err); return; }
-#define XBYAK_RISCV_THROW_RET(err, r) { Xbyak_riscv::local::SetError(err); return r; }
-
-#else
-class Error : public std::exception {
-	int err_;
-public:
-	explicit Error(int err) : err_(err)
-	{
-		if (err_ < 0 || err_ > ERR_INTERNAL) {
-			err_ = ERR_INTERNAL;
-		}
-	}
-	operator int() const { return err_; }
-	const char *what() const noexcept override
-	{
-		return ConvertErrorToString(err_);
-	}
-};
-
-// dummy functions
-inline void ClearError() { }
-inline int GetError() { return 0; }
-
-inline const char *ConvertErrorToString(const Error& err)
-{
-	return err.what();
-}
-
-#define XBYAK_RISCV_THROW(err) { throw Error(err); }
-#define XBYAK_RISCV_THROW_RET(err, r) { throw Error(err); }
-
-#endif
-
-inline void *AlignedMalloc(size_t size, size_t alignment)
-{
-#ifdef __MINGW32__
-	return __mingw_aligned_malloc(size, alignment);
-#elif defined(_WIN32)
-	return _aligned_malloc(size, alignment);
-#else
-	void *p;
-	int ret = posix_memalign(&p, alignment, size);
-	return (ret == 0) ? p : 0;
-#endif
-}
-
-inline void AlignedFree(void *p)
-{
-#ifdef __MINGW32__
-	__mingw_aligned_free(p);
-#elif defined(_MSC_VER)
-	_aligned_free(p);
-#else
-	free(p);
-#endif
-}
-
-namespace local {
-
-static const size_t ALIGN_PAGE_SIZE = 4096;
-
-inline XBYAK_RISCV_CONSTEXPR uint32_t mask(size_t n)
-{
-	XBYAK_RISCV_ASSERT(n <= 32);
-	return n == 32 ? 0xffffffff : (1u << n) - 1;
-}
-// is x <= mask(n) ?
-inline XBYAK_RISCV_CONSTEXPR bool inBit(uint32_t x, size_t n)
-{
-	return x <= mask(n);
-}
-
-// is x a signed n-bit integer?
-inline XBYAK_RISCV_CONSTEXPR bool inSBit(int x, int n)
-{
-	return -(1 << (n-1)) <= x && x < (1 << (n-1));
-}
-
-// split x to hi20bits and low12bits
-// return false if x in 12-bit signed integer
-inline bool split32bit(int *pH, int* pL, int x) {
-	if (inSBit(x, 12)) return false;
-	int H = (x >> 12) & mask(20);
-	int L = x & mask(12);
-	if (x & (1 << 11)) {
-		H++;
-		L = L | (mask(20) << 12);
-	}
-	*pH = H;
-	*pL = L;
-	return true;
-}
-
-// @@@ embedded by bit_pattern.py (DON'T DELETE THIS LINE)
-inline size_t get20_10to1_11_19to12_z12(size_t v) { return ((v & (1<<20)) << 11)| ((v & (1023<<1)) << 20)| ((v & (1<<11)) << 9)| (v & (255<<12)); }
-inline size_t get12_10to5_z13_4to1_11_z7(size_t v) { return ((v & (1<<12)) << 19)| ((v & (63<<5)) << 20)| ((v & (15<<1)) << 7)| ((v & (1<<11)) >> 4); }
-inline size_t get5to4_9to6_2_3_z5(size_t v) { return ((v & (3<<4)) << 7)| ((v & (15<<6)) << 1)| ((v & (1<<2)) << 4)| ((v & (1<<3)) << 2); }
-inline size_t get9_z5_4_6_8to7_5_z2(size_t v) { return ((v & (1<<9)) << 3)| ((v & (1<<4)) << 2)| ((v & (1<<6)) >> 1)| ((v & (3<<7)) >> 4)| ((v & (1<<5)) >> 3); }
-inline size_t get5to3_z3_2_6_z5(size_t v) { return ((v & (7<<3)) << 7)| ((v & (1<<2)) << 4)| ((v & (1<<6)) >> 1); }
-inline size_t get5to3_z3_7_6_z5(size_t v) { return ((v & (7<<3)) << 7)| ((v & (1<<7)) >> 1)| ((v & (1<<6)) >> 1); }
-inline size_t get5_z5_4to0_z2(size_t v) { return ((v & (1<<5)) << 7)| ((v & 31) << 2); }
-inline size_t get11_4_9to8_10_6_7_3to1_5_z2(size_t v) { return ((v & (1<<11)) << 1)| ((v & (1<<4)) << 7)| ((v & (3<<8)) << 1)| ((v & (1<<10)) >> 2)| ((v & (1<<6)) << 1)| ((v & (1<<7)) >> 1)| ((v & (7<<1)) << 2)| ((v & (1<<5)) >> 3); }
-inline size_t get17_z5_16to12_z2(size_t v) { return ((v & (1<<17)) >> 5)| ((v & (31<<12)) >> 10); }
-inline size_t get5_z5_4to2_7to6_z2(size_t v) { return ((v & (1<<5)) << 7)| ((v & (7<<2)) << 2)| ((v & (3<<6)) >> 4); }
-inline size_t get5_z5_4to3_8to6_z2(size_t v) { return ((v & (1<<5)) << 7)| ((v & (3<<3)) << 2)| ((v & (7<<6)) >> 4); }
-inline size_t get5to2_7to6_z7(size_t v) { return ((v & (15<<2)) << 7)| ((v & (3<<6)) << 1); }
-inline size_t get5to3_8to6_z7(size_t v) { return ((v & (7<<3)) << 7)| ((v & (7<<6)) << 1); }
-// @@@ embedded by bit_pattern.py (DON'T DELETE THIS LINE)
-
-} // local
-
-/*
-	custom allocator
-*/
-struct Allocator {
-	explicit Allocator(const std::string& = "") {} // same interface with MmapAllocator
-	virtual uint8_t *alloc(size_t size) { return reinterpret_cast<uint8_t*>(AlignedMalloc(size, local::ALIGN_PAGE_SIZE)); }
-	virtual void free(uint8_t *p) { AlignedFree(p); }
-	virtual ~Allocator() {}
-	/* override to return false if you call protect() manually */
-	virtual bool useProtect() const { return true; }
-};
-
-#ifdef XBYAK_RISCV_USE_MMAP_ALLOCATOR
-#ifdef XBYAK_RISCV_USE_MAP_JIT
-namespace local {
-
-inline int getMacOsVersionPure()
-{
-	char buf[64];
-	size_t size = sizeof(buf);
-	int err = sysctlbyname("kern.osrelease", buf, &size, NULL, 0);
-	if (err != 0) return 0;
-	char *endp;
-	int major = strtol(buf, &endp, 10);
-	if (*endp != '.') return 0;
-	return major;
-}
-
-inline int getMacOsVersion()
-{
-	static const int version = getMacOsVersionPure();
-	return version;
-}
-
-} // local
-#endif
-class MmapAllocator : public Allocator {
-	struct Allocation {
-		size_t size;
-#if defined(XBYAK_RISCV_USE_MEMFD)
-		// fd_ is only used with XBYAK_RISCV_USE_MEMFD. We keep the file open
-		// during the lifetime of each allocation in order to support
-		// checkpoint/restore by unprivileged users.
-		int fd;
-#endif
-	};
-	const std::string name_; // only used with XBYAK_RISCV_USE_MEMFD
-	typedef std::unordered_map<uintptr_t, Allocation> AllocationList;
-	AllocationList allocList_;
-public:
-	explicit MmapAllocator(const std::string& name = "xbyak") : name_(name) {}
-	uint8_t *alloc(size_t size) override
-	{
-		const size_t alignedSizeM1 = local::ALIGN_PAGE_SIZE - 1;
-		size = (size + alignedSizeM1) & ~alignedSizeM1;
-#if defined(MAP_ANONYMOUS)
-		int mode = MAP_PRIVATE | MAP_ANONYMOUS;
-#elif defined(MAP_ANON)
-		int mode = MAP_PRIVATE | MAP_ANON;
-#else
-		#error "not supported"
-#endif
-#if defined(XBYAK_RISCV_USE_MAP_JIT)
-		const int mojaveVersion = 18;
-		if (local::getMacOsVersion() >= mojaveVersion) mode |= MAP_JIT;
-#endif
-		int fd = -1;
-#if defined(XBYAK_RISCV_USE_MEMFD)
-		fd = memfd_create(name_.c_str(), MFD_CLOEXEC);
-		if (fd != -1) {
-			mode = MAP_SHARED;
-			if (ftruncate(fd, size) != 0) {
-				close(fd);
-				XBYAK_RISCV_THROW_RET(ERR_CANT_ALLOC, 0)
-			}
-		}
-#endif
-		void *p = mmap(NULL, size, PROT_READ | PROT_WRITE, mode, fd, 0);
-		if (p == MAP_FAILED) {
-			if (fd != -1) close(fd);
-			XBYAK_RISCV_THROW_RET(ERR_CANT_ALLOC, 0)
-		}
-		assert(p);
-		Allocation &alloc = allocList_[(uintptr_t)p];
-		alloc.size = size;
-#if defined(XBYAK_RISCV_USE_MEMFD)
-		alloc.fd = fd;
-#endif
-		return (uint8_t*)p;
-	}
-	void free(uint8_t *p) override
-	{
-		if (p == 0) return;
-		AllocationList::iterator i = allocList_.find((uintptr_t)p);
-		if (i == allocList_.end()) XBYAK_RISCV_THROW(ERR_BAD_PARAMETER)
-		if (munmap((void*)i->first, i->second.size) < 0) XBYAK_RISCV_THROW(ERR_MUNMAP)
-#if defined(XBYAK_RISCV_USE_MEMFD)
-		if (i->second.fd != -1) close(i->second.fd);
-#endif
-		allocList_.erase(i);
-	}
-};
-#endif
-
-namespace local {
-
-// Register Interface
-class IReg {
-public:
-	enum Kind {
-		GPR = 1,         // General purpose register
-		FReg = 1 << 1,   // Floating-point register
-		VECTOR = 1 << 2, // Vector register
-	};
-protected:
-	uint32_t idx_;
-	Kind kind_;
-public:
-	XBYAK_RISCV_CONSTEXPR IReg(uint32_t idx = 0, Kind kind = GPR)
-		: idx_(idx), kind_(kind)
-	{
-		XBYAK_RISCV_ASSERT(local::inBit(idx, 5));
-	}
-	XBYAK_RISCV_CONSTEXPR int getIdx() const { return idx_; }
-	const char *toString() const
-	{
-		if (kind_ == GPR) {
-			static const char tbl[][4] = {
-				"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
-				"x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
-				"x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
-				"x24", "x25", "x26", "x27", "x28", "x29", "x30", "x31",
-			};
-			return tbl[idx_];
-		} else if (kind_ == FReg) {
-			static const char tbl[][4] = {
-				"f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7",
-				"f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15",
-				"f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
-				"f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31",
-			};
-			return tbl[idx_];
-		} else if (kind_ == VECTOR) {
-			static const char tbl[][4] = {
-				"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-				"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
-				"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
-				"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
-			};
-			return tbl[idx_];
-		}
-		XBYAK_RISCV_THROW_RET(ERR_INTERNAL, 0);
-	}
-	bool operator==(const IReg& rhs) const
-	{
-		return idx_ == rhs.idx_ && kind_ == rhs.kind_;
-	}
-	bool operator!=(const IReg& rhs) const { return !operator==(rhs); }
-
-};
-
-} // local
-
-// General Purpose Register
-struct Reg : public local::IReg {
-	explicit XBYAK_RISCV_CONSTEXPR Reg(int idx = 0) : local::IReg(idx, IReg::Kind::GPR) { }
-};
-
-static XBYAK_RISCV_CONSTEXPR Reg x0(0), x1(1), x2(2), x3(3), x4(4), x5(5), x6(6), x7(7);
-static XBYAK_RISCV_CONSTEXPR Reg x8(8), x9(9), x10(10), x11(11), x12(12), x13(13), x14(14), x15(15);
-static XBYAK_RISCV_CONSTEXPR Reg x16(16), x17(17), x18(18), x19(19), x20(20), x21(21), x22(22), x23(23);
-static XBYAK_RISCV_CONSTEXPR Reg x24(24), x25(25), x26(26), x27(27), x28(28), x29(29), x30(30), x31(31);
-
-static XBYAK_RISCV_CONSTEXPR Reg zero(x0);
-static XBYAK_RISCV_CONSTEXPR Reg ra(x1);
-static XBYAK_RISCV_CONSTEXPR Reg sp(x2);
-static XBYAK_RISCV_CONSTEXPR Reg gp(x3);
-static XBYAK_RISCV_CONSTEXPR Reg tp(x4);
-static XBYAK_RISCV_CONSTEXPR Reg t0(x5);
-static XBYAK_RISCV_CONSTEXPR Reg t1(x6);
-static XBYAK_RISCV_CONSTEXPR Reg t2(x7);
-static XBYAK_RISCV_CONSTEXPR Reg fp(x8);
-static XBYAK_RISCV_CONSTEXPR Reg s0(x8);
-static XBYAK_RISCV_CONSTEXPR Reg s1(x9);
-static XBYAK_RISCV_CONSTEXPR Reg a0(x10), a1(x11), a2(x12), a3(x13), a4(x14), a5(x15), a6(x16), a7(x17);
-static XBYAK_RISCV_CONSTEXPR Reg s2(x18), s3(x19), s4(x20), s5(x21), s6(x22), s7(x23), s8(x24), s9(x25);
-static XBYAK_RISCV_CONSTEXPR Reg s10(x26), s11(x27);
-static XBYAK_RISCV_CONSTEXPR Reg t3(x28), t4(x29), t5(x30), t6(x31);
-
-// Floating Point Register
-struct FReg : public local::IReg {
-	explicit XBYAK_RISCV_CONSTEXPR FReg(int idx = 0) : local::IReg(idx, IReg::Kind::FReg) { }
-};
-
-static XBYAK_RISCV_CONSTEXPR FReg f0(0), f1(1), f2(2), f3(3), f4(4), f5(5), f6(6), f7(7);
-static XBYAK_RISCV_CONSTEXPR FReg f8(8), f9(9), f10(10), f11(11), f12(12), f13(13), f14(14), f15(15);
-static XBYAK_RISCV_CONSTEXPR FReg f16(16), f17(17), f18(18), f19(19), f20(20), f21(21), f22(22), f23(23);
-static XBYAK_RISCV_CONSTEXPR FReg f24(24), f25(25), f26(26), f27(27), f28(28), f29(29), f30(30), f31(31);
-// ABI name
-static XBYAK_RISCV_CONSTEXPR FReg ft0(0), ft1(1), ft2(2), ft3(3), ft4(4), ft5(5), ft6(6), ft7(7);
-static XBYAK_RISCV_CONSTEXPR FReg fs0(8), fs1(9), fa0(10), fa1(11), fa2(12), fa3(13), fa4(14), fa5(15), fa6(16), fa7(f17);
-static XBYAK_RISCV_CONSTEXPR FReg fs2(18), fs3(19), fs4(20), fs5(21), fs6(22), fs7(23), fs8(24), fs9(25), fs10(26), fs11(27);
-static XBYAK_RISCV_CONSTEXPR FReg ft8(28), ft9(29), ft10(30), ft11(31);
-
-#if defined(XBYAK_RISCV_V) && XBYAK_RISCV_V == 1
-// Vector Register
-struct VReg : public local::IReg {
-	explicit XBYAK_RISCV_CONSTEXPR VReg(int idx = 0) : local::IReg(idx, IReg::Kind::VECTOR) { }
-};
-
-static XBYAK_RISCV_CONSTEXPR VReg v0(0), v1(1), v2(2), v3(3), v4(4), v5(5), v6(6), v7(7);
-static XBYAK_RISCV_CONSTEXPR VReg v8(8), v9(9), v10(10), v11(11), v12(12), v13(13), v14(14), v15(15);
-static XBYAK_RISCV_CONSTEXPR VReg v16(16), v17(17), v18(18), v19(19), v20(20), v21(21), v22(22), v23(23);
-static XBYAK_RISCV_CONSTEXPR VReg v24(24), v25(25), v26(26), v27(27), v28(28), v29(29), v30(30), v31(31);
-#endif
-
-// 2nd parameter for constructor of CodeArray(maxSize, userPtr, alloc)
-void *const DontSetProtectRWE = (void*)2; //-V566
-
-class CodeArray {
-	enum Type {
-		USER_BUF = 1, // use userPtr(non alignment, non protect)
-		ALLOC_BUF // use new(alignment, protect)
-	};
-	CodeArray(const CodeArray& rhs);
-	void operator=(const CodeArray&);
-	bool isAllocType() const { return type_ == ALLOC_BUF; }
-	const Type type_;
-#ifdef XBYAK_RISCV_USE_MMAP_ALLOCATOR
-	MmapAllocator defaultAllocator_;
-#else
-	Allocator defaultAllocator_;
-#endif
-	Allocator *alloc_;
-protected:
-	size_t maxSize_;
-	uint8_t *top_;
-	size_t size_;
-
-	bool useProtect() const { return alloc_->useProtect(); }
-public:
-	enum ProtectMode {
-		PROTECT_RW = 0, // read/write
-		PROTECT_RWE = 1, // read/write/exec
-		PROTECT_RE = 2 // read/exec
-	};
-	explicit CodeArray(size_t maxSize, void *userPtr = 0, Allocator *allocator = 0)
-		: type_((userPtr == 0 || userPtr == DontSetProtectRWE) ? ALLOC_BUF : USER_BUF)
-		, alloc_(allocator ? allocator : (Allocator*)&defaultAllocator_)
-		, maxSize_(maxSize)
-		, top_(type_ == USER_BUF ? reinterpret_cast<uint8_t*>(userPtr) : alloc_->alloc((std::max<size_t>)(maxSize, 1)))
-		, size_(0)
-	{
-		if (maxSize_ > 0 && top_ == 0) XBYAK_RISCV_THROW(ERR_CANT_ALLOC)
-		if ((type_ == ALLOC_BUF && userPtr != DontSetProtectRWE && useProtect()) && !setProtectMode(PROTECT_RWE, false)) {
-			alloc_->free(top_);
-			XBYAK_RISCV_THROW(ERR_CANT_PROTECT)
-		}
-	}
-	virtual ~CodeArray()
-	{
-		if (isAllocType()) {
-			if (useProtect()) setProtectModeRW(false);
-			alloc_->free(top_);
-		}
-	}
-	bool setProtectMode(ProtectMode mode, bool throwException = true)
-	{
-		bool isOK = protect(top_, maxSize_, mode);
-		if (isOK) return true;
-		if (throwException) XBYAK_RISCV_THROW_RET(ERR_CANT_PROTECT, false)
-		return false;
-	}
-	bool setProtectModeRE(bool throwException = true) { return setProtectMode(PROTECT_RE, throwException); }
-	bool setProtectModeRW(bool throwException = true) { return setProtectMode(PROTECT_RW, throwException); }
-	void resetSize()
-	{
-		size_ = 0;
-	}
-	void writeBytes(size_t offset, uint64_t v, size_t n)
-	{
-		if (n > 8) XBYAK_RISCV_THROW(ERR_BAD_PARAMETER)
-		if (offset + n > maxSize_) XBYAK_RISCV_THROW(ERR_CODE_IS_TOO_BIG)
-		uint8_t *const p = top_ + offset;
-		for (size_t i = 0; i < n; i++) {
-			p[i] = static_cast<uint8_t>(v >> (i * 8));
-		}
-	}
-	void writeBytes(const uint8_t *addr, uint64_t v, size_t n)
-	{
-		writeBytes(addr - top_, v, n);
-	}
-	void appendBytes(uint64_t v, size_t n)
-	{
-		writeBytes(size_, v, n);
-		size_ += n;
-	}
-	void append4B(uint32_t code) { appendBytes(code, 4); }
-	void append2B(uint32_t code) { appendBytes(code, 2); }
-	void append1B(uint32_t code) { appendBytes(code, 1); }
-	void write4B(size_t offset, uint32_t v) { writeBytes(offset, v, 4); }
-	void dump(bool separate = false) const
-	{
-		const uint8_t *p = getCode();
-		const size_t bufSize = getSize();
-		if (separate) {
-			size_t pos = 0;
-			while (pos < bufSize) {
-				uint32_t v = p[pos];
-				size_t n = (v & 3) == 3 ? 4 : 2;
-				if (pos + n <= bufSize) {
-					for (size_t i = 0; i < n; i++) {
-						printf("%02x", p[pos + n - 1 - i]);
-					}
-					printf("\n");
-					pos += n;
-				} else {
-					printf("%02x error\n", v);
-					return;
-				}
-			}
-			return;
-		}
-		size_t remain = bufSize;
-		for (int i = 0; i < 4; i++) {
-			size_t disp = 16;
-			if (remain < 16) {
-				disp = remain;
-			}
-			for (size_t j = 0; j < 16; j++) {
-				if (j < disp) {
-					printf("%02x", p[i * 16 + j]);
-				}
-			}
-			putchar('\n');
-			remain -= disp;
-			if (remain == 0) {
-				break;
-			}
-		}
-	}
-	const uint8_t *getCode() const { return top_; }
-	template<class F>
-	const F getCode() const { return reinterpret_cast<F>(top_); }
-	const uint8_t *getCurr() const { return &top_[size_]; }
-	template<class F>
-	const F getCurr() const { return reinterpret_cast<F>(&top_[size_]); }
-	size_t getSize() const { return size_; }
-	void setSize(size_t size)
-	{
-		if (size > maxSize_) XBYAK_RISCV_THROW(ERR_OFFSET_IS_TOO_BIG)
-		size_ = size;
-	}
-	/**
-		change exec permission of memory
-		@param addr [in] buffer address
-		@param size [in] buffer size
-		@param protectMode [in] mode(RW/RWE/RE)
-		@return true(success), false(failure)
-	*/
-	static inline bool protect(const void *addr, size_t size, int protectMode)
-	{
-#if defined(_WIN32)
-		const DWORD c_rw = PAGE_READWRITE;
-		const DWORD c_rwe = PAGE_EXECUTE_READWRITE;
-		const DWORD c_re = PAGE_EXECUTE_READ;
-		DWORD mode;
-#else
-		const int c_rw = PROT_READ | PROT_WRITE;
-		const int c_rwe = PROT_READ | PROT_WRITE | PROT_EXEC;
-		const int c_re = PROT_READ | PROT_EXEC;
-		int mode;
-#endif
-		switch (protectMode) {
-		case PROTECT_RW: mode = c_rw; break;
-		case PROTECT_RWE: mode = c_rwe; break;
-		case PROTECT_RE: mode = c_re; break;
-		default:
-			return false;
-		}
-#if defined(_WIN32)
-		DWORD oldProtect;
-		return VirtualProtect(const_cast<void*>(addr), size, mode, &oldProtect) != 0;
-#elif defined(__GNUC__)
-		size_t pageSize = sysconf(_SC_PAGESIZE);
-		size_t iaddr = reinterpret_cast<size_t>(addr);
-		size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1));
-		return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0;
-#else
-		return true;
-#endif
-	}
-	/**
-		get aligned memory pointer
-		@param addr [in] address
-		@param alignedSize [in] power of two
-		@return aligned addr by alingedSize
-	*/
-	static inline uint8_t *getAlignedAddress(uint8_t *addr, size_t alignedSize = 16)
-	{
-		return reinterpret_cast<uint8_t*>((reinterpret_cast<size_t>(addr) + alignedSize - 1) & ~(alignedSize - static_cast<size_t>(1)));
-	}
-};
-
-struct Jmp {
-	enum Type {
-		tJal,
-		tBtype,
-		tRawAddress,
-	} type;
-	const uint8_t* from; /* address of the jmp mnemonic */
-	uint32_t encoded;
-	size_t encSize() const
-	{
-		return (type == tRawAddress) ? sizeof(size_t) : 4;
-	}
-	// jal
-	Jmp(const uint8_t *from, uint32_t opcode, const Reg& rd)
-		: type(tJal)
-		, from(from)
-		, encoded((rd.getIdx() << 7) | opcode)
-	{
-	}
-	// B-type
-	Jmp(const uint8_t* from, uint32_t opcode, uint32_t funct3, const Reg& src1, const Reg& src2)
-		: type(tBtype)
-		, from(from)
-		, encoded((src2.getIdx() << 20) | (src1.getIdx() << 15) | (funct3 << 12) | opcode)
-	{
-	}
-	// raw address
-	explicit Jmp(const uint8_t* from)
-		: type(tRawAddress)
-		, from(from)
-		, encoded(0)
-	{
-	}
-	static inline bool isValidImm(size_t imm, size_t maskBit)
-	{
-		const size_t M = local::mask(maskBit);
-		return (imm < M || ~M <= imm) && (imm & 1) == 0;
-	}
-	size_t encode(const uint8_t* addr) const
-	{
-		if (addr == 0) return 0;
-		if (type == tRawAddress) return size_t(addr);
-		const size_t imm = addr - from;
-		if (type == tJal) {
-			if (!isValidImm(imm, 20)) XBYAK_RISCV_THROW(ERR_INVALID_IMM_OF_JAL)
-			return local::get20_10to1_11_19to12_z12(imm) | encoded;
-		} else {
-			if (!isValidImm(imm, 12)) XBYAK_RISCV_THROW(ERR_INVALID_IMM_OF_JAL)
-			return local::get12_10to5_z13_4to1_11_z7(imm) | encoded;
-		}
-	}
-	// update jmp address by base->getCurr()
-	void update(CodeArray *base) const
-	{
-		base->writeBytes(from, encode(base->getCurr()), encSize());
-	}
-	// append jmp opcode with addr
-	void appendCode(CodeArray *base, const uint8_t *addr) const
-	{
-		base->appendBytes(encode(addr), encSize());
-	}
-};
-
-class LabelManager;
-
-class Label {
-	mutable LabelManager *mgr;
-	mutable int id;
-	friend class LabelManager;
-public:
-	Label() : mgr(0), id(0) {}
-	Label(const Label& rhs);
-	Label& operator=(const Label& rhs);
-	~Label();
-	void clear() { mgr = 0; id = 0; }
-	int getId() const { return id; }
-	const uint8_t *getAddress() const;
-};
-
-class LabelManager {
-	// for Label class
-	struct ClabelVal {
-		ClabelVal(const uint8_t* addr = 0) : addr(addr), refCount(1) {}
-		const uint8_t* addr;
-		int refCount;
-	};
-	typedef std::unordered_map<int, ClabelVal> ClabelDefList;
-	typedef std::unordered_multimap<int, Jmp> ClabelUndefList;
-	typedef std::unordered_set<Label*> LabelPtrList;
-
-	CodeArray *base_;
-	mutable int labelId_;
-	ClabelDefList clabelDefList_;
-	ClabelUndefList clabelUndefList_;
-	LabelPtrList labelPtrList_;
-
-	int getId(const Label& label) const
-	{
-		if (label.id == 0) label.id = labelId_++;
-		return label.id;
-	}
-	void define_inner(ClabelDefList& defList, ClabelUndefList& undefList, int labelId, const uint8_t* addr)
-	{
-		// add label
-		ClabelDefList::value_type item(labelId, addr);
-		std::pair<ClabelDefList::iterator, bool> ret = defList.insert(item);
-		if (!ret.second) XBYAK_RISCV_THROW(ERR_LABEL_IS_REDEFINED)
-		// search undefined label
-		for (;;) {
-			ClabelUndefList::iterator itr = undefList.find(labelId);
-			if (itr == undefList.end()) break;
-			const Jmp& jmp = itr->second;
-			jmp.update(base_);
-			undefList.erase(itr);
-		}
-	}
-	friend class Label;
-	void incRefCount(int id, Label *label)
-	{
-		clabelDefList_[id].refCount++;
-		labelPtrList_.insert(label);
-	}
-	void decRefCount(int id, Label *label)
-	{
-		labelPtrList_.erase(label);
-		ClabelDefList::iterator i = clabelDefList_.find(id);
-		if (i == clabelDefList_.end()) return;
-		if (i->second.refCount == 1) {
-			clabelDefList_.erase(id);
-		} else {
-			--i->second.refCount;
-		}
-	}
-	template<class T>
-	bool hasUndefinedLabel_inner(const T& list) const
-	{
-		return !list.empty();
-	}
-	// detach all labels linked to LabelManager
-	void resetLabelPtrList()
-	{
-		for (LabelPtrList::iterator i = labelPtrList_.begin(), ie = labelPtrList_.end(); i != ie; ++i) {
-			(*i)->clear();
-		}
-		labelPtrList_.clear();
-	}
-public:
-	LabelManager()
-	{
-		reset();
-	}
-	~LabelManager()
-	{
-		resetLabelPtrList();
-	}
-	void reset()
-	{
-		base_ = 0;
-		labelId_ = 1;
-		clabelDefList_.clear();
-		clabelUndefList_.clear();
-		resetLabelPtrList();
-	}
-	void set(CodeArray *base) { base_ = base; }
-	void defineClabel(Label& label)
-	{
-		define_inner(clabelDefList_, clabelUndefList_, getId(label), base_->getCurr());
-		label.mgr = this;
-		labelPtrList_.insert(&label);
-	}
-	void assign(Label& dst, const Label& src)
-	{
-		ClabelDefList::const_iterator i = clabelDefList_.find(src.id);
-		if (i == clabelDefList_.end()) XBYAK_RISCV_THROW(ERR_LABEL_IS_NOT_SET_BY_L)
-		define_inner(clabelDefList_, clabelUndefList_, dst.id, i->second.addr);
-		dst.mgr = this;
-		labelPtrList_.insert(&dst);
-	}
-	// return 0 unless label exists
-	const uint8_t* getAddr(const Label& label) const
-	{
-		ClabelDefList::const_iterator i = clabelDefList_.find(getId(label));
-		if (i == clabelDefList_.end()) return 0;
-		return i->second.addr;
-	}
-	void addUndefinedLabel(const Label& label, const Jmp& jmp)
-	{
-		clabelUndefList_.insert(ClabelUndefList::value_type(label.id, jmp));
-	}
-	bool hasUndefClabel() const { return hasUndefinedLabel_inner(clabelUndefList_); }
-	const uint8_t *getCode() const { return base_->getCode(); }
-};
-
-inline Label::Label(const Label& rhs)
-{
-	id = rhs.id;
-	mgr = rhs.mgr;
-	if (mgr) mgr->incRefCount(id, this);
-}
-inline Label& Label::operator=(const Label& rhs)
-{
-	if (id) XBYAK_RISCV_THROW_RET(ERR_LABEL_IS_ALREADY_SET_BY_L, *this)
-	id = rhs.id;
-	mgr = rhs.mgr;
-	if (mgr) mgr->incRefCount(id, this);
-	return *this;
-}
-inline Label::~Label()
-{
-	if (id && mgr) mgr->decRefCount(id, this);
-}
-inline const uint8_t* Label::getAddress() const
-{
-	if (mgr == 0) return 0;
-	return mgr->getAddr(*this);
-}
-
-namespace local {
-
-template<size_t n>
-struct Bit {
-	uint32_t v;
-	Bit(uint32_t v)
-		: v(v)
-	{
-		XBYAK_RISCV_ASSERT(inBit(v, n));
-	}
-	Bit(const IReg& r)
-		: v(r.getIdx())
-	{
-	}
-	Bit(VM vm)
-		: v(static_cast<uint32_t>(vm))
-	{
-	}
-	Bit(CSR csr)
-		: v(static_cast<uint32_t>(csr))
-	{
-	}
-	Bit(RM rm)
-		: v(static_cast<uint32_t>(rm))
-	{
-	}
-};
-
-} // local
-
-class CodeGenerator : public CodeArray {
-public:
-	enum AqRlType {
-		T_aq = 2,
-		T_rl = 1,
-		T_aqrl = 3,
-	};
-	typedef local::Bit<1> Bit1;
-	typedef local::Bit<2> Bit2;
-	typedef local::Bit<3> Bit3;
-	typedef local::Bit<5> Bit5;
-	typedef local::Bit<6> Bit6;
-	typedef local::Bit<7> Bit7;
-	typedef local::Bit<12> Bit12;
-	typedef local::Bit<32> Bit32;
-private:
-	CodeGenerator operator=(const CodeGenerator&) = delete;
-	LabelManager labelMgr_;
-	int XLEN_;
-	bool isRV32_;
-	bool supportRVC_;
-	void opJmp(const Label& label, const Jmp& jmp)
-	{
-		const uint8_t* addr = labelMgr_.getAddr(label);
-		jmp.appendCode(this, addr);
-		if (addr) return;
-		labelMgr_.addUndefinedLabel(label, jmp);
-	}
-	uint32_t enc2(uint32_t a, uint32_t b) const { return (a<<7) | (b<<15); }
-	uint32_t enc3(uint32_t a, uint32_t b, uint32_t c) const { return enc2(a, b) | (c<<20); }
-	void Rtype(Bit7 opcode, Bit3 funct3, Bit7 funct7, Bit5 rd, Bit5 rs1, Bit5 rs2)
-	{
-		uint32_t v = (funct7.v<<25) | (funct3.v<<12) | opcode.v | enc3(rd.v, rs1.v, rs2.v);
-		append4B(v);
-	}
-	void Itype(Bit7 opcode, Bit3 funct3, Bit5 rd, Bit5 rs1, int imm)
-	{
-		if (!local::inSBit(imm, 12)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG)
-		uint32_t v = (imm<<20) | (funct3.v<<12) | opcode.v | enc2(rd.v, rs1.v);
-		append4B(v);
-	}
-	void Stype(Bit7 opcode, Bit3 funct3, Bit5 rs1, Bit5 rs2, int imm)
-	{
-		if (!local::inSBit(imm, 12)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG)
-		uint32_t v = ((imm>>5)<<25) | (funct3.v<<12) | opcode.v | enc3(imm & local::mask(5), rs1.v, rs2.v);
-		append4B(v);
-	}
-	void Utype(Bit7 opcode, Bit5 rd, uint32_t imm)
-	{
-		if (imm >= (1u << 20)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG)
-		uint32_t v = (imm<<12) | opcode.v | (rd.v<<7);
-		append4B(v);
-	}
-	void opShift(Bit7 pre, Bit3 funct3, Bit7 opcode, Bit5 rd, Bit5 rs1, uint32_t shamt, int range = 0)
-	{
-		if (range == 0) range = isRV32_ ? 5 : 6;
-		if (shamt >= (1u << range)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG)
-		uint32_t v = (pre.v<<25) | (funct3.v<<12) | opcode.v | enc3(rd.v, rs1.v, shamt);
-		append4B(v);
-	}
-	void opAtomic(Bit5 rd, Bit5 rs2, Bit5 addr, Bit5 funct5, Bit3 funct3, uint32_t flag)
-	{
-		assert(flag <= 3);
-		Rtype(0x2f, funct3.v, (funct5.v << 2) | flag, rd, addr, rs2);
-	}
-	void opIVV(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 vs1, Bit5 vd)
-	{
-		/*
-		    31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
-			  func6    vm      vs2       vs1        func3       vd     opcode
-
-			func6, func3, and opcode must be encoded in the baseValue
-		*/
-		uint32_t v = (vm.v<<25) | (vs2.v<<20) | (vs1.v<<15) | (vd.v<<7);
-		v |= baseValue.v; // force-encode base value
-		append4B(v);
-	}
-	void opFVV(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 vs1, Bit5 d)
-	{
-		/*
-		    31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
-			  func6    vm      vs2       vs1        func3     vd/rd    opcode
-
-			func6, func3, and opcode must be encoded in the baseValue
-		*/
-		uint32_t v = (vm.v<<25) | (vs2.v<<20) | (vs1.v<<15) | (d.v<<7);
-		v |= baseValue.v; // force-encode base value
-		append4B(v);
-	}
-	void opMVV(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 vs1, Bit5 d)
-	{
-		/*
-		    31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
-			  func6    vm      vs2       vs1        func3     vd/rd    opcode
-
-			func6, func3, and opcode must be encoded in the baseValue
-		*/
-		uint32_t v = (vm.v<<25) | (vs2.v<<20) | (vs1.v<<15) | (d.v<<7);
-		v |= baseValue.v; // force-encode base value
-		append4B(v);
-	}
-	void opIVI(Bit32 baseValue, Bit1 vm, Bit5 vs2, uint32_t imm, Bit5 vd)
-	{
-		/*
-		    31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
-			  func6    vm      vs2       imm       func3       vd     opcode
-
-			func6, func3, and opcode must be encoded in the baseValue
-		*/
-		uint32_t v = (vm.v<<25) | (vs2.v<<20) | ((imm & local::mask(5))<<15) | (vd.v<<7);
-		v |= baseValue.v; // force-encode base value
-		append4B(v);
-	}
-	void opIVX(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 rs1, Bit5 vd)
-	{
-		/*
-		    31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
-			  func6    vm      vs2       rs1        func3       vd     opcode
-
-			func6, func3, and opcode must be encoded in the baseValue
-		*/
-		uint32_t v = (vm.v<<25) | (vs2.v<<20) | (rs1.v<<15) | (vd.v<<7);
-		v |= baseValue.v; // force-encode base value
-		append4B(v);
-	}
-	void opFVF(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 rs1, Bit5 vd)
-	{
-		/*
-		    31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
-			  func6    vm      vs2       rs1        func3       vd     opcode
-
-			func6, func3, and opcode must be encoded in the baseValue
-		*/
-		uint32_t v = (vm.v<<25) | (vs2.v<<20) | (rs1.v<<15) | (vd.v<<7);
-		v |= baseValue.v; // force-encode base value
-		append4B(v);
-	}
-	void opMVX(Bit32 baseValue, Bit1 vm, Bit5 vs2, Bit5 rs1, Bit5 d)
-	{
-		/*
-		    31 .. 26 | 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
-			  func6    vm      vs2       rs1        func3     vd/rd    opcode
-
-			func6, func3, and opcode must be encoded in the baseValue
-		*/
-		uint32_t v = (vm.v<<25) | (vs2.v<<20) | (rs1.v<<15) | (d.v<<7);
-		v |= baseValue.v; // force-encode base value
-		append4B(v);
-	}
-	void opVectorLoad(Bit32 baseValue, Bit1 vm, Bit5 rs2_vs2, Bit5 rs1, Bit5 vd)
-	{
-		/*
-		    31 .. 29 | 28 | 27 .. 26 | 25 |     24 .. 20     | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
-			   nf      mew     mop     vm     lumop/rs2/vs2      rs1        width       vd     opcode
-
-			mew, mop, width, lumop, and opcode must be encoded in the baseValue
-		*/
-		uint32_t v = (vm.v<<25) | (rs2_vs2.v<<20) | (rs1.v<<15) | (vd.v<<7);
-		v |= baseValue.v; // force-encode base value
-		append4B(v);
-	}
-	void opVectorStore(Bit32 baseValue, Bit1 vm, Bit5 rs2_vs2, Bit5 rs1, Bit5 vs3)
-	{
-		/*
-		    31 .. 29 | 28 | 27 .. 26 | 25 |     24 .. 20     | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
-			   nf      mew     mop     vm     sumop/rs2/vs2       rs1        width     vd      opcode
-
-			mew, mop, width, sumop, and opcode must be encoded in the baseValue
-		*/
-		uint32_t v = (vm.v<<25) | (rs2_vs2.v<<20) | (rs1.v<<15) | (vs3.v<<7);
-		v |= baseValue.v; // force-encode base value
-		append4B(v);
-	}
-	void opCSR(Bit32 baseValue, Bit12 csr, Bit5 rs1_uimm, Bit5 rd)
-	{
-		/*
-		    31 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
-			   csr     rs1_uimm     func3       rd     opcode
-
-			func3 and opcode must be encoded in the baseValue
-		*/
-		uint32_t v = (csr.v<<20) | (rs1_uimm.v<<15) | (rd.v<<7);
-		v |= baseValue.v; // force-encode base value
-		append4B(v);
-	}
-	void opLoadFP(Bit32 baseValue, int imm, Bit5 rs1, Bit5 rd)
-	{
-		if (!local::inSBit(imm, 12)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG)
-		/*
-			31 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
-			imm[11:0]     rs1       width       rd      opcode
-
-			width and opcode must be encoded in the baseValue
-		*/
-		uint32_t v = (imm<<20) | (rs1.v<<15) | (rd.v<<7);
-		v |= baseValue.v; // force-encode base value
-		append4B(v);
-	}
-	void opStoreFP(Bit32 baseValue, int imm, Bit5 rs2, Bit5 rs1)
-	{
-		if (!local::inSBit(imm, 12)) XBYAK_RISCV_THROW(ERR_IMM_IS_TOO_BIG)
-		/*
-			31 .. 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
-			imm[11:5]     rs2        rs1       width    imm[4:0]   opcode
-
-			width and opcode must be encoded in the baseValue
-		*/
-		uint32_t imm_11_5 = imm & (local::mask(7)<<5);
-		uint32_t imm_4_0 = imm & local::mask(5);
-		uint32_t v = (imm_11_5<<20) | (rs2.v<<20) | (rs1.v<<15) | (imm_4_0<<7);
-		v |= baseValue.v; // force-encode base value
-		append4B(v);
-	}
-	void opFP(Bit32 baseValue, Bit5 rs2, Bit5 rs1, Bit3 rm, Bit5 rd)
-	{
-		/*
-			31 .. 27 | 26 .. 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
-			  func5       fmt        rs2        rs1        rm         rd      opcode
-
-			func5, fmt, and opcode must be encoded in the baseValue
-		*/
-		uint32_t v = (rs2.v<<20) | (rs1.v<<15) | (rm.v<<12) | (rd.v<<7);
-		v |= baseValue.v; // force-encode base value
-		append4B(v);
-	}
-	void opR4(Bit32 baseValue, Bit5 rs3, Bit5 rs2, Bit5 rs1, Bit3 rm, Bit5 rd)
-	{
-		/*
-			31 .. 27 | 26 .. 25 | 24 .. 20 | 19 .. 15 | 14 .. 12 | 11 .. 7 | 6 .. 0
-			   rs3        fmt        rs2        rs1        rm         rd      opcode
-
-			fmt and opcode must be encoded in the baseValue
-		*/
-		uint32_t v = (rs3.v<<27) | (rs2.v<<20) | (rs1.v<<15) | (rm.v<<12) | (rd.v<<7);
-		v |= baseValue.v; // force-encode base value
-		append4B(v);
-	}
-	bool isValiCidx(uint32_t idx) const { return 8 <= idx && idx < 16; }
-	// c_addi, c_addiw
-	bool c_addi_inner(const Reg& rd, const Reg& rs, uint32_t imm, uint32_t funct3)
-	{
-		uint32_t dIdx = rd.getIdx();
-		uint32_t sIdx = rs.getIdx();
-		if (sIdx == 0 && c_li(rd, imm, 2, 1)) return true;
-		if (dIdx == 0 || dIdx != sIdx || !local::inSBit(imm, 6)) return false;
-		uint32_t v = (funct3<<13) | ((imm & (1<<5))<<7) | (dIdx<<7) | ((imm & 31)<<2)| 1;
-		append2B(v);
-		return true;
-	}
-	bool c_addi16sp(const Reg& rd, const Reg& rs, uint32_t imm)
-	{
-		if (rd != sp || rs != sp || (imm % 16) != 0 || (496 < imm && imm < ~512u) || imm == 0) return false;
-		uint32_t v = (3<<13) | (2<<7) | 1 | local::get9_z5_4_6_8to7_5_z2(imm);
-		append2B(v);
-		return true;
-	}
-	// c_li, c_slli
-	bool c_li(const Reg& rd, uint32_t imm, uint32_t funct3, uint32_t op)
-	{
-		if (rd == x0 || !local::inSBit(imm, 6)) return false;
-		uint32_t v = (funct3<<13) | (rd.getIdx() << 7) | op | local::get5_z5_4to0_z2(imm);
-		append2B(v);
-		return true;
-	}
-	bool c_lui(const Reg& rd, uint32_t imm)
-	{
-		if (rd == x0 || rd == x2 || imm == 0 || (32 <= imm && imm < (1<<20)-32)) return false;
-		uint32_t v = (3<<13) | (rd.getIdx()<<7) | 1 | local::get5_z5_4to0_z2(imm);
-		append2B(v);
-		return true;
-	}
-	bool c_addi(const Reg& rd, const Reg& rs, uint32_t imm)
-	{
-		uint32_t dIdx = rd.getIdx();
-		if (imm == 0 && c_mv(rd, rs, 0)) return true;
-		if (c_addi_inner(rd, rs, imm, 0)) return true;
-		if (c_addi16sp(rd, rs, imm)) return true;
-		// c.addi4spn(rd, imm) = c.addi(rd, x2, imm)
-		if (rs != sp || !isValiCidx(dIdx) || imm == 0 || (imm % 4) != 0 || imm >= 1024) return false;
-		uint32_t v = ((dIdx-8)<<2) | local::get5to4_9to6_2_3_z5(imm);
-		append2B(v);
-		return true;
-	}
-	uint32_t creg2(uint32_t a, uint32_t b) { return ((a-8)<<7) | ((b-8)<<2); }
-	// c_lw, c_sw
-	bool c_lsw(const Reg& rd, const Reg& rs, int imm, uint32_t funct3)
-	{
-		uint32_t dIdx = rd.getIdx();
-		uint32_t sIdx = rs.getIdx();
-		if (!isValiCidx(dIdx) || !isValiCidx(sIdx) || (imm % 4) != 0 || imm < 0 || imm >= (1 << 7)) return false;
-		uint32_t v = (funct3<<13) | creg2(sIdx, dIdx) | local::get5to3_z3_2_6_z5(imm);
-		append2B(v);
-		return true;
-	}
-	// c_ld, c_sd
-	bool c_lsd(const Reg& rd, const Reg& rs, int imm, uint32_t funct3)
-	{
-		uint32_t dIdx = rd.getIdx();
-		uint32_t sIdx = rs.getIdx();
-		if (!isValiCidx(dIdx) || !isValiCidx(sIdx) || (imm % 8) != 0 || imm < 0 || imm >= (1 << 8)) return false;
-		uint32_t v = (funct3<<13) | creg2(sIdx, dIdx) | local::get5to3_z3_7_6_z5(imm);
-		append2B(v);
-		return true;
-	}
-	// c_srli, c_srai, c_andi
-	bool c_srli(const Reg& rd, const Reg& rs, int imm, uint32_t funct2, bool allowImm0 = false)
-	{
-		uint32_t dIdx = rd.getIdx();
-		uint32_t sIdx = rs.getIdx();
-		if (dIdx != sIdx || !isValiCidx(dIdx) || (!allowImm0 && imm == 0) || imm >= (1 << 6)) return false;
-		uint32_t v = (4<<13) | (funct2<<10) | ((dIdx-8)<<7) | local::get5_z5_4to0_z2(imm) | 1;
-		append2B(v);
-		return true;
-	}
-	// rd = rs1
-	// c_sub, c_xor, c_or, c_and, c_subw
-	bool c_noimm(const Reg& rd, const Reg& rs1, const Reg& rs2, uint32_t funct3, uint32_t funct2)
-	{
-		uint32_t dIdx = rd.getIdx();
-		uint32_t sIdx = rs2.getIdx();
-		if (rd.getIdx() != rs1.getIdx() || !isValiCidx(dIdx) || !isValiCidx(sIdx)) return false;
-		uint32_t v = (funct3<<10) | ((dIdx-8)<<7) | (funct2<<5) | ((sIdx-8)<<2) | 1;
-		append2B(v);
-		return true;
-	}
-	// c_lwsp, c_flwsp
-	bool c_lwsp(const Reg& rd, const Reg& addr, int imm, uint32_t funct3)
-	{
-		uint32_t idx = rd.getIdx();
-		if (addr != sp || (imm % 4) != 0 || (imm >> 8)) return false;
-		uint32_t v = (funct3<<13) | (idx<<7) | local::get5_z5_4to2_7to6_z2(imm) | 2;
-		append2B(v);
-		return true;
-	}
-	// c_ldsp
-	bool c_ldsp(const Reg& rd, const Reg& addr, int imm, uint32_t funct3)
-	{
-		uint32_t idx = rd.getIdx();
-		if (addr != sp || (imm % 8) != 0 || (imm >> 9)) return false;
-		uint32_t v = (funct3<<13) | (idx<<7) | local::get5_z5_4to3_8to6_z2(imm) | 2;
-		append2B(v);
-		return true;
-	}
-	// c.mv, c.add
-	bool c_mv(const Reg& rd, const Reg& rs, uint32_t funct1)
-	{
-		if (rd == x0 || rs == x0) return false;
-		uint32_t v = (4<<13) | (funct1<<12) | (rd.getIdx()<<7) | (rs.getIdx()<<2) | 2;
-		append2B(v);
-		return true;
-	}
-	bool c_swsp(const Reg& rs, const Reg& addr, int imm, uint32_t funct3)
-	{
-		if (addr != sp || (imm % 4) != 0 || (imm >> 8)) return false;
-		uint32_t v = (funct3<<13) | (rs.getIdx()<<2) | local::get5to2_7to6_z7(imm) | 2;
-		append2B(v);
-		return true;
-	}
-	bool c_sdsp(const Reg& rs, const Reg& addr, int imm, uint32_t funct3)
-	{
-		if (addr != sp || (imm % 8) != 0 || (imm >> 9)) return false;
-		uint32_t v = (funct3<<13) | (rs.getIdx()<<2) | local::get5to3_8to6_z7(imm) | 2;
-		append2B(v);
-		return true;
-	}
-public:
-	void L(Label& label) { labelMgr_.defineClabel(label); }
-	Label L() { Label label; L(label); return label; }
-	/*
-		assign src to dst
-		require
-		dst : does not used by L()
-		src : used by L()
-	*/
-	void assignL(Label& dst, const Label& src) { labelMgr_.assign(dst, src); }
-	/*
-		put the absolute address of label to buffer
-		@note the put size is 4(32-bit), 8(64-bit)
-	*/
-	void putL(const Label &label)
-	{
-		Jmp jmp(getCurr());
-		opJmp(label, jmp);
-	}
-
-	// constructor
-	CodeGenerator(size_t maxSize = DEFAULT_MAX_CODE_SIZE, void *userPtr = DontSetProtectRWE, Allocator *allocator = 0)
-		: CodeArray(maxSize, userPtr, allocator)
-		, XLEN_(64)
-		, isRV32_(false)
-		, supportRVC_(false)
-	{
-		labelMgr_.set(this);
-	}
-	void reset()
-	{
-		ClearError();
-		resetSize();
-		labelMgr_.reset();
-		labelMgr_.set(this);
-		XLEN_ = 64;
-		isRV32_ = false;
-		supportRVC_ = false;
-	}
-	void setRV32(bool on = true)
-	{
-		isRV32_ = on;
-		XLEN_ = on ? 32 : 64;
-	}
-	void supportRVC(bool on = true)
-	{
-		supportRVC_ = on;
-	}
-	bool hasUndefinedLabel() const { return labelMgr_.hasUndefClabel(); }
-	static inline void clearCache(void *p, size_t n)
-	{
-#ifdef _WIN32
-		FlushInstructionCache(GetCurrentProcess(), begin, n);
-#elif defined(__APPLE__)
-		sys_icache_invalidate(begin, n);
-#else
-		__builtin___clear_cache((char *)p, (char *)p + n);
-#endif
-	}
-	/*
-		MUST call ready() to complete generating code if you use AutoGrow mode.
-		It is not necessary for the other mode if hasUndefinedLabel() is true.
-	*/
-	void ready(ProtectMode mode = PROTECT_RWE)
-	{
-		if (hasUndefinedLabel()) XBYAK_RISCV_THROW(ERR_LABEL_IS_NOT_FOUND)
-		if (useProtect()) setProtectMode(mode);
-		clearCache(top_, size_);
-	}
-	// set read/exec
-	void readyRE() { return ready(PROTECT_RE); }
-
-	void align(size_t x)
-	{
-		if (x == 1) return;
-		if (x < 4 || (x & (x - 1))) XBYAK_RISCV_THROW(ERR_BAD_ALIGN)
-		size_t remain = size_t(getCurr()) % x;
-		if (remain % 4) XBYAK_RISCV_THROW(ERR_INTERNAL)
-		if (remain) {
-			for (size_t i = 0; i < (x - remain) / 4; i++) {
-				nop();
-			}
-		}
-	}
-
-#include "xbyak_riscv_mnemonic.hpp"
-#if defined(XBYAK_RISCV_V) && XBYAK_RISCV_V == 1
-#include "xbyak_riscv_v.hpp"
-#endif
-};
-
-#ifdef _MSC_VER
-	#pragma warning(pop)
-#endif
-} // Xbyak_riscv
-
diff --git a/third_party/xbyak_riscv/xbyak_riscv_csr.hpp b/third_party/xbyak_riscv/xbyak_riscv_csr.hpp
deleted file mode 100644
index 5f04ed441a1..00000000000
--- a/third_party/xbyak_riscv/xbyak_riscv_csr.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/******************************************************************************
-* Copyright (C), 2023, KNS Group LLC (YADRO)
-*
-* Licensed under the 3-Clause BSD License
-* You may obtain a copy of the License at https://opensource.org/license/bsd-3-clause/
-*******************************************************************************/
-
-#pragma once
-namespace Xbyak_riscv {
-
-// Control and Status Register
-enum class CSR : uint32_t {
-    // FP CSRs
-    fflags = 0x001, // Floating-Point Accrued Exceptions
-    frm    = 0x002, // Floating-Point Dynamic Rounding Mode
-    fcsr   = 0x003, // Floating-Point Control and Status register
-    // vector CSRs
-    vstart = 0x008, // Vector start position
-    vxsat  = 0x009, // Fixed-Point Saturate Flag
-    vxrm   = 0x00A, // Fixed-Point Rounding Mode
-    vcsr   = 0x00F, // Vector control and status register
-    vl     = 0xC20, // Vector length
-    vtype  = 0xC21, // Vector data type register
-    vlenb  = 0xC22, // VLEN/8 (vector register length in bytes)
-};
-
-
-// Selected Element Width
-enum class SEW : uint32_t {
-    e8  = 0x0,
-    e16 = 0x1,
-    e32 = 0x2,
-    e64 = 0x3
-};
-
-// Vector Length Multiplier
-enum class LMUL : uint32_t {
-    mf8 = 0x5,
-    mf4 = 0x6,
-    mf2 = 0x7,
-    m1  = 0x0,
-    m2  = 0x1,
-    m4  = 0x2,
-    m8  = 0x3
-};
-
-// Vector Mask Agnostic
-enum class VMA : uint32_t {
-    mu = 0, // undisturbed
-    ma = 1, // agnostic
-};
-
-// Vector Tail Agnostic
-enum class VTA : uint32_t {
-    tu = 0, // undisturbed
-    ta = 1, // agnostic
-};
-
-enum class VectorAddressingMode : uint32_t {
-    unitStride       = 0x0,
-    indexedUnordered = 0x1,
-    strided          = 0x2,
-    indexedOrdered   = 0x3
-    // other encodings are reserved
-};
-
-enum class UnitStrideVectorAddressingModeLoad : uint32_t {
-    load              = 0x0, // unit-stride load
-    wholeRegisterLoad = 0x8, // unit-stride, whole register load
-    maskLoad          = 0xb, // unit-stride, mask load, EEW=8
-    faultOnlyFirst    = 0x10  // unit-stride fault-only-first
-    // other encodings are reserved
-};
-
-enum class UnitStrideVectorAddressingModeStore : uint32_t {
-    store              = 0x0, // unit-stride store
-    wholeRegisterStore = 0x8, // unit-stride, whole register store
-    maskStore          = 0xb  // unit-stride, mask store, EEW=8
-    // other encodings are reserved
-};
-
-enum class WidthEncoding : uint32_t {
-    e8  = 0x0, // Vector 8-bit  element
-    e16 = 0x5, // Vector 16-bit element
-    e32 = 0x6, // Vector 32-bit element
-    e64 = 0x7, // Vector 64-bit element
-};
-
-enum class VM : uint32_t {
-    unmasked = 1,
-    masked = 0
-};
-
-enum class RM : uint32_t {
-    rne = 0x0, // Round to Nearest, ties to Even
-    rtz = 0x1, // Round towards Zero
-    rdn = 0x2, // Round Down (towards -infinity)
-    rup = 0x3, // Round Up (towards + infinity)
-    rmm = 0x4, // Round to Nearest, ties to Max Magnitude
-    dyn = 0x7  // In instruction’s rm field, selects dynamic rounding mode;
-               // In Rounding Mode register, reserved.
-};
-
-enum class FFlags : uint32_t {
-    NV = 0x01, // Invalid Operation
-    DZ = 0x02, // Divide by Zero
-    OF = 0x04, // Overflow
-    UF = 0x08, // Underflow
-    NX = 0x10  // Inexact
-};
-
-} // Xbyak_riscv
diff --git a/third_party/xbyak_riscv/xbyak_riscv_mnemonic.hpp b/third_party/xbyak_riscv/xbyak_riscv_mnemonic.hpp
deleted file mode 100644
index b050d46cc75..00000000000
--- a/third_party/xbyak_riscv/xbyak_riscv_mnemonic.hpp
+++ /dev/null
@@ -1,231 +0,0 @@
-const char *getVersionString() const { return "1.01"; }
-void add(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && rd == rs1 && c_mv(rd, rs2, 1)) return; Rtype(0x33, 0, 0x0, rd, rs1, rs2); }
-void sub(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x23, 0)) return; Rtype(0x33, 0, 0x20, rd, rs1, rs2); }
-void sll(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 1, 0x0, rd, rs1, rs2); }
-void slt(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 2, 0x0, rd, rs1, rs2); }
-void sltu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 3, 0x0, rd, rs1, rs2); }
-void xor_(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x23, 1)) return; Rtype(0x33, 4, 0x0, rd, rs1, rs2); }
-void srl(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 5, 0x0, rd, rs1, rs2); }
-void sra(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 5, 0x20, rd, rs1, rs2); }
-void or_(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x23, 2)) return; Rtype(0x33, 6, 0x0, rd, rs1, rs2); }
-void and_(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x23, 3)) return; Rtype(0x33, 7, 0x0, rd, rs1, rs2); }
-void addw(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x27, 1)) return; Rtype(0x3b, 0, 0x0, rd, rs1, rs2); }
-void subw(const Reg& rd, const Reg& rs1, const Reg& rs2) { if (supportRVC_ && c_noimm(rd, rs1, rs2, 0x27, 0)) return; Rtype(0x3b, 0, 0x20, rd, rs1, rs2); }
-void sllw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 1, 0x0, rd, rs1, rs2); }
-void srlw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 5, 0x0, rd, rs1, rs2); }
-void sraw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 5, 0x20, rd, rs1, rs2); }
-void mul(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 0, 0x1, rd, rs1, rs2); }
-void mulh(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 1, 0x1, rd, rs1, rs2); }
-void mulhsu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 2, 0x1, rd, rs1, rs2); }
-void mulhu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 3, 0x1, rd, rs1, rs2); }
-void div(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 4, 0x1, rd, rs1, rs2); }
-void divu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 5, 0x1, rd, rs1, rs2); }
-void rem(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 6, 0x1, rd, rs1, rs2); }
-void remu(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x33, 7, 0x1, rd, rs1, rs2); }
-void mulw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 0, 0x1, rd, rs1, rs2); }
-void divw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 4, 0x1, rd, rs1, rs2); }
-void remw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 6, 0x1, rd, rs1, rs2); }
-void remuw(const Reg& rd, const Reg& rs1, const Reg& rs2) { Rtype(0x3b, 7, 0x1, rd, rs1, rs2); }
-void addi(const Reg& rd, const Reg& rs1, int imm) { if (supportRVC_ && c_addi(rd, rs1, imm)) return; Itype(0x13, 0, rd, rs1, imm); }
-void slti(const Reg& rd, const Reg& rs1, int imm) { Itype(0x13, 2, rd, rs1, imm); }
-void sltiu(const Reg& rd, const Reg& rs1, int imm) { Itype(0x13, 3, rd, rs1, imm); }
-void xori(const Reg& rd, const Reg& rs1, int imm) { Itype(0x13, 4, rd, rs1, imm); }
-void ori(const Reg& rd, const Reg& rs1, int imm) { Itype(0x13, 6, rd, rs1, imm); }
-void andi(const Reg& rd, const Reg& rs1, int imm) { if (supportRVC_ && c_srli(rd, rs1, imm, 2, true)) return; Itype(0x13, 7, rd, rs1, imm); }
-void addiw(const Reg& rd, const Reg& rs1, int imm) { if (supportRVC_ && c_addi_inner(rd, rs1, imm, 1)) return; Itype(0x1b, 0, rd, rs1, imm); }
-// load-op rd, imm(addr); rd = addr[imm];
-void jalr(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x67, 0, rd, addr, imm); }
-void lb(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 0, rd, addr, imm); }
-void lh(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 1, rd, addr, imm); }
-void lw(const Reg& rd, const Reg& addr, int imm = 0) { if (supportRVC_ && (c_lwsp(rd, addr, imm, 2) || c_lsw(rd, addr, imm, 2))) return; Itype(0x3, 2, rd, addr, imm); }
-void lbu(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 4, rd, addr, imm); }
-void lhu(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 5, rd, addr, imm); }
-void lwu(const Reg& rd, const Reg& addr, int imm = 0) { Itype(0x3, 6, rd, addr, imm); }
-void ld(const Reg& rd, const Reg& addr, int imm = 0) { if (supportRVC_ && (c_ldsp(rd, addr, imm, 3) || c_lsd(rd, addr, imm, 3))) return; Itype(0x3, 3, rd, addr, imm); }
-void auipc(const Reg& rd, uint32_t imm) { Utype(0x17, rd, imm); }
-void lui(const Reg& rd, uint32_t imm) { if (supportRVC_ && c_lui(rd, imm)) return; Utype(0x37, rd, imm); }
-void slli(const Reg& rd, const Reg& rs1, uint32_t shamt) { if (supportRVC_ && rd == rs1 && shamt != 0 && c_li(rd, shamt, 0, 2)) return; opShift(0x0, 1, 0x13, rd, rs1, shamt); }
-void srli(const Reg& rd, const Reg& rs1, uint32_t shamt) { if (supportRVC_ && c_srli(rd, rs1, shamt, 0)) return; opShift(0x0, 5, 0x13, rd, rs1, shamt); }
-void srai(const Reg& rd, const Reg& rs1, uint32_t shamt) { if (supportRVC_ && c_srli(rd, rs1, shamt, 1)) return; opShift(0x20, 5, 0x13, rd, rs1, shamt); }
-void slliw(const Reg& rd, const Reg& rs1, uint32_t shamt) { opShift(0x0, 1, 0x1b, rd, rs1, shamt, 5); }
-void srliw(const Reg& rd, const Reg& rs1, uint32_t shamt) { opShift(0x0, 5, 0x1b, rd, rs1, shamt, 5); }
-void sraiw(const Reg& rd, const Reg& rs1, uint32_t shamt) { opShift(0x20, 5, 0x1b, rd, rs1, shamt, 5); }
-void fence_rw_rw() { append4B(0x330000f); }
-void fence_tso() { append4B(0x8330000f); }
-void fence_rw_w() { append4B(0x310000f); }
-void fence_r_rw() { append4B(0x230000f); }
-void fence_r_r() { append4B(0x220000f); }
-void fence_w_w() { append4B(0x110000f); }
-void fence_i() { append4B(0x100f); }
-void ecall() { append4B(0x73); }
-void ebreak() { if (supportRVC_) append2B(0x9002); else append4B(0x00100073); }
-// store-op rs, imm(addr) ; addr[imm] = rs;
-void sb(const Reg& rs, const Reg& addr, int imm = 0) { Stype(0x23, 0, addr, rs, imm); }
-void sh(const Reg& rs, const Reg& addr, int imm = 0) { Stype(0x23, 1, addr, rs, imm); }
-void sw(const Reg& rs, const Reg& addr, int imm = 0) { if (supportRVC_ && (c_swsp(rs, addr, imm, 6) || c_lsw(rs, addr, imm, 6))) return; Stype(0x23, 2, addr, rs, imm); }
-void sd(const Reg& rs, const Reg& addr, int imm = 0) { if (supportRVC_ && (c_sdsp(rs, addr, imm, 7) || c_lsd(rs, addr, imm, 7))) return; Stype(0x23, 3, addr, rs, imm); }
-void beq(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 0, rs1, rs2); opJmp(label, jmp); }
-void bne(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 1, rs1, rs2); opJmp(label, jmp); }
-void blt(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 4, rs1, rs2); opJmp(label, jmp); }
-void bge(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 5, rs1, rs2); opJmp(label, jmp); }
-void bltu(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 6, rs1, rs2); opJmp(label, jmp); }
-void bgeu(const Reg& rs1, const Reg& rs2, const Label& label) { Jmp jmp(getCurr(), 0x63, 7, rs1, rs2); opJmp(label, jmp); }
-void beqz(const Reg& rs, const Label& label) { beq(rs, x0, label); }
-void bnez(const Reg& rs, const Label& label) { bne(rs, x0, label); }
-void blez(const Reg& rs, const Label& label) { bge(x0, rs, label); }
-void bgez(const Reg& rs, const Label& label) { bge(rs, x0, label); }
-void bltz(const Reg& rs, const Label& label) { blt(rs, x0, label); }
-void bgtz(const Reg& rs, const Label& label) { blt(x0, rs, label); }
-void bgt(const Reg& rs, const Reg& rt, const Label& label) { blt(rt, rs, label); }
-void ble(const Reg& rs, const Reg& rt, const Label& label) { bge(rt, rs, label); }
-void bgtu(const Reg& rs, const Reg& rt, const Label& label) { bltu(rt, rs, label); }
-void bleu(const Reg& rs, const Reg& rt, const Label& label) { bgeu(rt, rs, label); }
-// amos**, rd, rs2, (addr)
-void sc_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x3, 2, flag); }
-void sc_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x3, 3, flag); }
-void amoswap_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x1, 2, flag); }
-void amoswap_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x1, 3, flag); }
-void amoadd_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x0, 2, flag); }
-void amoadd_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x0, 3, flag); }
-void amoxor_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x4, 2, flag); }
-void amoxor_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x4, 3, flag); }
-void amoand_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0xc, 2, flag); }
-void amoand_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0xc, 3, flag); }
-void amoor_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x8, 2, flag); }
-void amoor_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x8, 3, flag); }
-void amomin_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x10, 2, flag); }
-void amomin_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x10, 3, flag); }
-void amomax_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x14, 2, flag); }
-void amomax_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x14, 3, flag); }
-void amominu_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x18, 2, flag); }
-void amominu_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x18, 3, flag); }
-void amomaxu_w(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x1c, 2, flag); }
-void amomaxu_d(const Reg& rd, const Reg& rs2, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, rs2, addr, 0x1c, 3, flag); }
-void csrrw(const Reg& rd, CSR csr, const Reg& rs1) { opCSR(0x1073, csr, rs1, rd); }
-void csrrs(const Reg& rd, CSR csr, const Reg& rs1) { opCSR(0x2073, csr, rs1, rd); }
-void csrrc(const Reg& rd, CSR csr, const Reg& rs1) { opCSR(0x3073, csr, rs1, rd); }
-void csrrwi(const Reg& rd, CSR csr, uint32_t imm) { opCSR(0x5073, csr, imm, rd); }
-void csrrsi(const Reg& rd, CSR csr, uint32_t imm) { opCSR(0x6073, csr, imm, rd); }
-void csrrci(const Reg& rd, CSR csr, uint32_t imm) { opCSR(0x7073, csr, imm, rd); }
-void fadd_s(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x53, rs2, rs1, rm, rd); }
-void fclass_s(const Reg& rd, const FReg& rs1) { opFP(0xe0001053, 0, rs1, 0, rd); }
-void fcvt_s_w(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd0000053, 0, rs1, rm, rd); }
-void fcvt_s_wu(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd0100053, 0, rs1, rm, rd); }
-void fcvt_w_s(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc0000053, 0, rs1, rm, rd); }
-void fcvt_wu_s(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc0100053, 0, rs1, rm, rd); }
-void fdiv_s(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x18000053, rs2, rs1, rm, rd); }
-void feq_s(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa0002053, rs2, rs1, 0, rd); }
-void fle_s(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa0000053, rs2, rs1, 0, rd); }
-void flt_s(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa0001053, rs2, rs1, 0, rd); }
-void fmax_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x28001053, rs2, rs1, 0, rd); }
-void fmin_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x28000053, rs2, rs1, 0, rd); }
-void fmul_s(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x10000053, rs2, rs1, rm, rd); }
-void fmv_w_x(const FReg& rd, const Reg& rs1) { opFP(0xf0000053, 0, rs1, 0, rd); }
-void fmv_x_w(const Reg& rd, const FReg& rs1) { opFP(0xe0000053, 0, rs1, 0, rd); }
-void fsgnj_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x20000053, rs2, rs1, 0, rd); }
-void fsgnjn_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x20001053, rs2, rs1, 0, rd); }
-void fsgnjx_s(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x20002053, rs2, rs1, 0, rd); }
-void fsqrt_s(const FReg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0x58000053, 0, rs1, rm, rd); }
-void fsub_s(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x8000053, rs2, rs1, rm, rd); }
-void fcvt_l_s(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc0200053, 0, rs1, rm, rd); }
-void fcvt_lu_s(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc0300053, 0, rs1, rm, rd); }
-void fcvt_s_l(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd0200053, 0, rs1, rm, rd); }
-void fcvt_s_lu(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd0300053, 0, rs1, rm, rd); }
-void fadd_h(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x4000053, rs2, rs1, rm, rd); }
-void fclass_h(const Reg& rd, const FReg& rs1) { opFP(0xe4001053, 0, rs1, 0, rd); }
-void fcvt_h_s(const Reg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0x44000053, 0, rs1, rm, rd); }
-void fcvt_h_w(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd4000053, 0, rs1, rm, rd); }
-void fcvt_h_wu(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd4100053, 0, rs1, rm, rd); }
-void fcvt_s_h(const Reg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0x40200053, 0, rs1, rm, rd); }
-void fcvt_w_h(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc4000053, 0, rs1, rm, rd); }
-void fcvt_wu_h(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc4100053, 0, rs1, rm, rd); }
-void fdiv_h(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x1c000053, rs2, rs1, rm, rd); }
-void feq_h(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa4002053, rs2, rs1, 0, rd); }
-void fle_h(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa4000053, rs2, rs1, 0, rd); }
-void flt_h(const Reg& rd, const FReg& rs1, const FReg& rs2) { opFP(0xa4001053, rs2, rs1, 0, rd); }
-void fmax_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x2c001053, rs2, rs1, 0, rd); }
-void fmin_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x2c000053, rs2, rs1, 0, rd); }
-void fmul_h(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0x14000053, rs2, rs1, rm, rd); }
-void fmv_h_x(const FReg& rd, const Reg& rs1) { opFP(0xf4000053, 0, rs1, 0, rd); }
-void fmv_x_h(const Reg& rd, const FReg& rs1) { opFP(0xe4000053, 0, rs1, 0, rd); }
-void fsgnj_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x24000053, rs2, rs1, 0, rd); }
-void fsgnjn_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x24001053, rs2, rs1, 0, rd); }
-void fsgnjx_h(const FReg& rd, const FReg& rs1, const FReg& rs2) { opFP(0x24002053, rs2, rs1, 0, rd); }
-void fsqrt_h(const FReg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0x5c000053, 0, rs1, rm, rd); }
-void fsub_h(const FReg& rd, const FReg& rs1, const FReg& rs2, RM rm=RM::dyn) { opFP(0xc000053, rs2, rs1, rm, rd); }
-void fcvt_h_l(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd4200053, 0, rs1, rm, rd); }
-void fcvt_h_lu(const FReg& rd, const Reg& rs1, RM rm=RM::dyn) { opFP(0xd4300053, 0, rs1, rm, rd); }
-void fcvt_l_h(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc4200053, 0, rs1, rm, rd); }
-void fcvt_lu_h(const Reg& rd, const FReg& rs1, RM rm=RM::dyn) { opFP(0xc4300053, 0, rs1, rm, rd); }
-
-void fmadd_s(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x43, rs3, rs2, rs1, rm, rd); }
-void fmsub_s(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x47, rs3, rs2, rs1, rm, rd); }
-void fnmsub_s(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x4b, rs3, rs2, rs1, rm, rd); }
-void fnmadd_s(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x4f, rs3, rs2, rs1, rm, rd); }
-
-void fmadd_h(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x4000043, rs3, rs2, rs1, rm, rd); }
-void fmsub_h(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x4000047, rs3, rs2, rs1, rm, rd); }
-void fnmsub_h(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x400004b, rs3, rs2, rs1, rm, rd); }
-void fnmadd_h(const FReg& rd, const FReg& rs1, const FReg& rs2, const FReg& rs3, RM rm=RM::dyn) { opR4(0x400004f, rs3, rs2, rs1, rm, rd); }
-
-
-void flq(const FReg& rd, const Reg& rs1, int32_t imm12 = 0) { opLoadFP(0x4007, imm12, rs1, rd); }
-void fsq(const FReg& rs2, const Reg& rs1, int32_t imm12 = 0) { opStoreFP(0x4027, imm12, rs2, rs1); }
-void fld(const FReg& rd, const Reg& rs1, int32_t imm12 = 0) { opLoadFP(0x3007, imm12, rs1, rd); }
-void fsd(const FReg& rs2, const Reg& rs1, int32_t imm12 = 0) { opStoreFP(0x3027, imm12, rs2, rs1); }
-void flw(const FReg& rd, const Reg& rs1, int32_t imm12 = 0) { opLoadFP(0x2007, imm12, rs1, rd); }
-void fsw(const FReg& rs2, const Reg& rs1, int32_t imm12 = 0) { opStoreFP(0x2027, imm12, rs2, rs1); }
-void flh(const FReg& rd, const Reg& rs1, int32_t imm12 = 0) { opLoadFP(0x1007, imm12, rs1, rd); }
-void fsh(const FReg& rs2, const Reg& rs1, int32_t imm12 = 0) { opStoreFP(0x1027, imm12, rs2, rs1); }
-
-
-void nop() { if (supportRVC_) { append2B(0x0001); return; } addi(x0, x0, 0); }
-void li(const Reg& rd, uint32_t imm)
-{
-	if (imm && (imm & local::mask(12)) == 0) { // lower 12 bits of imm are zero
-		lui(rd, uint32_t(imm >> 12));
-		return;
-	}
-	int H, L;
-	if (!local::split32bit(&H, &L, imm)) {
-		addi(rd, zero, imm);
-		return;
-	}
-	lui(rd, H);
-	if (isRV32_) {
-		addi(rd, rd, L);
-	} else {
-		addiw(rd, rd, L);
-	}
-}
-void mv(const Reg& rd, const Reg& rs) { addi(rd, rs, 0); }
-void not_(const Reg& rd, const Reg& rs) { xori(rd, rs, -1); }
-void neg(const Reg& rd, const Reg& rs) { sub(rd, x0, rs); }
-void negw(const Reg& rd, const Reg& rs) { subw(rd, x0, rs); }
-void sext_b(const Reg& rd, const Reg& rs) { slli(rd, rs, XLEN_ - 8); srai(rd, rd, XLEN_ - 8); }
-void sext_h(const Reg& rd, const Reg& rs) { slli(rd, rs, XLEN_ - 16); srai(rd, rd, XLEN_ - 16); }
-void sext_w(const Reg& rd, const Reg& rs) { addiw(rd, rs, 0); }
-void zext_b(const Reg& rd, const Reg& rs) { andi(rd, rs, 255); }
-void zext_h(const Reg& rd, const Reg& rs) { slli(rd, rs, XLEN_ - 16); srli(rd, rd, XLEN_ - 16); }
-void zext_w(const Reg& rd, const Reg& rs) { slli(rd, rs, XLEN_ - 32); srli(rd, rd, XLEN_ - 32); }
-void seqz(const Reg& rd, const Reg& rs) { sltiu(rd, rs, 1); }
-void snez(const Reg& rd, const Reg& rs) { sltu(rd, x0, rs); }
-void sltz(const Reg& rd, const Reg& rs) { slt(rd, rs, x0); }
-void sgtz(const Reg& rd, const Reg& rs) { slt(rd, x0, rs); }
-void fence() { append4B(0x0ff0000f); }
-void j_(const Label& label) { jal(x0, label); }
-void jal(const Reg& rd, const Label& label) { Jmp jmp(getCurr(), 0x6f, rd); opJmp(label, jmp); }
-void jr(const Reg& rs) { jalr(x0, rs, 0); }
-void jalr(const Reg& rs) { jalr(x1, rs, 0); }
-void ret() { jalr(x0, x1); }
-// lr rd, (addr)
-void lr_w(const Reg& rd, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, 0, addr, 2, 2, flag); }
-void lr_d(const Reg& rd, const Reg& addr, uint32_t flag = 0) { opAtomic(rd, 0, addr, 2, 3, flag); }
-void csrr(const Reg& rd, CSR csr) { csrrs(rd, csr, x0); }
-void csrw(CSR csr, const Reg& rs) { csrrw(x0, csr, rs); }
-void csrs(CSR csr, const Reg& rs) { csrrs(x0, csr, rs); }
-void csrc(CSR csr, const Reg& rs) { csrrc(x0, csr, rs); }
-void csrwi(CSR csr, uint32_t imm) { csrrwi(x0, csr, imm); }
-void csrsi(CSR csr, uint32_t imm) { csrrsi(x0, csr, imm); }
-void csrci(CSR csr, uint32_t imm) { csrrci(x0, csr, imm); }
-
diff --git a/third_party/xbyak_riscv/xbyak_riscv_util.hpp b/third_party/xbyak_riscv/xbyak_riscv_util.hpp
deleted file mode 100644
index 6fdeab13b0e..00000000000
--- a/third_party/xbyak_riscv/xbyak_riscv_util.hpp
+++ /dev/null
@@ -1,271 +0,0 @@
-/******************************************************************************
-* Copyright (C), 2023, KNS Group LLC (YADRO)
-*
-* Licensed under the 3-Clause BSD License
-* You may obtain a copy of the License at https://opensource.org/license/bsd-3-clause/
-*******************************************************************************/
-
-#pragma once
-
-#include <climits>
-#include <cstddef>
-#include <cstdint>
-#include "xbyak_riscv_csr.hpp"
-#include "xbyak_riscv.hpp"
-
-#if defined(__linux__) && defined(__riscv)
-#include <sys/auxv.h>
-#include <sys/prctl.h>
-#include <sys/utsname.h>
-#include <asm/hwcap.h>
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-
-namespace Xbyak_riscv {
-
-// Legacy HWCAP constants
-#ifndef COMPAT_HWCAP_ISA_I
-#define COMPAT_HWCAP_ISA_I  (1U << ('I' - 'A'))
-#endif
-
-#ifndef COMPAT_HWCAP_ISA_M
-#define COMPAT_HWCAP_ISA_M  (1U << ('M' - 'A'))
-#endif
-
-#ifndef COMPAT_HWCAP_ISA_A
-#define COMPAT_HWCAP_ISA_A  (1U << ('A' - 'A'))
-#endif
-
-#ifndef COMPAT_HWCAP_ISA_F
-#define COMPAT_HWCAP_ISA_F  (1U << ('F' - 'A'))
-#endif
-
-#ifndef COMPAT_HWCAP_ISA_D
-#define COMPAT_HWCAP_ISA_D  (1U << ('D' - 'A'))
-#endif
-
-#ifndef COMPAT_HWCAP_ISA_C
-#define COMPAT_HWCAP_ISA_C  (1U << ('C' - 'A'))
-#endif
-
-#ifndef COMPAT_HWCAP_ISA_V
-#define COMPAT_HWCAP_ISA_V  (1U << ('V' - 'A'))
-#endif
-
-#if defined(__linux__) && defined(__riscv)
-// Definitions for riscv_hwprobe (Linux 6.4+)
-#ifndef __NR_riscv_hwprobe
-#define __NR_riscv_hwprobe 258
-#endif
-
-#ifndef RISCV_HWPROBE_KEY_IMA_EXT_0
-#define RISCV_HWPROBE_KEY_IMA_EXT_0 4
-#endif
-
-#ifndef RISCV_HWPROBE_IMA_V
-#define RISCV_HWPROBE_IMA_V (1ULL << 2)
-#endif
-
-#ifndef RISCV_HWPROBE_EXT_ZVBB
-#define RISCV_HWPROBE_EXT_ZVBB (1ULL << 17)
-#endif
-
-#ifndef RISCV_HWPROBE_EXT_ZVBC
-#define RISCV_HWPROBE_EXT_ZVBC (1ULL << 18)
-#endif
-
-#ifndef RISCV_HWPROBE_EXT_ZVKG
-#define RISCV_HWPROBE_EXT_ZVKG (1ULL << 20)
-#endif
-
-#ifndef RISCV_HWPROBE_EXT_ZVFH
-#define RISCV_HWPROBE_EXT_ZVFH (1ULL << 30)
-#endif
-
-struct riscv_hwprobe {
-    int64_t key;
-    uint64_t value;
-};
-#endif
-
-enum class RISCVExtension : uint64_t {
-    // 0-25: Legacy single-letter map (matches HWCAP for convenience)
-    I = COMPAT_HWCAP_ISA_I,
-    M = COMPAT_HWCAP_ISA_M,
-    A = COMPAT_HWCAP_ISA_A,
-    F = COMPAT_HWCAP_ISA_F,
-    D = COMPAT_HWCAP_ISA_D,
-    C = COMPAT_HWCAP_ISA_C,
-    V = COMPAT_HWCAP_ISA_V,
-
-    // 26+: Extended Z-extensions
-    // Adding new extensions here is safe and conflict-free
-    Zvfh = 1ULL << 26,
-    Zvbb = 1ULL << 27,
-    Zvbc = 1ULL << 28,
-    Zvkg = 1ULL << 29
-};
-
-template <CSR csr>
-struct CSRReader : public CodeGenerator {
-    // Buffer capacity exactly for 2 instructions.
-    static constexpr size_t capacity = 8;
-
-    CSRReader() : CodeGenerator(capacity) {
-        csrrs(a0, csr, x0);
-        ret();
-    }
-};
-
-/**
- * Class that detects information about a RISC-V CPU.
- */
-class CPU final {
-public:
-    static const CPU& getInstance() {
-        static const CPU cpu;
-        return cpu;
-    }
-
-    CPU() {
-        hwcapFeatures = 0;
-        xlen = sizeof(void*) * 8; // Fallback if sysconf fails
-
-#if defined(__linux__) && defined(__riscv)
-        // Set hwcapFeatures with AT_HWCAP value from
-        // the Linux auxiliary vector to check for base extensions support.
-        hwcapFeatures = getauxval(AT_HWCAP) & (
-            COMPAT_HWCAP_ISA_I |
-            COMPAT_HWCAP_ISA_M |
-            COMPAT_HWCAP_ISA_A |
-            COMPAT_HWCAP_ISA_F |
-            COMPAT_HWCAP_ISA_D |
-            COMPAT_HWCAP_ISA_C |
-            COMPAT_HWCAP_ISA_V
-        );
-
-        // Try to use riscv_hwprobe to detect Z-extensions
-        struct riscv_hwprobe requests[] = {
-            {RISCV_HWPROBE_KEY_IMA_EXT_0, 0}
-        };
-
-        int ret = syscall(__NR_riscv_hwprobe, &requests, sizeof(requests) / sizeof(requests[0]), 0, NULL, 0);
-
-        if (ret == 0) {
-            uint64_t v = requests[0].value;
-            // Update V support from hwprobe if present
-            if (v & RISCV_HWPROBE_IMA_V) hwcapFeatures |= static_cast<uint64_t>(RISCVExtension::V);
-
-            // Detect Z-extensions using the table
-            const struct {
-                RISCVExtension id;
-                uint64_t hwprobe_bit; // Bit in RISCV_HWPROBE_KEY_IMA_EXT_0
-            } table[] = {
-                { RISCVExtension::Zvfh, RISCV_HWPROBE_EXT_ZVFH },
-                { RISCVExtension::Zvbb, RISCV_HWPROBE_EXT_ZVBB },
-                { RISCVExtension::Zvbc, RISCV_HWPROBE_EXT_ZVBC },
-                { RISCVExtension::Zvkg, RISCV_HWPROBE_EXT_ZVKG }
-            };
-            for (const auto& entry : table) {
-                if (v & entry.hwprobe_bit) {
-                    hwcapFeatures |= static_cast<uint64_t>(entry.id);
-                }
-            }
-        }
-
-        // Set xlen, number of cores, cache info
-        xlen = sysconf(_SC_LONG_BIT);
-        numCores = sysconf(_SC_NPROCESSORS_ONLN);
-
-        dataCacheSize_[0] = sysconf(_SC_LEVEL1_DCACHE_SIZE);
-        dataCacheSize_[1] = sysconf(_SC_LEVEL2_CACHE_SIZE);
-        dataCacheSize_[2] = sysconf(_SC_LEVEL3_CACHE_SIZE);
-        dataCacheSize_[3] = sysconf(_SC_LEVEL4_CACHE_SIZE);
-
-        dataCacheLineSize_[0] = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
-        dataCacheLineSize_[1] = sysconf(_SC_LEVEL2_CACHE_LINESIZE);
-        dataCacheLineSize_[2] = sysconf(_SC_LEVEL3_CACHE_LINESIZE);
-        dataCacheLineSize_[3] = sysconf(_SC_LEVEL4_CACHE_LINESIZE);
-#endif
-
-        // Set vlen
-        if(hasExtension(RISCVExtension::V)) {
-            CSRReader<CSR::vlenb> csrReaderGenerator;
-            csrReaderGenerator.ready();
-            const auto csrReader = csrReaderGenerator.getCode<uint32_t (*)()>();
-            vlen = csrReader() * 8 /* bit */;
-        }
-
-        // Set flen (bit)
-        if (hasExtension(RISCVExtension::D)) {
-            flen = 64;
-        } else if (hasExtension(RISCVExtension::F)) {
-            flen = 32;
-        }
-    }
-
-    /**
-     * Checks if a particular RISC-V extension is available.
-     *
-     * @param extension The extension to check.
-     */
-    bool hasExtension(RISCVExtension extension) const {
-        return (hwcapFeatures & static_cast<uint64_t>(extension)) != 0;
-    }
-
-    /**
-     * Get vector register width in bits
-    */
-    uint32_t getVlen() const {
-        return vlen;
-    }
-
-    /**
-     * Get general purpose register width in bits
-    */
-    uint32_t getXlen() const {
-        return xlen;
-    };
-
-    /**
-     * Get floating-point register width in bits
-    */
-    uint32_t getFlen() const {
-        return flen;
-    }
-
-    uint32_t getNumCores() const {
-        return numCores;
-    }
-
-    /**
-     * Get data cache size in bytes
-     * @param lvl Cache level 1..4
-    */
-    uint32_t getDataCacheSize(uint32_t lvl) const {
-        if (lvl == 0 || lvl > maxNumberCacheLevels) XBYAK_RISCV_THROW(ERR_BAD_PARAMETER);
-        return dataCacheSize_[lvl - 1];
-    }
-
-    /**
-     * Get data cache line size in bytes
-     * @param lvl Cache level 1..4
-    */
-    uint32_t getDataCacheLineSize(uint32_t lvl) const {
-        if (lvl == 0 || lvl > maxNumberCacheLevels) XBYAK_RISCV_THROW(ERR_BAD_PARAMETER);
-        return dataCacheLineSize_[lvl - 1];
-    }
-
-private:
-    uint64_t hwcapFeatures = 0;
-    static constexpr size_t maxNumberCacheLevels = 4;
-    uint32_t dataCacheSize_[maxNumberCacheLevels] = {0, 0, 0, 0};
-    uint32_t dataCacheLineSize_[maxNumberCacheLevels] = {0, 0, 0, 0};
-    uint32_t numCores = 0;
-    uint32_t xlen = 0;
-    uint32_t vlen = 0;
-    uint32_t flen = 0;
-};
-
-} // Xbyak_riscv
diff --git a/third_party/xbyak_riscv/xbyak_riscv_v.hpp b/third_party/xbyak_riscv/xbyak_riscv_v.hpp
deleted file mode 100644
index 7bff4daf391..00000000000
--- a/third_party/xbyak_riscv/xbyak_riscv_v.hpp
+++ /dev/null
@@ -1,776 +0,0 @@
-/*
-	Copyright (C), 2023, MITSUNARI Shigeo
-	Copyright (C), 2023, KNS Group LLC (YADRO)
-	Licensed under the 3-Clause BSD License
-	You may obtain a copy of the License at https://opensource.org/license/bsd-3-clause/
-*/
-void vaadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x24002057, vm, vs2, vs1, vd); }
-void vaadd_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x24006057, vm, vs2, rs1, vd); }
-void vaaddu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x20002057, vm, vs2, vs1, vd); }
-void vaaddu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x20006057, vm, vs2, rs1, vd); }
-void vadc_vim(const VReg& vd, const VReg& vs2, int32_t simm5) { opIVI(0x40003057, 0, vs2, simm5, vd); }
-void vadc_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x40000057, 0, vs2, vs1, vd); }
-void vadc_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x40004057, 0, vs2, rs1, vd); }
-void vadd_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x3057, vm, vs2, simm5, vd); }
-void vadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x57, vm, vs2, vs1, vd); }
-void vadd_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x4057, vm, vs2, rs1, vd); }
-void vand_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x24003057, vm, vs2, simm5, vd); }
-void vand_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x24000057, vm, vs2, vs1, vd); }
-void vand_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x24004057, vm, vs2, rs1, vd); }
-void vasub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x2c002057, vm, vs2, vs1, vd); }
-void vasub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x2c006057, vm, vs2, rs1, vd); }
-void vasubu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x28002057, vm, vs2, vs1, vd); }
-void vasubu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x28006057, vm, vs2, rs1, vd); }
-void vcompress_vm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opMVV(0x5e002057, 0, vs2, vs1, vd); }
-void vcpop_m(const Reg& rd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x40082057, vm, vs2, 0, rd); }
-void vdiv_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x84002057, vm, vs2, vs1, vd); }
-void vdiv_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x84006057, vm, vs2, rs1, vd); }
-void vdivu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x80002057, vm, vs2, vs1, vd); }
-void vdivu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x80006057, vm, vs2, rs1, vd); }
-void vfadd_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x5057, vm, vs2, rs1, vd); }
-void vfadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x1057, vm, vs2, vs1, vd); }
-void vfclass_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x4c081057, vm, vs2, 0, vd); }
-void vfcvt_f_x_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48019057, vm, vs2, 0, vd); }
-void vfcvt_f_xu_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48011057, vm, vs2, 0, vd); }
-void vfcvt_rtz_x_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48039057, vm, vs2, 0, vd); }
-void vfcvt_rtz_xu_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48031057, vm, vs2, 0, vd); }
-void vfcvt_x_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48009057, vm, vs2, 0, vd); }
-void vfcvt_xu_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48001057, vm, vs2, 0, vd); }
-void vfdiv_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x80005057, vm, vs2, rs1, vd); }
-void vfdiv_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x80001057, vm, vs2, vs1, vd); }
-void vfirst_m(const Reg& rd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x4008a057, vm, vs2, 0, rd); }
-void vfmacc_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xb0005057, vm, vs2, rs1, vd); }
-void vfmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xb0001057, vm, vs2, vs1, vd); }
-void vfmadd_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xa0005057, vm, vs2, rs1, vd); }
-void vfmadd_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xa0001057, vm, vs2, vs1, vd); }
-void vfmax_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x18005057, vm, vs2, rs1, vd); }
-void vfmax_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x18001057, vm, vs2, vs1, vd); }
-void vfmerge_vfm(const VReg& vd, const VReg& vs2, const FReg& rs1) { opFVF(0x5c005057, 0, vs2, rs1, vd); }
-void vfmin_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x10005057, vm, vs2, rs1, vd); }
-void vfmin_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x10001057, vm, vs2, vs1, vd); }
-void vfmsac_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xb8005057, vm, vs2, rs1, vd); }
-void vfmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xb8001057, vm, vs2, vs1, vd); }
-void vfmsub_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xa8005057, vm, vs2, rs1, vd); }
-void vfmsub_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xa8001057, vm, vs2, vs1, vd); }
-void vfmul_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x90005057, vm, vs2, rs1, vd); }
-void vfmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x90001057, vm, vs2, vs1, vd); }
-void vfmv_f_s(const FReg& rd, const VReg& vs2) { opFVV(0x42001057, 0, vs2, 0, rd); }
-void vfmv_s_f(const VReg& vd, const FReg& rs1) { opFVF(0x42005057, 0, 0, rs1, vd); }
-void vfmv_v_f(const VReg& vd, const FReg& rs1) { opFVF(0x5e005057, 0, 0, rs1, vd); }
-void vfncvt_f_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x480a1057, vm, vs2, 0, vd); }
-void vfncvt_f_x_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48099057, vm, vs2, 0, vd); }
-void vfncvt_f_xu_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48091057, vm, vs2, 0, vd); }
-void vfncvt_rod_f_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x480a9057, vm, vs2, 0, vd); }
-void vfncvt_rtz_x_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x480b9057, vm, vs2, 0, vd); }
-void vfncvt_rtz_xu_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x480b1057, vm, vs2, 0, vd); }
-void vfncvt_x_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48089057, vm, vs2, 0, vd); }
-void vfncvt_xu_f_w(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48081057, vm, vs2, 0, vd); }
-void vfnmacc_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xb4005057, vm, vs2, rs1, vd); }
-void vfnmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xb4001057, vm, vs2, vs1, vd); }
-void vfnmadd_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xa4005057, vm, vs2, rs1, vd); }
-void vfnmadd_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xa4001057, vm, vs2, vs1, vd); }
-void vfnmsac_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xbc005057, vm, vs2, rs1, vd); }
-void vfnmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xbc001057, vm, vs2, vs1, vd); }
-void vfnmsub_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xac005057, vm, vs2, rs1, vd); }
-void vfnmsub_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xac001057, vm, vs2, vs1, vd); }
-void vfrdiv_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x84005057, vm, vs2, rs1, vd); }
-void vfrec7_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x4c029057, vm, vs2, 0, vd); }
-void vfredmax_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x1c001057, vm, vs2, vs1, vd); }
-void vfredmin_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x14001057, vm, vs2, vs1, vd); }
-void vfredosum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xc001057, vm, vs2, vs1, vd); }
-void vfredusum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x4001057, vm, vs2, vs1, vd); }
-void vfrsqrt7_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x4c021057, vm, vs2, 0, vd); }
-void vfrsub_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x9c005057, vm, vs2, rs1, vd); }
-void vfsgnj_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x20005057, vm, vs2, rs1, vd); }
-void vfsgnj_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x20001057, vm, vs2, vs1, vd); }
-void vfsgnjn_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x24005057, vm, vs2, rs1, vd); }
-void vfsgnjn_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x24001057, vm, vs2, vs1, vd); }
-void vfsgnjx_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x28005057, vm, vs2, rs1, vd); }
-void vfsgnjx_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x28001057, vm, vs2, vs1, vd); }
-void vfslide1down_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x3c005057, vm, vs2, rs1, vd); }
-void vfslide1up_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x38005057, vm, vs2, rs1, vd); }
-void vfsqrt_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x4c001057, vm, vs2, 0, vd); }
-void vfsub_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x8005057, vm, vs2, rs1, vd); }
-void vfsub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x8001057, vm, vs2, vs1, vd); }
-void vfwadd_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xc0005057, vm, vs2, rs1, vd); }
-void vfwadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xc0001057, vm, vs2, vs1, vd); }
-void vfwadd_wf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xd0005057, vm, vs2, rs1, vd); }
-void vfwadd_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xd0001057, vm, vs2, vs1, vd); }
-void vfwcvt_f_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48061057, vm, vs2, 0, vd); }
-void vfwcvt_f_x_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48059057, vm, vs2, 0, vd); }
-void vfwcvt_f_xu_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48051057, vm, vs2, 0, vd); }
-void vfwcvt_rtz_x_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48079057, vm, vs2, 0, vd); }
-void vfwcvt_rtz_xu_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48071057, vm, vs2, 0, vd); }
-void vfwcvt_x_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48049057, vm, vs2, 0, vd); }
-void vfwcvt_xu_f_v(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0x48041057, vm, vs2, 0, vd); }
-void vfwmacc_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xf0005057, vm, vs2, rs1, vd); }
-void vfwmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xf0001057, vm, vs2, vs1, vd); }
-void vfwmsac_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xf8005057, vm, vs2, rs1, vd); }
-void vfwmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xf8001057, vm, vs2, vs1, vd); }
-void vfwmul_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xe0005057, vm, vs2, rs1, vd); }
-void vfwmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xe0001057, vm, vs2, vs1, vd); }
-void vfwnmacc_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xf4005057, vm, vs2, rs1, vd); }
-void vfwnmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xf4001057, vm, vs2, vs1, vd); }
-void vfwnmsac_vf(const VReg& vd, const FReg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opFVF(0xfc005057, vm, vs2, rs1, vd); }
-void vfwnmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opFVV(0xfc001057, vm, vs2, vs1, vd); }
-void vfwredosum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xcc001057, vm, vs2, vs1, vd); }
-void vfwredusum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xc4001057, vm, vs2, vs1, vd); }
-void vfwsub_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xc8005057, vm, vs2, rs1, vd); }
-void vfwsub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xc8001057, vm, vs2, vs1, vd); }
-void vfwsub_wf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0xd8005057, vm, vs2, rs1, vd); }
-void vfwsub_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0xd8001057, vm, vs2, vs1, vd); }
-void vid_v(const VReg& vd, VM vm=VM::unmasked) { opMVV(0x5008a057, vm, 0, 0, vd); }
-void viota_m(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x50082057, vm, vs2, 0, vd); }
-void vl1re16_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2805007, 0, 0, rs1, vd); }
-void vl1re32_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2806007, 0, 0, rs1, vd); }
-void vl1re64_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2807007, 0, 0, rs1, vd); }
-void vl1re8_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2800007, 0, 0, rs1, vd); }
-void vl2re16_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2805007, 0, 0, rs1, vd); }
-void vl2re32_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2806007, 0, 0, rs1, vd); }
-void vl2re64_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2807007, 0, 0, rs1, vd); }
-void vl2re8_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2800007, 0, 0, rs1, vd); }
-void vl4re16_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2805007, 0, 0, rs1, vd); }
-void vl4re32_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2806007, 0, 0, rs1, vd); }
-void vl4re64_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2807007, 0, 0, rs1, vd); }
-void vl4re8_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2800007, 0, 0, rs1, vd); }
-void vl8re16_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2805007, 0, 0, rs1, vd); }
-void vl8re32_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2806007, 0, 0, rs1, vd); }
-void vl8re64_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2807007, 0, 0, rs1, vd); }
-void vl8re8_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2800007, 0, 0, rs1, vd); }
-void vlseg1e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10007007, vm, 0, rs1, vd); }
-void vlseg2e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x30007007, vm, 0, rs1, vd); }
-void vlseg3e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x50007007, vm, 0, rs1, vd); }
-void vlseg4e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x70007007, vm, 0, rs1, vd); }
-void vlseg5e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x90007007, vm, 0, rs1, vd); }
-void vlseg6e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb0007007, vm, 0, rs1, vd); }
-void vlseg7e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd0007007, vm, 0, rs1, vd); }
-void vlseg8e1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf0007007, vm, 0, rs1, vd); }
-void vle1024_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10007007, vm, 0, rs1, vd); }
-void vlseg1e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11007007, vm, 0, rs1, vd); }
-void vlseg2e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x31007007, vm, 0, rs1, vd); }
-void vlseg3e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x51007007, vm, 0, rs1, vd); }
-void vlseg4e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x71007007, vm, 0, rs1, vd); }
-void vlseg5e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x91007007, vm, 0, rs1, vd); }
-void vlseg6e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb1007007, vm, 0, rs1, vd); }
-void vlseg7e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd1007007, vm, 0, rs1, vd); }
-void vlseg8e1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf1007007, vm, 0, rs1, vd); }
-void vle1024ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11007007, vm, 0, rs1, vd); }
-void vlseg1e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10000007, vm, 0, rs1, vd); }
-void vlseg2e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x30000007, vm, 0, rs1, vd); }
-void vlseg3e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x50000007, vm, 0, rs1, vd); }
-void vlseg4e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x70000007, vm, 0, rs1, vd); }
-void vlseg5e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x90000007, vm, 0, rs1, vd); }
-void vlseg6e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb0000007, vm, 0, rs1, vd); }
-void vlseg7e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd0000007, vm, 0, rs1, vd); }
-void vlseg8e128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf0000007, vm, 0, rs1, vd); }
-void vle128_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10000007, vm, 0, rs1, vd); }
-void vlseg1e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11000007, vm, 0, rs1, vd); }
-void vlseg2e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x31000007, vm, 0, rs1, vd); }
-void vlseg3e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x51000007, vm, 0, rs1, vd); }
-void vlseg4e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x71000007, vm, 0, rs1, vd); }
-void vlseg5e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x91000007, vm, 0, rs1, vd); }
-void vlseg6e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb1000007, vm, 0, rs1, vd); }
-void vlseg7e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd1000007, vm, 0, rs1, vd); }
-void vlseg8e128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf1000007, vm, 0, rs1, vd); }
-void vle128ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11000007, vm, 0, rs1, vd); }
-void vlseg1e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x5007, vm, 0, rs1, vd); }
-void vlseg2e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x20005007, vm, 0, rs1, vd); }
-void vlseg3e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x40005007, vm, 0, rs1, vd); }
-void vlseg4e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x60005007, vm, 0, rs1, vd); }
-void vlseg5e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x80005007, vm, 0, rs1, vd); }
-void vlseg6e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa0005007, vm, 0, rs1, vd); }
-void vlseg7e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc0005007, vm, 0, rs1, vd); }
-void vlseg8e16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe0005007, vm, 0, rs1, vd); }
-void vle16_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x5007, vm, 0, rs1, vd); }
-void vlseg1e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1005007, vm, 0, rs1, vd); }
-void vlseg2e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x21005007, vm, 0, rs1, vd); }
-void vlseg3e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x41005007, vm, 0, rs1, vd); }
-void vlseg4e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x61005007, vm, 0, rs1, vd); }
-void vlseg5e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x81005007, vm, 0, rs1, vd); }
-void vlseg6e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa1005007, vm, 0, rs1, vd); }
-void vlseg7e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc1005007, vm, 0, rs1, vd); }
-void vlseg8e16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe1005007, vm, 0, rs1, vd); }
-void vle16ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1005007, vm, 0, rs1, vd); }
-void vlseg1e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10005007, vm, 0, rs1, vd); }
-void vlseg2e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x30005007, vm, 0, rs1, vd); }
-void vlseg3e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x50005007, vm, 0, rs1, vd); }
-void vlseg4e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x70005007, vm, 0, rs1, vd); }
-void vlseg5e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x90005007, vm, 0, rs1, vd); }
-void vlseg6e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb0005007, vm, 0, rs1, vd); }
-void vlseg7e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd0005007, vm, 0, rs1, vd); }
-void vlseg8e256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf0005007, vm, 0, rs1, vd); }
-void vle256_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10005007, vm, 0, rs1, vd); }
-void vlseg1e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11005007, vm, 0, rs1, vd); }
-void vlseg2e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x31005007, vm, 0, rs1, vd); }
-void vlseg3e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x51005007, vm, 0, rs1, vd); }
-void vlseg4e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x71005007, vm, 0, rs1, vd); }
-void vlseg5e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x91005007, vm, 0, rs1, vd); }
-void vlseg6e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb1005007, vm, 0, rs1, vd); }
-void vlseg7e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd1005007, vm, 0, rs1, vd); }
-void vlseg8e256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf1005007, vm, 0, rs1, vd); }
-void vle256ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11005007, vm, 0, rs1, vd); }
-void vlseg1e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x6007, vm, 0, rs1, vd); }
-void vlseg2e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x20006007, vm, 0, rs1, vd); }
-void vlseg3e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x40006007, vm, 0, rs1, vd); }
-void vlseg4e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x60006007, vm, 0, rs1, vd); }
-void vlseg5e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x80006007, vm, 0, rs1, vd); }
-void vlseg6e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa0006007, vm, 0, rs1, vd); }
-void vlseg7e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc0006007, vm, 0, rs1, vd); }
-void vlseg8e32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe0006007, vm, 0, rs1, vd); }
-void vle32_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x6007, vm, 0, rs1, vd); }
-void vlseg1e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1006007, vm, 0, rs1, vd); }
-void vlseg2e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x21006007, vm, 0, rs1, vd); }
-void vlseg3e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x41006007, vm, 0, rs1, vd); }
-void vlseg4e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x61006007, vm, 0, rs1, vd); }
-void vlseg5e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x81006007, vm, 0, rs1, vd); }
-void vlseg6e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa1006007, vm, 0, rs1, vd); }
-void vlseg7e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc1006007, vm, 0, rs1, vd); }
-void vlseg8e32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe1006007, vm, 0, rs1, vd); }
-void vle32ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1006007, vm, 0, rs1, vd); }
-void vlseg1e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10006007, vm, 0, rs1, vd); }
-void vlseg2e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x30006007, vm, 0, rs1, vd); }
-void vlseg3e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x50006007, vm, 0, rs1, vd); }
-void vlseg4e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x70006007, vm, 0, rs1, vd); }
-void vlseg5e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x90006007, vm, 0, rs1, vd); }
-void vlseg6e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb0006007, vm, 0, rs1, vd); }
-void vlseg7e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd0006007, vm, 0, rs1, vd); }
-void vlseg8e512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf0006007, vm, 0, rs1, vd); }
-void vle512_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x10006007, vm, 0, rs1, vd); }
-void vlseg1e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11006007, vm, 0, rs1, vd); }
-void vlseg2e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x31006007, vm, 0, rs1, vd); }
-void vlseg3e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x51006007, vm, 0, rs1, vd); }
-void vlseg4e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x71006007, vm, 0, rs1, vd); }
-void vlseg5e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x91006007, vm, 0, rs1, vd); }
-void vlseg6e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xb1006007, vm, 0, rs1, vd); }
-void vlseg7e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xd1006007, vm, 0, rs1, vd); }
-void vlseg8e512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xf1006007, vm, 0, rs1, vd); }
-void vle512ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x11006007, vm, 0, rs1, vd); }
-void vlseg1e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x7007, vm, 0, rs1, vd); }
-void vlseg2e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x20007007, vm, 0, rs1, vd); }
-void vlseg3e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x40007007, vm, 0, rs1, vd); }
-void vlseg4e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x60007007, vm, 0, rs1, vd); }
-void vlseg5e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x80007007, vm, 0, rs1, vd); }
-void vlseg6e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa0007007, vm, 0, rs1, vd); }
-void vlseg7e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc0007007, vm, 0, rs1, vd); }
-void vlseg8e64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe0007007, vm, 0, rs1, vd); }
-void vle64_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x7007, vm, 0, rs1, vd); }
-void vlseg1e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1007007, vm, 0, rs1, vd); }
-void vlseg2e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x21007007, vm, 0, rs1, vd); }
-void vlseg3e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x41007007, vm, 0, rs1, vd); }
-void vlseg4e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x61007007, vm, 0, rs1, vd); }
-void vlseg5e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x81007007, vm, 0, rs1, vd); }
-void vlseg6e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa1007007, vm, 0, rs1, vd); }
-void vlseg7e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc1007007, vm, 0, rs1, vd); }
-void vlseg8e64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe1007007, vm, 0, rs1, vd); }
-void vle64ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1007007, vm, 0, rs1, vd); }
-void vlseg1e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x7, vm, 0, rs1, vd); }
-void vlseg2e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x20000007, vm, 0, rs1, vd); }
-void vlseg3e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x40000007, vm, 0, rs1, vd); }
-void vlseg4e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x60000007, vm, 0, rs1, vd); }
-void vlseg5e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x80000007, vm, 0, rs1, vd); }
-void vlseg6e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa0000007, vm, 0, rs1, vd); }
-void vlseg7e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc0000007, vm, 0, rs1, vd); }
-void vlseg8e8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe0000007, vm, 0, rs1, vd); }
-void vle8_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x7, vm, 0, rs1, vd); }
-void vlseg1e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1000007, vm, 0, rs1, vd); }
-void vlseg2e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x21000007, vm, 0, rs1, vd); }
-void vlseg3e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x41000007, vm, 0, rs1, vd); }
-void vlseg4e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x61000007, vm, 0, rs1, vd); }
-void vlseg5e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x81000007, vm, 0, rs1, vd); }
-void vlseg6e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xa1000007, vm, 0, rs1, vd); }
-void vlseg7e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xc1000007, vm, 0, rs1, vd); }
-void vlseg8e8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0xe1000007, vm, 0, rs1, vd); }
-void vle8ff_v(const VReg& vd, const Reg& rs1, VM vm=VM::unmasked) { opVectorLoad(0x1000007, vm, 0, rs1, vd); }
-void vlm_v(const VReg& vd, const Reg& rs1) { opVectorLoad(0x2b00007, 0, 0, rs1, vd); }
-void vloxei1024_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x1c007007, vm, vs2, rs1, vd); }
-void vloxei128_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x1c000007, vm, vs2, rs1, vd); }
-void vloxei16_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0xc005007, vm, vs2, rs1, vd); }
-void vloxei256_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x1c005007, vm, vs2, rs1, vd); }
-void vloxei32_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0xc006007, vm, vs2, rs1, vd); }
-void vloxei512_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x1c006007, vm, vs2, rs1, vd); }
-void vloxei64_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0xc007007, vm, vs2, rs1, vd); }
-void vloxei8_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0xc000007, vm, vs2, rs1, vd); }
-void vlsseg1e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18007007, vm, rs2, rs1, vd); }
-void vlsseg2e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x38007007, vm, rs2, rs1, vd); }
-void vlsseg3e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x58007007, vm, rs2, rs1, vd); }
-void vlsseg4e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x78007007, vm, rs2, rs1, vd); }
-void vlsseg5e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x98007007, vm, rs2, rs1, vd); }
-void vlsseg6e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xb8007007, vm, rs2, rs1, vd); }
-void vlsseg7e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xd8007007, vm, rs2, rs1, vd); }
-void vlsseg8e1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xf8007007, vm, rs2, rs1, vd); }
-void vlse1024_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18007007, vm, rs2, rs1, vd); }
-void vlsseg1e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18000007, vm, rs2, rs1, vd); }
-void vlsseg2e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x38000007, vm, rs2, rs1, vd); }
-void vlsseg3e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x58000007, vm, rs2, rs1, vd); }
-void vlsseg4e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x78000007, vm, rs2, rs1, vd); }
-void vlsseg5e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x98000007, vm, rs2, rs1, vd); }
-void vlsseg6e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xb8000007, vm, rs2, rs1, vd); }
-void vlsseg7e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xd8000007, vm, rs2, rs1, vd); }
-void vlsseg8e128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xf8000007, vm, rs2, rs1, vd); }
-void vlse128_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18000007, vm, rs2, rs1, vd); }
-void vlsseg1e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8005007, vm, rs2, rs1, vd); }
-void vlsseg2e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x28005007, vm, rs2, rs1, vd); }
-void vlsseg3e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x48005007, vm, rs2, rs1, vd); }
-void vlsseg4e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x68005007, vm, rs2, rs1, vd); }
-void vlsseg5e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x88005007, vm, rs2, rs1, vd); }
-void vlsseg6e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xa8005007, vm, rs2, rs1, vd); }
-void vlsseg7e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xc8005007, vm, rs2, rs1, vd); }
-void vlsseg8e16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xe8005007, vm, rs2, rs1, vd); }
-void vlse16_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8005007, vm, rs2, rs1, vd); }
-void vlsseg1e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18005007, vm, rs2, rs1, vd); }
-void vlsseg2e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x38005007, vm, rs2, rs1, vd); }
-void vlsseg3e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x58005007, vm, rs2, rs1, vd); }
-void vlsseg4e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x78005007, vm, rs2, rs1, vd); }
-void vlsseg5e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x98005007, vm, rs2, rs1, vd); }
-void vlsseg6e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xb8005007, vm, rs2, rs1, vd); }
-void vlsseg7e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xd8005007, vm, rs2, rs1, vd); }
-void vlsseg8e256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xf8005007, vm, rs2, rs1, vd); }
-void vlse256_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18005007, vm, rs2, rs1, vd); }
-void vlsseg1e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8006007, vm, rs2, rs1, vd); }
-void vlsseg2e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x28006007, vm, rs2, rs1, vd); }
-void vlsseg3e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x48006007, vm, rs2, rs1, vd); }
-void vlsseg4e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x68006007, vm, rs2, rs1, vd); }
-void vlsseg5e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x88006007, vm, rs2, rs1, vd); }
-void vlsseg6e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xa8006007, vm, rs2, rs1, vd); }
-void vlsseg7e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xc8006007, vm, rs2, rs1, vd); }
-void vlsseg8e32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xe8006007, vm, rs2, rs1, vd); }
-void vlse32_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8006007, vm, rs2, rs1, vd); }
-void vlsseg1e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18006007, vm, rs2, rs1, vd); }
-void vlsseg2e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x38006007, vm, rs2, rs1, vd); }
-void vlsseg3e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x58006007, vm, rs2, rs1, vd); }
-void vlsseg4e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x78006007, vm, rs2, rs1, vd); }
-void vlsseg5e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x98006007, vm, rs2, rs1, vd); }
-void vlsseg6e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xb8006007, vm, rs2, rs1, vd); }
-void vlsseg7e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xd8006007, vm, rs2, rs1, vd); }
-void vlsseg8e512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xf8006007, vm, rs2, rs1, vd); }
-void vlse512_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x18006007, vm, rs2, rs1, vd); }
-void vlsseg1e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8007007, vm, rs2, rs1, vd); }
-void vlsseg2e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x28007007, vm, rs2, rs1, vd); }
-void vlsseg3e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x48007007, vm, rs2, rs1, vd); }
-void vlsseg4e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x68007007, vm, rs2, rs1, vd); }
-void vlsseg5e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x88007007, vm, rs2, rs1, vd); }
-void vlsseg6e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xa8007007, vm, rs2, rs1, vd); }
-void vlsseg7e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xc8007007, vm, rs2, rs1, vd); }
-void vlsseg8e64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xe8007007, vm, rs2, rs1, vd); }
-void vlse64_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8007007, vm, rs2, rs1, vd); }
-void vlsseg1e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8000007, vm, rs2, rs1, vd); }
-void vlsseg2e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x28000007, vm, rs2, rs1, vd); }
-void vlsseg3e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x48000007, vm, rs2, rs1, vd); }
-void vlsseg4e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x68000007, vm, rs2, rs1, vd); }
-void vlsseg5e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x88000007, vm, rs2, rs1, vd); }
-void vlsseg6e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xa8000007, vm, rs2, rs1, vd); }
-void vlsseg7e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xc8000007, vm, rs2, rs1, vd); }
-void vlsseg8e8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0xe8000007, vm, rs2, rs1, vd); }
-void vlse8_v(const VReg& vd, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorLoad(0x8000007, vm, rs2, rs1, vd); }
-void vluxei1024_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x14007007, vm, vs2, rs1, vd); }
-void vluxei128_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x14000007, vm, vs2, rs1, vd); }
-void vluxei16_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x4005007, vm, vs2, rs1, vd); }
-void vluxei256_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x14005007, vm, vs2, rs1, vd); }
-void vluxei32_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x4006007, vm, vs2, rs1, vd); }
-void vluxei512_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x14006007, vm, vs2, rs1, vd); }
-void vluxei64_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x4007007, vm, vs2, rs1, vd); }
-void vluxei8_v(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorLoad(0x4000007, vm, vs2, rs1, vd); }
-void vmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xb4002057, vm, vs2, vs1, vd); }
-void vmacc_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xb4006057, vm, vs2, rs1, vd); }
-void vmadc_vi(const VReg& vd, const VReg& vs2, int32_t simm5) { opIVI(0x46003057, 0, vs2, simm5, vd); }
-void vmadc_vim(const VReg& vd, const VReg& vs2, int32_t simm5) { opIVI(0x44003057, 0, vs2, simm5, vd); }
-void vmadc_vv(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x46000057, 0, vs2, vs1, vd); }
-void vmadc_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x44000057, 0, vs2, vs1, vd); }
-void vmadc_vx(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x46004057, 0, vs2, rs1, vd); }
-void vmadc_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x44004057, 0, vs2, rs1, vd); }
-void vmadd_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xa4002057, vm, vs2, vs1, vd); }
-void vmadd_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xa4006057, vm, vs2, rs1, vd); }
-void vmand_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x64002057, vm, vs2, vs1, vd); }
-void vmandn_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x60002057, vm, vs2, vs1, vd); }
-void vmax_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x1c000057, vm, vs2, vs1, vd); }
-void vmax_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x1c004057, vm, vs2, rs1, vd); }
-void vmaxu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x18000057, vm, vs2, vs1, vd); }
-void vmaxu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x18004057, vm, vs2, rs1, vd); }
-void vmerge_vim(const VReg& vd, const VReg& vs2, int32_t simm5) { opIVI(0x5c003057, 0, vs2, simm5, vd); }
-void vmerge_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x5c000057, 0, vs2, vs1, vd); }
-void vmerge_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x5c004057, 0, vs2, rs1, vd); }
-void vmfeq_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x60005057, vm, vs2, rs1, vd); }
-void vmfeq_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x60001057, vm, vs2, vs1, vd); }
-void vmfge_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x7c005057, vm, vs2, rs1, vd); }
-void vmfgt_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x74005057, vm, vs2, rs1, vd); }
-void vmfle_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x64005057, vm, vs2, rs1, vd); }
-void vmfle_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x64001057, vm, vs2, vs1, vd); }
-void vmflt_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x6c005057, vm, vs2, rs1, vd); }
-void vmflt_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x6c001057, vm, vs2, vs1, vd); }
-void vmfne_vf(const VReg& vd, const VReg& vs2, const FReg& rs1, VM vm=VM::unmasked) { opFVF(0x70005057, vm, vs2, rs1, vd); }
-void vmfne_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opFVV(0x70001057, vm, vs2, vs1, vd); }
-void vmin_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x14000057, vm, vs2, vs1, vd); }
-void vmin_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x14004057, vm, vs2, rs1, vd); }
-void vminu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x10000057, vm, vs2, vs1, vd); }
-void vminu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x10004057, vm, vs2, rs1, vd); }
-void vmnand_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x74002057, vm, vs2, vs1, vd); }
-void vmnor_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x78002057, vm, vs2, vs1, vd); }
-void vmor_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x68002057, vm, vs2, vs1, vd); }
-void vmorn_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x70002057, vm, vs2, vs1, vd); }
-void vmsbc_vv(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x4e000057, 0, vs2, vs1, vd); }
-void vmsbc_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x4c000057, 0, vs2, vs1, vd); }
-void vmsbc_vx(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x4e004057, 0, vs2, rs1, vd); }
-void vmsbc_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x4c004057, 0, vs2, rs1, vd); }
-void vmsbf_m(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x5000a057, vm, vs2, 0, vd); }
-void vmseq_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x60003057, vm, vs2, simm5, vd); }
-void vmseq_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x60000057, vm, vs2, vs1, vd); }
-void vmseq_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x60004057, vm, vs2, rs1, vd); }
-void vmsgt_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x7c003057, vm, vs2, simm5, vd); }
-void vmsgt_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x7c004057, vm, vs2, rs1, vd); }
-void vmsgtu_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x78003057, vm, vs2, simm5, vd); }
-void vmsgtu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x78004057, vm, vs2, rs1, vd); }
-void vmsif_m(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x5001a057, vm, vs2, 0, vd); }
-void vmsle_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x74003057, vm, vs2, simm5, vd); }
-void vmsle_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x74000057, vm, vs2, vs1, vd); }
-void vmsle_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x74004057, vm, vs2, rs1, vd); }
-void vmsleu_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x70003057, vm, vs2, simm5, vd); }
-void vmsleu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x70000057, vm, vs2, vs1, vd); }
-void vmsleu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x70004057, vm, vs2, rs1, vd); }
-void vmslt_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x6c000057, vm, vs2, vs1, vd); }
-void vmslt_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x6c004057, vm, vs2, rs1, vd); }
-void vmsltu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x68000057, vm, vs2, vs1, vd); }
-void vmsltu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x68004057, vm, vs2, rs1, vd); }
-void vmsne_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x64003057, vm, vs2, simm5, vd); }
-void vmsne_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x64000057, vm, vs2, vs1, vd); }
-void vmsne_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x64004057, vm, vs2, rs1, vd); }
-void vmsof_m(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x50012057, vm, vs2, 0, vd); }
-void vmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x94002057, vm, vs2, vs1, vd); }
-void vmul_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x94006057, vm, vs2, rs1, vd); }
-void vmulh_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x9c002057, vm, vs2, vs1, vd); }
-void vmulh_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x9c006057, vm, vs2, rs1, vd); }
-void vmulhsu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x98002057, vm, vs2, vs1, vd); }
-void vmulhsu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x98006057, vm, vs2, rs1, vd); }
-void vmulhu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x90002057, vm, vs2, vs1, vd); }
-void vmulhu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x90006057, vm, vs2, rs1, vd); }
-void vmv1r_v(const VReg& vd, const VReg& vs2) { opIVI(0x9e003057, 0, vs2, 0, vd); }
-void vmv2r_v(const VReg& vd, const VReg& vs2) { opIVI(0x9e00b057, 0, vs2, 0, vd); }
-void vmv4r_v(const VReg& vd, const VReg& vs2) { opIVI(0x9e01b057, 0, vs2, 0, vd); }
-void vmv8r_v(const VReg& vd, const VReg& vs2) { opIVI(0x9e03b057, 0, vs2, 0, vd); }
-void vmv_s_x(const VReg& vd, const Reg& rs1) { opMVX(0x42006057, 0, 0, rs1, vd); }
-void vmv_v_i(const VReg& vd, int32_t simm5) { opIVI(0x5e003057, 0, 0, simm5, vd); }
-void vmv_v_v(const VReg& vd, const VReg& vs1) { opIVV(0x5e000057, 0, 0, vs1, vd); }
-void vmv_v_x(const VReg& vd, const Reg& rs1) { opIVX(0x5e004057, 0, 0, rs1, vd); }
-void vmv_x_s(const Reg& rd, const VReg& vs2) { opMVV(0x42002057, 0, vs2, 0, rd); }
-void vmxnor_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x7c002057, vm, vs2, vs1, vd); }
-void vmxor_mm(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x6c002057, vm, vs2, vs1, vd); }
-void vnclip_wi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xbc003057, vm, vs2, simm5, vd); }
-void vnclip_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xbc000057, vm, vs2, vs1, vd); }
-void vnclip_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xbc004057, vm, vs2, rs1, vd); }
-void vnclipu_wi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xb8003057, vm, vs2, simm5, vd); }
-void vnclipu_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xb8000057, vm, vs2, vs1, vd); }
-void vnclipu_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xb8004057, vm, vs2, rs1, vd); }
-void vnmsac_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xbc002057, vm, vs2, vs1, vd); }
-void vnmsac_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xbc006057, vm, vs2, rs1, vd); }
-void vnmsub_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xac002057, vm, vs2, vs1, vd); }
-void vnmsub_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xac006057, vm, vs2, rs1, vd); }
-void vnsra_wi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xb4003057, vm, vs2, simm5, vd); }
-void vnsra_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xb4000057, vm, vs2, vs1, vd); }
-void vnsra_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xb4004057, vm, vs2, rs1, vd); }
-void vnsrl_wi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xb0003057, vm, vs2, simm5, vd); }
-void vnsrl_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xb0000057, vm, vs2, vs1, vd); }
-void vnsrl_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xb0004057, vm, vs2, rs1, vd); }
-void vor_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x28003057, vm, vs2, simm5, vd); }
-void vor_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x28000057, vm, vs2, vs1, vd); }
-void vor_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x28004057, vm, vs2, rs1, vd); }
-void vredand_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x4002057, vm, vs2, vs1, vd); }
-void vredmax_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x1c002057, vm, vs2, vs1, vd); }
-void vredmaxu_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x18002057, vm, vs2, vs1, vd); }
-void vredmin_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x14002057, vm, vs2, vs1, vd); }
-void vredminu_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x10002057, vm, vs2, vs1, vd); }
-void vredor_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x8002057, vm, vs2, vs1, vd); }
-void vredsum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x2057, vm, vs2, vs1, vd); }
-void vredxor_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xc002057, vm, vs2, vs1, vd); }
-void vrem_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x8c002057, vm, vs2, vs1, vd); }
-void vrem_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x8c006057, vm, vs2, rs1, vd); }
-void vremu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0x88002057, vm, vs2, vs1, vd); }
-void vremu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x88006057, vm, vs2, rs1, vd); }
-void vrgather_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x30003057, vm, vs2, simm5, vd); }
-void vrgather_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x30000057, vm, vs2, vs1, vd); }
-void vrgather_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x30004057, vm, vs2, rs1, vd); }
-void vrgatherei16_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x38000057, vm, vs2, vs1, vd); }
-void vrsub_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xc003057, vm, vs2, simm5, vd); }
-void vrsub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xc004057, vm, vs2, rs1, vd); }
-void vs1r_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2800027, 0, 0, rs1, vs3); }
-void vs2r_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2800027, 0, 0, rs1, vs3); }
-void vs4r_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2800027, 0, 0, rs1, vs3); }
-void vs8r_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2800027, 0, 0, rs1, vs3); }
-void vsadd_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x84003057, vm, vs2, simm5, vd); }
-void vsadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x84000057, vm, vs2, vs1, vd); }
-void vsadd_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x84004057, vm, vs2, rs1, vd); }
-void vsaddu_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x80003057, vm, vs2, simm5, vd); }
-void vsaddu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x80000057, vm, vs2, vs1, vd); }
-void vsaddu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x80004057, vm, vs2, rs1, vd); }
-void vsbc_vvm(const VReg& vd, const VReg& vs2, const VReg& vs1) { opIVV(0x48000057, 0, vs2, vs1, vd); }
-void vsbc_vxm(const VReg& vd, const VReg& vs2, const Reg& rs1) { opIVX(0x48004057, 0, vs2, rs1, vd); }
-void vsseg1e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10007027, vm, 0, rs1, vs3); }
-void vsseg2e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x30007027, vm, 0, rs1, vs3); }
-void vsseg3e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x50007027, vm, 0, rs1, vs3); }
-void vsseg4e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x70007027, vm, 0, rs1, vs3); }
-void vsseg5e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x90007027, vm, 0, rs1, vs3); }
-void vsseg6e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xb0007027, vm, 0, rs1, vs3); }
-void vsseg7e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xd0007027, vm, 0, rs1, vs3); }
-void vsseg8e1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xf0007027, vm, 0, rs1, vs3); }
-void vse1024_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10007027, vm, 0, rs1, vs3); }
-void vsseg1e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10000027, vm, 0, rs1, vs3); }
-void vsseg2e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x30000027, vm, 0, rs1, vs3); }
-void vsseg3e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x50000027, vm, 0, rs1, vs3); }
-void vsseg4e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x70000027, vm, 0, rs1, vs3); }
-void vsseg5e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x90000027, vm, 0, rs1, vs3); }
-void vsseg6e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xb0000027, vm, 0, rs1, vs3); }
-void vsseg7e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xd0000027, vm, 0, rs1, vs3); }
-void vsseg8e128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xf0000027, vm, 0, rs1, vs3); }
-void vse128_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10000027, vm, 0, rs1, vs3); }
-void vsseg1e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x5027, vm, 0, rs1, vs3); }
-void vsseg2e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x20005027, vm, 0, rs1, vs3); }
-void vsseg3e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x40005027, vm, 0, rs1, vs3); }
-void vsseg4e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x60005027, vm, 0, rs1, vs3); }
-void vsseg5e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x80005027, vm, 0, rs1, vs3); }
-void vsseg6e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xa0005027, vm, 0, rs1, vs3); }
-void vsseg7e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xc0005027, vm, 0, rs1, vs3); }
-void vsseg8e16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xe0005027, vm, 0, rs1, vs3); }
-void vse16_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x5027, vm, 0, rs1, vs3); }
-void vsseg1e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10005027, vm, 0, rs1, vs3); }
-void vsseg2e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x30005027, vm, 0, rs1, vs3); }
-void vsseg3e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x50005027, vm, 0, rs1, vs3); }
-void vsseg4e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x70005027, vm, 0, rs1, vs3); }
-void vsseg5e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x90005027, vm, 0, rs1, vs3); }
-void vsseg6e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xb0005027, vm, 0, rs1, vs3); }
-void vsseg7e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xd0005027, vm, 0, rs1, vs3); }
-void vsseg8e256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xf0005027, vm, 0, rs1, vs3); }
-void vse256_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10005027, vm, 0, rs1, vs3); }
-void vsseg1e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x6027, vm, 0, rs1, vs3); }
-void vsseg2e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x20006027, vm, 0, rs1, vs3); }
-void vsseg3e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x40006027, vm, 0, rs1, vs3); }
-void vsseg4e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x60006027, vm, 0, rs1, vs3); }
-void vsseg5e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x80006027, vm, 0, rs1, vs3); }
-void vsseg6e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xa0006027, vm, 0, rs1, vs3); }
-void vsseg7e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xc0006027, vm, 0, rs1, vs3); }
-void vsseg8e32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xe0006027, vm, 0, rs1, vs3); }
-void vse32_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x6027, vm, 0, rs1, vs3); }
-void vsseg1e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10006027, vm, 0, rs1, vs3); }
-void vsseg2e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x30006027, vm, 0, rs1, vs3); }
-void vsseg3e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x50006027, vm, 0, rs1, vs3); }
-void vsseg4e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x70006027, vm, 0, rs1, vs3); }
-void vsseg5e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x90006027, vm, 0, rs1, vs3); }
-void vsseg6e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xb0006027, vm, 0, rs1, vs3); }
-void vsseg7e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xd0006027, vm, 0, rs1, vs3); }
-void vsseg8e512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xf0006027, vm, 0, rs1, vs3); }
-void vse512_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x10006027, vm, 0, rs1, vs3); }
-void vsseg1e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x7027, vm, 0, rs1, vs3); }
-void vsseg2e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x20007027, vm, 0, rs1, vs3); }
-void vsseg3e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x40007027, vm, 0, rs1, vs3); }
-void vsseg4e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x60007027, vm, 0, rs1, vs3); }
-void vsseg5e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x80007027, vm, 0, rs1, vs3); }
-void vsseg6e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xa0007027, vm, 0, rs1, vs3); }
-void vsseg7e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xc0007027, vm, 0, rs1, vs3); }
-void vsseg8e64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xe0007027, vm, 0, rs1, vs3); }
-void vse64_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x7027, vm, 0, rs1, vs3); }
-void vsseg1e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x27, vm, 0, rs1, vs3); }
-void vsseg2e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x20000027, vm, 0, rs1, vs3); }
-void vsseg3e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x40000027, vm, 0, rs1, vs3); }
-void vsseg4e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x60000027, vm, 0, rs1, vs3); }
-void vsseg5e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x80000027, vm, 0, rs1, vs3); }
-void vsseg6e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xa0000027, vm, 0, rs1, vs3); }
-void vsseg7e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xc0000027, vm, 0, rs1, vs3); }
-void vsseg8e8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0xe0000027, vm, 0, rs1, vs3); }
-void vse8_v(VReg vs3, const Reg& rs1, VM vm=VM::unmasked) { opVectorStore(0x27, vm, 0, rs1, vs3); }
-void vsext_vf2(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x4803a057, vm, vs2, 0, vd); }
-void vsext_vf4(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x4802a057, vm, vs2, 0, vd); }
-void vsext_vf8(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x4801a057, vm, vs2, 0, vd); }
-void vslide1down_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x3c006057, vm, vs2, rs1, vd); }
-void vslide1up_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0x38006057, vm, vs2, rs1, vd); }
-void vslidedown_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x3c003057, vm, vs2, simm5, vd); }
-void vslidedown_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x3c004057, vm, vs2, rs1, vd); }
-void vslideup_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x38003057, vm, vs2, simm5, vd); }
-void vslideup_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x38004057, vm, vs2, rs1, vd); }
-void vsll_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x94003057, vm, vs2, simm5, vd); }
-void vsll_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x94000057, vm, vs2, vs1, vd); }
-void vsll_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x94004057, vm, vs2, rs1, vd); }
-void vsm_v(VReg vs3, const Reg& rs1) { opVectorStore(0x2b00027, 0, 0, rs1, vs3); }
-void vsmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x9c000057, vm, vs2, vs1, vd); }
-void vsmul_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x9c004057, vm, vs2, rs1, vd); }
-void vsoxei1024_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x1c007027, vm, vs2, rs1, vs3); }
-void vsoxei128_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x1c000027, vm, vs2, rs1, vs3); }
-void vsoxei16_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0xc005027, vm, vs2, rs1, vs3); }
-void vsoxei256_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x1c005027, vm, vs2, rs1, vs3); }
-void vsoxei32_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0xc006027, vm, vs2, rs1, vs3); }
-void vsoxei512_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x1c006027, vm, vs2, rs1, vs3); }
-void vsoxei64_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0xc007027, vm, vs2, rs1, vs3); }
-void vsoxei8_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0xc000027, vm, vs2, rs1, vs3); }
-void vsra_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xa4003057, vm, vs2, simm5, vd); }
-void vsra_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xa4000057, vm, vs2, vs1, vd); }
-void vsra_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xa4004057, vm, vs2, rs1, vd); }
-void vsrl_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xa0003057, vm, vs2, simm5, vd); }
-void vsrl_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xa0000057, vm, vs2, vs1, vd); }
-void vsrl_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xa0004057, vm, vs2, rs1, vd); }
-void vssseg1e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18007027, vm, rs2, rs1, vs3); }
-void vssseg2e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x38007027, vm, rs2, rs1, vs3); }
-void vssseg3e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x58007027, vm, rs2, rs1, vs3); }
-void vssseg4e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x78007027, vm, rs2, rs1, vs3); }
-void vssseg5e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x98007027, vm, rs2, rs1, vs3); }
-void vssseg6e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xb8007027, vm, rs2, rs1, vs3); }
-void vssseg7e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xd8007027, vm, rs2, rs1, vs3); }
-void vssseg8e1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xf8007027, vm, rs2, rs1, vs3); }
-void vsse1024_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18007027, vm, rs2, rs1, vs3); }
-void vssseg1e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18000027, vm, rs2, rs1, vs3); }
-void vssseg2e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x38000027, vm, rs2, rs1, vs3); }
-void vssseg3e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x58000027, vm, rs2, rs1, vs3); }
-void vssseg4e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x78000027, vm, rs2, rs1, vs3); }
-void vssseg5e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x98000027, vm, rs2, rs1, vs3); }
-void vssseg6e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xb8000027, vm, rs2, rs1, vs3); }
-void vssseg7e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xd8000027, vm, rs2, rs1, vs3); }
-void vssseg8e128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xf8000027, vm, rs2, rs1, vs3); }
-void vsse128_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18000027, vm, rs2, rs1, vs3); }
-void vssseg1e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8005027, vm, rs2, rs1, vs3); }
-void vssseg2e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x28005027, vm, rs2, rs1, vs3); }
-void vssseg3e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x48005027, vm, rs2, rs1, vs3); }
-void vssseg4e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x68005027, vm, rs2, rs1, vs3); }
-void vssseg5e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x88005027, vm, rs2, rs1, vs3); }
-void vssseg6e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xa8005027, vm, rs2, rs1, vs3); }
-void vssseg7e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xc8005027, vm, rs2, rs1, vs3); }
-void vssseg8e16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xe8005027, vm, rs2, rs1, vs3); }
-void vsse16_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8005027, vm, rs2, rs1, vs3); }
-void vssseg1e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18005027, vm, rs2, rs1, vs3); }
-void vssseg2e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x38005027, vm, rs2, rs1, vs3); }
-void vssseg3e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x58005027, vm, rs2, rs1, vs3); }
-void vssseg4e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x78005027, vm, rs2, rs1, vs3); }
-void vssseg5e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x98005027, vm, rs2, rs1, vs3); }
-void vssseg6e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xb8005027, vm, rs2, rs1, vs3); }
-void vssseg7e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xd8005027, vm, rs2, rs1, vs3); }
-void vssseg8e256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xf8005027, vm, rs2, rs1, vs3); }
-void vsse256_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18005027, vm, rs2, rs1, vs3); }
-void vssseg1e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8006027, vm, rs2, rs1, vs3); }
-void vssseg2e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x28006027, vm, rs2, rs1, vs3); }
-void vssseg3e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x48006027, vm, rs2, rs1, vs3); }
-void vssseg4e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x68006027, vm, rs2, rs1, vs3); }
-void vssseg5e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x88006027, vm, rs2, rs1, vs3); }
-void vssseg6e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xa8006027, vm, rs2, rs1, vs3); }
-void vssseg7e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xc8006027, vm, rs2, rs1, vs3); }
-void vssseg8e32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xe8006027, vm, rs2, rs1, vs3); }
-void vsse32_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8006027, vm, rs2, rs1, vs3); }
-void vssseg1e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18006027, vm, rs2, rs1, vs3); }
-void vssseg2e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x38006027, vm, rs2, rs1, vs3); }
-void vssseg3e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x58006027, vm, rs2, rs1, vs3); }
-void vssseg4e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x78006027, vm, rs2, rs1, vs3); }
-void vssseg5e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x98006027, vm, rs2, rs1, vs3); }
-void vssseg6e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xb8006027, vm, rs2, rs1, vs3); }
-void vssseg7e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xd8006027, vm, rs2, rs1, vs3); }
-void vssseg8e512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xf8006027, vm, rs2, rs1, vs3); }
-void vsse512_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x18006027, vm, rs2, rs1, vs3); }
-void vssseg1e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8007027, vm, rs2, rs1, vs3); }
-void vssseg2e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x28007027, vm, rs2, rs1, vs3); }
-void vssseg3e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x48007027, vm, rs2, rs1, vs3); }
-void vssseg4e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x68007027, vm, rs2, rs1, vs3); }
-void vssseg5e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x88007027, vm, rs2, rs1, vs3); }
-void vssseg6e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xa8007027, vm, rs2, rs1, vs3); }
-void vssseg7e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xc8007027, vm, rs2, rs1, vs3); }
-void vssseg8e64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xe8007027, vm, rs2, rs1, vs3); }
-void vsse64_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8007027, vm, rs2, rs1, vs3); }
-void vssseg1e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8000027, vm, rs2, rs1, vs3); }
-void vssseg2e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x28000027, vm, rs2, rs1, vs3); }
-void vssseg3e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x48000027, vm, rs2, rs1, vs3); }
-void vssseg4e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x68000027, vm, rs2, rs1, vs3); }
-void vssseg5e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x88000027, vm, rs2, rs1, vs3); }
-void vssseg6e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xa8000027, vm, rs2, rs1, vs3); }
-void vssseg7e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xc8000027, vm, rs2, rs1, vs3); }
-void vssseg8e8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0xe8000027, vm, rs2, rs1, vs3); }
-void vsse8_v(VReg vs3, const Reg& rs1, const Reg& rs2, VM vm=VM::unmasked) { opVectorStore(0x8000027, vm, rs2, rs1, vs3); }
-void vssra_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xac003057, vm, vs2, simm5, vd); }
-void vssra_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xac000057, vm, vs2, vs1, vd); }
-void vssra_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xac004057, vm, vs2, rs1, vd); }
-void vssrl_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0xa8003057, vm, vs2, simm5, vd); }
-void vssrl_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xa8000057, vm, vs2, vs1, vd); }
-void vssrl_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0xa8004057, vm, vs2, rs1, vd); }
-void vssub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x8c000057, vm, vs2, vs1, vd); }
-void vssub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x8c004057, vm, vs2, rs1, vd); }
-void vssubu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x88000057, vm, vs2, vs1, vd); }
-void vssubu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x88004057, vm, vs2, rs1, vd); }
-void vsub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x8000057, vm, vs2, vs1, vd); }
-void vsub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x8004057, vm, vs2, rs1, vd); }
-void vsuxei1024_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x14007027, vm, vs2, rs1, vs3); }
-void vsuxei128_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x14000027, vm, vs2, rs1, vs3); }
-void vsuxei16_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x4005027, vm, vs2, rs1, vs3); }
-void vsuxei256_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x14005027, vm, vs2, rs1, vs3); }
-void vsuxei32_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x4006027, vm, vs2, rs1, vs3); }
-void vsuxei512_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x14006027, vm, vs2, rs1, vs3); }
-void vsuxei64_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x4007027, vm, vs2, rs1, vs3); }
-void vsuxei8_v(VReg vs3, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opVectorStore(0x4000027, vm, vs2, rs1, vs3); }
-void vwadd_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xc4002057, vm, vs2, vs1, vd); }
-void vwadd_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xc4006057, vm, vs2, rs1, vd); }
-void vwadd_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xd4002057, vm, vs2, vs1, vd); }
-void vwadd_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xd4006057, vm, vs2, rs1, vd); }
-void vwaddu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xc0002057, vm, vs2, vs1, vd); }
-void vwaddu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xc0006057, vm, vs2, rs1, vd); }
-void vwaddu_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xd0002057, vm, vs2, vs1, vd); }
-void vwaddu_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xd0006057, vm, vs2, rs1, vd); }
-void vwmacc_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xf4002057, vm, vs2, vs1, vd); }
-void vwmacc_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xf4006057, vm, vs2, rs1, vd); }
-void vwmaccsu_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xfc002057, vm, vs2, vs1, vd); }
-void vwmaccsu_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xfc006057, vm, vs2, rs1, vd); }
-void vwmaccu_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0xf0002057, vm, vs2, vs1, vd); }
-void vwmaccu_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xf0006057, vm, vs2, rs1, vd); }
-void vwmaccus_vx(const VReg& vd, const Reg& rs1, const VReg& vs2, VM vm=VM::unmasked) { opMVX(0xf8006057, vm, vs2, rs1, vd); }
-void vwmul_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xec002057, vm, vs2, vs1, vd); }
-void vwmul_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xec006057, vm, vs2, rs1, vd); }
-void vwmulsu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xe8002057, vm, vs2, vs1, vd); }
-void vwmulsu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xe8006057, vm, vs2, rs1, vd); }
-void vwmulu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xe0002057, vm, vs2, vs1, vd); }
-void vwmulu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xe0006057, vm, vs2, rs1, vd); }
-void vwredsum_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xc4000057, vm, vs2, vs1, vd); }
-void vwredsumu_vs(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0xc0000057, vm, vs2, vs1, vd); }
-void vwsub_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xcc002057, vm, vs2, vs1, vd); }
-void vwsub_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xcc006057, vm, vs2, rs1, vd); }
-void vwsub_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xdc002057, vm, vs2, vs1, vd); }
-void vwsub_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xdc006057, vm, vs2, rs1, vd); }
-void vwsubu_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xc8002057, vm, vs2, vs1, vd); }
-void vwsubu_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xc8006057, vm, vs2, rs1, vd); }
-void vwsubu_wv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opMVV(0xd8002057, vm, vs2, vs1, vd); }
-void vwsubu_wx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opMVX(0xd8006057, vm, vs2, rs1, vd); }
-void vxor_vi(const VReg& vd, const VReg& vs2, int32_t simm5, VM vm=VM::unmasked) { opIVI(0x2c003057, vm, vs2, simm5, vd); }
-void vxor_vv(const VReg& vd, const VReg& vs2, const VReg& vs1, VM vm=VM::unmasked) { opIVV(0x2c000057, vm, vs2, vs1, vd); }
-void vxor_vx(const VReg& vd, const VReg& vs2, const Reg& rs1, VM vm=VM::unmasked) { opIVX(0x2c004057, vm, vs2, rs1, vd); }
-void vzext_vf2(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x48032057, vm, vs2, 0, vd); }
-void vzext_vf4(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x48022057, vm, vs2, 0, vd); }
-void vzext_vf8(const VReg& vd, const VReg& vs2, VM vm=VM::unmasked) { opMVV(0x48012057, vm, vs2, 0, vd); }
-
-void vsetivli(const Reg& rd, uint32_t uimm, SEW sew, LMUL lmul=LMUL::m1, VTA vta=VTA::tu, VMA vma=VMA::mu) {
-    uint32_t zimm = (static_cast<uint32_t>(vma)<<7) |
-                    (static_cast<uint32_t>(vta)<<6) |
-                    (static_cast<uint32_t>(sew)<<3) |
-                    (static_cast<uint32_t>(lmul));
-    uint32_t v = (0x3<<30) | (zimm<<20) | (uimm<<15) | (0x7<<12) | (rd.getIdx()<<7) | (0x57);
-    append4B(v);
-}
-
-void vsetvli(const Reg& rd, const Reg& rs1, SEW sew, LMUL lmul=LMUL::m1, VTA vta=VTA::tu, VMA vma=VMA::mu) {
-    uint32_t zimm = (static_cast<uint32_t>(vma)<<7) |
-                    (static_cast<uint32_t>(vta)<<6) |
-                    (static_cast<uint32_t>(sew)<<3) |
-                    (static_cast<uint32_t>(lmul));
-    uint32_t v = (0x0<<31) | (zimm<<20) | (rs1.getIdx()<<15) | (0x7<<12) | (rd.getIdx()<<7) | (0x57);
-    append4B(v);
-}
-
-void vsetvl(const Reg& rd, const Reg& rs1, const Reg& rs2) {
-    uint32_t v = (0x40<<25) | (rs2.getIdx()<<20) | (rs1.getIdx()<<15) | (0x7<<12) | (rd.getIdx()<<7) | (0x57);
-    append4B(v);
-}
-
-
-// Copy mask register
-void vmmv_m(const VReg& vd, const VReg& vs) { vmand_mm(vd, vs, vs); }
-// Clear mask register
-void vmclr_m(const VReg& vd) { vmxor_mm(vd, vd, vd); }
-// Set mask register
-void vmset_m(const VReg& vd) { vmxnor_mm(vd, vd, vd); }
-// Invert bits
-void vmnot_m(const VReg& vd, const VReg& vs) { vmnand_mm(vd, vs, vs); }
-
-
-// vector compare pseudoinstructions
-void vmfgt_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { vmflt_vv(vd, vs2, vs1, vm); }
-void vmfge_vv(const VReg& vd, const VReg& vs1, const VReg& vs2, VM vm=VM::unmasked) { vmfle_vv(vd, vs2, vs1, vm); }
-
-// sign-related pseudoinstructions
-void vfabs_v(const VReg& vd, const VReg& vs, VM vm=VM::unmasked) { vfsgnjx_vv(vd, vs, vs, vm); }
-void vfneg_v(const VReg& vd, const VReg& vs, VM vm=VM::unmasked) { vfsgnjn_vv(vd, vs, vs, vm); }

From 9b4e111e3b0f26e84fa2d4b0a3adb5cad60fcbbe Mon Sep 17 00:00:00 2001
From: StrelkovKM <strelkovkm96@outlook.com>
Date: Tue, 28 Apr 2026 21:29:22 +0000
Subject: [PATCH 12/13] [CPU][RV64] Return ref, clean comments  and edit choose
 impl

---
 src/cpu/cpu_convolution_list.cpp            |   6 +-
 src/cpu/rv64/rvv_gemm_convolution.cpp       |  16 +-
 src/cpu/rv64/rvv_gemm_convolution.hpp       |  10 +-
 src/cpu/rv64/rvv_gemm_convolution_utils.cpp | 462 ++++++++++----------
 4 files changed, 251 insertions(+), 243 deletions(-)

diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp
index c8f41b8e947..6a0756b4e95 100644
--- a/src/cpu/cpu_convolution_list.cpp
+++ b/src/cpu/cpu_convolution_list.cpp
@@ -182,9 +182,9 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
             CPU_INSTANCE_RV64GCV(riscv_gemm_convolution_fwd_t)
             
 
-            // CPU_INSTANCE(gemm_convolution_fwd_t)
-            // CPU_INSTANCE(ref_convolution_fwd_t)
-            // CPU_INSTANCE(ref_fused_convolution_fwd_t)
+            CPU_INSTANCE(gemm_convolution_fwd_t)
+            CPU_INSTANCE(ref_convolution_fwd_t)
+            CPU_INSTANCE(ref_fused_convolution_fwd_t)
             nullptr,
         }},
         {{forward, f32, f16, f32}, {
diff --git a/src/cpu/rv64/rvv_gemm_convolution.cpp b/src/cpu/rv64/rvv_gemm_convolution.cpp
index dfb271575d0..f4ea4196da9 100644
--- a/src/cpu/rv64/rvv_gemm_convolution.cpp
+++ b/src/cpu/rv64/rvv_gemm_convolution.cpp
@@ -48,7 +48,7 @@ static void apply_bias_eltwise_rvv_nspc(
         bool with_eltwise,
         const ref_post_ops_t *post_ops,
         const exec_ctx_t &ctx,
-        const memory_desc_t *dst_md, // Changed to pointer to memory_desc_t
+        const memory_desc_t *dst_md,
         const conv_gemm_conf_t &jcp,
         size_t g, size_t os_offset_factor) {
     
@@ -350,7 +350,7 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp(
             jcp.os_block == jcp.os && jcp.ic_block == jcp.ic
                      && jcp.os_nb_block == 1));
 
-    status_t st = status::success;
+    std::atomic<status_t> st(status::success);
     parallel(jcp.nthr, [&](const int ithr, const int nthr) {
         data_t *_col = col + (ptrdiff_t)ithr * jcp.im2col_sz;
 
@@ -547,7 +547,8 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp(
                         status_t st_thr
                                 = inner_ker(spatial, curr, prev, step, end);
                         if (st_thr != status::success) {
-                            st = st_thr;
+                            status_t expected = status::success;
+                            st.compare_exchange_strong(expected, st_thr);
                             return;
                         }
                     }
@@ -562,13 +563,16 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp(
                         status_t st_thr
                                 = inner_ker(spatial, curr, prev, step, end);
                         if (st_thr != status::success) {
-                            st = st_thr;
+                            status_t expected = status::success;
+                            st.compare_exchange_strong(expected, st_thr);
                             return;
                         }
                     }
             }
-        else
-            st = status::unimplemented;
+        else {
+            status_t expected = status::success;
+            st.compare_exchange_strong(expected, status::unimplemented);
+        }
     });
 
     return st;
diff --git a/src/cpu/rv64/rvv_gemm_convolution.hpp b/src/cpu/rv64/rvv_gemm_convolution.hpp
index 19f4289920c..1545afb6912 100644
--- a/src/cpu/rv64/rvv_gemm_convolution.hpp
+++ b/src/cpu/rv64/rvv_gemm_convolution.hpp
@@ -38,7 +38,7 @@ struct riscv_gemm_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
         using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
-        DECLARE_COMMON_PD_T(GEMM_IMPL_STR, riscv_gemm_convolution_fwd_t,
+        DECLARE_COMMON_PD_T("gemm:any", riscv_gemm_convolution_fwd_t,
                 USE_GLOBAL_SCRATCHPAD);
 
         status_t init(engine_t *engine) {
@@ -68,8 +68,7 @@ struct riscv_gemm_convolution_fwd_t : public primitive_t {
 
             // TODO: make `init_conf` assign initialized object to `jcp_`
             jcp_ = conv_gemm_conf_t();
-
-            return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad,
+                return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad,
                     *desc(), src_md_, weights_md_, dst_md_, bias_md_, attr_,
                     dnnl_get_max_threads());
         }
@@ -114,7 +113,6 @@ struct riscv_gemm_convolution_fwd_t : public primitive_t {
         : primitive_t(apd), post_ops_(nullptr) {}
 
     status_t init(engine_t *engine) override {
-        std::cout << "GEMM INIT" << std::endl;
         const auto &jcp = pd()->jcp_;
 
         if (jcp.with_eltwise || jcp.with_binary) {
@@ -122,16 +120,12 @@ struct riscv_gemm_convolution_fwd_t : public primitive_t {
             CHECK(post_ops_->init(pd()->dst_md()));
         }
 
-        std::cout << "GEMM SUCCESS" << std::endl;
         return status::success;
     }
 
     using data_t = typename prec_traits_t<data_type::f32>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
-        fprintf(stderr, "[RVV EXECUTE] Layer executed!\n");
-        fflush(stderr);
-
         bool is_nspc = pd()->jcp_.is_nspc;
         return is_nspc ? execute_forward_nspc(ctx) : execute_forward_ncsp(ctx);
     }
diff --git a/src/cpu/rv64/rvv_gemm_convolution_utils.cpp b/src/cpu/rv64/rvv_gemm_convolution_utils.cpp
index 2ce81d0a738..bcfb62b2990 100644
--- a/src/cpu/rv64/rvv_gemm_convolution_utils.cpp
+++ b/src/cpu/rv64/rvv_gemm_convolution_utils.cpp
@@ -13,7 +13,6 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
-
 #include "cpu/rv64/rvv_gemm_convolution_utils.hpp"
 #include "common/bfloat16.hpp"
 #include "common/c_types_map.hpp"
@@ -21,14 +20,16 @@
 #include "common/type_helpers.hpp"
 #include "common/utils.hpp"
 #include "cpu/scale_utils.hpp"
-
 #include "cpu/platform.hpp"
 
+#ifdef DNNL_RISCV_USE_RVV_INTRINSICS
+#include <riscv_vector.h>
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace cpu {
 namespace rv64 {
-
 using namespace dnnl::impl::status;
 using namespace dnnl::impl::utils;
 using namespace prop_kind;
@@ -48,13 +49,12 @@ namespace jit_gemm_convolution_utils {
 template <typename data_type_t>
 void im2col_3d(const conv_gemm_conf_t &jcp, const data_type_t *im,
         data_type_t *col, dim_t od, int spatial_step, int spatial_block) {
-    using data_t =
-            typename conditional<data_traits_t<data_type_t>::data_type == bf16,
-                    uint16_t, data_type_t>::type;
+    using data_t = typename conditional<data_traits_t<data_type_t>::data_type
+                        == bf16,
+                uint16_t, data_type_t>::type;
     const data_t *__restrict _im
             = reinterpret_cast<const data_t *__restrict>(im);
     data_t *__restrict _col = reinterpret_cast<data_t *__restrict>(col);
-
     const size_t OHW = spatial_block;
     const size_t im_step = jcp.ih * jcp.iw * jcp.id;
     const size_t col_step = jcp.ks * OHW;
@@ -97,7 +97,8 @@ void im2col_3d(const conv_gemm_conf_t &jcp, const data_type_t *im,
                     col_ += jcp.kw * OHW;
                 }
             } else {
-                const data_t *__restrict im_ = im_loc + id * jcp.ih * jcp.iw;
+                const data_t *__restrict im_
+                        = im_loc + id * jcp.ih * jcp.iw;
                 dim_t ih_ = -jcp.t_pad;
                 for (dim_t kh = 0; kh < jcp.kh; ++kh) {
                     dim_t ih = ih_;
@@ -211,17 +212,18 @@ void im2col_3d(const conv_gemm_conf_t &jcp, const data_type_t *im,
 
     // zero padding is handled outside im2col
     const bool outer_padding = jcp.os_nb_block == 1;
-    if (outer_padding)
+    if (outer_padding) {
         parallel_nd(jcp.ic, compute_im2col_outer_padding);
-    else
+    } else {
         parallel_nd(jcp.ic, compute_im2col_padding);
+    }
 }
 
-template void im2col_3d(const conv_gemm_conf_t &jcp, const float *im,
+template void im2col_3d<float>(const conv_gemm_conf_t &jcp, const float *im,
         float *col, dim_t od, int spatial_step, int spatial_block);
-
-template void im2col_3d(const conv_gemm_conf_t &jcp, const bfloat16_t *im,
-        bfloat16_t *col, dim_t od, int spatial_step, int spatial_block);
+template void im2col_3d<bfloat16_t>(const conv_gemm_conf_t &jcp,
+        const bfloat16_t *im, bfloat16_t *col, dim_t od, int spatial_step,
+        int spatial_block);
 
 /* imtr[ic][od][oh][ow] <-- im[id][ih][iw][ic]*/
 template <typename T>
@@ -231,7 +233,8 @@ void transpose_dt(const conv_gemm_conf_t &jcp, const T *__restrict im,
     const dim_t ic_stride = jcp.id * jcp.ih * jcp.iw;
     const dim_t IC = jcp.ngroups * jcp.ic;
     const dim_t IHW = jcp.ih * jcp.iw;
-    constexpr dim_t ic_block = platform::get_cache_line_size();
+    const dim_t ic_block = nstl::max<dim_t>(
+            1, platform::get_cache_line_size() / sizeof(T));
     const dim_t nb_ic = jcp.ic / ic_block;
     const dim_t ic_blocked = nb_ic * ic_block;
     parallel_nd(jcp.id, jcp.ih, [&](dim_t id, dim_t ih) {
@@ -255,15 +258,15 @@ void transpose_dt(const conv_gemm_conf_t &jcp, const T *__restrict im,
     });
 }
 
-template void transpose_dt(const conv_gemm_conf_t &jcp,
+template void transpose_dt<int8_t>(const conv_gemm_conf_t &jcp,
         const int8_t *__restrict im, int8_t *__restrict imtr);
-template void transpose_dt(const conv_gemm_conf_t &jcp,
+template void transpose_dt<uint8_t>(const conv_gemm_conf_t &jcp,
         const uint8_t *__restrict im, uint8_t *__restrict imtr);
-template void transpose_dt(const conv_gemm_conf_t &jcp,
+template void transpose_dt<char>(const conv_gemm_conf_t &jcp,
         const char *__restrict im, char *__restrict imtr);
-template void transpose_dt(const conv_gemm_conf_t &jcp,
+template void transpose_dt<float>(const conv_gemm_conf_t &jcp,
         const float *__restrict im, float *__restrict imtr);
-template void transpose_dt(const conv_gemm_conf_t &jcp,
+template void transpose_dt<bfloat16_t>(const conv_gemm_conf_t &jcp,
         const bfloat16_t *__restrict im, bfloat16_t *__restrict imtr);
 
 /* col[kd][kh][kw][g][ic][od][oh][ow] <-- im2col_dt_3d(im[id][ih][iw][g][ic]) */
@@ -282,7 +285,6 @@ void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr,
     const im_dt *__restrict imtr
             = reinterpret_cast<const im_dt *__restrict>(_imtr);
     col_dt *__restrict col = reinterpret_cast<col_dt *__restrict>(_col);
-
     col_dt shift = static_cast<col_dt>(jcp.signed_input ? 128 : 0);
     const dim_t dd = 1 + jcp.dilate_d;
     const dim_t dh = 1 + jcp.dilate_h;
@@ -303,89 +305,100 @@ void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr,
     if (sd == 1 && sh == 1 && sw == 1 && dd == 1 && dh == 1 && dw == 1)
         parallel_nd(jcp.kd, jcp.kh, jcp.kw, jcp.ic,
                 [&](dim_t kd, dim_t kh, dim_t kw, dim_t ic) {
-            col_dt *__restrict col_loc = col + kd * col_kd_s + kh * col_kh_s
-                    + kw * col_kw_s + ic * col_ic_s;
-            const dim_t id = od - fp + kd;
-            if (id < 0 || id >= jcp.id) {
-                for (ptrdiff_t i = 0; i < OHW; i++)
-                    col_loc[i] = shift;
-                return;
-            }
-            const im_dt *__restrict imtr_loc = imtr + (ic * jcp.id + id) * IHW;
-            const dim_t oh_start = saturate(dim_t(0), jcp.oh, tp - kh);
-            const dim_t oh_end = saturate(dim_t(0), jcp.oh, jcp.ih + tp - kh);
-            const dim_t ow_start = saturate(dim_t(0), jcp.ow, lp - kw);
-            const dim_t ow_end = saturate(dim_t(0), jcp.ow, jcp.iw + lp - kw);
-            for (dim_t oh = oh_start, ih = oh_start - tp + kh; oh < oh_end;
-                    oh++, ih++) {
-                col_dt *__restrict col_h = col_loc + oh * jcp.ow;
-                const im_dt *__restrict imtr_h = imtr_loc + ih * jcp.iw;
-                for (dim_t ow = ow_start, iw = ow_start - lp + kw; ow < ow_end;
-                        ow++, iw++) {
-                    col_h[ow] = imtr_h[iw];
-                }
-            }
-        });
+                    col_dt *__restrict col_loc = col + kd * col_kd_s
+                            + kh * col_kh_s + kw * col_kw_s + ic * col_ic_s;
+                    const dim_t id = od - fp + kd;
+                    if (id < 0 || id >= jcp.id) {
+                        for (ptrdiff_t i = 0; i < OHW; i++)
+                            col_loc[i] = shift;
+                        return;
+                    }
+                    const im_dt *__restrict imtr_loc
+                            = imtr + (ic * jcp.id + id) * IHW;
+                    const dim_t oh_start
+                            = saturate(dim_t(0), jcp.oh, tp - kh);
+                    const dim_t oh_end
+                            = saturate(dim_t(0), jcp.oh, jcp.ih + tp - kh);
+                    const dim_t ow_start
+                            = saturate(dim_t(0), jcp.ow, lp - kw);
+                    const dim_t ow_end
+                            = saturate(dim_t(0), jcp.ow, jcp.iw + lp - kw);
+                    for (dim_t oh = oh_start, ih = oh_start - tp + kh;
+                            oh < oh_end; oh++, ih++) {
+                        col_dt *__restrict col_h = col_loc + oh * jcp.ow;
+                        const im_dt *__restrict imtr_h
+                                = imtr_loc + ih * jcp.iw;
+                        for (dim_t ow = ow_start, iw = ow_start - lp + kw;
+                                ow < ow_end; ow++, iw++) {
+                            col_h[ow] = imtr_h[iw];
+                        }
+                    }
+                });
     else if (sd == 2 && sh == 2 && sw == 2 && dd == 1 && dh == 1 && dw == 1)
         parallel_nd(jcp.kd, jcp.kh, jcp.kw, jcp.ic,
                 [&](dim_t kd, dim_t kh, dim_t kw, dim_t ic) {
-            col_dt *__restrict col_loc = col + kd * col_kd_s + kh * col_kh_s
-                    + kw * col_kw_s + ic * col_ic_s;
-            const dim_t id = od * 2 - fp + kd;
-            if (id < 0 || id >= jcp.id) {
-                for (ptrdiff_t i = 0; i < OHW; i++)
-                    col_loc[i] = shift;
-                return;
-            }
-            const im_dt *__restrict imtr_loc = imtr + (ic * jcp.id + id) * IHW;
-            const dim_t oh_start
-                    = saturate(dim_t(0), jcp.oh, div_up(tp - kh, 2));
-            const dim_t oh_end
-                    = saturate(dim_t(0), jcp.oh, div_up(jcp.ih + tp - kh, 2));
-            const dim_t ow_start
-                    = saturate(dim_t(0), jcp.ow, div_up(lp - kw, 2));
-            const dim_t ow_end
-                    = saturate(dim_t(0), jcp.ow, div_up(jcp.iw + lp - kw, 2));
-            for (dim_t oh = oh_start, ih = oh_start * 2 - tp + kh; oh < oh_end;
-                    ++oh, ih += 2) {
-                col_dt *__restrict col_h = col_loc + oh * jcp.ow;
-                const im_dt *__restrict imtr_h = imtr_loc + ih * jcp.iw;
-                for (dim_t ow = ow_start, iw = ow_start * 2 - lp + kw;
-                        ow < ow_end; ++ow, iw += 2) {
-                    col_h[ow] = imtr_h[iw];
-                }
-            }
-        });
+                    col_dt *__restrict col_loc = col + kd * col_kd_s
+                            + kh * col_kh_s + kw * col_kw_s + ic * col_ic_s;
+                    const dim_t id = od * 2 - fp + kd;
+                    if (id < 0 || id >= jcp.id) {
+                        for (ptrdiff_t i = 0; i < OHW; i++)
+                            col_loc[i] = shift;
+                        return;
+                    }
+                    const im_dt *__restrict imtr_loc
+                            = imtr + (ic * jcp.id + id) * IHW;
+                    const dim_t oh_start
+                            = saturate(dim_t(0), jcp.oh, div_up(tp - kh, 2));
+                    const dim_t oh_end = saturate(
+                            dim_t(0), jcp.oh, div_up(jcp.ih + tp - kh, 2));
+                    const dim_t ow_start
+                            = saturate(dim_t(0), jcp.ow, div_up(lp - kw, 2));
+                    const dim_t ow_end = saturate(
+                            dim_t(0), jcp.ow, div_up(jcp.iw + lp - kw, 2));
+                    for (dim_t oh = oh_start, ih = oh_start * 2 - tp + kh;
+                            oh < oh_end; ++oh, ih += 2) {
+                        col_dt *__restrict col_h = col_loc + oh * jcp.ow;
+                        const im_dt *__restrict imtr_h
+                                = imtr_loc + ih * jcp.iw;
+                        for (dim_t ow = ow_start, iw = ow_start * 2 - lp + kw;
+                                ow < ow_end; ++ow, iw += 2) {
+                            col_h[ow] = imtr_h[iw];
+                        }
+                    }
+                });
     else
         parallel_nd(jcp.kd, jcp.kh, jcp.kw, jcp.ic,
                 [&](dim_t kd, dim_t kh, dim_t kw, dim_t ic) {
-            col_dt *__restrict col_loc = col + kd * col_kd_s + kh * col_kh_s
-                    + kw * col_kw_s + ic * col_ic_s;
-            const dim_t id = od * sd - fp + kd * dd;
-            if (id < 0 || id >= jcp.id) {
-                for (ptrdiff_t i = 0; i < OHW; i++)
-                    col_loc[i] = shift;
-                return;
-            }
-            const im_dt *__restrict imtr_loc = imtr + (ic * jcp.id + id) * IHW;
-            const dim_t oh_start
-                    = saturate(dim_t(0), jcp.oh, div_up(tp - kh * dh, sh));
-            const dim_t oh_end = saturate(
-                    dim_t(0), jcp.oh, div_up(jcp.ih + tp - kh * dh, sh));
-            const dim_t ow_start
-                    = saturate(dim_t(0), jcp.ow, div_up(lp - kw * dw, sw));
-            const dim_t ow_end = saturate(
-                    dim_t(0), jcp.ow, div_up(jcp.iw + lp - kw * dw, sw));
-            for (dim_t oh = oh_start, ih = oh_start * sh - tp + kh * dh;
-                    oh < oh_end; ++oh, ih += sh) {
-                col_dt *__restrict col_h = col_loc + oh * jcp.ow;
-                const im_dt *__restrict imtr_h = imtr_loc + ih * jcp.iw;
-                for (dim_t ow = ow_start, iw = ow_start * sw - lp + kw * dw;
-                        ow < ow_end; ++ow, iw += sw) {
-                    col_h[ow] = imtr_h[iw];
-                }
-            }
-        });
+                    col_dt *__restrict col_loc = col + kd * col_kd_s
+                            + kh * col_kh_s + kw * col_kw_s + ic * col_ic_s;
+                    const dim_t id = od * sd - fp + kd * dd;
+                    if (id < 0 || id >= jcp.id) {
+                        for (ptrdiff_t i = 0; i < OHW; i++)
+                            col_loc[i] = shift;
+                        return;
+                    }
+                    const im_dt *__restrict imtr_loc
+                            = imtr + (ic * jcp.id + id) * IHW;
+                    const dim_t oh_start = saturate(
+                            dim_t(0), jcp.oh, div_up(tp - kh * dh, sh));
+                    const dim_t oh_end = saturate(dim_t(0), jcp.oh,
+                            div_up(jcp.ih + tp - kh * dh, sh));
+                    const dim_t ow_start = saturate(
+                            dim_t(0), jcp.ow, div_up(lp - kw * dw, sw));
+                    const dim_t ow_end = saturate(dim_t(0), jcp.ow,
+                            div_up(jcp.iw + lp - kw * dw, sw));
+                    for (dim_t oh = oh_start, ih = oh_start * sh - tp + kh * dh;
+                            oh < oh_end; ++oh, ih += sh) {
+                        col_dt *__restrict col_h = col_loc + oh * jcp.ow;
+                        const im_dt *__restrict imtr_h
+                                = imtr_loc + ih * jcp.iw;
+                        for (dim_t ow = ow_start,
+                                  iw = ow_start * sw - lp + kw * dw;
+                                ow < ow_end; ++ow, iw += sw) {
+                            col_h[ow] = imtr_h[iw];
+                        }
+                    }
+                });
 }
 
 template void im2col_dt_3d<int8_t, uint8_t>(const conv_gemm_conf_t &jcp,
@@ -500,58 +513,62 @@ void im2col(const conv_gemm_conf_t &jcp, const data_type_t *__restrict im,
         if (sw == 1)
             parallel_nd(cb, jcp.kh, jcp.kw, oh_range,
                     [&](dim_t ic, dim_t kh, dim_t kw, dim_t ohr) {
-                const dim_t oh = ohr + oh_begin;
-                const dim_t ih = oh * sh - tp + kh * dh;
-                const dim_t ow_start = (oh == first_oh) ? first_ow : 0;
-                const dim_t ow_end = (oh == last_oh) ? (last_ow + 1) : jcp.ow;
-                data_t *__restrict col_oh = _col + ic * col_step
-                        + (kh * jcp.kw + kw) * sb + oh * jcp.ow - ss;
-                const data_t *__restrict im_
-                        = _im + (ic + cs) * im_step + ih * jcp.iw;
-                const dim_t iw_shift = kw * dw - lp;
-                if (ih < 0 || ih >= jcp.ih)
-                    for (dim_t ow = ow_start; ow < ow_end; ow++)
-                        col_oh[ow] = zero_val;
-                else
-                    for (dim_t ow = ow_start; ow < ow_end; ow++) {
-                        const dim_t iw = ow + iw_shift;
-                        if (iw < 0 || iw >= jcp.iw)
-                            col_oh[ow] = zero_val;
+                        const dim_t oh = ohr + oh_begin;
+                        const dim_t ih = oh * sh - tp + kh * dh;
+                        const dim_t ow_start
+                                = (oh == first_oh) ? first_ow : 0;
+                        const dim_t ow_end
+                                = (oh == last_oh) ? (last_ow + 1) : jcp.ow;
+                        data_t *__restrict col_oh = _col + ic * col_step
+                                + (kh * jcp.kw + kw) * sb + oh * jcp.ow - ss;
+                        const data_t *__restrict im_ = _im + (ic + cs) * im_step
+                                + ih * jcp.iw;
+                        const dim_t iw_shift = kw * dw - lp;
+                        if (ih < 0 || ih >= jcp.ih)
+                            for (dim_t ow = ow_start; ow < ow_end; ow++)
+                                col_oh[ow] = zero_val;
                         else
-                            col_oh[ow] = im_[iw];
-                    }
-            });
+                            for (dim_t ow = ow_start; ow < ow_end; ow++) {
+                                const dim_t iw = ow + iw_shift;
+                                if (iw < 0 || iw >= jcp.iw)
+                                    col_oh[ow] = zero_val;
+                                else
+                                    col_oh[ow] = im_[iw];
+                            }
+                    });
         else
             parallel_nd(cb, jcp.kh, jcp.kw, oh_range,
                     [&](dim_t ic, dim_t kh, dim_t kw, dim_t ohr) {
-                const dim_t oh = ohr + oh_begin;
-                const dim_t ih = oh * sh - tp + kh * dh;
-                const dim_t ow_start = (oh == first_oh) ? first_ow : 0;
-                const dim_t ow_end = (oh == last_oh) ? (last_ow + 1) : jcp.ow;
-                data_t *__restrict col_oh = _col + ic * col_step
-                        + (kh * jcp.kw + kw) * sb + oh * jcp.ow - ss;
-                const data_t *__restrict im_ = _im + (ic + cs) * im_step;
-                if (ih < 0 || ih >= jcp.ih)
-                    for (dim_t ow = ow_start; ow < ow_end; ow++)
-                        col_oh[ow] = zero_val;
-                else
-                    for (dim_t ow = ow_start; ow < ow_end; ow++) {
-                        const dim_t iw = ow * sw - lp + kw * dw;
-                        if (iw < 0 || iw >= jcp.iw)
-                            col_oh[ow] = zero_val;
-                        else {
-                            const ptrdiff_t im_idx = ih * jcp.iw + iw;
-                            col_oh[ow] = im_[im_idx];
-                        }
-                    }
-            });
+                        const dim_t oh = ohr + oh_begin;
+                        const dim_t ih = oh * sh - tp + kh * dh;
+                        const dim_t ow_start
+                                = (oh == first_oh) ? first_ow : 0;
+                        const dim_t ow_end
+                                = (oh == last_oh) ? (last_ow + 1) : jcp.ow;
+                        data_t *__restrict col_oh = _col + ic * col_step
+                                + (kh * jcp.kw + kw) * sb + oh * jcp.ow - ss;
+                        const data_t *__restrict im_ = _im + (ic + cs) * im_step;
+                        if (ih < 0 || ih >= jcp.ih)
+                            for (dim_t ow = ow_start; ow < ow_end; ow++)
+                                col_oh[ow] = zero_val;
+                        else
+                            for (dim_t ow = ow_start; ow < ow_end; ow++) {
+                                const dim_t iw = ow * sw - lp + kw * dw;
+                                if (iw < 0 || iw >= jcp.iw)
+                                    col_oh[ow] = zero_val;
+                                else {
+                                    const ptrdiff_t im_idx = ih * jcp.iw + iw;
+                                    col_oh[ow] = im_[im_idx];
+                                }
+                            }
+                    });
     }
 }
 
-template void im2col(const conv_gemm_conf_t &jcp, const float *__restrict im,
-        float *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb);
-
-template void im2col(const conv_gemm_conf_t &jcp,
+template void im2col<float>(const conv_gemm_conf_t &jcp,
+        const float *__restrict im, float *__restrict col, dim_t hs,
+        dim_t hb, dim_t ws, dim_t wb);
+template void im2col<bfloat16_t>(const conv_gemm_conf_t &jcp,
         const bfloat16_t *__restrict im, bfloat16_t *__restrict col, dim_t hs,
         dim_t hb, dim_t ws, dim_t wb);
 
@@ -560,19 +577,16 @@ template <typename orig_im_dt, typename orig_col_dt>
 void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict _im,
         void *__restrict _imtr, orig_col_dt *__restrict _col, dim_t hs,
         dim_t hb, dim_t ws, dim_t wb) {
-    // For performance reasons, use uint16_t as a proxy for bfloat16_t
-    using im_dt =
-            typename utils::conditional<data_traits_t<orig_im_dt>::data_type
-                            == bf16,
-                    uint16_t, orig_im_dt>::type;
-    using col_dt =
-            typename utils::conditional<data_traits_t<orig_col_dt>::data_type
-                            == bf16,
-                    uint16_t, orig_col_dt>::type;
-    const im_dt *__restrict im = reinterpret_cast<const im_dt *__restrict>(_im);
+    using im_dt = typename utils::conditional<
+            data_traits_t<orig_im_dt>::data_type == bf16, uint16_t,
+            orig_im_dt>::type;
+    using col_dt = typename utils::conditional<
+            data_traits_t<orig_col_dt>::data_type == bf16, uint16_t,
+            orig_col_dt>::type;
+    const im_dt *__restrict im
+            = reinterpret_cast<const im_dt *__restrict>(_im);
     im_dt *__restrict imtr = reinterpret_cast<im_dt *__restrict>(_imtr);
     col_dt *__restrict col = reinterpret_cast<col_dt *__restrict>(_col);
-
     col_dt shift = static_cast<col_dt>(jcp.signed_input ? 128 : 0);
     const dim_t dh = 1 + jcp.dilate_h;
     const dim_t dw = 1 + jcp.dilate_w;
@@ -655,32 +669,34 @@ void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict _im,
     } else {
         parallel_nd(jcp.kh, jcp.kw, jcp.ic, hb,
                 [&](dim_t kh, dim_t kw, dim_t ic, dim_t oh) {
-            const dim_t hp = tp - kh * dh;
-            const dim_t ih = (oh + hs) * sh - hp;
-            const ptrdiff_t col_idx_base
-                    = (((kh * jcp.kw + kw) * jcp.ic + ic) * hb + oh) * wb;
-            if (ih < 0 || ih >= jcp.ih)
-                for (dim_t ow = 0; ow < wb; ow++)
-                    col[col_idx_base + ow] = shift;
-            else {
-                const dim_t wp = lp - kw * dw;
-                const dim_t ow_start
-                        = saturate(dim_t(0), wb, div_up(wp, sw) - ws);
-                const dim_t ow_end
-                        = saturate(dim_t(0), wb, div_up(jcp.iw + wp, sw) - ws);
-                for (dim_t ow = 0; ow < ow_start; ow++)
-                    col[col_idx_base + ow] = shift;
-                const dim_t iw_base = ws * sw - wp;
-                const ptrdiff_t im_idx_base = ih * im_ih_stride + ic;
-                for (dim_t ow = ow_start; ow < ow_end; ow++) {
-                    const dim_t iw = iw_base + ow * sw;
-                    const ptrdiff_t im_idx = im_idx_base + iw * im_iw_stride;
-                    col[col_idx_base + ow] = im[im_idx] + shift;
-                }
-                for (dim_t ow = ow_end; ow < wb; ow++)
-                    col[col_idx_base + ow] = shift;
-            }
-        });
+                    const dim_t hp = tp - kh * dh;
+                    const dim_t ih = (oh + hs) * sh - hp;
+                    const ptrdiff_t col_idx_base
+                            = (((kh * jcp.kw + kw) * jcp.ic + ic) * hb + oh)
+                                    * wb;
+                    if (ih < 0 || ih >= jcp.ih)
+                        for (dim_t ow = 0; ow < wb; ow++)
+                            col[col_idx_base + ow] = shift;
+                    else {
+                        const dim_t wp = lp - kw * dw;
+                        const dim_t ow_start = saturate(
+                                dim_t(0), wb, div_up(wp, sw) - ws);
+                        const dim_t ow_end = saturate(dim_t(0), wb,
+                                div_up(jcp.iw + wp, sw) - ws);
+                        for (dim_t ow = 0; ow < ow_start; ow++)
+                            col[col_idx_base + ow] = shift;
+                        const dim_t iw_base = ws * sw - wp;
+                        const ptrdiff_t im_idx_base = ih * im_ih_stride + ic;
+                        for (dim_t ow = ow_start; ow < ow_end; ow++) {
+                            const dim_t iw = iw_base + ow * sw;
+                            const ptrdiff_t im_idx
+                                    = im_idx_base + iw * im_iw_stride;
+                            col[col_idx_base + ow] = im[im_idx] + shift;
+                        }
+                        for (dim_t ow = ow_end; ow < wb; ow++)
+                            col[col_idx_base + ow] = shift;
+                    }
+                });
     }
 }
 
@@ -693,7 +709,6 @@ template void im2col_dt<uint8_t, uint8_t>(const conv_gemm_conf_t &jcp,
 template void im2col_dt<float, float>(const conv_gemm_conf_t &jcp,
         const void *__restrict im, void *__restrict imtr, float *__restrict col,
         dim_t hs, dim_t hb, dim_t ws, dim_t wb);
-
 template void im2col_dt<bfloat16_t, bfloat16_t>(const conv_gemm_conf_t &jcp,
         const void *__restrict im, void *__restrict imtr,
         bfloat16_t *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb);
@@ -1116,10 +1131,12 @@ status_t init_conf(conv_gemm_conf_t &jcp,
     const bool is_bwd_w = jcp.prop_kind == backward_weights;
     const bool is_fwd = !is_bwd_d && !is_bwd_w;
 
-    const auto dst_max_size
-            = static_cast<size_t>(jcp.iw) * jcp.ih * jcp.id * jcp.ic * 4;
-    const auto src_max_size
-            = static_cast<size_t>(jcp.ow) * jcp.oh * jcp.od * jcp.oc * 4;
+    const auto dst_max_size = static_cast<size_t>(jcp.iw)
+            * static_cast<size_t>(jcp.ih) * static_cast<size_t>(jcp.id)
+            * static_cast<size_t>(jcp.ic) * 4;
+    const auto src_max_size = static_cast<size_t>(jcp.ow)
+            * static_cast<size_t>(jcp.oh) * static_cast<size_t>(jcp.od)
+            * static_cast<size_t>(jcp.oc) * 4;
     VDISPATCH_CONV_IC(dst_max_size <= INT_MAX && src_max_size <= INT_MAX,
             VERBOSE_UNSUPPORTED_FEATURE,
             "dst/scr size > INT_MAX is not supported");
@@ -1195,7 +1212,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
     // to the number of threads and multiplied by a heuristic coefficient (15)
     const size_t zp_src_pad_comp_size = zp_src_with_padding
             ? (jcp.oc * jcp.ngroups * jcp.zp.src_pad_comp.d
-                      * jcp.zp.src_pad_comp.h * jcp.zp.src_pad_comp.w)
+                       * jcp.zp.src_pad_comp.h * jcp.zp.src_pad_comp.w)
             : 0u;
     const size_t zp_src_comp_size = jcp.zp.src_is_common
             ? utils::rnd_up(jcp.oc * jcp.ngroups,
@@ -1247,6 +1264,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
                 // memory for transposition
                 row_size += ic * iw;
 
+                if (row_size == 0) row_size = 1;
                 h_block = nstl::max(
                         dim_t(1), nstl::min(oh, div_up(dim_t(L2), row_size)));
                 if (h_block == 1) {
@@ -1489,8 +1507,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
             //  64K - this is heuristic gemm size per thread threshold.
             constexpr size_t gemm_thrld = 64 * 1024;
             if (!jcp.outer_threading && !is_3d) {
-                bool is_depthwise
-                        = jcp.ic == 1 && jcp.oc == 1 && jcp.ngroups != 1;
+                bool is_depthwise = jcp.ic == 1 && jcp.oc == 1 && jcp.ngroups != 1;
                 const size_t outer_work = jcp.ngroups * jcp.mb;
                 const float outer_thr_eff
                         = (float)outer_work / rnd_up(outer_work, max_threads);
@@ -1498,9 +1515,8 @@ status_t init_conf(conv_gemm_conf_t &jcp,
                         = div_up(jcp.is, simd_w) * div_up(jcp.ic, simd_w);
                 const float inner_thr_eff
                         = (float)inner_work / rnd_up(inner_work, max_threads);
-                jcp.outer_threading
-                        = (is_depthwise
-                                  || (jcp.is / max_threads < 64 && jcp.mb != 1))
+                jcp.outer_threading = (is_depthwise
+                                              || (jcp.is / max_threads < 64 && jcp.mb != 1))
                         && (outer_thr_eff / inner_thr_eff >= 1.f
                                 || (static_cast<size_t>(jcp.os) * jcp.ic
                                            * jcp.oc)
@@ -1524,7 +1540,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
                     gemm_col_datatype_size);
             if (is_bf16_conv && jcp.with_bias
                     && one_of(data_type::bf16, cd.diff_bias_desc.data_type,
-                            cd.bias_desc.data_type)) {
+                           cd.bias_desc.data_type)) {
                 scratchpad.book<float>(
                         key_conv_bias_bf16_convert_wsp, jcp.ngroups * jcp.oc);
             }
@@ -1540,17 +1556,17 @@ status_t init_conf(conv_gemm_conf_t &jcp,
             // gemm implementation which we cannot control
             bool is_blocking_applicable = true && !is_3d
                     && (!jcp.im2col_sz
-                            // spatial is small
-                            || spatial >= max_threads * simd_w
-                            // inner threading work is greater then outer
-                            // threading work
-                            || jcp.os < jcp.mb * jcp.ngroups * jcp.od
-                            // im2col is big
-                            || (sw == 1 && K <= 0.05 * jcp.oc))
+                               // spatial is small
+                               || spatial >= max_threads * simd_w
+                               // inner threading work is greater then outer
+                               // threading work
+                               || jcp.os < jcp.mb * jcp.ngroups * jcp.od
+                               // im2col is big
+                               || (sw == 1 && K <= 0.05 * jcp.oc))
                     // heuristic condition
                     && (jcp.im2col_sz
-                            || (jcp.ic / jcp.oc < 42
-                                    && jcp.ic * jcp.oc * jcp.is < 1024));
+                               || (jcp.ic / jcp.oc < 42
+                                          && jcp.ic * jcp.oc * jcp.is < 1024));
 
             if (is_blocking_applicable) {
                 const dim_t min_oc_block = 8;
@@ -1565,9 +1581,8 @@ status_t init_conf(conv_gemm_conf_t &jcp,
                         + ic_disb_k + reg_osb_disb_k + thr_mem_eff_k
                         + gemm_eff_k + gemm_calc_eff_k;
 
-                auto calc_max_icb
-                        = [=](dim_t nthr_oc, dim_t ocb, dim_t osb,
-                                  dim_t oc_per_thr, dim_t os_per_thr) {
+                auto calc_max_icb = [=](dim_t nthr_oc, dim_t ocb, dim_t osb,
+                                            dim_t oc_per_thr, dim_t os_per_thr) {
                     const dim_t block_out_size = ocb * osb;
                     // TODO: need more precise calculation if stride more than
                     // kernel size
@@ -1906,11 +1921,10 @@ status_t init_conf(conv_gemm_conf_t &jcp,
             if (jcp.im2col_sz)
                 jcp.im2col_sz = (ptrdiff_t)jcp.ic_block * jcp.ks * jcp.os_block;
         } else if (jcp.is_nspc && is_bwd_d) {
-            jcp.im2col_sz
-                    = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih,
-                              jcp.od == jcp.id, jcp.stride_w == 1,
-                              jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1,
-                              !jcp.signed_input)
+            jcp.im2col_sz = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih,
+                                      jcp.od == jcp.id, jcp.stride_w == 1,
+                                      jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1,
+                                      !jcp.signed_input)
                     ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os * jcp.od
                     : 0;
 
@@ -1924,7 +1938,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
                     = (float)inner_work / rnd_up(inner_work, max_threads);
             jcp.outer_threading = !is_3d
                     && (is_depthwise
-                            || (jcp.is / max_threads < 64 && jcp.mb != 1))
+                               || (jcp.is / max_threads < 64 && jcp.mb != 1))
                     && (outer_thr_eff / inner_thr_eff >= 1.f
                             || (static_cast<size_t>(jcp.is) * jcp.ic * jcp.oc)
                                             / max_threads
@@ -1950,11 +1964,10 @@ status_t init_conf(conv_gemm_conf_t &jcp,
                             || (jcp.is * jcp.ic * jcp.oc) / max_threads
                                     < gemm_thrld);
         } else if (jcp.is_nspc && is_bwd_w) {
-            jcp.im2col_sz
-                    = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih,
-                              jcp.od == jcp.id, jcp.stride_w == 1,
-                              jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1,
-                              !jcp.signed_input)
+            jcp.im2col_sz = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih,
+                                      jcp.od == jcp.id, jcp.stride_w == 1,
+                                      jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1,
+                                      !jcp.signed_input)
                     ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os
                     : 0;
             const size_t gemm_col_datatype_size
@@ -1970,7 +1983,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
                 thr_mem_estimate += sizeof(float) * weights_d.size();
                 if (jcp.with_bias
                         && one_of(data_type::bf16, cd.diff_bias_desc.data_type,
-                                cd.bias_desc.data_type))
+                               cd.bias_desc.data_type))
                     thr_mem_estimate += sizeof(float) * jcp.ngroups * jcp.oc;
             }
             const bool outer_threading_mem_ok
@@ -1997,7 +2010,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
             }
             if ((is_bf16_conv) && jcp.with_bias
                     && one_of(data_type::bf16, cd.diff_bias_desc.data_type,
-                            cd.bias_desc.data_type))
+                           cd.bias_desc.data_type))
                 scratchpad.book<float>(
                         key_conv_bias_bf16_convert_wsp, jcp.ngroups * jcp.oc);
         } else if (!jcp.is_nspc && is_bwd_w) {
@@ -2009,7 +2022,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
                 thr_mem_estimate += sizeof(float) * weights_d.size();
                 if (jcp.with_bias
                         && one_of(data_type::bf16, cd.diff_bias_desc.data_type,
-                                cd.bias_desc.data_type))
+                               cd.bias_desc.data_type))
                     thr_mem_estimate += sizeof(float) * jcp.ngroups * jcp.oc;
             }
             const size_t gemm_col_datatype_size
@@ -2018,8 +2031,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
             thr_mem_estimate += gemm_col_datatype_size * max_threads * jcp.ic
                     * jcp.ks * simd_w;
 
-            const bool outer_threading_mem_ok
-                    = thr_mem_estimate < scratchpad_limit;
+            const bool outer_threading_mem_ok = thr_mem_estimate < scratchpad_limit;
             jcp.outer_threading = outer_threading_mem_ok
                     && jcp.os / max_threads < 256
                     && (jcp.mb != 1 || jcp.ngroups > 2);
@@ -2050,7 +2062,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
                         key_conv_int_dat_in_acc_dt, conv_acc_buffer_size);
                 if ((is_fwd || is_bwd_w) && jcp.with_bias
                         && one_of(data_type::bf16, cd.diff_bias_desc.data_type,
-                                cd.bias_desc.data_type))
+                               cd.bias_desc.data_type))
                     scratchpad.book<float>(key_conv_bias_bf16_convert_wsp,
                             jcp.ngroups * jcp.oc);
             }
@@ -2065,8 +2077,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
                 VDISPATCH_CONV_IC(scratchpad_limit >= scratchpad.size(),
                         VERBOSE_SCRATCHPAD_LIMIT);
 
-                const size_t available_mem
-                        = scratchpad_limit - scratchpad.size();
+                const size_t available_mem = scratchpad_limit - scratchpad.size();
                 if (available_mem
                         < gemm_col_memory_sz * gemm_col_datatype_size) {
                     // Required memory in this scenario overflows the
@@ -2107,8 +2118,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
         if (size) scratchpad.book<int32_t>(key_conv_gemm_zp_src_comp, size);
     }
 
-    VDISPATCH_CONV_IC(
-            scratchpad.size() <= scratchpad_limit, VERBOSE_SCRATCHPAD_LIMIT);
+    VDISPATCH_CONV_IC(scratchpad.size() <= scratchpad_limit, VERBOSE_SCRATCHPAD_LIMIT);
 
     return status::success;
 }
@@ -2182,4 +2192,4 @@ bool padding_exists(const conv_gemm_conf_t &jcp) noexcept {
 } // namespace rv64
 } // namespace cpu
 } // namespace impl
-} // namespace dnnl
+} // namespace dnnl
\ No newline at end of file

From 6791f0ea3e785912e79a31546133afaddfa018f7 Mon Sep 17 00:00:00 2001
From: StrelkovKM <strelkovkm96@outlook.com>
Date: Tue, 28 Apr 2026 21:43:45 +0000
Subject: [PATCH 13/13] [CPU][RV64] Clang-formated

---
 src/cpu/rv64/rvv_gemm_convolution.cpp       | 115 +++++++--------
 src/cpu/rv64/rvv_gemm_convolution.hpp       |   2 +-
 src/cpu/rv64/rvv_gemm_convolution_utils.cpp | 146 ++++++++++----------
 3 files changed, 133 insertions(+), 130 deletions(-)

diff --git a/src/cpu/rv64/rvv_gemm_convolution.cpp b/src/cpu/rv64/rvv_gemm_convolution.cpp
index f4ea4196da9..eda2217cd77 100644
--- a/src/cpu/rv64/rvv_gemm_convolution.cpp
+++ b/src/cpu/rv64/rvv_gemm_convolution.cpp
@@ -11,12 +11,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 *******************************************************************************/
 #include <atomic>
-#include <riscv_vector.h>
 #include "common/c_types_map.hpp"
 #include "common/dnnl_thread.hpp"
 #include "common/type_helpers.hpp"
 #include "common/utils.hpp"
 #include "cpu/rv64/rvv_gemm_convolution.hpp"
+#include <riscv_vector.h>
 
 namespace dnnl {
 namespace impl {
@@ -33,25 +33,19 @@ struct im_pos_t {
     dim_t n, g, od, sp, ic, oc;
     bool do_im2col(const im_pos_t &prev) const {
         return true
-            && (n != prev.n || g != prev.g || od != prev.od || sp != prev.sp
-                    || ic != prev.ic);
+                && (n != prev.n || g != prev.g || od != prev.od || sp != prev.sp
+                        || ic != prev.ic);
     }
 };
 
 // Helper function to apply bias and eltwise using RVV in NSPC layout
 // Using float explicitly as data_t is float in this specialization
-static void apply_bias_eltwise_rvv_nspc(
-        const float *__restrict bia_arr,
-        float *__restrict dst_arr,
-        size_t start_oc, size_t end_oc,
-        bool with_bias,
-        bool with_eltwise,
-        const ref_post_ops_t *post_ops,
-        const exec_ctx_t &ctx,
-        const memory_desc_t *dst_md,
-        const conv_gemm_conf_t &jcp,
-        size_t g, size_t os_offset_factor) {
-    
+static void apply_bias_eltwise_rvv_nspc(const float *__restrict bia_arr,
+        float *__restrict dst_arr, size_t start_oc, size_t end_oc,
+        bool with_bias, bool with_eltwise, const ref_post_ops_t *post_ops,
+        const exec_ctx_t &ctx, const memory_desc_t *dst_md,
+        const conv_gemm_conf_t &jcp, size_t g, size_t os_offset_factor) {
+
     size_t n_elems = end_oc - start_oc + 1;
     if (n_elems == 0) return;
 
@@ -63,7 +57,7 @@ static void apply_bias_eltwise_rvv_nspc(
     float eltwise_alpha = 0.0f;
     float eltwise_scale = 1.0f;
     bool is_fast_relu = false;
-    
+
     if (with_eltwise && jcp.post_ops.len() == 1) {
         const auto &eltwise = jcp.post_ops.entry_.back().eltwise;
         if (eltwise.alg == alg_kind::eltwise_relu) {
@@ -75,7 +69,7 @@ static void apply_bias_eltwise_rvv_nspc(
 
     while (oc < n_elems) {
         size_t vl = __riscv_vsetvl_e32m1(n_elems - oc);
-        
+
         vfloat32m1_t v_dst = __riscv_vle32_v_f32m1(d_ptr + oc, vl);
 
         // 1. Add Bias
@@ -92,9 +86,10 @@ static void apply_bias_eltwise_rvv_nspc(
             } else {
                 // Leaky ReLU-like
                 vbool32_t mask = __riscv_vmflt_vf_f32m1_b32(v_dst, 0.0f, vl);
-                v_dst = __riscv_vfmul_vf_f32m1_m(mask, v_dst, eltwise_alpha, vl);
+                v_dst = __riscv_vfmul_vf_f32m1_m(
+                        mask, v_dst, eltwise_alpha, vl);
             }
-            
+
             if (eltwise_scale != 1.0f) {
                 v_dst = __riscv_vfmul_vf_f32m1(v_dst, eltwise_scale, vl);
             }
@@ -102,7 +97,7 @@ static void apply_bias_eltwise_rvv_nspc(
             oc += vl;
         } else {
             // If not fast relu, break to handle scalarly or generic post-ops
-            break; 
+            break;
         }
     }
 
@@ -111,19 +106,17 @@ static void apply_bias_eltwise_rvv_nspc(
         for (size_t i = oc; i < n_elems; ++i) {
             size_t cur_oc = start_oc + i;
             float *dst_val = dst_arr + cur_oc;
-            
-            if (with_bias) {
-                *dst_val += bia_arr[cur_oc];
-            }
-            
+
+            if (with_bias) { *dst_val += bia_arr[cur_oc]; }
+
             if (with_eltwise || jcp.with_binary) {
-                 ref_post_ops_t::args_t args;
-                 args.ctx = &ctx;
-                 args.dst_md = dst_md; // Use the passed pointer
-                 // Calculate offset correctly
-                 // Note: l_offset calculation might need adjustment based on exact memory layout expectations of post_ops
-                 args.l_offset = (g * jcp.oc + cur_oc) * (jcp.os * jcp.od);
-                 post_ops->execute(*dst_val, args);
+                ref_post_ops_t::args_t args;
+                args.ctx = &ctx;
+                args.dst_md = dst_md; // Use the passed pointer
+                // Calculate offset correctly
+                // Note: l_offset calculation might need adjustment based on exact memory layout expectations of post_ops
+                args.l_offset = (g * jcp.oc + cur_oc) * (jcp.os * jcp.od);
+                post_ops->execute(*dst_val, args);
             }
         }
     }
@@ -181,7 +174,7 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc(
 
     assert(IMPLICATION(is_problem_3d,
             jcp.oh_block == jcp.oh && jcp.ow_block == jcp.ow
-                     && jcp.ic_block == jcp.ic));
+                    && jcp.ic_block == jcp.ic));
     assert(IMPLICATION(jcp.ow_block != jcp.ow, jcp.oh_block == 1));
 
     const dim_t nb_oh = div_up(jcp.oh, jcp.oh_block);
@@ -198,7 +191,8 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc(
         const vfloat32m1_t v_zero = __riscv_vfmv_v_f_f32m1(0.0f, vlmax);
         ptrdiff_t i = 0;
 
-        for (; i <= (ptrdiff_t)total_sz - (ptrdiff_t)vlmax; i += (ptrdiff_t)vlmax) {
+        for (; i <= (ptrdiff_t)total_sz - (ptrdiff_t)vlmax;
+                i += (ptrdiff_t)vlmax) {
             __riscv_vse32_v_f32m1(col + i, v_zero, vlmax);
         }
         if (i < (ptrdiff_t)total_sz) {
@@ -220,16 +214,16 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc(
 
         const int h_step = nstl::min(jcp.oh_block, jcp.oh - oh);
         const int w_step = nstl::min(jcp.ow_block, jcp.ow - ow);
-        
+
         if (jcp.im2col_sz && is_problem_3d) {
-             jit_gemm_convolution_utils::transpose_dt(jcp, src, imtr);
+            jit_gemm_convolution_utils::transpose_dt(jcp, src, imtr);
         }
 
         for (int od = 0; od < jcp.od; od++) {
             data_t *__restrict dst = dst_base + n * dst_mb_stride
                     + g * dst_g_stride
                     + ((od * jcp.oh + oh) * jcp.ow + ow) * dst_os_stride;
-            
+
             if (jcp.im2col_sz) {
                 if (is_problem_3d)
                     jit_gemm_convolution_utils::im2col_dt_3d<data_t, data_t>(
@@ -250,7 +244,7 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc(
             const float beta = jcp.with_sum ? 1.0f : 0.0f;
             const data_t *__restrict src_od
                     = src + od * jcp.oh * jcp.ow * jcp.ngroups * jcp.ic;
-            
+
             status_t st = extended_sgemm("N ", BT, &M, &N, &K, &onef, wei, &LDA,
                     jcp.im2col_sz ? col : (data_t *)src_od, &LDB, &beta, dst,
                     &LDC);
@@ -260,7 +254,8 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc(
                 // NOTE: Keeping parallel(0, ...) as requested
                 parallel(0, [&](int ithr_inner, int nthr_inner) {
                     dim_t start_inner, end_inner;
-                    balance211(N * jcp.oc, nthr_inner, ithr_inner, start_inner, end_inner);
+                    balance211(N * jcp.oc, nthr_inner, ithr_inner, start_inner,
+                            end_inner);
 
                     const size_t first_oc = start_inner % jcp.oc;
                     const size_t last_oc = (end_inner - 1) % jcp.oc;
@@ -278,30 +273,35 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_thr_nspc(
 
                         // Check if we can use optimized RVV path
                         bool has_binary = jcp.with_binary;
-                        bool has_complex_eltwise = jcp.with_eltwise && !(jcp.post_ops.len() == 1 && jcp.post_ops.entry_.back().eltwise.alg == alg_kind::eltwise_relu);
-                        
+                        bool has_complex_eltwise = jcp.with_eltwise
+                                && !(jcp.post_ops.len() == 1
+                                        && jcp.post_ops.entry_.back()
+                                                        .eltwise.alg
+                                                == alg_kind::eltwise_relu);
+
                         if (!has_binary && !has_complex_eltwise) {
-                             apply_bias_eltwise_rvv_nspc(
-                                (const float*)bia_arr, (float*)dst_arr, start_oc, end_oc,
-                                jcp.with_bias, jcp.with_eltwise,
-                                post_ops_ptr, ctx, dst_md_ptr, jcp, g, 0);
+                            apply_bias_eltwise_rvv_nspc((const float *)bia_arr,
+                                    (float *)dst_arr, start_oc, end_oc,
+                                    jcp.with_bias, jcp.with_eltwise,
+                                    post_ops_ptr, ctx, dst_md_ptr, jcp, g, 0);
                         } else {
                             // Fallback to original scalar logic for complex cases
                             if (jcp.with_bias) {
                                 size_t n_elems = end_oc - start_oc + 1;
                                 if (n_elems > 0) {
                                     // Scalar bias add
-                                    for(size_t k=0; k<n_elems; ++k) {
-                                        dst_arr[start_oc + k] += bia_arr[start_oc + k];
+                                    for (size_t k = 0; k < n_elems; ++k) {
+                                        dst_arr[start_oc + k]
+                                                += bia_arr[start_oc + k];
                                     }
                                 }
                             }
-                            
+
                             if (jcp.with_eltwise || jcp.with_binary) {
                                 ref_post_ops_t::args_t args;
                                 args.ctx = &ctx;
                                 args.dst_md = dst_md_ptr;
-                                
+
                                 for (size_t oc = start_oc; oc <= end_oc; oc++) {
                                     args.l_offset = (g * jcp.oc + oc)
                                             * (jcp.os * jcp.od);
@@ -348,7 +348,7 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp(
 
     assert(IMPLICATION(is_problem_3d,
             jcp.os_block == jcp.os && jcp.ic_block == jcp.ic
-                     && jcp.os_nb_block == 1));
+                    && jcp.os_nb_block == 1));
 
     std::atomic<status_t> st(status::success);
     parallel(jcp.nthr, [&](const int ithr, const int nthr) {
@@ -363,7 +363,8 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp(
             const size_t vlmax = __riscv_vsetvlmax_e32m1();
             const vfloat32m1_t v_zero = __riscv_vfmv_v_f_f32m1(0.0f, vlmax);
             ptrdiff_t i = 0;
-            for (; i <= (ptrdiff_t)total_sz - (ptrdiff_t)vlmax; i += (ptrdiff_t)vlmax) {
+            for (; i <= (ptrdiff_t)total_sz - (ptrdiff_t)vlmax;
+                    i += (ptrdiff_t)vlmax) {
                 __riscv_vse32_v_f32m1(_col + i, v_zero, vlmax);
             }
             if (i < (ptrdiff_t)total_sz) {
@@ -371,7 +372,7 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp(
                 __riscv_vse32_v_f32m1(_col + i, v_zero, vl);
             }
         }
-        
+
         auto inner_ker = [&](int spatial, const im_pos_t &curr, im_pos_t &prev,
                                  im_pos_t &step, const im_pos_t &end) {
             const data_t *_src
@@ -396,7 +397,7 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp(
             const data_t one = 1.0;
 
             const dim_t M = jcp.os * jcp.od;
-            const dim_t m = step.sp ;
+            const dim_t m = step.sp;
             const dim_t LDA = jcp.im2col_sz ? m : M;
             data_t *_dst = dst + curr.n * dst_mb_stride + curr.g * dst_g_stride
                     + curr.oc * M + curr.od * jcp.os + curr.sp;
@@ -442,11 +443,11 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp(
                                                 v_d, b, vl); // Add bias
 
                                         v_d = __riscv_vfmax_vf_f32m1(
-                                                 v_d, 0.0f, vl);
+                                                v_d, 0.0f, vl);
 
                                         if (eltwise.scale != 1.0f) {
                                             v_d = __riscv_vfmul_vf_f32m1(
-                                                     v_d, eltwise.scale, vl);
+                                                    v_d, eltwise.scale, vl);
                                         }
 
                                         __riscv_vse32_v_f32m1(d_ + oS, v_d, vl);
@@ -463,10 +464,10 @@ status_t riscv_gemm_convolution_fwd_t::execute_forward_ncsp(
                                         v_d = __riscv_vfadd_vf_f32m1(
                                                 v_d, b, vl); // Add bias
                                         vbool32_t mask
-                                                 = __riscv_vmflt_vf_f32m1_b32(
+                                                = __riscv_vmflt_vf_f32m1_b32(
                                                         v_d, 0.0f, vl);
                                         v_d = __riscv_vfmul_vf_f32m1_m(
-                                                 mask, v_d, eltwise.alpha, vl);
+                                                mask, v_d, eltwise.alpha, vl);
                                         v_d = __riscv_vfmul_vf_f32m1(
                                                 v_d, eltwise.scale, vl);
                                         __riscv_vse32_v_f32m1(d_ + oS, v_d, vl);
diff --git a/src/cpu/rv64/rvv_gemm_convolution.hpp b/src/cpu/rv64/rvv_gemm_convolution.hpp
index 1545afb6912..310bdcd72b1 100644
--- a/src/cpu/rv64/rvv_gemm_convolution.hpp
+++ b/src/cpu/rv64/rvv_gemm_convolution.hpp
@@ -68,7 +68,7 @@ struct riscv_gemm_convolution_fwd_t : public primitive_t {
 
             // TODO: make `init_conf` assign initialized object to `jcp_`
             jcp_ = conv_gemm_conf_t();
-                return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad,
+            return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad,
                     *desc(), src_md_, weights_md_, dst_md_, bias_md_, attr_,
                     dnnl_get_max_threads());
         }
diff --git a/src/cpu/rv64/rvv_gemm_convolution_utils.cpp b/src/cpu/rv64/rvv_gemm_convolution_utils.cpp
index bcfb62b2990..615c7051ce7 100644
--- a/src/cpu/rv64/rvv_gemm_convolution_utils.cpp
+++ b/src/cpu/rv64/rvv_gemm_convolution_utils.cpp
@@ -19,8 +19,8 @@
 #include "common/dnnl_thread.hpp"
 #include "common/type_helpers.hpp"
 #include "common/utils.hpp"
-#include "cpu/scale_utils.hpp"
 #include "cpu/platform.hpp"
+#include "cpu/scale_utils.hpp"
 
 #ifdef DNNL_RISCV_USE_RVV_INTRINSICS
 #include <riscv_vector.h>
@@ -49,9 +49,9 @@ namespace jit_gemm_convolution_utils {
 template <typename data_type_t>
 void im2col_3d(const conv_gemm_conf_t &jcp, const data_type_t *im,
         data_type_t *col, dim_t od, int spatial_step, int spatial_block) {
-    using data_t = typename conditional<data_traits_t<data_type_t>::data_type
-                        == bf16,
-                uint16_t, data_type_t>::type;
+    using data_t =
+            typename conditional<data_traits_t<data_type_t>::data_type == bf16,
+                    uint16_t, data_type_t>::type;
     const data_t *__restrict _im
             = reinterpret_cast<const data_t *__restrict>(im);
     data_t *__restrict _col = reinterpret_cast<data_t *__restrict>(col);
@@ -97,8 +97,7 @@ void im2col_3d(const conv_gemm_conf_t &jcp, const data_type_t *im,
                     col_ += jcp.kw * OHW;
                 }
             } else {
-                const data_t *__restrict im_
-                        = im_loc + id * jcp.ih * jcp.iw;
+                const data_t *__restrict im_ = im_loc + id * jcp.ih * jcp.iw;
                 dim_t ih_ = -jcp.t_pad;
                 for (dim_t kh = 0; kh < jcp.kh; ++kh) {
                     dim_t ih = ih_;
@@ -233,8 +232,8 @@ void transpose_dt(const conv_gemm_conf_t &jcp, const T *__restrict im,
     const dim_t ic_stride = jcp.id * jcp.ih * jcp.iw;
     const dim_t IC = jcp.ngroups * jcp.ic;
     const dim_t IHW = jcp.ih * jcp.iw;
-    const dim_t ic_block = nstl::max<dim_t>(
-            1, platform::get_cache_line_size() / sizeof(T));
+    const dim_t ic_block
+            = nstl::max<dim_t>(1, platform::get_cache_line_size() / sizeof(T));
     const dim_t nb_ic = jcp.ic / ic_block;
     const dim_t ic_blocked = nb_ic * ic_block;
     parallel_nd(jcp.id, jcp.ih, [&](dim_t id, dim_t ih) {
@@ -315,19 +314,16 @@ void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr,
                     }
                     const im_dt *__restrict imtr_loc
                             = imtr + (ic * jcp.id + id) * IHW;
-                    const dim_t oh_start
-                            = saturate(dim_t(0), jcp.oh, tp - kh);
+                    const dim_t oh_start = saturate(dim_t(0), jcp.oh, tp - kh);
                     const dim_t oh_end
                             = saturate(dim_t(0), jcp.oh, jcp.ih + tp - kh);
-                    const dim_t ow_start
-                            = saturate(dim_t(0), jcp.ow, lp - kw);
+                    const dim_t ow_start = saturate(dim_t(0), jcp.ow, lp - kw);
                     const dim_t ow_end
                             = saturate(dim_t(0), jcp.ow, jcp.iw + lp - kw);
                     for (dim_t oh = oh_start, ih = oh_start - tp + kh;
                             oh < oh_end; oh++, ih++) {
                         col_dt *__restrict col_h = col_loc + oh * jcp.ow;
-                        const im_dt *__restrict imtr_h
-                                = imtr_loc + ih * jcp.iw;
+                        const im_dt *__restrict imtr_h = imtr_loc + ih * jcp.iw;
                         for (dim_t ow = ow_start, iw = ow_start - lp + kw;
                                 ow < ow_end; ow++, iw++) {
                             col_h[ow] = imtr_h[iw];
@@ -358,8 +354,7 @@ void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr,
                     for (dim_t oh = oh_start, ih = oh_start * 2 - tp + kh;
                             oh < oh_end; ++oh, ih += 2) {
                         col_dt *__restrict col_h = col_loc + oh * jcp.ow;
-                        const im_dt *__restrict imtr_h
-                                = imtr_loc + ih * jcp.iw;
+                        const im_dt *__restrict imtr_h = imtr_loc + ih * jcp.iw;
                         for (dim_t ow = ow_start, iw = ow_start * 2 - lp + kw;
                                 ow < ow_end; ++ow, iw += 2) {
                             col_h[ow] = imtr_h[iw];
@@ -390,10 +385,9 @@ void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr,
                     for (dim_t oh = oh_start, ih = oh_start * sh - tp + kh * dh;
                             oh < oh_end; ++oh, ih += sh) {
                         col_dt *__restrict col_h = col_loc + oh * jcp.ow;
-                        const im_dt *__restrict imtr_h
-                                = imtr_loc + ih * jcp.iw;
+                        const im_dt *__restrict imtr_h = imtr_loc + ih * jcp.iw;
                         for (dim_t ow = ow_start,
-                                  iw = ow_start * sw - lp + kw * dw;
+                                   iw = ow_start * sw - lp + kw * dw;
                                 ow < ow_end; ++ow, iw += sw) {
                             col_h[ow] = imtr_h[iw];
                         }
@@ -515,14 +509,13 @@ void im2col(const conv_gemm_conf_t &jcp, const data_type_t *__restrict im,
                     [&](dim_t ic, dim_t kh, dim_t kw, dim_t ohr) {
                         const dim_t oh = ohr + oh_begin;
                         const dim_t ih = oh * sh - tp + kh * dh;
-                        const dim_t ow_start
-                                = (oh == first_oh) ? first_ow : 0;
+                        const dim_t ow_start = (oh == first_oh) ? first_ow : 0;
                         const dim_t ow_end
                                 = (oh == last_oh) ? (last_ow + 1) : jcp.ow;
                         data_t *__restrict col_oh = _col + ic * col_step
                                 + (kh * jcp.kw + kw) * sb + oh * jcp.ow - ss;
-                        const data_t *__restrict im_ = _im + (ic + cs) * im_step
-                                + ih * jcp.iw;
+                        const data_t *__restrict im_
+                                = _im + (ic + cs) * im_step + ih * jcp.iw;
                         const dim_t iw_shift = kw * dw - lp;
                         if (ih < 0 || ih >= jcp.ih)
                             for (dim_t ow = ow_start; ow < ow_end; ow++)
@@ -541,13 +534,13 @@ void im2col(const conv_gemm_conf_t &jcp, const data_type_t *__restrict im,
                     [&](dim_t ic, dim_t kh, dim_t kw, dim_t ohr) {
                         const dim_t oh = ohr + oh_begin;
                         const dim_t ih = oh * sh - tp + kh * dh;
-                        const dim_t ow_start
-                                = (oh == first_oh) ? first_ow : 0;
+                        const dim_t ow_start = (oh == first_oh) ? first_ow : 0;
                         const dim_t ow_end
                                 = (oh == last_oh) ? (last_ow + 1) : jcp.ow;
                         data_t *__restrict col_oh = _col + ic * col_step
                                 + (kh * jcp.kw + kw) * sb + oh * jcp.ow - ss;
-                        const data_t *__restrict im_ = _im + (ic + cs) * im_step;
+                        const data_t *__restrict im_
+                                = _im + (ic + cs) * im_step;
                         if (ih < 0 || ih >= jcp.ih)
                             for (dim_t ow = ow_start; ow < ow_end; ow++)
                                 col_oh[ow] = zero_val;
@@ -566,8 +559,8 @@ void im2col(const conv_gemm_conf_t &jcp, const data_type_t *__restrict im,
 }
 
 template void im2col<float>(const conv_gemm_conf_t &jcp,
-        const float *__restrict im, float *__restrict col, dim_t hs,
-        dim_t hb, dim_t ws, dim_t wb);
+        const float *__restrict im, float *__restrict col, dim_t hs, dim_t hb,
+        dim_t ws, dim_t wb);
 template void im2col<bfloat16_t>(const conv_gemm_conf_t &jcp,
         const bfloat16_t *__restrict im, bfloat16_t *__restrict col, dim_t hs,
         dim_t hb, dim_t ws, dim_t wb);
@@ -577,14 +570,15 @@ template <typename orig_im_dt, typename orig_col_dt>
 void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict _im,
         void *__restrict _imtr, orig_col_dt *__restrict _col, dim_t hs,
         dim_t hb, dim_t ws, dim_t wb) {
-    using im_dt = typename utils::conditional<
-            data_traits_t<orig_im_dt>::data_type == bf16, uint16_t,
-            orig_im_dt>::type;
-    using col_dt = typename utils::conditional<
-            data_traits_t<orig_col_dt>::data_type == bf16, uint16_t,
-            orig_col_dt>::type;
-    const im_dt *__restrict im
-            = reinterpret_cast<const im_dt *__restrict>(_im);
+    using im_dt =
+            typename utils::conditional<data_traits_t<orig_im_dt>::data_type
+                            == bf16,
+                    uint16_t, orig_im_dt>::type;
+    using col_dt =
+            typename utils::conditional<data_traits_t<orig_col_dt>::data_type
+                            == bf16,
+                    uint16_t, orig_col_dt>::type;
+    const im_dt *__restrict im = reinterpret_cast<const im_dt *__restrict>(_im);
     im_dt *__restrict imtr = reinterpret_cast<im_dt *__restrict>(_imtr);
     col_dt *__restrict col = reinterpret_cast<col_dt *__restrict>(_col);
     col_dt shift = static_cast<col_dt>(jcp.signed_input ? 128 : 0);
@@ -673,16 +667,16 @@ void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict _im,
                     const dim_t ih = (oh + hs) * sh - hp;
                     const ptrdiff_t col_idx_base
                             = (((kh * jcp.kw + kw) * jcp.ic + ic) * hb + oh)
-                                    * wb;
+                            * wb;
                     if (ih < 0 || ih >= jcp.ih)
                         for (dim_t ow = 0; ow < wb; ow++)
                             col[col_idx_base + ow] = shift;
                     else {
                         const dim_t wp = lp - kw * dw;
-                        const dim_t ow_start = saturate(
-                                dim_t(0), wb, div_up(wp, sw) - ws);
-                        const dim_t ow_end = saturate(dim_t(0), wb,
-                                div_up(jcp.iw + wp, sw) - ws);
+                        const dim_t ow_start
+                                = saturate(dim_t(0), wb, div_up(wp, sw) - ws);
+                        const dim_t ow_end = saturate(
+                                dim_t(0), wb, div_up(jcp.iw + wp, sw) - ws);
                         for (dim_t ow = 0; ow < ow_start; ow++)
                             col[col_idx_base + ow] = shift;
                         const dim_t iw_base = ws * sw - wp;
@@ -1212,7 +1206,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
     // to the number of threads and multiplied by a heuristic coefficient (15)
     const size_t zp_src_pad_comp_size = zp_src_with_padding
             ? (jcp.oc * jcp.ngroups * jcp.zp.src_pad_comp.d
-                       * jcp.zp.src_pad_comp.h * jcp.zp.src_pad_comp.w)
+                      * jcp.zp.src_pad_comp.h * jcp.zp.src_pad_comp.w)
             : 0u;
     const size_t zp_src_comp_size = jcp.zp.src_is_common
             ? utils::rnd_up(jcp.oc * jcp.ngroups,
@@ -1507,7 +1501,8 @@ status_t init_conf(conv_gemm_conf_t &jcp,
             //  64K - this is heuristic gemm size per thread threshold.
             constexpr size_t gemm_thrld = 64 * 1024;
             if (!jcp.outer_threading && !is_3d) {
-                bool is_depthwise = jcp.ic == 1 && jcp.oc == 1 && jcp.ngroups != 1;
+                bool is_depthwise
+                        = jcp.ic == 1 && jcp.oc == 1 && jcp.ngroups != 1;
                 const size_t outer_work = jcp.ngroups * jcp.mb;
                 const float outer_thr_eff
                         = (float)outer_work / rnd_up(outer_work, max_threads);
@@ -1515,8 +1510,9 @@ status_t init_conf(conv_gemm_conf_t &jcp,
                         = div_up(jcp.is, simd_w) * div_up(jcp.ic, simd_w);
                 const float inner_thr_eff
                         = (float)inner_work / rnd_up(inner_work, max_threads);
-                jcp.outer_threading = (is_depthwise
-                                              || (jcp.is / max_threads < 64 && jcp.mb != 1))
+                jcp.outer_threading
+                        = (is_depthwise
+                                  || (jcp.is / max_threads < 64 && jcp.mb != 1))
                         && (outer_thr_eff / inner_thr_eff >= 1.f
                                 || (static_cast<size_t>(jcp.os) * jcp.ic
                                            * jcp.oc)
@@ -1540,7 +1536,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
                     gemm_col_datatype_size);
             if (is_bf16_conv && jcp.with_bias
                     && one_of(data_type::bf16, cd.diff_bias_desc.data_type,
-                           cd.bias_desc.data_type)) {
+                            cd.bias_desc.data_type)) {
                 scratchpad.book<float>(
                         key_conv_bias_bf16_convert_wsp, jcp.ngroups * jcp.oc);
             }
@@ -1556,17 +1552,17 @@ status_t init_conf(conv_gemm_conf_t &jcp,
             // gemm implementation which we cannot control
             bool is_blocking_applicable = true && !is_3d
                     && (!jcp.im2col_sz
-                               // spatial is small
-                               || spatial >= max_threads * simd_w
-                               // inner threading work is greater then outer
-                               // threading work
-                               || jcp.os < jcp.mb * jcp.ngroups * jcp.od
-                               // im2col is big
-                               || (sw == 1 && K <= 0.05 * jcp.oc))
+                            // spatial is small
+                            || spatial >= max_threads * simd_w
+                            // inner threading work is greater then outer
+                            // threading work
+                            || jcp.os < jcp.mb * jcp.ngroups * jcp.od
+                            // im2col is big
+                            || (sw == 1 && K <= 0.05 * jcp.oc))
                     // heuristic condition
                     && (jcp.im2col_sz
-                               || (jcp.ic / jcp.oc < 42
-                                          && jcp.ic * jcp.oc * jcp.is < 1024));
+                            || (jcp.ic / jcp.oc < 42
+                                    && jcp.ic * jcp.oc * jcp.is < 1024));
 
             if (is_blocking_applicable) {
                 const dim_t min_oc_block = 8;
@@ -1582,7 +1578,8 @@ status_t init_conf(conv_gemm_conf_t &jcp,
                         + gemm_eff_k + gemm_calc_eff_k;
 
                 auto calc_max_icb = [=](dim_t nthr_oc, dim_t ocb, dim_t osb,
-                                            dim_t oc_per_thr, dim_t os_per_thr) {
+                                            dim_t oc_per_thr,
+                                            dim_t os_per_thr) {
                     const dim_t block_out_size = ocb * osb;
                     // TODO: need more precise calculation if stride more than
                     // kernel size
@@ -1921,10 +1918,11 @@ status_t init_conf(conv_gemm_conf_t &jcp,
             if (jcp.im2col_sz)
                 jcp.im2col_sz = (ptrdiff_t)jcp.ic_block * jcp.ks * jcp.os_block;
         } else if (jcp.is_nspc && is_bwd_d) {
-            jcp.im2col_sz = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih,
-                                      jcp.od == jcp.id, jcp.stride_w == 1,
-                                      jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1,
-                                      !jcp.signed_input)
+            jcp.im2col_sz
+                    = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih,
+                              jcp.od == jcp.id, jcp.stride_w == 1,
+                              jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1,
+                              !jcp.signed_input)
                     ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os * jcp.od
                     : 0;
 
@@ -1938,7 +1936,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
                     = (float)inner_work / rnd_up(inner_work, max_threads);
             jcp.outer_threading = !is_3d
                     && (is_depthwise
-                               || (jcp.is / max_threads < 64 && jcp.mb != 1))
+                            || (jcp.is / max_threads < 64 && jcp.mb != 1))
                     && (outer_thr_eff / inner_thr_eff >= 1.f
                             || (static_cast<size_t>(jcp.is) * jcp.ic * jcp.oc)
                                             / max_threads
@@ -1964,10 +1962,11 @@ status_t init_conf(conv_gemm_conf_t &jcp,
                             || (jcp.is * jcp.ic * jcp.oc) / max_threads
                                     < gemm_thrld);
         } else if (jcp.is_nspc && is_bwd_w) {
-            jcp.im2col_sz = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih,
-                                      jcp.od == jcp.id, jcp.stride_w == 1,
-                                      jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1,
-                                      !jcp.signed_input)
+            jcp.im2col_sz
+                    = !everyone_is(true, jcp.ow == jcp.iw, jcp.oh == jcp.ih,
+                              jcp.od == jcp.id, jcp.stride_w == 1,
+                              jcp.stride_h == 1, jcp.stride_d == 1, jcp.ks == 1,
+                              !jcp.signed_input)
                     ? (ptrdiff_t)jcp.ic * jcp.ks * jcp.os
                     : 0;
             const size_t gemm_col_datatype_size
@@ -1983,7 +1982,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
                 thr_mem_estimate += sizeof(float) * weights_d.size();
                 if (jcp.with_bias
                         && one_of(data_type::bf16, cd.diff_bias_desc.data_type,
-                               cd.bias_desc.data_type))
+                                cd.bias_desc.data_type))
                     thr_mem_estimate += sizeof(float) * jcp.ngroups * jcp.oc;
             }
             const bool outer_threading_mem_ok
@@ -2010,7 +2009,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
             }
             if ((is_bf16_conv) && jcp.with_bias
                     && one_of(data_type::bf16, cd.diff_bias_desc.data_type,
-                           cd.bias_desc.data_type))
+                            cd.bias_desc.data_type))
                 scratchpad.book<float>(
                         key_conv_bias_bf16_convert_wsp, jcp.ngroups * jcp.oc);
         } else if (!jcp.is_nspc && is_bwd_w) {
@@ -2022,7 +2021,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
                 thr_mem_estimate += sizeof(float) * weights_d.size();
                 if (jcp.with_bias
                         && one_of(data_type::bf16, cd.diff_bias_desc.data_type,
-                               cd.bias_desc.data_type))
+                                cd.bias_desc.data_type))
                     thr_mem_estimate += sizeof(float) * jcp.ngroups * jcp.oc;
             }
             const size_t gemm_col_datatype_size
@@ -2031,7 +2030,8 @@ status_t init_conf(conv_gemm_conf_t &jcp,
             thr_mem_estimate += gemm_col_datatype_size * max_threads * jcp.ic
                     * jcp.ks * simd_w;
 
-            const bool outer_threading_mem_ok = thr_mem_estimate < scratchpad_limit;
+            const bool outer_threading_mem_ok
+                    = thr_mem_estimate < scratchpad_limit;
             jcp.outer_threading = outer_threading_mem_ok
                     && jcp.os / max_threads < 256
                     && (jcp.mb != 1 || jcp.ngroups > 2);
@@ -2062,7 +2062,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
                         key_conv_int_dat_in_acc_dt, conv_acc_buffer_size);
                 if ((is_fwd || is_bwd_w) && jcp.with_bias
                         && one_of(data_type::bf16, cd.diff_bias_desc.data_type,
-                               cd.bias_desc.data_type))
+                                cd.bias_desc.data_type))
                     scratchpad.book<float>(key_conv_bias_bf16_convert_wsp,
                             jcp.ngroups * jcp.oc);
             }
@@ -2077,7 +2077,8 @@ status_t init_conf(conv_gemm_conf_t &jcp,
                 VDISPATCH_CONV_IC(scratchpad_limit >= scratchpad.size(),
                         VERBOSE_SCRATCHPAD_LIMIT);
 
-                const size_t available_mem = scratchpad_limit - scratchpad.size();
+                const size_t available_mem
+                        = scratchpad_limit - scratchpad.size();
                 if (available_mem
                         < gemm_col_memory_sz * gemm_col_datatype_size) {
                     // Required memory in this scenario overflows the
@@ -2118,7 +2119,8 @@ status_t init_conf(conv_gemm_conf_t &jcp,
         if (size) scratchpad.book<int32_t>(key_conv_gemm_zp_src_comp, size);
     }
 
-    VDISPATCH_CONV_IC(scratchpad.size() <= scratchpad_limit, VERBOSE_SCRATCHPAD_LIMIT);
+    VDISPATCH_CONV_IC(
+            scratchpad.size() <= scratchpad_limit, VERBOSE_SCRATCHPAD_LIMIT);
 
     return status::success;
 }