From 28316f8ed20036fe8521a08bb7684d8c2576fdd1 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Fri, 13 Feb 2026 17:37:33 -0800
Subject: [PATCH 01/15] hexagon: refactor set/get/sum-rows ops to use local
 context

---
 ggml/src/ggml-hexagon/htp/get-rows-ops.c | 33 ++++++----
 ggml/src/ggml-hexagon/htp/htp-ops.h      |  6 --
 ggml/src/ggml-hexagon/htp/set-rows-ops.c | 53 ++++++++-------
 ggml/src/ggml-hexagon/htp/sum-rows-ops.c | 83 ++++++++++++++----------
 4 files changed, 96 insertions(+), 79 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/get-rows-ops.c b/ggml/src/ggml-hexagon/htp/get-rows-ops.c
index a657cd2dc..bf24bbda7 100644
--- a/ggml/src/ggml-hexagon/htp/get-rows-ops.c
+++ b/ggml/src/ggml-hexagon/htp/get-rows-ops.c
@@ -15,6 +15,13 @@
 #include "htp-ops.h"
 #include "hvx-utils.h"
 
+struct get_rows_context {
+    struct htp_ops_context * octx;
+    uint32_t src1_nrows_per_thread;
+    struct fastdiv_values get_rows_div_ne10;
+    struct fastdiv_values get_rows_div_ne10_ne11;
+};
+
 #define get_rows_preamble \
     const uint32_t ne00 = octx->src0.ne[0]; \
     const uint32_t ne01 = octx->src0.ne[1]; \
@@ -39,20 +46,22 @@
                                             \
     const uint32_t nr = ne10 * ne11 * ne12;
 
-static int get_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth, const int ith) {
+static void get_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *data) {
+    struct get_rows_context * grctx = (struct get_rows_context *)data;
+    struct htp_ops_context * octx = grctx->octx;
     get_rows_preamble;
 
     // parallelize by src1 elements (which correspond to dst rows)
-    const uint32_t dr  = octx->src1_nrows_per_thread;
+    const uint32_t dr  = grctx->src1_nrows_per_thread;
     const uint32_t ir0 = dr * ith;
     const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
 
     const bool is_i32 = (octx->src1.type == HTP_TYPE_I32);
 
     for (uint32_t i = ir0; i < ir1; ++i) {
-        const uint32_t i12 = fastdiv(i, &octx->get_rows_div_ne10_ne11);
+        const uint32_t i12 = fastdiv(i, &grctx->get_rows_div_ne10_ne11);
         const uint32_t rem = i - i12 * ne11 * ne10;
-        const uint32_t i11 = fastdiv(rem, &octx->get_rows_div_ne10);
+        const uint32_t i11 = fastdiv(rem, &grctx->get_rows_div_ne10);
         const uint32_t i10 = rem - i11 * ne10;
 
         const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
@@ -68,12 +77,6 @@ static int get_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth,
         const uintptr_t dst_ptr  = octx->dst.data  + i10*nb1  + i11*nb2  + i12*nb3;
         hvx_copy_f32_uu((uint8_t *)dst_ptr, (const uint8_t *)src0_ptr, ne00);
     }
-
-    return HTP_STATUS_OK;
-}
-
-static void get_rows_work_f32_f32(unsigned int n, unsigned int i, void *data) {
-    get_rows_thread_f32_f32((struct htp_ops_context *) data, n, i);
 }
 
 int op_get_rows(struct htp_ops_context * octx) {
@@ -95,12 +98,14 @@ int op_get_rows(struct htp_ops_context * octx) {
         return HTP_STATUS_OK;
     }
 
-    octx->get_rows_div_ne10      = init_fastdiv_values(octx->src1.ne[0]);
-    octx->get_rows_div_ne10_ne11 = init_fastdiv_values(octx->src1.ne[0] * octx->src1.ne[1]);
+    struct get_rows_context grctx;
+    grctx.octx = octx;
+    grctx.get_rows_div_ne10      = init_fastdiv_values(octx->src1.ne[0]);
+    grctx.get_rows_div_ne10_ne11 = init_fastdiv_values(octx->src1.ne[0] * octx->src1.ne[1]);
 
     const uint32_t n_jobs = MIN(nr, octx->n_threads);
-    octx->src1_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
+    grctx.src1_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
 
-    worker_pool_run_func(octx->ctx->worker_pool, get_rows_work_f32_f32, octx, n_jobs);
+    worker_pool_run_func(octx->ctx->worker_pool, get_rows_thread_f32_f32, &grctx, n_jobs);
     return HTP_STATUS_OK;
 }
diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h
index f1ad24dbf..c6c27f7f4 100644
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -64,12 +64,6 @@ struct htp_ops_context {
     struct fastdiv_values broadcast_rv2;
     struct fastdiv_values broadcast_rv3;
 
-    struct fastdiv_values set_rows_div_ne12; // fastdiv values for ne12
-    struct fastdiv_values set_rows_div_ne11; // fastdiv values for ne11
-
-    struct fastdiv_values get_rows_div_ne10;      // fastdiv values for ne10
-    struct fastdiv_values get_rows_div_ne10_ne11; // fastdiv values for ne10 * ne11
-
     uint32_t flags;
 };
 
diff --git a/ggml/src/ggml-hexagon/htp/set-rows-ops.c b/ggml/src/ggml-hexagon/htp/set-rows-ops.c
index 904484da9..2fd6c9077 100644
--- a/ggml/src/ggml-hexagon/htp/set-rows-ops.c
+++ b/ggml/src/ggml-hexagon/htp/set-rows-ops.c
@@ -43,11 +43,21 @@
                                             \
     const uint32_t nr  = ne01;
 
-static int set_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth, const int ith) {
+struct htp_set_rows_context {
+    struct htp_ops_context * octx;
+    struct fastdiv_values div_ne12;
+    struct fastdiv_values div_ne11;
+    uint32_t src0_nrows_per_thread;
+};
+
+static void set_rows_thread_f32_f32(unsigned int nth, unsigned int ith, void *data) {
+    struct htp_set_rows_context * srctx = (struct htp_set_rows_context *)data;
+    struct htp_ops_context * octx = srctx->octx;
+
     set_rows_preamble;
 
     // parallelize by rows of src0
-    const uint32_t dr  = octx->src0_nrows_per_thread;
+    const uint32_t dr  = srctx->src0_nrows_per_thread;
     const uint32_t ir0 = dr * ith;
     const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
 
@@ -56,8 +66,8 @@ static int set_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth,
     for (uint32_t i03 = 0; i03 < ne03; ++i03) {
         for (uint32_t i02 = 0; i02 < ne02; ++i02) {
             for (uint32_t i = ir0; i < ir1; ++i) {
-                const uint32_t i12 = fastmodulo(i03, ne12, &octx->set_rows_div_ne12);
-                const uint32_t i11 = fastmodulo(i02, ne11, &octx->set_rows_div_ne11);
+                const uint32_t i12 = fastmodulo(i03, ne12, &srctx->div_ne12);
+                const uint32_t i11 = fastmodulo(i02, ne11, &srctx->div_ne11);
                 const uint32_t i10 = i;
 
                 const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
@@ -76,15 +86,16 @@ static int set_rows_thread_f32_f32(struct htp_ops_context * octx, const int nth,
             }
         }
     }
-
-    return HTP_STATUS_OK;
 }
 
-static int set_rows_thread_f16_f32(struct htp_ops_context * octx, const int nth, const int ith) {
+static void set_rows_thread_f16_f32(unsigned int nth, unsigned int ith, void *data) {
+    struct htp_set_rows_context * srctx = (struct htp_set_rows_context *)data;
+    struct htp_ops_context * octx = srctx->octx;
+
     set_rows_preamble;
 
     // parallelize by rows of src0
-    const uint32_t dr  = octx->src0_nrows_per_thread;
+    const uint32_t dr  = srctx->src0_nrows_per_thread;
     const uint32_t ir0 = dr * ith;
     const uint32_t ir1 = (ir0 + dr < nr) ? (ir0 + dr) : nr;
 
@@ -93,8 +104,8 @@ static int set_rows_thread_f16_f32(struct htp_ops_context * octx, const int nth,
     for (uint32_t i03 = 0; i03 < ne03; ++i03) {
         for (uint32_t i02 = 0; i02 < ne02; ++i02) {
             for (uint32_t i = ir0; i < ir1; ++i) {
-                const uint32_t i12 = fastmodulo(i03, ne12, &octx->set_rows_div_ne12);
-                const uint32_t i11 = fastmodulo(i02, ne11, &octx->set_rows_div_ne11);
+                const uint32_t i12 = fastmodulo(i03, ne12, &srctx->div_ne12);
+                const uint32_t i11 = fastmodulo(i02, ne11, &srctx->div_ne11);
                 const uint32_t i10 = i;
 
                 const uintptr_t src1_addr = octx->src1.data + i10*nb10 + i11*nb11 + i12*nb12;
@@ -112,16 +123,6 @@ static int set_rows_thread_f16_f32(struct htp_ops_context * octx, const int nth,
             }
         }
     }
-
-    return HTP_STATUS_OK;
-}
-
-static void set_rows_work_f16_f32(unsigned int n, unsigned int i, void *data) {
-    set_rows_thread_f16_f32((struct htp_ops_context *) data, n, i);
-}
-
-static void set_rows_work_f32_f32(unsigned int n, unsigned int i, void *data) {
-    set_rows_thread_f32_f32((struct htp_ops_context *) data, n, i);
 }
 
 int op_set_rows(struct htp_ops_context * octx) {
@@ -143,18 +144,20 @@ int op_set_rows(struct htp_ops_context * octx) {
         return HTP_STATUS_OK;
     }
 
-    octx->set_rows_div_ne12 = init_fastdiv_values(ne12);
-    octx->set_rows_div_ne11 = init_fastdiv_values(ne11);
+    struct htp_set_rows_context srctx;
+    srctx.octx = octx;
+    srctx.div_ne12 = init_fastdiv_values(ne12);
+    srctx.div_ne11 = init_fastdiv_values(ne11);
 
     const uint32_t n_jobs = MIN(nr, octx->n_threads);
-    octx->src0_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
+    srctx.src0_nrows_per_thread = (nr + n_jobs - 1) / n_jobs;
 
     switch(octx->dst.type) {
     case HTP_TYPE_F32:
-        worker_pool_run_func(octx->ctx->worker_pool, set_rows_work_f32_f32, octx, n_jobs);
+        worker_pool_run_func(octx->ctx->worker_pool, set_rows_thread_f32_f32, &srctx, n_jobs);
         break;
     case HTP_TYPE_F16:
-        worker_pool_run_func(octx->ctx->worker_pool, set_rows_work_f16_f32, octx, n_jobs);
+        worker_pool_run_func(octx->ctx->worker_pool, set_rows_thread_f16_f32, &srctx, n_jobs);
         break;
     default:
         return HTP_STATUS_NO_SUPPORT;
diff --git a/ggml/src/ggml-hexagon/htp/sum-rows-ops.c b/ggml/src/ggml-hexagon/htp/sum-rows-ops.c
index 62e45da2b..04fa72182 100644
--- a/ggml/src/ggml-hexagon/htp/sum-rows-ops.c
+++ b/ggml/src/ggml-hexagon/htp/sum-rows-ops.c
@@ -17,7 +17,6 @@
 #include "htp-msg.h"
 #include "htp-ops.h"
 
-
 #define sum_rows_preamble                       \
     struct htp_tensor *src0 =  &octx->src0;\
     struct htp_tensor *dst  = &octx->dst;  \
@@ -42,53 +41,54 @@
     const uint32_t  nb2 = dst->nb[2];      \
     const uint32_t  nb3 = dst->nb[3];      \
 
-static int sum_rows_thread_f32(struct htp_ops_context * octx, const int nth, const int ith) {
-    sum_rows_preamble;
+struct sum_rows_context {
+    const uint8_t * src_data;
+    uint8_t       * dst_data;
+    uint32_t        ne00;
+    size_t          src_stride;
+    size_t          dst_stride;
+    uint32_t        rows_per_thread;
+    uint32_t        total_rows;
+    bool            opt_path;
+};
 
-    const uint32_t src0_nrows_per_thread  = octx->src0_nrows_per_thread;
-    const size_t src0_row_size = nb01;
-    const size_t dst_row_size  = nb1;
+static void sum_rows_thread_f32(unsigned int nth, unsigned int ith, void *data) {
+    const struct sum_rows_context * smctx = (const struct sum_rows_context *) data;
 
-    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
+    const uint32_t rows_per_thread = smctx->rows_per_thread;
+    const uint32_t total_rows      = smctx->total_rows;
 
-    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
-    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+    const uint32_t start_row = rows_per_thread * ith;
+    const uint32_t end_row   = MIN(start_row + rows_per_thread, total_rows);
 
-    // no work for this thread
-    if (src0_start_row >= src0_end_row) {
-        return HTP_STATUS_OK;
+    if (start_row >= end_row) {
+        return;
     }
 
-    int opt_path   = 0;
-    if ((0 == hex_is_aligned((void *) src0->data, VLEN)) && !(nb01 & (VLEN - 1))) {
-        opt_path = 1;
-    }
+    const size_t   src_stride = smctx->src_stride;
+    const size_t   dst_stride = smctx->dst_stride;
+    const uint32_t ne00       = smctx->ne00;
+    const bool     opt_path   = smctx->opt_path;
 
-    const uint8_t * restrict data_src = (const uint8_t *) src0->data;
-    uint8_t * restrict data_dst       = (uint8_t *) dst->data;
+    const float * restrict src_th = (const float *) (smctx->src_data + (start_row * src_stride));
+    float       * restrict dst_th = (float *)       (smctx->dst_data + (start_row * dst_stride));
 
-    const float * restrict src_th = (float *) (data_src + (src0_start_row * src0_row_size));
-    float * restrict dst_th       = (float *) (data_dst + (src0_start_row * dst_row_size));
+    // Calculate actual number of rows for this thread
+    const uint32_t n_rows = end_row - start_row;
 
-    for (uint32_t ir = 0; ir < src0_nrows_per_thread; ir++) {
-        const float * restrict src_local = src_th + (ir * ne00);
+    for (uint32_t ir = 0; ir < n_rows; ir++) {
+        const float * restrict src_local = src_th + (ir * (src_stride / sizeof(float)));
 
-        if (ir + 1 < src0_nrows_per_thread) {
-            hex_l2fetch(src_local + ne00, src0_row_size, src0_row_size, 1);
+        if (ir + 1 < n_rows) {
+            hex_l2fetch(src_local + (src_stride / sizeof(float)), src_stride, src_stride, 1);
         }
 
-        if (1 == opt_path) {
+        if (opt_path) {
             dst_th[ir] = hvx_reduce_sum_f32_a((const uint8_t *) src_local, ne00);
         } else {
             dst_th[ir] = hvx_reduce_sum_f32((const uint8_t *) src_local, ne00);
         }
     }
-
-    return HTP_STATUS_OK;
-}
-
-static void sum_rows_work_f32(unsigned int n, unsigned int i, void *data) {
-    sum_rows_thread_f32((struct htp_ops_context *) data, n, i);
 }
 
 int op_sum_rows(struct htp_ops_context * octx) {
@@ -106,10 +106,25 @@ int op_sum_rows(struct htp_ops_context * octx) {
     const uint32_t src0_nrows = ne01 * ne02 * ne03;
 
     uint32_t n_jobs = MIN(n_threads, src0_nrows);
-    octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
+    uint32_t rows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
 
-    worker_pool_run_func(octx->ctx->worker_pool, sum_rows_work_f32, octx, n_jobs);
+    bool opt_path = false;
+    if ((0 == hex_is_aligned((void *) src0->data, VLEN)) && !(nb01 & (VLEN - 1))) {
+        opt_path = true;
+    }
+
+    struct sum_rows_context smctx = {
+        .src_data        = (const uint8_t *) src0->data,
+        .dst_data        = (uint8_t *) dst->data,
+        .ne00            = ne00,
+        .src_stride      = nb01,
+        .dst_stride      = nb1,
+        .rows_per_thread = rows_per_thread,
+        .total_rows      = src0_nrows,
+        .opt_path        = opt_path,
+    };
+
+    worker_pool_run_func(octx->ctx->worker_pool, sum_rows_thread_f32, &smctx, n_jobs);
 
     return HTP_STATUS_OK;
 }
-

From 272e4ae06575dd83d6543f1003ffc7ac29fb8c4b Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Fri, 13 Feb 2026 20:49:07 -0800
Subject: [PATCH 02/15] hexagon: refactor ROPE and Softmax Ops to use local
 context

Improves performance a bit by precomputing things and saving in the context.
---
 ggml/src/ggml-hexagon/htp/rope-ops.c    | 255 ++++++++++++------------
 ggml/src/ggml-hexagon/htp/softmax-ops.c | 242 ++++++++++++----------
 2 files changed, 264 insertions(+), 233 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index 943ca5c95..728bc0e4d 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -10,6 +10,7 @@
 
 #include "hex-dma.h"
 #include "hvx-utils.h"
+#include "hex-fastdiv.h"
 
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
@@ -42,7 +43,7 @@
     const uint32_t nb2 = dst->nb[2];   \
     const uint32_t nb3 = dst->nb[3];
 
-struct rope_th_ctx {
+struct htp_rope_context {
     int32_t n_dims;
     int32_t mode;
     int32_t n_ctx_orig;
@@ -57,6 +58,11 @@ struct rope_th_ctx {
     float theta_scale;
     float corr_dims[2];
 
+    uint32_t src0_nrows_per_thread;
+    struct fastdiv_values fastdiv_ne01;
+    struct fastdiv_values fastdiv_ne02;
+    size_t spad_stride;
+
     struct htp_ops_context * octx;
 };
 
@@ -117,31 +123,39 @@ static void rope_corr_dims(int     n_dims,
     dims[1]     = MIN(n_dims - 1, end);
 }
 
-static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context * octx) {
-    memset(rope_ctx, 0, sizeof(struct rope_th_ctx));
+static void init_rope_ctx(struct htp_rope_context * rctx, struct htp_ops_context * octx) {
+    memset(rctx, 0, sizeof(struct htp_rope_context));
 
     const int32_t * op_params = &octx->op_params[0];
 
-    rope_ctx->n_dims     = ((const int32_t *) op_params)[1];
-    rope_ctx->mode       = ((const int32_t *) op_params)[2];
-    rope_ctx->n_ctx_orig = ((const int32_t *) op_params)[4];
+    rctx->n_dims     = ((const int32_t *) op_params)[1];
+    rctx->mode       = ((const int32_t *) op_params)[2];
+    rctx->n_ctx_orig = ((const int32_t *) op_params)[4];
+
+    memcpy(&rctx->freq_base, (int32_t *) op_params + 5, sizeof(float));
+    memcpy(&rctx->freq_scale, (int32_t *) op_params + 6, sizeof(float));
+    memcpy(&rctx->ext_factor, (int32_t *) op_params + 7, sizeof(float));
+    memcpy(&rctx->attn_factor, (int32_t *) op_params + 8, sizeof(float));
+    memcpy(&rctx->beta_fast, (int32_t *) op_params + 9, sizeof(float));
+    memcpy(&rctx->beta_slow, (int32_t *) op_params + 10, sizeof(float));
+    memcpy(&rctx->sections, (int32_t *) op_params + 11, sizeof(int) * 4);
+
+    rctx->theta_scale = powf(rctx->freq_base, -2.0f / rctx->n_dims);
 
-    memcpy(&rope_ctx->freq_base, (int32_t *) op_params + 5, sizeof(float));
-    memcpy(&rope_ctx->freq_scale, (int32_t *) op_params + 6, sizeof(float));
-    memcpy(&rope_ctx->ext_factor, (int32_t *) op_params + 7, sizeof(float));
-    memcpy(&rope_ctx->attn_factor, (int32_t *) op_params + 8, sizeof(float));
-    memcpy(&rope_ctx->beta_fast, (int32_t *) op_params + 9, sizeof(float));
-    memcpy(&rope_ctx->beta_slow, (int32_t *) op_params + 10, sizeof(float));
-    memcpy(&rope_ctx->sections, (int32_t *) op_params + 11, sizeof(int) * 4);
+    rope_corr_dims(rctx->n_dims, rctx->n_ctx_orig, rctx->freq_base, rctx->beta_fast,
+                   rctx->beta_slow, rctx->corr_dims);
 
-    rope_ctx->theta_scale = powf(rope_ctx->freq_base, -2.0f / rope_ctx->n_dims);
+    rctx->octx = octx;
 
-    rope_corr_dims(rope_ctx->n_dims, rope_ctx->n_ctx_orig, rope_ctx->freq_base, rope_ctx->beta_fast,
-                   rope_ctx->beta_slow, rope_ctx->corr_dims);
+    // Initialize fastdiv values
+    const uint32_t ne01 = octx->src0.ne[1];
+    const uint32_t ne02 = octx->src0.ne[2];
 
-    rope_ctx->octx = octx;
-    FARF(HIGH, "rope-f32 n_dims:%d, ext_factor:%.6f, theta_scale:%.6f, attn_factor:%.6f\n", rope_ctx->n_dims,
-         rope_ctx->ext_factor, rope_ctx->theta_scale, rope_ctx->attn_factor);
+    if (ne01 > 0) rctx->fastdiv_ne01 = init_fastdiv_values(ne01);
+    if (ne02 > 0) rctx->fastdiv_ne02 = init_fastdiv_values(ne02);
+
+    FARF(HIGH, "rope-f32 n_dims:%d, ext_factor:%.6f, theta_scale:%.6f, attn_factor:%.6f\n", rctx->n_dims,
+         rctx->ext_factor, rctx->theta_scale, rctx->attn_factor);
 }
 
 static void hvx_calc_rope_neox_f32(const float * restrict src0,
@@ -248,144 +262,135 @@ static void hvx_calc_rope_f32(const float * restrict src0,
     }
 }
 
-static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
-                         const uint32_t       ir0,
-                         const uint32_t       ir1,
-                         int                  nth,
-                         int                  ith,
-                         const int            opt_path) {
-    struct htp_ops_context * octx = rope_ctx->octx;
+static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_rope_context * rctx = (struct htp_rope_context *) data;
+    struct htp_ops_context * octx = rctx->octx;
 
     const struct htp_tensor * src0 = &octx->src0;
     const struct htp_tensor * src1 = &octx->src1;
     const struct htp_tensor * src2 = &octx->src2;
     struct htp_tensor *       dst  = &octx->dst;
 
-    const int32_t mode    = rope_ctx->mode;
-    const bool    is_neox = mode & HTP_ROPE_TYPE_NEOX;
-
     htp_rope_preamble;
 
-    const int32_t * pos = (const int32_t *) src1->data;
+    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
+    const uint32_t src0_nrows_per_thread = rctx->src0_nrows_per_thread;
 
-    float * wp0 = (float *) (octx->src0_spad.data + (ith * nb01));
+    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
+    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
 
-    const float * freq_factors = NULL;
-    if (src2 != NULL) {
-        freq_factors = (const float *) src2->data;
+    // no work for this thread
+    if (src0_start_row >= src0_end_row) {
+        return;
     }
 
-    const uint32_t i1_end       = MIN(ir1, ne1);
-    const int32_t  half_dims    = rope_ctx->n_dims / 2;
-    const size_t   remain_bytes = (ne0 - rope_ctx->n_dims) * sizeof(float);
-    for (uint32_t i3 = 0; i3 < ne3; i3++) {      // batch
-        for (uint32_t i2 = 0; i2 < ne2; i2++) {  // seq-len
-            const int32_t p = pos[i2];
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
 
-            rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor,
-                            rope_ctx->attn_factor, wp0, rope_ctx->theta_scale);
+    const int32_t mode    = rctx->mode;
+    const bool    is_neox = mode & HTP_ROPE_TYPE_NEOX;
+    const int32_t half_dims = rctx->n_dims / 2;
+    const size_t  remain_bytes = (ne0 - rctx->n_dims) * sizeof(float);
 
-            for (uint32_t i1 = ir0; i1 < i1_end; i1++) {  // attn-heads
-                const float * src      = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
-                float *       dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
+    int is_aligned = 1;
+    int opt_path   = 0;
+    if ((0 == hex_is_aligned((void *) src0->data, VLEN)) || (0 == hex_is_aligned((void *) src1->data, VLEN)) ||
+        (0 == hex_is_aligned((void *) dst->data, VLEN))) {
+        FARF(HIGH, "rope-f32: unaligned addresses in rope op, possibly slower execution\n");
+        is_aligned = 0;
+    }
+    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
+        opt_path = 1;
+    }
 
-                const float * src_loc      = src;
-                float *       dst_data_loc = dst_data;
+    float * wp0 = (float *) (octx->src0_spad.data + (ith * rctx->spad_stride));
 
-                if (1 == opt_path) {
-                    if (is_neox) {
-                        hvx_calc_rope_neox_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
-                    } else {
-                        hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
-                    }
+    const int32_t * pos = (const int32_t *) src1->data;
 
-                    src_loc += rope_ctx->n_dims;
-                    dst_data_loc += rope_ctx->n_dims;
-                } else {
-                    for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
-                        const float cos_theta = wp0[i0 + 0];
-                        const float sin_theta = wp0[i0 + 1];
+    const float * freq_factors = NULL;
+    if (src2 != NULL) {
+        freq_factors = (const float *) src2->data;
+    }
 
-                        if (is_neox) {
-                            const float x0 = src_loc[0];
-                            const float x1 = src_loc[half_dims];
+    uint32_t prev_i2 = (uint32_t)-1;
 
-                            dst_data_loc[0]         = x0 * cos_theta - x1 * sin_theta;
-                            dst_data_loc[half_dims] = x0 * sin_theta + x1 * cos_theta;
+    for (uint32_t r = src0_start_row; r < src0_end_row; ++r) {
+        // Calculate indices from flat row index r
+        // idx = i3 * (ne02 * ne01) + i2 * ne01 + i1
+        // i1 = idx % ne01
+        // i2 = (idx / ne01) % ne02
+        // i3 = (idx / ne01) / ne02
 
-                            src_loc += 1;
-                            dst_data_loc += 1;
-                        } else {
-                            const float x0 = src_loc[0];
-                            const float x1 = src_loc[1];
+        uint32_t i1 = fastmodulo(r, ne01, &rctx->fastdiv_ne01);
+        uint32_t r_div_ne01 = fastdiv(r, &rctx->fastdiv_ne01);
+        uint32_t i2 = fastmodulo(r_div_ne01, ne02, &rctx->fastdiv_ne02);
+        uint32_t i3 = fastdiv(r_div_ne01, &rctx->fastdiv_ne02);
 
-                            dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
-                            dst_data_loc[1] = x0 * sin_theta + x1 * cos_theta;
+        if (i2 != prev_i2) {
+            const int32_t p = pos[i2];
 
-                            src_loc += 2;
-                            dst_data_loc += 2;
-                        }
-                    }
+            rope_cache_init(p, rctx->freq_scale, freq_factors, rctx->corr_dims, ne0, rctx->ext_factor,
+                            rctx->attn_factor, wp0, rctx->theta_scale);
+            prev_i2 = i2;
+        }
 
-                    src_loc += (is_neox ? half_dims : 0);
-                    dst_data_loc += (is_neox ? half_dims : 0);
-                }
+        const float * src      = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
+        float *       dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
 
-                // TODO: use simd to speed up the remaining elements copy
-                memcpy(dst_data_loc, src_loc, remain_bytes);
+        const float * src_loc      = src;
+        float *       dst_data_loc = dst_data;
+
+        if (1 == opt_path) {
+            if (is_neox) {
+                hvx_calc_rope_neox_f32(src_loc, dst_data_loc, rctx->n_dims, wp0);
+            } else {
+                hvx_calc_rope_f32(src_loc, dst_data_loc, rctx->n_dims, wp0);
             }
-        }
-    }
-}
 
-static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int ith) {
-    struct htp_ops_context * octx = rope_ctx->octx;
+            src_loc += rctx->n_dims;
+            dst_data_loc += rctx->n_dims;
+        } else {
+            for (uint32_t i0 = 0; i0 < rctx->n_dims; i0 += 2) {
+                const float cos_theta = wp0[i0 + 0];
+                const float sin_theta = wp0[i0 + 1];
 
-    const struct htp_tensor * src0 = &octx->src0;
-    const struct htp_tensor * src1 = &octx->src1;
-    struct htp_tensor *       dst  = &octx->dst;
+                if (is_neox) {
+                    const float x0 = src_loc[0];
+                    const float x1 = src_loc[half_dims];
 
-    htp_rope_preamble;
+                    dst_data_loc[0]         = x0 * cos_theta - x1 * sin_theta;
+                    dst_data_loc[half_dims] = x0 * sin_theta + x1 * cos_theta;
 
-    const uint32_t src0_nrows            = ne01 * ne02 * ne03;  // src0 rows
-    const uint32_t src0_nrows_per_thread = octx->src0_nrows_per_thread;
+                    src_loc += 1;
+                    dst_data_loc += 1;
+                } else {
+                    const float x0 = src_loc[0];
+                    const float x1 = src_loc[1];
 
-    const uint32_t src0_start_row = src0_nrows_per_thread * ith;
-    const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
+                    dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
+                    dst_data_loc[1] = x0 * sin_theta + x1 * cos_theta;
 
-    // no work for this thread
-    if (src0_start_row >= src0_end_row) {
-        return;
-    }
+                    src_loc += 2;
+                    dst_data_loc += 2;
+                }
+            }
 
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
+            src_loc += (is_neox ? half_dims : 0);
+            dst_data_loc += (is_neox ? half_dims : 0);
+        }
 
-    int is_aligned = 1;
-    int opt_path   = 0;
-    if ((0 == hex_is_aligned((void *) src0->data, VLEN)) || (0 == hex_is_aligned((void *) src1->data, VLEN)) ||
-        (0 == hex_is_aligned((void *) dst->data, VLEN))) {
-        FARF(HIGH, "rope-f32: unaligned addresses in rope op, possibly slower execution\n");
-        is_aligned = 0;
-    }
-    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
-        opt_path = 1;
+        // TODO: use simd to speed up the remaining elements copy
+        if (remain_bytes > 0) {
+            memcpy(dst_data_loc, src_loc, remain_bytes);
+        }
     }
 
-    rope_hex_f32(rope_ctx, src0_start_row, src0_end_row, nth, ith, opt_path);
-
     t2 = HAP_perf_get_qtimer_count();
 
     FARF(HIGH, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row, src0_end_row,
          (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-static void rope_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {
-    struct rope_th_ctx * rope_ctx = (struct rope_th_ctx *) data;
-
-    rope_job_f32_per_thread(rope_ctx, n, i);
-}
-
 static int execute_op_rope_f32(struct htp_ops_context * octx) {
     int err = HTP_STATUS_OK;
 
@@ -394,17 +399,12 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
     const struct htp_tensor * src2 = &octx->src2;
     struct htp_tensor *       dst  = &octx->dst;
 
-    worker_callback_t op_func;
-    const char *      op_type = NULL;
-
-    struct rope_th_ctx rope_ctx;
+    struct htp_rope_context rctx;
+    const char * op_type = "rope-f32";
 
     switch (octx->op) {
         case HTP_OP_ROPE:
-            op_func = rope_job_dispatcher_f32;
-            op_type = "rope-f32";
-
-            init_rope_ctx(&rope_ctx, octx);
+            init_rope_ctx(&rctx, octx);
             break;
 
         default:
@@ -424,6 +424,9 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
     octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads;
     octx->src1_spad.size = hex_round_up(src1_row_size, 128) * n_threads;
 
+    // Use stride for calculating offset
+    rctx.spad_stride = hex_round_up(src0_row_size, 128);
+
     size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
 
     if (src2->ne[0]) {
@@ -456,8 +459,8 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
 
     if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
         uint32_t n_jobs             = MIN(n_threads, src0_nrows);
-        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
-        worker_pool_run_func(octx->ctx->worker_pool, op_func, &rope_ctx, n_jobs);
+        rctx.src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
+        worker_pool_run_func(octx->ctx->worker_pool, rope_job_f32, &rctx, n_jobs);
     }
 
     return err;
diff --git a/ggml/src/ggml-hexagon/htp/softmax-ops.c b/ggml/src/ggml-hexagon/htp/softmax-ops.c
index e91a16d94..58278079f 100644
--- a/ggml/src/ggml-hexagon/htp/softmax-ops.c
+++ b/ggml/src/ggml-hexagon/htp/softmax-ops.c
@@ -10,6 +10,7 @@
 
 #include "hex-dma.h"
 #include "hvx-utils.h"
+#include "hex-fastdiv.h"
 
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
@@ -48,7 +49,7 @@
     const uint32_t nb2 = dst->nb[2];                       \
     const uint32_t nb3 = dst->nb[3];
 
-struct softmax_th_ctx {
+struct htp_softmax_context {
     bool     use_f16;
     bool     use_src1;
     uint32_t n_head;
@@ -59,28 +60,48 @@ struct softmax_th_ctx {
     float m0;
     float m1;
 
+    uint32_t src0_nrows_per_thread;
+    struct fastdiv_values fastdiv_ne01;
+    struct fastdiv_values fastdiv_ne02;
+    struct fastdiv_values fastdiv_ne12; // For mask broadcasting
+    struct fastdiv_values fastdiv_ne13; // For mask broadcasting
+    size_t spad_stride;
+
     struct htp_ops_context * octx;
 };
 
-static void init_softmax_ctx(struct softmax_th_ctx * softmax_ctx, struct htp_ops_context * octx) {
+static void init_softmax_ctx(struct htp_softmax_context * smctx, struct htp_ops_context * octx) {
     const struct htp_tensor * src0 = &octx->src0;
     const struct htp_tensor * src1 = &octx->src1;
 
-    memset(softmax_ctx, 0, sizeof(struct softmax_th_ctx));
+    memset(smctx, 0, sizeof(struct htp_softmax_context));
+
+    memcpy(&smctx->scale, (float *) octx->op_params, sizeof(float));
+    memcpy(&smctx->max_bias, (float *) octx->op_params + 1, sizeof(float));
+
+    smctx->n_head      = src0->ne[2];
+    smctx->n_head_log2 = 1u << (uint32_t) floor(log2(smctx->n_head));
+
+    smctx->m0 = powf(2.0f, -(smctx->max_bias) / smctx->n_head_log2);
+    smctx->m1 = powf(2.0f, -(smctx->max_bias / 2.0f) / smctx->n_head_log2);
 
-    memcpy(&softmax_ctx->scale, (float *) octx->op_params, sizeof(float));
-    memcpy(&softmax_ctx->max_bias, (float *) octx->op_params + 1, sizeof(float));
+    smctx->use_src1 = (src1->ne[0] != 0);
+    smctx->use_f16  = (src1->ne[0] != 0) && (src1->type == HTP_TYPE_F16);
 
-    softmax_ctx->n_head      = src0->ne[2];
-    softmax_ctx->n_head_log2 = 1u << (uint32_t) floor(log2(softmax_ctx->n_head));
+    smctx->octx = octx;
 
-    softmax_ctx->m0 = powf(2.0f, -(softmax_ctx->max_bias) / softmax_ctx->n_head_log2);
-    softmax_ctx->m1 = powf(2.0f, -(softmax_ctx->max_bias / 2.0f) / softmax_ctx->n_head_log2);
+    // Initialize fastdiv values
+    const uint32_t ne01 = src0->ne[1];
+    const uint32_t ne02 = src0->ne[2];
 
-    softmax_ctx->use_src1 = (src1->ne[0] != 0);
-    softmax_ctx->use_f16  = (src1->ne[0] != 0) && (src1->type == HTP_TYPE_F16);
+    if (ne01 > 0) smctx->fastdiv_ne01 = init_fastdiv_values(ne01);
+    if (ne02 > 0) smctx->fastdiv_ne02 = init_fastdiv_values(ne02);
 
-    softmax_ctx->octx = octx;
+    const uint32_t ne12 = (src1->ne[0]) ? src1->ne[2] : 1;
+    const uint32_t ne13 = (src1->ne[0]) ? src1->ne[3] : 1;
+
+    if (ne12 > 0) smctx->fastdiv_ne12 = init_fastdiv_values(ne12);
+    if (ne13 > 0) smctx->fastdiv_ne13 = init_fastdiv_values(ne13);
 }
 
 static void hvx_fast_softmax_prep_f32(const uint8_t * restrict src,
@@ -183,83 +204,9 @@ static float hvx_softmax_f32(const uint8_t * restrict src,
     return sum;
 }
 
-static void softmax_htp_f32(int nth, int ith, struct softmax_th_ctx * softmax_ctx, int opt_path) {
-    struct htp_ops_context * octx = softmax_ctx->octx;
-
-    const struct htp_tensor * src0 = &octx->src0;
-    const struct htp_tensor * src1 = &octx->src1;
-    const struct htp_tensor * dst  = &octx->dst;
-
-    htp_softmax_preamble3;
-
-    uint8_t * src0_spad_data = octx->src0_spad.data + (ith * nb01);
-    uint8_t * src1_spad_data = octx->src1_spad.data + (ith * nb01);
-    uint8_t * dst_spad_data  = octx->dst_spad.data + (ith * nb1);
-
-    float * wp0 = (float *) src0_spad_data;
-    float * wp1 = (float *) src1_spad_data;
-    float * wp2 = (float *) dst_spad_data;
-
-    for (uint32_t i03 = 0; i03 < ne03; i03++) {
-        for (uint32_t i02 = 0; i02 < ne02; i02++) {
-            for (uint32_t i01 = ith; i01 < ne01; i01 += nth) {
-                const uint32_t i11 = i01;
-                const uint32_t i12 = i02 % ne12;
-                const uint32_t i13 = i03 % ne13;
-
-                // ALiBi
-                const uint32_t h = i02;  // head
-
-                const float slope = (softmax_ctx->max_bias > 0.0f) ?
-                                        h < softmax_ctx->n_head_log2 ?
-                                        powf(softmax_ctx->m0, h + 1) :
-                                        powf(softmax_ctx->m1, 2 * (h - softmax_ctx->n_head_log2) + 1) :
-                                        1.0f;
-
-                float * sp = (float *) ((char *) octx->src0.data + i01 * nb01 + i02 * nb02 + i03 * nb03);
-                float * dp = (float *) ((char *) octx->dst.data + i01 * nb1 + i02 * nb2 + i03 * nb3);
-
-                // broadcast the mask across rows
-                __fp16 * mp_f16 = (softmax_ctx->use_src1) ?
-                                      (__fp16 *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
-                                      NULL;
-                float *  mp_f32 = (softmax_ctx->use_src1) ?
-                                      (float *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
-                                      NULL;
-
-                if ((1 == opt_path) && (mp_f32) && !(softmax_ctx->use_f16)) {
-                    hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, softmax_ctx->scale,
-                                              (const uint8_t *) mp_f32, slope);
-                } else {
-                    hvx_scale_f32((uint8_t *) wp0, (const uint8_t *) sp, ne00, softmax_ctx->scale);
-                    if (mp_f32) {
-                        if (softmax_ctx->use_f16) {
-                            for (int i = 0; i < ne00; ++i) {
-                                wp0[i] += slope * (float) mp_f16[i];
-                            }
-                        } else {
-                            for (int i = 0; i < ne00; ++i) {
-                                wp0[i] += slope * mp_f32[i];
-                            }
-                        }
-                    }
-                }
-
-                if (1 == opt_path) {
-                    hvx_fast_softmax_f32((const uint8_t *) wp0, (uint8_t *) dp, (uint8_t *) wp1, ne00);
-                } else {
-                    float max = hvx_reduce_max_f32((const uint8_t *) wp0, ne00);
-                    float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max);
-                    sum       = sum > 0.0 ? (1.0 / sum) : 1;
-                    hvx_scale_f32((uint8_t *) dp, (const uint8_t *) wp2, ne00, sum);
-                }
-            }
-        }
-    }
-}
-
-static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int nth, int ith) {
-    struct htp_ops_context * octx = softmax_ctx->octx;
+static void softmax_job_f32(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_softmax_context * smctx = (struct htp_softmax_context *) data;
+    struct htp_ops_context * octx = smctx->octx;
 
     const struct htp_tensor * src0 = &octx->src0;
     const struct htp_tensor * src1 = &octx->src1;
@@ -268,7 +215,7 @@ static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int
     htp_softmax_preamble3;
 
     const uint32_t src0_nrows            = ne01 * ne02 * ne03;  // src0 rows
-    const uint32_t src0_nrows_per_thread = octx->src0_nrows_per_thread;
+    const uint32_t src0_nrows_per_thread = smctx->src0_nrows_per_thread;
 
     const uint32_t src0_start_row = src0_nrows_per_thread * ith;
     const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
@@ -291,20 +238,103 @@ static void softmax_job_f32_per_thread(struct softmax_th_ctx * softmax_ctx, int
         opt_path = 1;
     }
 
-    softmax_htp_f32(nth, ith, softmax_ctx, opt_path);
+    uint8_t * src0_spad_data = octx->src0_spad.data + (ith * smctx->spad_stride);
+    uint8_t * src1_spad_data = octx->src1_spad.data + (ith * smctx->spad_stride);
+    uint8_t * dst_spad_data  = octx->dst_spad.data + (ith * smctx->spad_stride);
+
+    float * wp0 = (float *) src0_spad_data;
+    float * wp1 = (float *) src1_spad_data;
+    float * wp2 = (float *) dst_spad_data;
+
+    uint32_t prev_i2 = (uint32_t)-1;
+    float slope = 1.0f;
+
+    for (uint32_t r = src0_start_row; r < src0_end_row; ++r) {
+        uint32_t i1 = fastmodulo(r, ne01, &smctx->fastdiv_ne01);
+        uint32_t r_div_ne01 = fastdiv(r, &smctx->fastdiv_ne01);
+        uint32_t i2 = fastmodulo(r_div_ne01, ne02, &smctx->fastdiv_ne02);
+        uint32_t i3 = fastdiv(r_div_ne01, &smctx->fastdiv_ne02);
+
+        // Map to original logic indices
+        // i01 = i1
+        // i02 = i2
+        // i03 = i3
+
+        const uint32_t i11 = i1;
+        // const uint32_t i12 = i2 % ne12;
+        // const uint32_t i13 = i3 % ne13;
+
+        uint32_t i12, i13;
+        if (ne12 == ne02) {
+             i12 = i2;
+        } else {
+             i12 = fastmodulo(i2, ne12, &smctx->fastdiv_ne12);
+        }
+
+        if (ne13 == ne03) {
+             i13 = i3;
+        } else {
+             i13 = fastmodulo(i3, ne13, &smctx->fastdiv_ne13);
+        }
+
+        // ALiBi
+        if (i2 != prev_i2) {
+            const uint32_t h = i2;  // head
+
+            slope = (smctx->max_bias > 0.0f) ?
+                        h < smctx->n_head_log2 ?
+                        powf(smctx->m0, h + 1) :
+                        powf(smctx->m1, 2 * (h - smctx->n_head_log2) + 1) :
+                        1.0f;
+            prev_i2 = i2;
+        }
+
+        float * sp = (float *) ((char *) octx->src0.data + i1 * nb01 + i2 * nb02 + i3 * nb03);
+        float * dp = (float *) ((char *) octx->dst.data + i1 * nb1 + i2 * nb2 + i3 * nb3);
+
+        // broadcast the mask across rows
+        __fp16 * mp_f16 = (smctx->use_src1) ?
+                              (__fp16 *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
+                              NULL;
+        float *  mp_f32 = (smctx->use_src1) ?
+                              (float *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
+                              NULL;
+
+        if ((1 == opt_path) && (mp_f32) && !(smctx->use_f16)) {
+            hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, smctx->scale,
+                                      (const uint8_t *) mp_f32, slope);
+        } else {
+            hvx_scale_f32((uint8_t *) wp0, (const uint8_t *) sp, ne00, smctx->scale);
+            if (mp_f32) {
+                if (smctx->use_f16) {
+                    for (int i = 0; i < ne00; ++i) {
+                        wp0[i] += slope * (float) mp_f16[i];
+                    }
+                } else {
+                    for (int i = 0; i < ne00; ++i) {
+                        wp0[i] += slope * mp_f32[i];
+                    }
+                }
+            }
+        }
+
+        if (1 == opt_path) {
+            hvx_fast_softmax_f32((const uint8_t *) wp0, (uint8_t *) dp, (uint8_t *) wp1, ne00);
+        } else {
+            float max = hvx_reduce_max_f32((const uint8_t *) wp0, ne00);
+            float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max);
+            sum       = sum > 0.0 ? (1.0 / sum) : 1;
+            hvx_scale_f32((uint8_t *) dp, (const uint8_t *) wp2, ne00, sum);
+        }
+    }
 
     t2 = HAP_perf_get_qtimer_count();
 
     FARF(HIGH, "softmax-f32 %d/%d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
-         softmax_ctx->use_f16, opt_path, ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13,
+         smctx->use_f16, opt_path, ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13,
          ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-static void softmax_job_dispatcher_f32(unsigned int n, unsigned int i, void * p_data) {
-    struct softmax_th_ctx * p_softmax_ctx = (struct softmax_th_ctx *) p_data;
-    softmax_job_f32_per_thread(p_softmax_ctx, n, i);
-}
-
 static int execute_op_softmax_f32(struct htp_ops_context * octx) {
     int err = HTP_STATUS_OK;
 
@@ -312,17 +342,12 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) {
     const struct htp_tensor * src1 = &octx->src1;
     struct htp_tensor *       dst  = &octx->dst;
 
-    worker_callback_t op_func;
-    const char *      op_type = NULL;
-
-    struct softmax_th_ctx softmax_ctx;
+    struct htp_softmax_context smctx;
+    const char * op_type = "softmax-f32";
 
     switch (octx->op) {
         case HTP_OP_SOFTMAX:
-            op_func = softmax_job_dispatcher_f32;
-            op_type = "softmax-f32";
-
-            init_softmax_ctx(&softmax_ctx, octx);
+            init_softmax_ctx(&smctx, octx);
             break;
 
         default:
@@ -342,6 +367,9 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) {
     octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads;
     octx->src1_spad.size = hex_round_up(src1_row_size, 128) * n_threads;
 
+    // Use stride for calculating offset
+    smctx.spad_stride = hex_round_up(src0_row_size, 128);
+
     size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
 
     if (src1->ne[0]) {
@@ -371,8 +399,8 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) {
 
     if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
         uint32_t n_jobs             = MIN(n_threads, src0_nrows);
-        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
-        worker_pool_run_func(octx->ctx->worker_pool, op_func, &softmax_ctx, n_jobs);
+        smctx.src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
+        worker_pool_run_func(octx->ctx->worker_pool, softmax_job_f32, &smctx, n_jobs);
     }
 
     return err;

From 3cd81b1465e3c89772a39d0d215a04df1c6f73af Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Sat, 14 Feb 2026 18:19:54 -0800
Subject: [PATCH 03/15] hexagon: refactor activation ops to use local context
 struct

---
 ggml/src/ggml-hexagon/htp/act-ops.c | 436 ++++++++++++++--------------
 1 file changed, 214 insertions(+), 222 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index 950d836ad..21bd4050a 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -69,27 +69,45 @@
     const uint32_t nb2 = dst->nb[2];   \
     const uint32_t nb3 = dst->nb[3];
 
-static void glu_swiglu_f32_per_thread(const struct htp_tensor * src0,
-                                       const struct htp_tensor * src1,
-                                       struct htp_tensor *       dst,
-                                       const int32_t *           op_params,
-                                       struct htp_spad *         src0_spad,
-                                       struct htp_spad *         src1_spad,
-                                       struct htp_spad *         dst_spad,
-                                       uint32_t                  nth,
-                                       uint32_t                  ith,
-                                       uint32_t                  src0_nrows_per_thread,
-                                       dma_queue *               dma_queue) {
+struct htp_act_context {
+    struct htp_ops_context *  octx;
+
+    // Precomputed values
+    const uint8_t *           data_src0;
+    const uint8_t *           data_src1;
+    uint8_t *                 data_dst;
+
+    size_t                    src0_row_size;
+    size_t                    src1_row_size;
+    size_t                    dst_row_size;
+
+    size_t                    src0_row_size_aligned;
+    size_t                    src1_row_size_aligned;
+    size_t                    dst_row_size_aligned;
+
+    size_t                    src0_spad_half_size;
+    size_t                    src1_spad_half_size;
+    size_t                    dst_spad_half_size;
+
+    uint32_t                  block;
+    uint32_t                  src0_nrows;
+    uint32_t                  src0_nrows_per_thread;
+    int                       nc;
+};
+
+static void glu_swiglu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_act_context * actx = (struct htp_act_context *) data;
+    const struct htp_tensor * src0 = &actx->octx->src0;
+    const struct htp_tensor * src1 = &actx->octx->src1;
+    const struct htp_tensor * dst  = &actx->octx->dst;
     htp_act_preamble3;
 
-    size_t src0_row_size = nb01;
-    size_t src1_row_size = nb11;
-    size_t dst_row_size  = nb1;
-
-
-
-    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
+    size_t src0_row_size = actx->src0_row_size;
+    size_t src1_row_size = actx->src1_row_size;
+    size_t dst_row_size  = actx->dst_row_size;
 
+    const uint32_t src0_nrows = actx->src0_nrows;
+    const uint32_t src0_nrows_per_thread = actx->src0_nrows_per_thread;
     const uint32_t src0_start_row = src0_nrows_per_thread * ith;
     const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
 
@@ -101,43 +119,34 @@ static void glu_swiglu_f32_per_thread(const struct htp_tensor * src0,
     uint64_t t1, t2;
     t1 = HAP_perf_get_qtimer_count();
 
-    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
-    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
-    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
-
-    const bool src1_valid = src1->ne[0];
-    const int  nc         = (src1_valid) ? ne00 : ne00 / 2;
-    if (!src1_valid) {
-        const int32_t swapped = op_params[1];
-        data_src1             = data_src0;
-        src1_row_size         = src0_row_size;
+    const uint8_t * restrict data_src0 = actx->data_src0;
+    const uint8_t * restrict data_src1 = actx->data_src1;
+    uint8_t * restrict data_dst        = actx->data_dst;
 
-        const size_t nc_in_bytes = nc * SIZEOF_FP32;
-        data_src0 += swapped ? nc_in_bytes : 0;
-        data_src1 += swapped ? 0 : nc_in_bytes;
-    }
+    const int  nc = actx->nc;
 
-    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
-    const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
-    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
+    const size_t src0_row_size_aligned = actx->src0_row_size_aligned;
+    const size_t src1_row_size_aligned = actx->src1_row_size_aligned;
+    const size_t dst_row_size_aligned  = actx->dst_row_size_aligned;
 
-    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
-    uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
-    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_spad->size_per_thread);
+    uint8_t * restrict src0_spad_data = actx->octx->src0_spad.data + (ith * actx->octx->src0_spad.size_per_thread);
+    uint8_t * restrict src1_spad_data = actx->octx->src1_spad.data + (ith * actx->octx->src1_spad.size_per_thread);
+    uint8_t * restrict dst_spad_data  = actx->octx->dst_spad.data + (ith * actx->octx->dst_spad.size_per_thread);
 
-    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
-    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
-    size_t src1_spad_half_size = src1_spad->size_per_thread / 2;
-    size_t dst_spad_half_size  = dst_spad->size_per_thread / 2;
+    size_t src0_spad_half_size = actx->src0_spad_half_size;
+    size_t src1_spad_half_size = actx->src1_spad_half_size;
+    size_t dst_spad_half_size  = actx->dst_spad_half_size;
 
-    const int BLOCK = src0_spad_half_size / src0_row_size_aligned;  // How many rows can we process in one block
+    const int BLOCK = actx->block;
     if (BLOCK == 0) {
         FARF(ERROR,
              "swiglu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
-             src0_spad->size_per_thread, src0_row_size_aligned);
+             actx->octx->src0_spad.size_per_thread, src0_row_size_aligned);
         return;
     }
 
+    dma_queue * dma_queue = actx->octx->ctx->dma[ith];
+
     // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
     for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
         const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
@@ -196,27 +205,22 @@ static void glu_swiglu_f32_per_thread(const struct htp_tensor * src0,
          (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-static void glu_swiglu_oai_f32_per_thread(const struct htp_tensor * src0,
-                                           const struct htp_tensor * src1,
-                                           struct htp_tensor *       dst,
-                                           const int32_t *           op_params,
-                                           struct htp_spad *         src0_spad,
-                                           struct htp_spad *         src1_spad,
-                                           struct htp_spad *         dst_spad,
-                                           uint32_t                  nth,
-                                           uint32_t                  ith,
-                                           uint32_t                  src0_nrows_per_thread,
-                                           dma_queue *               dma_queue) {
+static void glu_swiglu_oai_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_act_context * actx = (struct htp_act_context *) data;
+    const struct htp_tensor * src0 = &actx->octx->src0;
+    const struct htp_tensor * src1 = &actx->octx->src1;
+    const struct htp_tensor * dst  = &actx->octx->dst;
     htp_act_preamble3;
 
     uint64_t t1, t2;
     t1 = HAP_perf_get_qtimer_count();
 
-    size_t src0_row_size = nb01;
-    size_t src1_row_size = nb11;
-    size_t dst_row_size  = nb1;
+    size_t src0_row_size = actx->src0_row_size;
+    size_t src1_row_size = actx->src1_row_size;
+    size_t dst_row_size  = actx->dst_row_size;
 
-    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
+    const uint32_t src0_nrows = actx->src0_nrows;
+    const uint32_t src0_nrows_per_thread = actx->src0_nrows_per_thread;
 
     const uint32_t src0_start_row = src0_nrows_per_thread * ith;
     const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
@@ -226,45 +230,36 @@ static void glu_swiglu_oai_f32_per_thread(const struct htp_tensor * src0,
         return;
     }
 
-    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
-    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
-    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
-
-    const bool src1_valid = src1->ne[0];
-    const int  nc         = (src1_valid) ? ne00 : ne00 / 2;
-    if (!src1_valid) {
-        const int32_t swapped = op_params[1];
-        data_src1             = data_src0;
-        src1_row_size         = src0_row_size;
+    const uint8_t * restrict data_src0 = actx->data_src0;
+    const uint8_t * restrict data_src1 = actx->data_src1;
+    uint8_t * restrict data_dst        = actx->data_dst;
 
-        const size_t nc_in_bytes = nc * SIZEOF_FP32;
-        data_src0 += swapped ? nc_in_bytes : 0;
-        data_src1 += swapped ? 0 : nc_in_bytes;
-    }
+    const int nc = actx->nc;
 
-    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
-    const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
-    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
+    const size_t src0_row_size_aligned = actx->src0_row_size_aligned;
+    const size_t src1_row_size_aligned = actx->src1_row_size_aligned;
+    const size_t dst_row_size_aligned  = actx->dst_row_size_aligned;
 
-    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
-    uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
-    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_spad->size_per_thread);
+    uint8_t * restrict src0_spad_data = actx->octx->src0_spad.data + (ith * actx->octx->src0_spad.size_per_thread);
+    uint8_t * restrict src1_spad_data = actx->octx->src1_spad.data + (ith * actx->octx->src1_spad.size_per_thread);
+    uint8_t * restrict dst_spad_data  = actx->octx->dst_spad.data + (ith * actx->octx->dst_spad.size_per_thread);
 
-    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
-    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
-    size_t src1_spad_half_size = src1_spad->size_per_thread / 2;
-    size_t dst_spad_half_size  = dst_spad->size_per_thread / 2;
+    size_t src0_spad_half_size = actx->src0_spad_half_size;
+    size_t src1_spad_half_size = actx->src1_spad_half_size;
+    size_t dst_spad_half_size  = actx->dst_spad_half_size;
 
-    const int BLOCK = src0_spad_half_size / src0_row_size_aligned;  // How many rows can we process in one block
+    const int BLOCK = actx->block;
     if (BLOCK == 0) {
         FARF(ERROR,
              "swiglu-oai-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least "
              "%zu\n",
-             src0_spad->size_per_thread, src0_row_size_aligned);
+             actx->octx->src0_spad.size_per_thread, src0_row_size_aligned);
         return;
     }
-    const float alpha = ((const float *) (op_params))[2];
-    const float limit = ((const float *) (op_params))[3];
+    const float alpha = ((const float *) (actx->octx->op_params))[2];
+    const float limit = ((const float *) (actx->octx->op_params))[3];
+
+    dma_queue * dma_queue = actx->octx->ctx->dma[ith];
 
     // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
     for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
@@ -335,26 +330,22 @@ static void glu_swiglu_oai_f32_per_thread(const struct htp_tensor * src0,
 }
 
 
-static void unary_gelu_f32_per_thread(const struct htp_tensor * src0,
-                                       struct htp_tensor *       dst,
-                                       const int32_t *           op_params,
-                                       struct htp_spad *         src0_spad,
-                                       struct htp_spad *         dst_spad,
-                                       uint32_t                  nth,
-                                       uint32_t                  ith,
-                                       uint32_t                  src0_nrows_per_thread,
-                                       dma_queue *               dma_queue) {
+static void unary_gelu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_act_context * actx = (struct htp_act_context *) data;
+    const struct htp_tensor * src0 = &actx->octx->src0;
+    const struct htp_tensor * dst  = &actx->octx->dst;
     htp_act_preamble2;
 
     uint64_t t1, t2;
     t1 = HAP_perf_get_qtimer_count();
 
-    const size_t src0_row_size = nb01;
-    const size_t dst_row_size  = nb1;
-    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
-    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
+    const size_t src0_row_size = actx->src0_row_size;
+    const size_t dst_row_size  = actx->dst_row_size;
+    const size_t src0_row_size_aligned = actx->src0_row_size_aligned;
+    const size_t dst_row_size_aligned  = actx->dst_row_size_aligned;
 
-    const uint32_t src0_nrows = ne01 * ne02 * ne03;
+    const uint32_t src0_nrows = actx->src0_nrows;
+    const uint32_t src0_nrows_per_thread = actx->src0_nrows_per_thread;
 
     const uint32_t src0_start_row = src0_nrows_per_thread * ith;
     const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
@@ -364,25 +355,29 @@ static void unary_gelu_f32_per_thread(const struct htp_tensor * src0,
         return;
     }
 
-    const uint8_t * data_src0 = (const uint8_t *) src0->data;
-    uint8_t * data_dst        = (uint8_t *) dst->data;
+    const uint8_t * data_src0 = actx->data_src0;
+    uint8_t * data_dst        = actx->data_dst;
 
-    uint8_t * src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
-    uint8_t * dst_spad_data  = dst_spad->data  + (ith * dst_spad->size_per_thread);
+    // nc/ne0 matches.
+    const int ne0_val = actx->nc; // == dst->ne[0]
 
-    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
-    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
-    size_t dst_spad_half_size  = dst_spad->size_per_thread  / 2;
+    uint8_t * src0_spad_data = actx->octx->src0_spad.data + (ith * actx->octx->src0_spad.size_per_thread);
+    uint8_t * dst_spad_data  = actx->octx->dst_spad.data  + (ith * actx->octx->dst_spad.size_per_thread);
+
+    size_t src0_spad_half_size = actx->src0_spad_half_size;
+    size_t dst_spad_half_size  = actx->dst_spad_half_size;
 
     // In gelu = x*sigmoid(x*1.702)
-    const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
+    const int BLOCK = actx->block;
 
     if (BLOCK == 0) {
         FARF(ERROR, "gelu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
-                src0_spad->size_per_thread, src0_row_size_aligned);
+                actx->octx->src0_spad.size_per_thread, src0_row_size_aligned);
         return;
     }
 
+    dma_queue * dma_queue = actx->octx->ctx->dma[ith];
+
     // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
     for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
         const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
@@ -408,9 +403,9 @@ static void unary_gelu_f32_per_thread(const struct htp_tensor * src0,
             float* dst_spad_ptr        = dst_spad  + ib * (dst_row_size_aligned  / sizeof(float));
 
             // gelu = x * sigmoid(1.702 * x) // current implementation
-            hvx_mul_scalar_f32((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (float) 1.702, ne0);
-            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
-            hvx_mul_f32_aaa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
+            hvx_mul_scalar_f32((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (float) 1.702, ne0_val);
+            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0_val);
+            hvx_mul_f32_aaa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0_val);
         }
 
         dma_queue_push_vtcm_to_ddr(dma_queue,
@@ -435,34 +430,23 @@ static void unary_gelu_f32_per_thread(const struct htp_tensor * src0,
          ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-static void unary_gelu_f32(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = (struct htp_ops_context *) data;
-    unary_gelu_f32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
-                               octx->src0_nrows_per_thread, octx->ctx->dma[i]);
-}
-
 
-
-static void unary_silu_f32_per_thread(const struct htp_tensor * src0,
-                                       struct htp_tensor *       dst,
-                                       const int32_t *           op_params,
-                                       struct htp_spad *         src0_spad,
-                                       struct htp_spad *         dst_spad,
-                                       uint32_t                  nth,
-                                       uint32_t                  ith,
-                                       uint32_t                  src0_nrows_per_thread,
-                                       dma_queue *               dma_queue) {
+static void unary_silu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_act_context * actx = (struct htp_act_context *) data;
+    const struct htp_tensor * src0 = &actx->octx->src0;
+    const struct htp_tensor * dst  = &actx->octx->dst;
     htp_act_preamble2;
 
     uint64_t t1, t2;
     t1 = HAP_perf_get_qtimer_count();
 
-    const size_t src0_row_size = nb01;
-    const size_t dst_row_size  = nb1;
-    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
-    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
+    const size_t src0_row_size = actx->src0_row_size;
+    const size_t dst_row_size  = actx->dst_row_size;
+    const size_t src0_row_size_aligned = actx->src0_row_size_aligned;
+    const size_t dst_row_size_aligned  = actx->dst_row_size_aligned;
 
-    const uint32_t src0_nrows = ne01 * ne02 * ne03;
+    const uint32_t src0_nrows = actx->src0_nrows;
+    const uint32_t src0_nrows_per_thread = actx->src0_nrows_per_thread;
 
     const uint32_t src0_start_row = src0_nrows_per_thread * ith;
     const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
@@ -472,24 +456,27 @@ static void unary_silu_f32_per_thread(const struct htp_tensor * src0,
         return;
     }
 
-    const uint8_t * data_src0 = (const uint8_t *) src0->data;
-    uint8_t * data_dst        = (uint8_t *) dst->data;
+    const uint8_t * data_src0 = actx->data_src0;
+    uint8_t * data_dst        = actx->data_dst;
 
-    uint8_t * src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
-    uint8_t * dst_spad_data  = dst_spad->data  + (ith * dst_spad->size_per_thread);
+    const int ne0_val = actx->nc; // == dst->ne[0]
 
-    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
-    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
-    size_t dst_spad_half_size  = dst_spad->size_per_thread  / 2;
+    uint8_t * src0_spad_data = actx->octx->src0_spad.data + (ith * actx->octx->src0_spad.size_per_thread);
+    uint8_t * dst_spad_data  = actx->octx->dst_spad.data  + (ith * actx->octx->dst_spad.size_per_thread);
 
-    const int BLOCK = src0_spad_half_size / src0_row_size_aligned; // How many rows can we process in one block
+    size_t src0_spad_half_size = actx->src0_spad_half_size;
+    size_t dst_spad_half_size  = actx->dst_spad_half_size;
+
+    const int BLOCK = actx->block;
 
     if (BLOCK == 0) {
         FARF(ERROR, "silu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
-                src0_spad->size_per_thread, src0_row_size_aligned);
+                actx->octx->src0_spad.size_per_thread, src0_row_size_aligned);
         return;
     }
 
+    dma_queue * dma_queue = actx->octx->ctx->dma[ith];
+
     // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
     for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
         const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
@@ -515,8 +502,8 @@ static void unary_silu_f32_per_thread(const struct htp_tensor * src0,
             float* dst_spad_ptr        = dst_spad  + ib * (dst_row_size_aligned  / sizeof(float));
 
             // silu = x * sigmoid(x)
-            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, ne0);
-            hvx_mul_f32_aaa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0);
+            hvx_sigmoid_f32_aa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, ne0_val);
+            hvx_mul_f32_aaa((uint8_t *) dst_spad_ptr, (const uint8_t *) src0_spad_ptr, (const uint8_t *) dst_spad_ptr, ne0_val);
         }
 
         dma_queue_push_vtcm_to_ddr(dma_queue,
@@ -544,27 +531,22 @@ static void unary_silu_f32_per_thread(const struct htp_tensor * src0,
 static const float GELU_COEF_A     = 0.044715f;
 static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
 
-static void glu_geglu_f32_per_thread(const struct htp_tensor * src0,
-                                       const struct htp_tensor * src1,
-                                       struct htp_tensor *       dst,
-                                       const int32_t *           op_params,
-                                       struct htp_spad *         src0_spad,
-                                       struct htp_spad *         src1_spad,
-                                       struct htp_spad *         dst_spad,
-                                       uint32_t                  nth,
-                                       uint32_t                  ith,
-                                       uint32_t                  src0_nrows_per_thread,
-                                       dma_queue *               dma_queue) {
+static void glu_geglu_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_act_context * actx = (struct htp_act_context *) data;
+    const struct htp_tensor * src0 = &actx->octx->src0;
+    const struct htp_tensor * src1 = &actx->octx->src1;
+    const struct htp_tensor * dst  = &actx->octx->dst;
     htp_act_preamble3;
 
-    size_t src0_row_size = nb01;
-    size_t src1_row_size = nb11;
-    size_t dst_row_size  = nb1;
+    size_t src0_row_size = actx->src0_row_size;
+    size_t src1_row_size = actx->src1_row_size;
+    size_t dst_row_size  = actx->dst_row_size;
 
     uint64_t t1, t2;
     t1 = HAP_perf_get_qtimer_count();
 
-    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
+    const uint32_t src0_nrows = actx->src0_nrows;
+    const uint32_t src0_nrows_per_thread = actx->src0_nrows_per_thread;
 
     const uint32_t src0_start_row = src0_nrows_per_thread * ith;
     const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
@@ -574,43 +556,34 @@ static void glu_geglu_f32_per_thread(const struct htp_tensor * src0,
         return;
     }
 
-    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
-    const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
-    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
+    const uint8_t * restrict data_src0 = actx->data_src0;
+    const uint8_t * restrict data_src1 = actx->data_src1;
+    uint8_t * restrict data_dst        = actx->data_dst;
 
-    const bool src1_valid = src1->ne[0];
-    const int  nc         = (src1_valid) ? ne00 : ne00 / 2;
-    if (!src1_valid) {
-        const int32_t swapped = op_params[1];
-        data_src1             = data_src0;
-        src1_row_size         = src0_row_size;
-
-        const size_t nc_in_bytes = nc * SIZEOF_FP32;
-        data_src0 += swapped ? nc_in_bytes : 0;
-        data_src1 += swapped ? 0 : nc_in_bytes;
-    }
+    const int nc = actx->nc;
 
-    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
-    const size_t src1_row_size_aligned = hex_round_up(src1_row_size, VLEN);
-    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
+    const size_t src0_row_size_aligned = actx->src0_row_size_aligned;
+    const size_t src1_row_size_aligned = actx->src1_row_size_aligned;
+    const size_t dst_row_size_aligned  = actx->dst_row_size_aligned;
 
-    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_spad->size_per_thread);
-    uint8_t * restrict src1_spad_data = src1_spad->data + (ith * src1_spad->size_per_thread);
-    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_spad->size_per_thread);
+    uint8_t * restrict src0_spad_data = actx->octx->src0_spad.data + (ith * actx->octx->src0_spad.size_per_thread);
+    uint8_t * restrict src1_spad_data = actx->octx->src1_spad.data + (ith * actx->octx->src1_spad.size_per_thread);
+    uint8_t * restrict dst_spad_data  = actx->octx->dst_spad.data + (ith * actx->octx->dst_spad.size_per_thread);
 
-    // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
-    size_t src0_spad_half_size = src0_spad->size_per_thread / 2;
-    size_t src1_spad_half_size = src1_spad->size_per_thread / 2;
-    size_t dst_spad_half_size  = dst_spad->size_per_thread / 2;
+    size_t src0_spad_half_size = actx->src0_spad_half_size;
+    size_t src1_spad_half_size = actx->src1_spad_half_size;
+    size_t dst_spad_half_size  = actx->dst_spad_half_size;
 
-    const int BLOCK = src0_spad_half_size / src0_row_size_aligned;  // How many rows can we process in one block
+    const int BLOCK = actx->block;
     if (BLOCK == 0) {
         FARF(ERROR,
              "geglu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
-             src0_spad->size_per_thread, src0_row_size_aligned);
+             actx->octx->src0_spad.size_per_thread, src0_row_size_aligned);
         return;
     }
 
+    dma_queue * dma_queue = actx->octx->ctx->dma[ith];
+
     // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
     for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
         const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
@@ -678,33 +651,7 @@ static void glu_geglu_f32_per_thread(const struct htp_tensor * src0,
          (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-static void unary_silu_f32(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = (struct htp_ops_context *) data;
-    unary_silu_f32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
-                               octx->src0_nrows_per_thread, octx->ctx->dma[i]);
-}
-
-static void glu_swiglu_f32(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = (struct htp_ops_context *) data;
-    glu_swiglu_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
-                               &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
-}
-
-static void glu_swiglu_oai_f32(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = (struct htp_ops_context *) data;
-    glu_swiglu_oai_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
-                                   &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
-}
-
-static void glu_geglu_f32(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = (struct htp_ops_context *) data;
-    glu_geglu_f32_per_thread(&octx->src0, &octx->src1, &octx->dst, octx->op_params, &octx->src0_spad,
-                               &octx->src1_spad, &octx->dst_spad, n, i, octx->src0_nrows_per_thread, octx->ctx->dma[i]);
-}
-
 static int execute_op_activations_f32(struct htp_ops_context * octx) {
-    int err = HTP_STATUS_OK;
-
     const struct htp_tensor * src0 = &octx->src0;
     const struct htp_tensor * src1 = &octx->src1;
     struct htp_tensor *       dst  = &octx->dst;
@@ -719,26 +666,26 @@ static int execute_op_activations_f32(struct htp_ops_context * octx) {
 
     switch (octx->op) {
         case HTP_OP_UNARY_SILU:
-            act_op_func = unary_silu_f32;
+            act_op_func = (worker_callback_t)unary_silu_f32_per_thread;
             op_type     = "silu-f32";
             break;
 
         case HTP_OP_GLU_SWIGLU:
-            act_op_func = glu_swiglu_f32;
+            act_op_func = (worker_callback_t)glu_swiglu_f32_per_thread;
             op_type     = "swiglu-f32";
             break;
 
         case HTP_OP_GLU_SWIGLU_OAI:
-            act_op_func = glu_swiglu_oai_f32;
+            act_op_func = (worker_callback_t)glu_swiglu_oai_f32_per_thread;
             op_type     = "swiglu-oai-f32";
             break;
         case HTP_OP_UNARY_GELU:
-            act_op_func = unary_gelu_f32;
+            act_op_func = (worker_callback_t)unary_gelu_f32_per_thread;
             op_type     = "gelu-f32";
             break;
 
         case HTP_OP_GLU_GEGLU:
-            act_op_func = glu_geglu_f32;
+            act_op_func = (worker_callback_t)glu_geglu_f32_per_thread;
             op_type     = "geglu-f32";
             break;
         default:
@@ -797,13 +744,58 @@ static int execute_op_activations_f32(struct htp_ops_context * octx) {
              octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
     }
 
-    if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
-        uint32_t n_jobs = MIN(n_threads, src0_nrows);
-        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
-        worker_pool_run_func(octx->ctx->worker_pool, act_op_func, octx, n_jobs);
+    if ((octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
+        return HTP_STATUS_OK;
     }
 
-    return err;
+    uint32_t n_jobs = MIN(n_threads, src0_nrows);
+
+    // Prepare context
+    struct htp_act_context actx;
+    actx.octx = octx;
+
+    actx.src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
+
+    actx.src0_row_size = src0_row_size;
+    actx.src1_row_size = src1_row_size;
+    actx.dst_row_size  = dst_row_size;
+
+    actx.src0_row_size_aligned = src0_row_size_aligned;
+    actx.src1_row_size_aligned = src1_row_size_aligned;
+    actx.dst_row_size_aligned  = dst_row_size_aligned;
+
+    actx.src0_spad_half_size = octx->src0_spad.size_per_thread / 2;
+    actx.src1_spad_half_size = octx->src1_spad.size_per_thread / 2;
+    actx.dst_spad_half_size  = octx->dst_spad.size_per_thread / 2;
+
+    actx.block = actx.src0_spad_half_size / actx.src0_row_size_aligned;
+    actx.src0_nrows = src0_nrows;
+
+    actx.nc = dst->ne[0];
+
+    // Pointers and GLU logic
+    const uint8_t * data_src0 = (const uint8_t *) src0->data;
+    const uint8_t * data_src1 = (const uint8_t *) src1->data;
+
+    if (!src1_valid && (octx->op == HTP_OP_GLU_SWIGLU || octx->op == HTP_OP_GLU_SWIGLU_OAI || octx->op == HTP_OP_GLU_GEGLU)) {
+         const int32_t swapped = octx->op_params[1];
+         data_src1 = data_src0;
+         actx.src1_row_size = actx.src0_row_size;
+
+         size_t nc_in_bytes = actx.nc * SIZEOF_FP32;
+         if (swapped) {
+             data_src0 += nc_in_bytes;
+         } else {
+             data_src1 += nc_in_bytes;
+         }
+    }
+
+    actx.data_src0 = data_src0;
+    actx.data_src1 = data_src1;
+    actx.data_dst  = (uint8_t *) dst->data;
+
+    worker_pool_run_func(octx->ctx->worker_pool, act_op_func, &actx, n_jobs);
+    return HTP_STATUS_OK;
 }
 
 int op_activations(struct htp_ops_context * octx) {

From 1f72f12845553dcaf8db21fd6080c5038c733215 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Mon, 16 Feb 2026 13:56:36 -0800
Subject: [PATCH 04/15] hexagon: refactor unary ops to use local context struct
 and DMA/VTCM

---
 ggml/src/ggml-hexagon/htp/unary-ops.c | 330 +++++++++++++++-----------
 1 file changed, 187 insertions(+), 143 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c
index ce879bf03..04da6d014 100644
--- a/ggml/src/ggml-hexagon/htp/unary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/unary-ops.c
@@ -17,6 +17,28 @@
 #include "htp-msg.h"
 #include "htp-ops.h"
 
+struct htp_unary_context {
+    struct htp_ops_context * octx;
+
+    // Precomputed values
+    const uint8_t *           data_src0;
+    uint8_t *                 data_dst;
+
+    size_t                    src0_row_size;
+    size_t                    dst_row_size;
+
+    size_t                    src0_row_size_aligned;
+    size_t                    dst_row_size_aligned;
+
+    size_t                    src0_spad_half_size;
+    size_t                    dst_spad_half_size;
+
+    uint32_t                  block;
+    uint32_t                  src0_nrows;
+    uint32_t                  src0_nrows_per_thread;
+    uint32_t                  nc;
+};
+
 #define htp_unary_preamble            \
     const uint32_t ne00 = src->ne[0]; \
     const uint32_t ne01 = src->ne[1]; \
@@ -75,128 +97,95 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
     }
 }
 
-static void scale_htp_f32(const float * restrict src,
-                          float * restrict dst,
-                          uint8_t * restrict spad,
-                          const uint32_t num_rows,
-                          const uint32_t row_elems,
-                          const size_t   row_size,
-                          int32_t *      op_params,
-                          int            opt_path) {
+static void scale_f32(const float * restrict src,
+                      float * restrict dst,
+                      uint8_t * restrict spad,
+                      const uint32_t num_rows,
+                      const uint32_t row_elems,
+                      const size_t   row_size,
+                      int32_t *      op_params) {
     float scale = 0.f;
     float bias  = 0.f;
     memcpy(&scale, &op_params[0], sizeof(float));
     memcpy(&bias,  &op_params[1], sizeof(float));
 
     for (uint32_t ir = 0; ir < num_rows; ir++) {
-        const float * restrict src_local = src + (ir * row_elems);
-        float * restrict dst_local       = dst + (ir * row_elems);
-
-        if (ir + 1 < num_rows) {
-            hex_l2fetch(src_local + row_elems, row_size, row_size, 1);
-        }
+        const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
+        uint8_t * restrict dst_local       = (uint8_t *)dst + (ir * row_size);
 
         hvx_scale_offset_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems, scale, bias);
     }
 }
 
-static void rms_norm_htp_f32(const float * restrict src,
-                             float * restrict dst,
-                             uint8_t * restrict spad,
-                             const uint32_t num_rows,
-                             const uint32_t row_elems,
-                             const size_t   row_size,
-                             int32_t *      op_params,
-                             int            opt_path) {
+static void rms_norm_f32(const float * restrict src,
+                         float * restrict dst,
+                         uint8_t * restrict spad,
+                         const uint32_t num_rows,
+                         const uint32_t row_elems,
+                         const size_t   row_size,
+                         int32_t *      op_params) {
     float epsilon = 0.f;
     memcpy(&epsilon, op_params, sizeof(float));
 
     for (uint32_t ir = 0; ir < num_rows; ir++) {
-        const float * restrict src_local = src + (ir * row_elems);
-        float * restrict dst_local       = dst + (ir * row_elems);
+        const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
+        uint8_t * restrict dst_local       = (uint8_t *)dst + (ir * row_size);
 
-        if (ir + 1 < num_rows) {
-            hex_l2fetch(src_local + row_elems, row_size, row_size, 1);
-        }
-
-        if (1 == opt_path) {
-            hvx_fast_rms_norm_f32((const uint8_t *) src_local, (uint8_t *) dst_local, spad, row_elems, epsilon);
-        } else {
-            float sum = hvx_sum_of_squares_f32((const uint8_t *) src_local, row_elems);
-
-            const float mean  = sum / row_elems;
-            const float scale = 1.0f / sqrtf(mean + epsilon);
-
-            hvx_scale_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems, scale);
-        }
+        hvx_fast_rms_norm_f32((const uint8_t *) src_local, (uint8_t *) dst_local, spad, row_elems, epsilon);
     }
 }
 
-static void sqr_htp_f32(const float * restrict src,
-                          float * restrict dst,
-                          uint8_t * restrict spad,
-                          const uint32_t num_rows,
-                          const uint32_t row_elems,
-                          const size_t   row_size,
-                          int32_t *      op_params,
-                          int            opt_path) {
+static void sqr_f32(const float * restrict src,
+                    float * restrict dst,
+                    uint8_t * restrict spad,
+                    const uint32_t num_rows,
+                    const uint32_t row_elems,
+                    const size_t   row_size,
+                    int32_t *      op_params) {
 
     for (uint32_t ir = 0; ir < num_rows; ir++) {
-        const float * restrict src_local = src + (ir * row_elems);
-        float * restrict dst_local       = dst + (ir * row_elems);
-
-        if (ir + 1 < num_rows) {
-            hex_l2fetch(src_local + row_elems, row_size, row_size, 1);
-        }
+        const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
+        uint8_t * restrict dst_local       = (uint8_t *)dst + (ir * row_size);
 
-        if (1 == opt_path) {
-            hvx_sqr_f32_aa((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems);
-        } else {
-            hvx_sqr_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems);
-        }
+        hvx_sqr_f32_aa((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems);
     }
 }
 
-static void sqrt_htp_f32(const float * restrict src,
-                          float * restrict dst,
-                          uint8_t * restrict spad,
-                          const uint32_t num_rows,
-                          const uint32_t row_elems,
-                          const size_t   row_size,
-                          int32_t *      op_params,
-                          int            opt_path) {
+static void sqrt_f32(const float * restrict src,
+                     float * restrict dst,
+                     uint8_t * restrict spad,
+                     const uint32_t num_rows,
+                     const uint32_t row_elems,
+                     const size_t   row_size,
+                     int32_t *      op_params) {
 
     for (uint32_t ir = 0; ir < num_rows; ir++) {
-        const float * restrict src_local = src + (ir * row_elems);
-        float * restrict dst_local       = dst + (ir * row_elems);
+        const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
+        uint8_t * restrict dst_local       = (uint8_t *)dst + (ir * row_size);
 
-        if (ir + 1 < num_rows) {
-            hex_l2fetch(src_local + row_elems, row_size, row_size, 1);
-        }
-
-        if (1 == opt_path) {
-            hvx_sqrt_f32_aa((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems);
-        } else {
-            hvx_sqrt_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems);
-        }
+        hvx_sqrt_f32_aa((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems);
     }
 }
 
-static void unary_job_f32_per_thread(const struct htp_tensor * src,
-                                     struct htp_tensor *       dst,
-                                     uint8_t *                 spad,
-                                     int                       htp_op,
-                                     int32_t *                 op_params,
-                                     uint32_t                  nth,
-                                     uint32_t                  ith,
-                                     uint32_t                  src0_nrows_per_thread) {
+static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void * data) {
+    const struct htp_unary_context * uctx = (const struct htp_unary_context *) data;
+    struct htp_ops_context * octx = uctx->octx;
+    const struct htp_tensor * src = &octx->src0;
+    const struct htp_tensor * dst = &octx->dst;
+
     htp_unary_preamble;
 
-    const size_t src0_row_size = nb01;
-    const size_t dst_row_size  = nb1;
+    int                       htp_op = octx->op;
+    int32_t *                 op_params = octx->op_params;
+    uint32_t                  src0_nrows_per_thread = uctx->src0_nrows_per_thread;
 
-    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
+    const size_t src0_row_size = uctx->src0_row_size;
+    const size_t dst_row_size  = uctx->dst_row_size;
 
+    const size_t src0_row_size_aligned = uctx->src0_row_size_aligned;
+    const size_t dst_row_size_aligned  = uctx->dst_row_size_aligned;
+
+    const uint32_t src0_nrows = uctx->src0_nrows;
     const uint32_t src0_start_row = src0_nrows_per_thread * ith;
     const uint32_t src0_end_row   = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows);
 
@@ -208,79 +197,105 @@ static void unary_job_f32_per_thread(const struct htp_tensor * src,
     uint64_t t1, t2;
     t1 = HAP_perf_get_qtimer_count();
 
-    int is_aligned = 1;
-    int opt_path   = 0;
-    if ((0 == hex_is_aligned((void *) src->data, VLEN)) || (0 == hex_is_aligned((void *) dst->data, VLEN))) {
-        is_aligned = 0;
-    }
-    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
-        opt_path = 1;
+    const uint8_t * restrict data_src = uctx->data_src0;
+    uint8_t * restrict       data_dst = uctx->data_dst;
+
+    uint8_t * src0_spad_data = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
+    uint8_t * dst_spad_data  = octx->dst_spad.data  + (ith * octx->dst_spad.size_per_thread);
+
+    size_t src0_spad_half_size = uctx->src0_spad_half_size;
+    size_t dst_spad_half_size  = uctx->dst_spad_half_size;
+
+    const int BLOCK = uctx->block;
+    if (BLOCK == 0) {
+        FARF(ERROR, "unary-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n",
+             octx->src0_spad.size_per_thread, src0_row_size_aligned);
+        return;
     }
 
-    const uint8_t * restrict data_src = (const uint8_t *) src->data;
-    uint8_t * restrict data_dst       = (uint8_t *) dst->data;
+    dma_queue * dma_queue = octx->ctx->dma[ith];
 
-    const float * restrict src_th = (float *) (data_src + (src0_start_row * src0_row_size));
-    float * restrict dst_th       = (float *) (data_dst + (src0_start_row * dst_row_size));
-    uint8_t * restrict spad_th    = (uint8_t *) spad + (ith * nb01);
+    // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
+    for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
+        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
 
-    switch (htp_op) {
-        case HTP_OP_RMS_NORM:
-            rms_norm_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
-            break;
-        case HTP_OP_SCALE:
-            scale_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
-            break;
-        case HTP_OP_SQR:
-            sqr_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
-            break;
-        case HTP_OP_SQRT:
-            sqrt_htp_f32(src_th, dst_th, spad_th, src0_end_row - src0_start_row, ne0, nb1, op_params, opt_path);
-            break;
+        // Dummy DMA transation for sequencing (interleaving dst,src,dst,...)
+        dma_queue_push_vtcm_to_ddr(dma_queue,
+            dma_make_ptr(data_dst, dst_spad_data + (spad_idx * dst_spad_half_size)),
+            dst_row_size, dst_row_size_aligned, 0);
 
-        default:
-            break;
+        dma_queue_push_ddr_to_vtcm(dma_queue,
+            dma_make_ptr(src0_spad_data + (spad_idx * src0_spad_half_size), data_src + (ir * src0_row_size)),
+            src0_row_size_aligned, src0_row_size, block_size);
     }
 
+    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
+        const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
+
+        float * dst_spad  = (float *) dma_queue_pop(dma_queue).src;
+        float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
+
+        // Process block in VTCM
+        switch (htp_op) {
+            case HTP_OP_RMS_NORM:
+                rms_norm_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
+                break;
+            case HTP_OP_SCALE:
+                scale_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
+                break;
+            case HTP_OP_SQR:
+                sqr_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
+                break;
+            case HTP_OP_SQRT:
+                sqrt_f32(src0_spad, dst_spad, NULL, block_size, ne0, src0_row_size_aligned, op_params);
+                break;
+            default:
+                break;
+        }
+
+        dma_queue_push_vtcm_to_ddr(dma_queue,
+            dma_make_ptr(data_dst + (ir * dst_row_size), dst_spad),
+            dst_row_size, dst_row_size_aligned, block_size);
+
+        // prefetch N+2 loop iteration if any
+        const uint32_t pref_block = (ir + BLOCK * 2);
+        if (pref_block < src0_end_row) {
+            const uint32_t pref_block_size = MIN(BLOCK, src0_end_row - pref_block);
+            dma_queue_push_ddr_to_vtcm(dma_queue,
+                dma_make_ptr(src0_spad, data_src + (pref_block * src0_row_size)),
+                src0_row_size_aligned, src0_row_size, pref_block_size);
+        }
+    }
+
+    dma_queue_flush(dma_queue);
+
     t2 = HAP_perf_get_qtimer_count();
 
-    FARF(HIGH, "unary-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, src->ne[0],
+    FARF(HIGH, "unary-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, src->ne[0],
          src->ne[1], src->ne[2], src->ne[3], src0_start_row, src0_end_row, dst->ne[0], dst->ne[1], dst->ne[2],
          dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
-static void unary_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {
-    struct htp_ops_context * octx = (struct htp_ops_context *) data;
-
-    unary_job_f32_per_thread(&octx->src0, &octx->dst, octx->src0_spad.data, octx->op, octx->op_params, n, i,
-                             octx->src0_nrows_per_thread);
-}
-
 static int execute_op_unary_f32(struct htp_ops_context * octx) {
     int err = HTP_STATUS_OK;
 
     const struct htp_tensor * src0 = &octx->src0;
     struct htp_tensor *       dst  = &octx->dst;
 
-    worker_callback_t unary_op_func;
-    const char *      op_type = NULL;
+    const char * op_type = NULL;
 
     switch (octx->op) {
         case HTP_OP_RMS_NORM:
-            unary_op_func = unary_job_dispatcher_f32;
-            op_type       = "rmsnorm-f32";
+            op_type = "rmsnorm-f32";
             break;
         case HTP_OP_SCALE:
-            unary_op_func = unary_job_dispatcher_f32;
-            op_type       = "scale-f32";
+            op_type = "scale-f32";
             break;
         case HTP_OP_SQR:
-            unary_op_func = unary_job_dispatcher_f32;
-            op_type       = "sqr-f32";
+            op_type = "sqr-f32";
             break;
         case HTP_OP_SQRT:
-            unary_op_func = unary_job_dispatcher_f32;
-            op_type       = "sqrt-f32";
+            op_type = "sqrt-f32";
             break;
 
         default:
@@ -294,32 +309,61 @@ static int execute_op_unary_f32(struct htp_ops_context * octx) {
     const size_t src0_row_size = src0->nb[1];
     const size_t dst_row_size  = dst->nb[1];
 
-    // VTCM scratchpads for all tensors
-    octx->dst_spad.size  = hex_round_up(dst_row_size, 128) * n_threads;
-    octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads;
+    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, VLEN);
+    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, VLEN);
 
-    size_t spad_size = octx->src0_spad.size + octx->dst_spad.size;
+    // VTCM scratchpads for all tensors
+    // N rows per thread, padded to HVX vector size
+    // Double buffering requires 2x size per buffer
 
-    FARF(HIGH, "%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
-         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
-         octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
+    size_t spad_size_per_row   = 2 * (src0_row_size_aligned + dst_row_size_aligned);
+    size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads * spad_size_per_row);
 
     // Make sure the reserved vtcm size is sufficient
-    if (octx->ctx->vtcm_size < spad_size) {
+    if (vtcm_row_per_thread == 0) {
         FARF(ERROR, "unary-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
-             spad_size);
+             spad_size_per_row * n_threads);
         return HTP_STATUS_VTCM_TOO_SMALL;
     }
 
+    octx->src0_spad.size_per_thread = src0_row_size_aligned * vtcm_row_per_thread * 2;
+    octx->dst_spad.size_per_thread  = dst_row_size_aligned * vtcm_row_per_thread * 2;
+
+    octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
+    octx->dst_spad.size  = n_threads * octx->dst_spad.size_per_thread;
+
     octx->src0_spad.data = octx->ctx->vtcm_base;
     octx->dst_spad.data  = octx->src0_spad.data + octx->src0_spad.size;
 
+    FARF(HIGH, "%s: (%ux%ux%ux%u) -> (%ux%ux%ux%u) : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
+         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+         octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
+
     if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
         uint32_t n_jobs = MIN(n_threads, src0_nrows);
 
-        octx->src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
+        struct htp_unary_context uctx = {
+            .octx                  = octx,
+            .src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs,
+            .src0_nrows            = src0_nrows,
+
+            .data_src0             = (const uint8_t *)src0->data,
+            .data_dst              = (uint8_t *)dst->data,
+
+            .src0_row_size         = src0_row_size,
+            .dst_row_size          = dst_row_size,
+
+            .src0_row_size_aligned = src0_row_size_aligned,
+            .dst_row_size_aligned  = dst_row_size_aligned,
+
+            .src0_spad_half_size   = octx->src0_spad.size_per_thread / 2,
+            .dst_spad_half_size    = octx->dst_spad.size_per_thread / 2,
+
+            .block                 = (octx->src0_spad.size_per_thread / 2) / src0_row_size_aligned,
+            .nc                    = src0->ne[0],
+        };
 
-        worker_pool_run_func(octx->ctx->worker_pool, unary_op_func, octx, n_jobs);
+        worker_pool_run_func(octx->ctx->worker_pool, unary_job_f32_per_thread, &uctx, n_jobs);
     }
 
     return err;

From aca9a5f48ed08cf466d932c1befa8898a0807024 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Mon, 16 Feb 2026 14:13:00 -0800
Subject: [PATCH 05/15] hexagon: use aligned hvx_scale function

---
 ggml/src/ggml-hexagon/htp/unary-ops.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/unary-ops.c b/ggml/src/ggml-hexagon/htp/unary-ops.c
index 04da6d014..8cc1f582f 100644
--- a/ggml/src/ggml-hexagon/htp/unary-ops.c
+++ b/ggml/src/ggml-hexagon/htp/unary-ops.c
@@ -113,7 +113,7 @@ static void scale_f32(const float * restrict src,
         const uint8_t * restrict src_local = (const uint8_t *)src + (ir * row_size);
         uint8_t * restrict dst_local       = (uint8_t *)dst + (ir * row_size);
 
-        hvx_scale_offset_f32((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems, scale, bias);
+        hvx_scale_offset_f32_aa((uint8_t *) dst_local, (const uint8_t *) src_local, row_elems, scale, bias);
     }
 }
 
@@ -215,7 +215,6 @@ static void unary_job_f32_per_thread(unsigned int nth, unsigned int ith, void *
 
     dma_queue * dma_queue = octx->ctx->dma[ith];
 
-    // See discussion: https://github.com/ggml-org/llama.cpp/pull/18151#issuecomment-3678235379
     for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
         const uint32_t block_size = MIN(BLOCK, src0_end_row - ir);
 

From b047dfe01f76a9bf4cd1841f142900c16fb7027b Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 17 Feb 2026 17:32:15 -0800
Subject: [PATCH 06/15] hexagon: remove unused fields from op_context

---
 ggml/src/ggml-hexagon/htp/htp-ops.h | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h
index c6c27f7f4..127ab1d66 100644
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -44,26 +44,6 @@ struct htp_ops_context {
     uint32_t src0_nrows_per_thread;
     uint32_t src1_nrows_per_thread;
 
-    struct fastdiv_values src0_div1;  // fastdiv values for ne1
-    struct fastdiv_values src0_div2;  // fastdiv values for ne2
-    struct fastdiv_values src0_div3;  // fastdiv values for ne3
-    struct fastdiv_values src0_div21; // fastdiv values for ne2 * ne1
-
-    struct fastdiv_values src1_div1;  // fastdiv values for ne1
-    struct fastdiv_values src1_div2;  // fastdiv values for ne2
-    struct fastdiv_values src1_div3;  // fastdiv values for ne3
-    struct fastdiv_values src1_div21; // fastdiv values for ne2 * ne1
-
-    struct fastdiv_values src3_div1;  // fastdiv values for ne1
-    struct fastdiv_values src3_div2;  // fastdiv values for ne2
-    struct fastdiv_values src3_div3;  // fastdiv values for ne3
-    struct fastdiv_values src3_div21; // fastdiv values for ne2 * ne1
-
-    struct fastdiv_values broadcast_rk2;
-    struct fastdiv_values broadcast_rk3;
-    struct fastdiv_values broadcast_rv2;
-    struct fastdiv_values broadcast_rv3;
-
     uint32_t flags;
 };
 

From 6184173be405d6472b3bfe1648d788db7f2355f6 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 17 Feb 2026 17:06:48 -0800
Subject: [PATCH 07/15] hexagon: rewrite ROPE to use DMA and VTCM scratchpad

---
 ggml/src/ggml-hexagon/htp/hex-dma.h  |   4 +-
 ggml/src/ggml-hexagon/htp/rope-ops.c | 304 +++++++++++++--------------
 2 files changed, 146 insertions(+), 162 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hex-dma.h b/ggml/src/ggml-hexagon/htp/hex-dma.h
index d1ddb0ecb..969a95713 100644
--- a/ggml/src/ggml-hexagon/htp/hex-dma.h
+++ b/ggml/src/ggml-hexagon/htp/hex-dma.h
@@ -102,7 +102,7 @@ static inline bool dma_queue_push(dma_queue * q,
     dmlink(q->tail, desc);
     q->tail = desc;
 
-    // FARF(ERROR, "dma-push: i %u len %u dst %p src %p\n", q->push_idx, len, dst, src);
+    // FARF(ERROR, "dma-push: i %u width %u nrows %d dst %p src %p\n", q->push_idx, width, nrows, dptr.dst, dptr.src);
     q->push_idx = (q->push_idx + 1) & q->idx_mask;
     return true;
 }
@@ -144,7 +144,7 @@ static inline dma_ptr dma_queue_pop(dma_queue * q) {
 
     dptr = q->dptr[q->pop_idx];
 
-    // FARF(ERROR, "dma-pop: i %u dst %p\n", q->pop_idx, dst);
+    // FARF(ERROR, "dma-pop: i %u dst %p src %p\n", q->pop_idx, dptr.dst, dptr.src);
     q->pop_idx = (q->pop_idx + 1) & q->idx_mask;
     return dptr;
 }
diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index 728bc0e4d..998973947 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -64,6 +64,16 @@ struct htp_rope_context {
     size_t spad_stride;
 
     struct htp_ops_context * octx;
+
+    size_t src0_row_size;
+    size_t dst_row_size;
+    size_t src0_row_size_aligned;
+    size_t dst_row_size_aligned;
+    size_t src0_spad_half_size;
+    size_t dst_spad_half_size;
+    size_t theta_cache_offset;
+    uint32_t block_size;
+    uint32_t src0_nrows;
 };
 
 static float rope_yarn_ramp(const float low, const float high, const int i0) {
@@ -132,18 +142,17 @@ static void init_rope_ctx(struct htp_rope_context * rctx, struct htp_ops_context
     rctx->mode       = ((const int32_t *) op_params)[2];
     rctx->n_ctx_orig = ((const int32_t *) op_params)[4];
 
-    memcpy(&rctx->freq_base, (int32_t *) op_params + 5, sizeof(float));
-    memcpy(&rctx->freq_scale, (int32_t *) op_params + 6, sizeof(float));
-    memcpy(&rctx->ext_factor, (int32_t *) op_params + 7, sizeof(float));
-    memcpy(&rctx->attn_factor, (int32_t *) op_params + 8, sizeof(float));
-    memcpy(&rctx->beta_fast, (int32_t *) op_params + 9, sizeof(float));
-    memcpy(&rctx->beta_slow, (int32_t *) op_params + 10, sizeof(float));
-    memcpy(&rctx->sections, (int32_t *) op_params + 11, sizeof(int) * 4);
+    memcpy(&rctx->freq_base,   (int32_t *) op_params + 5,  sizeof(float));
+    memcpy(&rctx->freq_scale,  (int32_t *) op_params + 6,  sizeof(float));
+    memcpy(&rctx->ext_factor,  (int32_t *) op_params + 7,  sizeof(float));
+    memcpy(&rctx->attn_factor, (int32_t *) op_params + 8,  sizeof(float));
+    memcpy(&rctx->beta_fast,   (int32_t *) op_params + 9,  sizeof(float));
+    memcpy(&rctx->beta_slow,   (int32_t *) op_params + 10, sizeof(float));
+    memcpy(&rctx->sections,    (int32_t *) op_params + 11, sizeof(int) * 4);
 
     rctx->theta_scale = powf(rctx->freq_base, -2.0f / rctx->n_dims);
 
-    rope_corr_dims(rctx->n_dims, rctx->n_ctx_orig, rctx->freq_base, rctx->beta_fast,
-                   rctx->beta_slow, rctx->corr_dims);
+    rope_corr_dims(rctx->n_dims, rctx->n_ctx_orig, rctx->freq_base, rctx->beta_fast, rctx->beta_slow, rctx->corr_dims);
 
     rctx->octx = octx;
 
@@ -154,34 +163,20 @@ static void init_rope_ctx(struct htp_rope_context * rctx, struct htp_ops_context
     if (ne01 > 0) rctx->fastdiv_ne01 = init_fastdiv_values(ne01);
     if (ne02 > 0) rctx->fastdiv_ne02 = init_fastdiv_values(ne02);
 
-    FARF(HIGH, "rope-f32 n_dims:%d, ext_factor:%.6f, theta_scale:%.6f, attn_factor:%.6f\n", rctx->n_dims,
+    const uint32_t ne0 = octx->dst.ne[0];
+
+    FARF(HIGH, "rope-f32 n-dims %d ne0 %u ext-factor %.6f theta-scale %.6f attn-factor %.6f\n", rctx->n_dims, ne0,
          rctx->ext_factor, rctx->theta_scale, rctx->attn_factor);
 }
 
-static void hvx_calc_rope_neox_f32(const float * restrict src0,
-                                   float * restrict dst,
-                                   const int num_elems,
-                                   const float * restrict theta_cache) {
-    // for (int i = 0; i < num_elems; i += 2) {
-    //const float cos_theta = theta_cache[i + 0];
-    //const float sin_theta = theta_cache[i + 1];
-
-    //const float x0 = src[0];
-    //const float x1 = src[num_elems/2];
-
-    //dst[0] = x0*cos_theta - x1*sin_theta;
-    //dst[num_elems/2] = x0*sin_theta + x1*cos_theta;
-
-    //src += 1;
-    //dst += 1;
-    // }
-
+static void hvx_rope_neox_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
     const uint8_t * restrict src0_curr  = (const uint8_t *) src0;
     const uint8_t * restrict theta_curr = (const uint8_t *) theta_cache;
     uint8_t * restrict dst_curr         = (uint8_t *) dst;
 
-    int step_of_1 = num_elems >> 6;  // 6 because we process two vectors at once
-    int half_size = (sizeof(float) * (num_elems / 2));
+    int step_of_1 = ne >> 6;  // 6 because we process two vectors at once
+    int half_size = (sizeof(float) * (ne / 2));
+    const int32_t half_dims = ne / 2;
 
     for (int i = 0; i < step_of_1; i++) {
         HVX_Vector v0 = *(HVX_Vector *) src0_curr;
@@ -203,35 +198,28 @@ static void hvx_calc_rope_neox_f32(const float * restrict src0,
         *(HVX_Vector *) dst_curr               = Q6_Vsf_equals_Vqf32(v4);
         *(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5);
 
-        src0_curr += VLEN;
+        src0_curr  += VLEN;
         theta_curr += 2 * VLEN;
-        dst_curr += VLEN;
+        dst_curr   += VLEN;
     }
-}
-
-static void hvx_calc_rope_f32(const float * restrict src0,
-                              float * restrict dst,
-                              const int num_elems,
-                              const float * restrict theta_cache) {
-    // for (int i = 0; i < num_elems; i += 2) {
-    //const float cos_theta = theta_cache[i + 0];
-    //const float sin_theta = theta_cache[i + 1];
-
-    //const float x0 = src[0];
-    //const float x1 = src[1];
-
-    //dst[0] = x0*cos_theta - x1*sin_theta;
-    //dst[1] = x0*sin_theta + x1*cos_theta;
 
-    //src += 2;
-    //dst += 2;
-    // }
+    int processed = (ne >> 6) << 6; // multiples of 64
+    for (int i = processed; i < ne; i += 2) {
+        const float cos_theta = theta_cache[i+0];
+        const float sin_theta = theta_cache[i+1];
+        float x0 = src0[i/2];
+        float x1 = src0[i/2 + half_dims];
+        dst[i/2]             = x0 * cos_theta - x1 * sin_theta;
+        dst[i/2 + half_dims] = x0 * sin_theta + x1 * cos_theta;
+    }
+}
 
+static void hvx_rope_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
     const uint8_t * restrict src0_curr  = (const uint8_t *) src0;
     const uint8_t * restrict theta_curr = (const uint8_t *) theta_cache;
     uint8_t * restrict dst_curr         = (uint8_t *) dst;
 
-    int step_of_1 = num_elems >> 6;  // 6 because we process two vectors at once
+    int step_of_1 = ne >> 6;  // 6 because we process two vectors at once
 
     for (int i = 0; i < step_of_1; i++) {
         HVX_Vector v0 = *(HVX_Vector *) src0_curr;
@@ -256,9 +244,19 @@ static void hvx_calc_rope_f32(const float * restrict src0,
         *(HVX_Vector *) dst_curr          = Q6_V_lo_W(vstore);
         *(HVX_Vector *) (dst_curr + VLEN) = Q6_V_hi_W(vstore);
 
-        src0_curr += 2 * VLEN;
+        src0_curr  += 2 * VLEN;
         theta_curr += 2 * VLEN;
-        dst_curr += 2 * VLEN;
+        dst_curr   += 2 * VLEN;
+    }
+
+    int processed = (ne >> 6) << 6;
+    for (int i = processed; i < ne; i += 2) {
+        const float cos_theta = theta_cache[i+0];
+        const float sin_theta = theta_cache[i+1];
+        float x0 = src0[i+0];
+        float x1 = src0[i+1];
+        dst[i+0] = x0 * cos_theta - x1 * sin_theta;
+        dst[i+1] = x0 * sin_theta + x1 * cos_theta;
     }
 }
 
@@ -273,7 +271,7 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
 
     htp_rope_preamble;
 
-    const uint32_t src0_nrows = ne01 * ne02 * ne03;  // src0 rows
+    const uint32_t src0_nrows = rctx->src0_nrows;
     const uint32_t src0_nrows_per_thread = rctx->src0_nrows_per_thread;
 
     const uint32_t src0_start_row = src0_nrows_per_thread * ith;
@@ -289,105 +287,88 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
 
     const int32_t mode    = rctx->mode;
     const bool    is_neox = mode & HTP_ROPE_TYPE_NEOX;
-    const int32_t half_dims = rctx->n_dims / 2;
-    const size_t  remain_bytes = (ne0 - rctx->n_dims) * sizeof(float);
-
-    int is_aligned = 1;
-    int opt_path   = 0;
-    if ((0 == hex_is_aligned((void *) src0->data, VLEN)) || (0 == hex_is_aligned((void *) src1->data, VLEN)) ||
-        (0 == hex_is_aligned((void *) dst->data, VLEN))) {
-        FARF(HIGH, "rope-f32: unaligned addresses in rope op, possibly slower execution\n");
-        is_aligned = 0;
-    }
-    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
-        opt_path = 1;
-    }
 
-    float * wp0 = (float *) (octx->src0_spad.data + (ith * rctx->spad_stride));
+    // VTCM setup
+    uint8_t * src0_spad_base = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
+    float *   theta_cache    = (float *) (src0_spad_base);
+              src0_spad_base = src0_spad_base + rctx->theta_cache_offset;
+    uint8_t * dst_spad_base  = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
+
+    size_t src0_spad_half_size = rctx->src0_spad_half_size;
+    size_t dst_spad_half_size  = rctx->dst_spad_half_size;
 
+    const int BLOCK = rctx->block_size;
+
+    dma_queue * dma_queue = octx->ctx->dma[ith];
     const int32_t * pos = (const int32_t *) src1->data;
+    const float * freq_factors = (src2 && src2->data) ? (const float *) src2->data : NULL;
 
-    const float * freq_factors = NULL;
-    if (src2 != NULL) {
-        freq_factors = (const float *) src2->data;
-    }
+    for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
+        // Dummy DMA transaction for sequencing (interleaving dst,src,dst,...)
+        dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr((void *) dst->data, dst_spad_base + (spad_idx * dst_spad_half_size)), 0, 0, 0);
 
-    uint32_t prev_i2 = (uint32_t)-1;
+        uint32_t i1 = fastmodulo(ir, ne01, &rctx->fastdiv_ne01);
+        uint32_t r_div_ne01 = fastdiv(ir, &rctx->fastdiv_ne01);
+        uint32_t i2 = fastmodulo(r_div_ne01, ne02, &rctx->fastdiv_ne02);
+        uint32_t i3 = fastdiv(r_div_ne01, &rctx->fastdiv_ne02);
+        const uint8_t * src_addr = (const uint8_t *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01;
 
-    for (uint32_t r = src0_start_row; r < src0_end_row; ++r) {
-        // Calculate indices from flat row index r
-        // idx = i3 * (ne02 * ne01) + i2 * ne01 + i1
-        // i1 = idx % ne01
-        // i2 = (idx / ne01) % ne02
-        // i3 = (idx / ne01) / ne02
+        dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src0_spad_base + (spad_idx * src0_spad_half_size), src_addr),
+            rctx->src0_row_size_aligned, rctx->src0_row_size, 1);
+    }
 
-        uint32_t i1 = fastmodulo(r, ne01, &rctx->fastdiv_ne01);
-        uint32_t r_div_ne01 = fastdiv(r, &rctx->fastdiv_ne01);
+    uint32_t prev_i2 = (uint32_t) -1;
+    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
+        // Process block (single row)
+        // Recalculate indices for params
+        uint32_t i1 = fastmodulo(ir, ne01, &rctx->fastdiv_ne01);
+        uint32_t r_div_ne01 = fastdiv(ir, &rctx->fastdiv_ne01);
         uint32_t i2 = fastmodulo(r_div_ne01, ne02, &rctx->fastdiv_ne02);
         uint32_t i3 = fastdiv(r_div_ne01, &rctx->fastdiv_ne02);
 
         if (i2 != prev_i2) {
             const int32_t p = pos[i2];
-
-            rope_cache_init(p, rctx->freq_scale, freq_factors, rctx->corr_dims, ne0, rctx->ext_factor,
-                            rctx->attn_factor, wp0, rctx->theta_scale);
+            rope_cache_init(p, rctx->freq_scale, freq_factors, rctx->corr_dims, ne0, rctx->ext_factor, rctx->attn_factor, theta_cache, rctx->theta_scale);
             prev_i2 = i2;
         }
 
-        const float * src      = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
-        float *       dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
-
-        const float * src_loc      = src;
-        float *       dst_data_loc = dst_data;
+        float * dst_spad  = (float *) dma_queue_pop(dma_queue).src;
+        float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
 
-        if (1 == opt_path) {
-            if (is_neox) {
-                hvx_calc_rope_neox_f32(src_loc, dst_data_loc, rctx->n_dims, wp0);
-            } else {
-                hvx_calc_rope_f32(src_loc, dst_data_loc, rctx->n_dims, wp0);
-            }
-
-            src_loc += rctx->n_dims;
-            dst_data_loc += rctx->n_dims;
+        if (is_neox) {
+            hvx_rope_neox_f32_aa(dst_spad, src0_spad, rctx->n_dims, theta_cache);
         } else {
-            for (uint32_t i0 = 0; i0 < rctx->n_dims; i0 += 2) {
-                const float cos_theta = wp0[i0 + 0];
-                const float sin_theta = wp0[i0 + 1];
-
-                if (is_neox) {
-                    const float x0 = src_loc[0];
-                    const float x1 = src_loc[half_dims];
-
-                    dst_data_loc[0]         = x0 * cos_theta - x1 * sin_theta;
-                    dst_data_loc[half_dims] = x0 * sin_theta + x1 * cos_theta;
-
-                    src_loc += 1;
-                    dst_data_loc += 1;
-                } else {
-                    const float x0 = src_loc[0];
-                    const float x1 = src_loc[1];
-
-                    dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
-                    dst_data_loc[1] = x0 * sin_theta + x1 * cos_theta;
-
-                    src_loc += 2;
-                    dst_data_loc += 2;
-                }
-            }
+            hvx_rope_f32_aa(dst_spad, src0_spad, rctx->n_dims, theta_cache);
+        }
 
-            src_loc += (is_neox ? half_dims : 0);
-            dst_data_loc += (is_neox ? half_dims : 0);
+        // fill the remain channels with data from src tensor
+        if (rctx->n_dims < ne0) {
+            hvx_copy_f32_uu((uint8_t *)(dst_spad + rctx->n_dims), (uint8_t *)(src0_spad + rctx->n_dims), ne0 - rctx->n_dims);
         }
 
-        // TODO: use simd to speed up the remaining elements copy
-        if (remain_bytes > 0) {
-            memcpy(dst_data_loc, src_loc, remain_bytes);
+        uint8_t * dst_addr = (uint8_t *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1;
+        dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(dst_addr, dst_spad), rctx->dst_row_size, rctx->dst_row_size_aligned, 1);
+
+        // Prefetch next
+        const uint32_t pref_block = (ir + BLOCK * 2);
+        if (pref_block < src0_end_row) {
+            // Re-calculate src ptr for prefetch
+            uint32_t pi1 = fastmodulo(pref_block, ne01, &rctx->fastdiv_ne01);
+            uint32_t pr_div_ne01 = fastdiv(pref_block, &rctx->fastdiv_ne01);
+            uint32_t pi2 = fastmodulo(pr_div_ne01, ne02, &rctx->fastdiv_ne02);
+            uint32_t pi3 = fastdiv(pr_div_ne01, &rctx->fastdiv_ne02);
+            const uint8_t * psrc_addr = (const uint8_t *) src0->data + pi3 * nb03 + pi2 * nb02 + pi1 * nb01;
+
+            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src0_spad, psrc_addr), // reusing src0_spad buffer
+                rctx->src0_row_size_aligned, rctx->src0_row_size, 1);
         }
     }
 
+    dma_queue_flush(dma_queue);
+
     t2 = HAP_perf_get_qtimer_count();
 
-    FARF(HIGH, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row, src0_end_row,
+    FARF(HIGH, "rope-f32: %d/%d: (%u:%u) usec %u\n", ith, nth, src0_start_row, src0_end_row,
          (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
@@ -415,47 +396,50 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
     const uint32_t n_threads = octx->n_threads;
 
     const size_t src0_row_size = src0->nb[1];
-    const size_t src1_row_size = src0_row_size;
     const size_t dst_row_size  = dst->nb[1];
 
-    // VTCM scratchpads for all tensors
-    // N rows per thread, padded to HVX vector size
-    octx->dst_spad.size  = hex_round_up(dst_row_size, 128) * n_threads;
-    octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads;
-    octx->src1_spad.size = hex_round_up(src1_row_size, 128) * n_threads;
-
-    // Use stride for calculating offset
-    rctx.spad_stride = hex_round_up(src0_row_size, 128);
-
-    size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
-
-    if (src2->ne[0]) {
-        FARF(HIGH,
-             "%s: %ux%ux%ux%u (x %ux%ux%ux%u x %ux%ux%ux%u) -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u "
-             "dst-spad-size %u\n",
-             op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
-             src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0], dst->ne[1], dst->ne[2],
-             dst->ne[3], octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
-    } else {
-        FARF(HIGH,
-             "%s: %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
-             op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
-             src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
-             octx->dst_spad.size);
-    }
+    // Aligned row sizes for VTCM
+    const size_t src0_row_size_aligned    = hex_round_up(src0_row_size, VLEN);
+    const size_t dst_row_size_aligned     = hex_round_up(dst_row_size, VLEN);
+    const size_t theta_cache_size_aligned = hex_round_up(src0->ne[0] * sizeof(float), 128);
 
-    // Make sure the reserved vtcm size is sufficient
-    if (octx->ctx->vtcm_size < spad_size) {
-        FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
-             spad_size);
+    // Calculate spad sizes per thread
+    size_t src0_spad_per_thread = theta_cache_size_aligned + 2 * src0_row_size_aligned;
+    size_t dst_spad_per_thread  = 2 * dst_row_size_aligned;
+    size_t spad_per_thread = src0_spad_per_thread + dst_spad_per_thread;
+
+    // Check if we fit in VTCM
+    size_t total_vtcm_needed = spad_per_thread * n_threads;
+    if (octx->ctx->vtcm_size < total_vtcm_needed) {
+        FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size, total_vtcm_needed);
         return HTP_STATUS_VTCM_TOO_SMALL;
     }
 
+    // Assign sizes
+    octx->src0_spad.size_per_thread = src0_spad_per_thread;
+    octx->dst_spad.size_per_thread  = dst_spad_per_thread;
+    octx->src0_spad.size = n_threads * src0_spad_per_thread;
+    octx->dst_spad.size  = n_threads * dst_spad_per_thread;
+    octx->src1_spad.size = 0;
+
+    // Assign pointers
     octx->src0_spad.data = octx->ctx->vtcm_base;
-    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
-    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
+    octx->src1_spad.data = NULL;
+    octx->dst_spad.data  = octx->src0_spad.data + octx->src0_spad.size;
+
+    // Fill context
+    rctx.src0_row_size = src0_row_size;
+    rctx.dst_row_size  = dst_row_size;
+    rctx.src0_row_size_aligned = src0_row_size_aligned;
+    rctx.dst_row_size_aligned  = dst_row_size_aligned;
+    rctx.theta_cache_offset    = theta_cache_size_aligned;
+
+    rctx.block_size = 1;
+    rctx.src0_spad_half_size = src0_row_size_aligned;
+    rctx.dst_spad_half_size  = dst_row_size_aligned;
 
     uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
+    rctx.src0_nrows = src0_nrows;
 
     if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
         uint32_t n_jobs             = MIN(n_threads, src0_nrows);

From 1475229dc7628f79753d52b8db78fcb985e63836 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 17 Feb 2026 18:12:53 -0800
Subject: [PATCH 08/15] hex-rope: keep N rows in scratchpad (instead of just
 two)

---
 ggml/src/ggml-hexagon/htp/rope-ops.c | 30 ++++++++++++----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index 998973947..1b961be7c 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -22,6 +22,8 @@
 #define HTP_ROPE_TYPE_NORMAL 0
 #define HTP_ROPE_TYPE_NEOX   2
 
+#define HTP_ROPE_SPAD_NROWS  4
+
 #define htp_rope_preamble              \
     const uint32_t ne00 = src0->ne[0]; \
     const uint32_t ne01 = src0->ne[1]; \
@@ -72,7 +74,6 @@ struct htp_rope_context {
     size_t src0_spad_half_size;
     size_t dst_spad_half_size;
     size_t theta_cache_offset;
-    uint32_t block_size;
     uint32_t src0_nrows;
 };
 
@@ -297,15 +298,13 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
     size_t src0_spad_half_size = rctx->src0_spad_half_size;
     size_t dst_spad_half_size  = rctx->dst_spad_half_size;
 
-    const int BLOCK = rctx->block_size;
-
     dma_queue * dma_queue = octx->ctx->dma[ith];
     const int32_t * pos = (const int32_t *) src1->data;
     const float * freq_factors = (src2 && src2->data) ? (const float *) src2->data : NULL;
 
-    for (uint32_t ir = src0_start_row, spad_idx = 0; ir < src0_end_row && spad_idx < 2; ir += BLOCK, spad_idx++) {
+    for (uint32_t ir = src0_start_row, is = 0; ir < src0_end_row && is < HTP_ROPE_SPAD_NROWS; ir++, is++) {
         // Dummy DMA transaction for sequencing (interleaving dst,src,dst,...)
-        dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr((void *) dst->data, dst_spad_base + (spad_idx * dst_spad_half_size)), 0, 0, 0);
+        dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr((void *) dst->data, dst_spad_base + is * rctx->dst_row_size_aligned), 0, 0, 0);
 
         uint32_t i1 = fastmodulo(ir, ne01, &rctx->fastdiv_ne01);
         uint32_t r_div_ne01 = fastdiv(ir, &rctx->fastdiv_ne01);
@@ -313,14 +312,12 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
         uint32_t i3 = fastdiv(r_div_ne01, &rctx->fastdiv_ne02);
         const uint8_t * src_addr = (const uint8_t *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01;
 
-        dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src0_spad_base + (spad_idx * src0_spad_half_size), src_addr),
+        dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src0_spad_base + is * rctx->src0_row_size_aligned, src_addr),
             rctx->src0_row_size_aligned, rctx->src0_row_size, 1);
     }
 
     uint32_t prev_i2 = (uint32_t) -1;
-    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
-        // Process block (single row)
-        // Recalculate indices for params
+    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
         uint32_t i1 = fastmodulo(ir, ne01, &rctx->fastdiv_ne01);
         uint32_t r_div_ne01 = fastdiv(ir, &rctx->fastdiv_ne01);
         uint32_t i2 = fastmodulo(r_div_ne01, ne02, &rctx->fastdiv_ne02);
@@ -349,12 +346,12 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
         uint8_t * dst_addr = (uint8_t *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1;
         dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(dst_addr, dst_spad), rctx->dst_row_size, rctx->dst_row_size_aligned, 1);
 
-        // Prefetch next
-        const uint32_t pref_block = (ir + BLOCK * 2);
-        if (pref_block < src0_end_row) {
+        // prefetch next row
+        const uint32_t pr = (ir + HTP_ROPE_SPAD_NROWS);
+        if (pr < src0_end_row) {
             // Re-calculate src ptr for prefetch
-            uint32_t pi1 = fastmodulo(pref_block, ne01, &rctx->fastdiv_ne01);
-            uint32_t pr_div_ne01 = fastdiv(pref_block, &rctx->fastdiv_ne01);
+            uint32_t pi1 = fastmodulo(pr, ne01, &rctx->fastdiv_ne01);
+            uint32_t pr_div_ne01 = fastdiv(pr, &rctx->fastdiv_ne01);
             uint32_t pi2 = fastmodulo(pr_div_ne01, ne02, &rctx->fastdiv_ne02);
             uint32_t pi3 = fastdiv(pr_div_ne01, &rctx->fastdiv_ne02);
             const uint8_t * psrc_addr = (const uint8_t *) src0->data + pi3 * nb03 + pi2 * nb02 + pi1 * nb01;
@@ -404,8 +401,8 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
     const size_t theta_cache_size_aligned = hex_round_up(src0->ne[0] * sizeof(float), 128);
 
     // Calculate spad sizes per thread
-    size_t src0_spad_per_thread = theta_cache_size_aligned + 2 * src0_row_size_aligned;
-    size_t dst_spad_per_thread  = 2 * dst_row_size_aligned;
+    size_t src0_spad_per_thread = theta_cache_size_aligned + HTP_ROPE_SPAD_NROWS * src0_row_size_aligned;
+    size_t dst_spad_per_thread  = HTP_ROPE_SPAD_NROWS * dst_row_size_aligned;
     size_t spad_per_thread = src0_spad_per_thread + dst_spad_per_thread;
 
     // Check if we fit in VTCM
@@ -434,7 +431,6 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
     rctx.dst_row_size_aligned  = dst_row_size_aligned;
     rctx.theta_cache_offset    = theta_cache_size_aligned;
 
-    rctx.block_size = 1;
     rctx.src0_spad_half_size = src0_row_size_aligned;
     rctx.dst_spad_half_size  = dst_row_size_aligned;
 

From a732df1a71270316cdbd47b0d732268a7c9d8112 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 17 Feb 2026 21:42:03 -0800
Subject: [PATCH 09/15] hex-rope: introduce rowidx cache

---
 ggml/src/ggml-hexagon/htp/rope-ops.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index 1b961be7c..0cd443a7b 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -22,7 +22,7 @@
 #define HTP_ROPE_TYPE_NORMAL 0
 #define HTP_ROPE_TYPE_NEOX   2
 
-#define HTP_ROPE_SPAD_NROWS  4
+#define HTP_ROPE_SPAD_NROWS  8
 
 #define htp_rope_preamble              \
     const uint32_t ne00 = src0->ne[0]; \
@@ -261,6 +261,13 @@ static void hvx_rope_f32_aa(float * restrict dst, const float * restrict src0, u
     }
 }
 
+struct rope_rowidx_cache {
+    uint32_t i1;
+    uint32_t i2;
+    uint32_t i3;
+    uint32_t pad;
+};
+
 static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
     struct htp_rope_context * rctx = (struct htp_rope_context *) data;
     struct htp_ops_context * octx = rctx->octx;
@@ -302,6 +309,8 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
     const int32_t * pos = (const int32_t *) src1->data;
     const float * freq_factors = (src2 && src2->data) ? (const float *) src2->data : NULL;
 
+    struct rope_rowidx_cache rowidx_cache[HTP_ROPE_SPAD_NROWS];
+
     for (uint32_t ir = src0_start_row, is = 0; ir < src0_end_row && is < HTP_ROPE_SPAD_NROWS; ir++, is++) {
         // Dummy DMA transaction for sequencing (interleaving dst,src,dst,...)
         dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr((void *) dst->data, dst_spad_base + is * rctx->dst_row_size_aligned), 0, 0, 0);
@@ -314,14 +323,17 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
 
         dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src0_spad_base + is * rctx->src0_row_size_aligned, src_addr),
             rctx->src0_row_size_aligned, rctx->src0_row_size, 1);
+
+        rowidx_cache[ir % HTP_ROPE_SPAD_NROWS].i1 = i1;
+        rowidx_cache[ir % HTP_ROPE_SPAD_NROWS].i2 = i2;
+        rowidx_cache[ir % HTP_ROPE_SPAD_NROWS].i3 = i3;
     }
 
     uint32_t prev_i2 = (uint32_t) -1;
     for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
-        uint32_t i1 = fastmodulo(ir, ne01, &rctx->fastdiv_ne01);
-        uint32_t r_div_ne01 = fastdiv(ir, &rctx->fastdiv_ne01);
-        uint32_t i2 = fastmodulo(r_div_ne01, ne02, &rctx->fastdiv_ne02);
-        uint32_t i3 = fastdiv(r_div_ne01, &rctx->fastdiv_ne02);
+        uint32_t i1 = rowidx_cache[ir % HTP_ROPE_SPAD_NROWS].i1;
+        uint32_t i2 = rowidx_cache[ir % HTP_ROPE_SPAD_NROWS].i2;
+        uint32_t i3 = rowidx_cache[ir % HTP_ROPE_SPAD_NROWS].i3;
 
         if (i2 != prev_i2) {
             const int32_t p = pos[i2];
@@ -358,6 +370,10 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
 
             dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src0_spad, psrc_addr), // reusing src0_spad buffer
                 rctx->src0_row_size_aligned, rctx->src0_row_size, 1);
+
+            rowidx_cache[ir % HTP_ROPE_SPAD_NROWS].i1 = pi1;
+            rowidx_cache[ir % HTP_ROPE_SPAD_NROWS].i2 = pi2;
+            rowidx_cache[ir % HTP_ROPE_SPAD_NROWS].i3 = pi3;
         }
     }
 

From 50e83dc3dc76419b58c764be8f0f7085054c115c Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Tue, 17 Feb 2026 23:02:29 -0800
Subject: [PATCH 10/15] hex-rope: remove unused fields

---
 ggml/src/ggml-hexagon/htp/rope-ops.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index 0cd443a7b..35d07e2ce 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -71,8 +71,6 @@ struct htp_rope_context {
     size_t dst_row_size;
     size_t src0_row_size_aligned;
     size_t dst_row_size_aligned;
-    size_t src0_spad_half_size;
-    size_t dst_spad_half_size;
     size_t theta_cache_offset;
     uint32_t src0_nrows;
 };
@@ -302,12 +300,9 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
               src0_spad_base = src0_spad_base + rctx->theta_cache_offset;
     uint8_t * dst_spad_base  = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
 
-    size_t src0_spad_half_size = rctx->src0_spad_half_size;
-    size_t dst_spad_half_size  = rctx->dst_spad_half_size;
-
     dma_queue * dma_queue = octx->ctx->dma[ith];
     const int32_t * pos = (const int32_t *) src1->data;
-    const float * freq_factors = (src2 && src2->data) ? (const float *) src2->data : NULL;
+    const float * freq_factors = src2->data ? (const float *) src2->data : NULL;
 
     struct rope_rowidx_cache rowidx_cache[HTP_ROPE_SPAD_NROWS];
 
@@ -447,9 +442,6 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
     rctx.dst_row_size_aligned  = dst_row_size_aligned;
     rctx.theta_cache_offset    = theta_cache_size_aligned;
 
-    rctx.src0_spad_half_size = src0_row_size_aligned;
-    rctx.dst_spad_half_size  = dst_row_size_aligned;
-
     uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
     rctx.src0_nrows = src0_nrows;
 

From 2cf88f98d6151264296f06984f98536eee631955 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Wed, 18 Feb 2026 21:47:08 -0800
Subject: [PATCH 11/15] hex-rope: rewrite dma prefetch logic to allow for
 multi-row fetch/compute

also removes the need for fastdiv.
---
 ggml/src/ggml-hexagon/htp/hex-dma.h  |  12 ++
 ggml/src/ggml-hexagon/htp/rope-ops.c | 174 ++++++++++++++++-----------
 2 files changed, 113 insertions(+), 73 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hex-dma.h b/ggml/src/ggml-hexagon/htp/hex-dma.h
index 969a95713..d197999fa 100644
--- a/ggml/src/ggml-hexagon/htp/hex-dma.h
+++ b/ggml/src/ggml-hexagon/htp/hex-dma.h
@@ -149,6 +149,18 @@ static inline dma_ptr dma_queue_pop(dma_queue * q) {
     return dptr;
 }
 
+static inline bool dma_queue_empty(dma_queue * q) {
+    return q->push_idx == q->pop_idx;
+}
+
+static inline uint32_t dma_queue_depth(dma_queue * q) {
+    return (q->push_idx - q->pop_idx) & q->idx_mask;
+}
+
+static inline uint32_t dma_queue_capacity(dma_queue * q) {
+    return q->capacity;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index 35d07e2ce..ac9315774 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -22,7 +22,8 @@
 #define HTP_ROPE_TYPE_NORMAL 0
 #define HTP_ROPE_TYPE_NEOX   2
 
-#define HTP_ROPE_SPAD_NROWS  8
+#define HTP_ROPE_SPAD_NROWS  16
+#define HTP_ROPE_SPAD_BLOCK  (HTP_ROPE_SPAD_NROWS/2)
 
 #define htp_rope_preamble              \
     const uint32_t ne00 = src0->ne[0]; \
@@ -61,8 +62,6 @@ struct htp_rope_context {
     float corr_dims[2];
 
     uint32_t src0_nrows_per_thread;
-    struct fastdiv_values fastdiv_ne01;
-    struct fastdiv_values fastdiv_ne02;
     size_t spad_stride;
 
     struct htp_ops_context * octx;
@@ -155,13 +154,6 @@ static void init_rope_ctx(struct htp_rope_context * rctx, struct htp_ops_context
 
     rctx->octx = octx;
 
-    // Initialize fastdiv values
-    const uint32_t ne01 = octx->src0.ne[1];
-    const uint32_t ne02 = octx->src0.ne[2];
-
-    if (ne01 > 0) rctx->fastdiv_ne01 = init_fastdiv_values(ne01);
-    if (ne02 > 0) rctx->fastdiv_ne02 = init_fastdiv_values(ne02);
-
     const uint32_t ne0 = octx->dst.ne[0];
 
     FARF(HIGH, "rope-f32 n-dims %d ne0 %u ext-factor %.6f theta-scale %.6f attn-factor %.6f\n", rctx->n_dims, ne0,
@@ -259,12 +251,37 @@ static void hvx_rope_f32_aa(float * restrict dst, const float * restrict src0, u
     }
 }
 
-struct rope_rowidx_cache {
-    uint32_t i1;
-    uint32_t i2;
-    uint32_t i3;
-    uint32_t pad;
-};
+static void inline rope_basic_f32(struct htp_rope_context * rctx, uint8_t * restrict dst, uint8_t * restrict src,
+                   uint32_t nr, uint32_t ne0, const float * restrict theta_cache) {
+    #pragma unroll(4)
+    for (uint32_t i = 0; i < nr; i++) {
+        float * d = (float *) (dst + i * rctx->dst_row_size_aligned);
+        float * s = (float *) (src + i * rctx->src0_row_size_aligned);
+
+        hvx_rope_f32_aa(d, s, rctx->n_dims, theta_cache);
+
+        // fill the remain channels with data from src tensor
+        if (rctx->n_dims < ne0) {
+            hvx_copy_f32_uu((uint8_t *)(d + rctx->n_dims), (uint8_t *)(s + rctx->n_dims), ne0 - rctx->n_dims);
+        }
+    }
+}
+
+static void inline rope_neox_f32(struct htp_rope_context * rctx, uint8_t * restrict dst, uint8_t * restrict src,
+                   uint32_t nr, uint32_t ne0, const float * restrict theta_cache) {
+    #pragma unroll(4)
+    for (uint32_t i = 0; i < nr; i++) {
+        float * d = (float *) (dst + i * rctx->dst_row_size_aligned);
+        float * s = (float *) (src + i * rctx->src0_row_size_aligned);
+
+        hvx_rope_neox_f32_aa(d, s, rctx->n_dims, theta_cache);
+
+        // fill the remain channels with data from src tensor
+        if (rctx->n_dims < ne0) {
+            hvx_copy_f32_uu((uint8_t *)(d + rctx->n_dims), (uint8_t *)(s + rctx->n_dims), ne0 - rctx->n_dims);
+        }
+    }
+}
 
 static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
     struct htp_rope_context * rctx = (struct htp_rope_context *) data;
@@ -304,76 +321,87 @@ static void rope_job_f32(unsigned int nth, unsigned int ith, void * data) {
     const int32_t * pos = (const int32_t *) src1->data;
     const float * freq_factors = src2->data ? (const float *) src2->data : NULL;
 
-    struct rope_rowidx_cache rowidx_cache[HTP_ROPE_SPAD_NROWS];
+    uint32_t ir = 0;
+    uint32_t prev_i2 = (uint32_t) -1;
 
-    for (uint32_t ir = src0_start_row, is = 0; ir < src0_end_row && is < HTP_ROPE_SPAD_NROWS; ir++, is++) {
-        // Dummy DMA transaction for sequencing (interleaving dst,src,dst,...)
-        dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr((void *) dst->data, dst_spad_base + is * rctx->dst_row_size_aligned), 0, 0, 0);
+    for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch
+        for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len
+            for (uint32_t i1 = 0; i1 < ne1; ) { // attn-heads
+                if (ir < src0_start_row) { ir++; i1++; continue; }
+                if (ir >= src0_end_row) goto done;
 
-        uint32_t i1 = fastmodulo(ir, ne01, &rctx->fastdiv_ne01);
-        uint32_t r_div_ne01 = fastdiv(ir, &rctx->fastdiv_ne01);
-        uint32_t i2 = fastmodulo(r_div_ne01, ne02, &rctx->fastdiv_ne02);
-        uint32_t i3 = fastdiv(r_div_ne01, &rctx->fastdiv_ne02);
-        const uint8_t * src_addr = (const uint8_t *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01;
+                // Rows in this block
+                const uint32_t nrows = MIN(src0_end_row - ir, ne1 - i1);
 
-        dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src0_spad_base + is * rctx->src0_row_size_aligned, src_addr),
-            rctx->src0_row_size_aligned, rctx->src0_row_size, 1);
+                // Depth before prefetch
+                uint32_t dma_depth = dma_queue_depth(dma_queue);
 
-        rowidx_cache[ir % HTP_ROPE_SPAD_NROWS].i1 = i1;
-        rowidx_cache[ir % HTP_ROPE_SPAD_NROWS].i2 = i2;
-        rowidx_cache[ir % HTP_ROPE_SPAD_NROWS].i3 = i3;
-    }
+                // Prefetch loop
+                for (uint32_t pnr = 0, pr = 0; pr < nrows && pr < HTP_ROPE_SPAD_NROWS; pr += pnr) {
+                    pnr = MIN(nrows - pr, HTP_ROPE_SPAD_BLOCK);
 
-    uint32_t prev_i2 = (uint32_t) -1;
-    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
-        uint32_t i1 = rowidx_cache[ir % HTP_ROPE_SPAD_NROWS].i1;
-        uint32_t i2 = rowidx_cache[ir % HTP_ROPE_SPAD_NROWS].i2;
-        uint32_t i3 = rowidx_cache[ir % HTP_ROPE_SPAD_NROWS].i3;
-
-        if (i2 != prev_i2) {
-            const int32_t p = pos[i2];
-            rope_cache_init(p, rctx->freq_scale, freq_factors, rctx->corr_dims, ne0, rctx->ext_factor, rctx->attn_factor, theta_cache, rctx->theta_scale);
-            prev_i2 = i2;
-        }
+                    uint32_t pi1 = i1 + pr;
+                    uint32_t pir = ir + pr;
 
-        float * dst_spad  = (float *) dma_queue_pop(dma_queue).src;
-        float * src0_spad = (float *) dma_queue_pop(dma_queue).dst;
+                    // Dummy DMA transaction for sequencing (interleaving dst,src,dst,...)
+                    dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr((void *) dst->data, dst_spad_base + pr * rctx->dst_row_size_aligned), 0, 0, 0);
 
-        if (is_neox) {
-            hvx_rope_neox_f32_aa(dst_spad, src0_spad, rctx->n_dims, theta_cache);
-        } else {
-            hvx_rope_f32_aa(dst_spad, src0_spad, rctx->n_dims, theta_cache);
-        }
+                    const uint8_t * src_addr = (const uint8_t *) src0->data + i3 * nb03 + i2 * nb02 + pi1 * nb01;
+                          uint8_t * src_spad = src0_spad_base + pr * rctx->src0_row_size_aligned;
+                    dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src_spad, src_addr),
+                        rctx->src0_row_size_aligned, rctx->src0_row_size, pnr);
 
-        // fill the remain channels with data from src tensor
-        if (rctx->n_dims < ne0) {
-            hvx_copy_f32_uu((uint8_t *)(dst_spad + rctx->n_dims), (uint8_t *)(src0_spad + rctx->n_dims), ne0 - rctx->n_dims);
-        }
+                    // FARF(HIGH, "rope-prefetch %u: pr %u i1 %u i2 %u i3 %u src-spad %p src-addr %p npr %u", ith, pir, pi1, i2, i3, src_spad, src_addr, pnr);
+                }
+
+                // Update theta cache
+                if (i2 != prev_i2) {
+                    const int32_t p = pos[i2];
+                    rope_cache_init(p, rctx->freq_scale, freq_factors, rctx->corr_dims, ne0, rctx->ext_factor, rctx->attn_factor, theta_cache, rctx->theta_scale);
+                    prev_i2 = i2;
+                }
 
-        uint8_t * dst_addr = (uint8_t *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1;
-        dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(dst_addr, dst_spad), rctx->dst_row_size, rctx->dst_row_size_aligned, 1);
-
-        // prefetch next row
-        const uint32_t pr = (ir + HTP_ROPE_SPAD_NROWS);
-        if (pr < src0_end_row) {
-            // Re-calculate src ptr for prefetch
-            uint32_t pi1 = fastmodulo(pr, ne01, &rctx->fastdiv_ne01);
-            uint32_t pr_div_ne01 = fastdiv(pr, &rctx->fastdiv_ne01);
-            uint32_t pi2 = fastmodulo(pr_div_ne01, ne02, &rctx->fastdiv_ne02);
-            uint32_t pi3 = fastdiv(pr_div_ne01, &rctx->fastdiv_ne02);
-            const uint8_t * psrc_addr = (const uint8_t *) src0->data + pi3 * nb03 + pi2 * nb02 + pi1 * nb01;
-
-            dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src0_spad, psrc_addr), // reusing src0_spad buffer
-                rctx->src0_row_size_aligned, rctx->src0_row_size, 1);
-
-            rowidx_cache[ir % HTP_ROPE_SPAD_NROWS].i1 = pi1;
-            rowidx_cache[ir % HTP_ROPE_SPAD_NROWS].i2 = pi2;
-            rowidx_cache[ir % HTP_ROPE_SPAD_NROWS].i3 = pi3;
+                // Flush DMA transactions from prev block (if any)
+                for (uint32_t d=0; d < dma_depth; d++) { dma_queue_pop(dma_queue); }
+
+                // Compute loop
+                for (uint32_t cnr = 0, cr = 0; cr < nrows; cr += cnr, ir += cnr, i1 += cnr) {
+                    // Number of rows to compute
+                    cnr = MIN(nrows - cr, HTP_ROPE_SPAD_BLOCK);
+
+                    uint8_t * dst_spad = (uint8_t *) dma_queue_pop(dma_queue).src;
+                    uint8_t * src_spad = (uint8_t *) dma_queue_pop(dma_queue).dst;
+
+                    // FARF(HIGH, "rope-process %u: ir %u i1 %u i2 %u i3 %u src-spad %p npr %u", ith, ir, i1, i2, i3, src_spad, cnr);
+
+                    if (is_neox) {
+                        rope_neox_f32(rctx, dst_spad, src_spad, cnr, ne0, theta_cache);
+                    } else {
+                        rope_basic_f32(rctx, dst_spad, src_spad, cnr, ne0, theta_cache);
+                    }
+
+                    uint8_t * dst_addr = (uint8_t *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1;
+                    dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(dst_addr, dst_spad), rctx->dst_row_size, rctx->dst_row_size_aligned, cnr);
+
+                    // Prefetch more rows (if any)
+                    if ((cr + HTP_ROPE_SPAD_NROWS) < nrows) {
+                        uint32_t pnr = MIN(nrows - (cr + HTP_ROPE_SPAD_NROWS), HTP_ROPE_SPAD_BLOCK);
+                        uint32_t pi1 = i1 + HTP_ROPE_SPAD_NROWS;
+                        uint32_t pir = ir + HTP_ROPE_SPAD_NROWS;
+
+                        const uint8_t * src_addr = (const uint8_t *) src0->data + i3 * nb03 + i2 * nb02 + pi1 * nb01;
+                        dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(src_spad, src_addr),
+                            rctx->src0_row_size_aligned, rctx->src0_row_size, pnr);
+
+                        // FARF(HIGH, "rope-prefetch %u: pr %u i1 %u i2 %u i3 %u src-spad %p src-addr %p npr %u", ith, pir, pi1, i2, i3, src_spad, src_addr, pnr);
+                    }
+                }
+            }
         }
     }
 
+done:
     dma_queue_flush(dma_queue);
-
     t2 = HAP_perf_get_qtimer_count();
 
     FARF(HIGH, "rope-f32: %d/%d: (%u:%u) usec %u\n", ith, nth, src0_start_row, src0_end_row,

From 3cd2a54cd20669d437d6820d35b3f792283d389c Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Thu, 19 Feb 2026 10:52:55 -0800
Subject: [PATCH 12/15] hex-rope: minor formatting

---
 ggml/src/ggml-hexagon/htp/rope-ops.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index ac9315774..e25bac20c 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -131,7 +131,7 @@ static void rope_corr_dims(int     n_dims,
     dims[1]     = MIN(n_dims - 1, end);
 }
 
-static void init_rope_ctx(struct htp_rope_context * rctx, struct htp_ops_context * octx) {
+static void rope_init_context(struct htp_rope_context * rctx, struct htp_ops_context * octx) {
     memset(rctx, 0, sizeof(struct htp_rope_context));
 
     const int32_t * op_params = &octx->op_params[0];
@@ -160,7 +160,7 @@ static void init_rope_ctx(struct htp_rope_context * rctx, struct htp_ops_context
          rctx->ext_factor, rctx->theta_scale, rctx->attn_factor);
 }
 
-static void hvx_rope_neox_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
+static inline void hvx_rope_neox_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
     const uint8_t * restrict src0_curr  = (const uint8_t *) src0;
     const uint8_t * restrict theta_curr = (const uint8_t *) theta_cache;
     uint8_t * restrict dst_curr         = (uint8_t *) dst;
@@ -205,7 +205,7 @@ static void hvx_rope_neox_f32_aa(float * restrict dst, const float * restrict sr
     }
 }
 
-static void hvx_rope_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
+static inline void hvx_rope_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
     const uint8_t * restrict src0_curr  = (const uint8_t *) src0;
     const uint8_t * restrict theta_curr = (const uint8_t *) theta_cache;
     uint8_t * restrict dst_curr         = (uint8_t *) dst;
@@ -421,7 +421,7 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
 
     switch (octx->op) {
         case HTP_OP_ROPE:
-            init_rope_ctx(&rctx, octx);
+            rope_init_context(&rctx, octx);
             break;
 
         default:
@@ -474,7 +474,7 @@ static int execute_op_rope_f32(struct htp_ops_context * octx) {
     rctx.src0_nrows = src0_nrows;
 
     if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
-        uint32_t n_jobs             = MIN(n_threads, src0_nrows);
+        uint32_t n_jobs = MIN(n_threads, src0_nrows);
         rctx.src0_nrows_per_thread = (src0_nrows + n_jobs - 1) / n_jobs;
         worker_pool_run_func(octx->ctx->worker_pool, rope_job_f32, &rctx, n_jobs);
     }

From f9d5f9da9b58e6a4ebb367a4b2b7e2cb1d665848 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Thu, 19 Feb 2026 13:04:06 -0800
Subject: [PATCH 13/15] hex-rope: use indices and unroll the loops

---
 ggml/src/ggml-hexagon/htp/rope-ops.c | 71 +++++++++++++---------------
 1 file changed, 32 insertions(+), 39 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index e25bac20c..0dd55da65 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -161,20 +161,22 @@ static void rope_init_context(struct htp_rope_context * rctx, struct htp_ops_con
 }
 
 static inline void hvx_rope_neox_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
-    const uint8_t * restrict src0_curr  = (const uint8_t *) src0;
-    const uint8_t * restrict theta_curr = (const uint8_t *) theta_cache;
-    uint8_t * restrict dst_curr         = (uint8_t *) dst;
+    const HVX_Vector * restrict vsrc   = (const HVX_Vector *) src0;
+    const HVX_Vector * restrict vtheta = (const HVX_Vector *) theta_cache;
+    HVX_Vector       * restrict vdst   = (HVX_Vector *) dst;
 
-    int step_of_1 = ne >> 6;  // 6 because we process two vectors at once
-    int half_size = (sizeof(float) * (ne / 2));
-    const int32_t half_dims = ne / 2;
+    uint32_t nvec = (ne / (VLEN_FP32 * 2) * 2); // 2 vecs per loop, step of 2
 
-    for (int i = 0; i < step_of_1; i++) {
-        HVX_Vector v0 = *(HVX_Vector *) src0_curr;
-        HVX_Vector v1 = *(HVX_Vector *) (src0_curr + half_size);
+    uint32_t he = ne / 2;         // half_dims offset in elements
+    uint32_t hv = he / VLEN_FP32; // half_dims offset in vectors
 
-        HVX_Vector v2 = *(HVX_Vector *) theta_curr;
-        HVX_Vector v3 = *(HVX_Vector *) (theta_curr + VLEN);
+    #pragma unroll(2)
+    for (uint32_t i = 0; i < nvec; i += 2) {
+        HVX_Vector v0 = vsrc[i/2+0];
+        HVX_Vector v1 = vsrc[i/2+hv];
+
+        HVX_Vector v2 = vtheta[i+0];
+        HVX_Vector v3 = vtheta[i+1];
 
         HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);  // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
 
@@ -186,38 +188,34 @@ static inline void hvx_rope_neox_f32_aa(float * restrict dst, const float * rest
         HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
         HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
 
-        *(HVX_Vector *) dst_curr               = Q6_Vsf_equals_Vqf32(v4);
-        *(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5);
-
-        src0_curr  += VLEN;
-        theta_curr += 2 * VLEN;
-        dst_curr   += VLEN;
+        vdst[i/2+0]  = Q6_Vsf_equals_Vqf32(v4);
+        vdst[i/2+hv] = Q6_Vsf_equals_Vqf32(v5);
     }
 
-    int processed = (ne >> 6) << 6; // multiples of 64
-    for (int i = processed; i < ne; i += 2) {
+    for (uint32_t i = nvec * VLEN_FP32; i < ne; i += 2) {
         const float cos_theta = theta_cache[i+0];
         const float sin_theta = theta_cache[i+1];
         float x0 = src0[i/2];
-        float x1 = src0[i/2 + half_dims];
-        dst[i/2]             = x0 * cos_theta - x1 * sin_theta;
-        dst[i/2 + half_dims] = x0 * sin_theta + x1 * cos_theta;
+        float x1 = src0[i/2 + he];
+        dst[i/2]      = x0 * cos_theta - x1 * sin_theta;
+        dst[i/2 + he] = x0 * sin_theta + x1 * cos_theta;
     }
 }
 
 static inline void hvx_rope_f32_aa(float * restrict dst, const float * restrict src0, uint32_t ne, const float * restrict theta_cache) {
-    const uint8_t * restrict src0_curr  = (const uint8_t *) src0;
-    const uint8_t * restrict theta_curr = (const uint8_t *) theta_cache;
-    uint8_t * restrict dst_curr         = (uint8_t *) dst;
+    const HVX_Vector * restrict vsrc   = (const HVX_Vector *) src0;
+    const HVX_Vector * restrict vtheta = (const HVX_Vector *) theta_cache;
+    HVX_Vector       * restrict vdst   = (HVX_Vector *) dst;
 
-    int step_of_1 = ne >> 6;  // 6 because we process two vectors at once
+    uint32_t nvec = (ne / (VLEN_FP32 * 2)) * 2; // 2 vecs per loop, step of two
 
-    for (int i = 0; i < step_of_1; i++) {
-        HVX_Vector v0 = *(HVX_Vector *) src0_curr;
-        HVX_Vector v1 = *(HVX_Vector *) (src0_curr + VLEN);
+    #pragma unroll(2)
+    for (uint32_t i = 0; i < nvec; i+=2) {
+        HVX_Vector v0 = vsrc[i+0];
+        HVX_Vector v1 = vsrc[i+1];
 
-        HVX_Vector v2 = *(HVX_Vector *) theta_curr;
-        HVX_Vector v3 = *(HVX_Vector *) (theta_curr + VLEN);
+        HVX_Vector v2 = vtheta[i+0];
+        HVX_Vector v3 = vtheta[i+1];
 
         HVX_VectorPair vx0_x1   = Q6_W_vdeal_VVR(v1, v0, -4);  // vx0_x1[0] = x0, vx0_x1[1] = x1
         HVX_VectorPair vcos_sin = Q6_W_vdeal_VVR(v3, v2, -4);  // vcos_sin[0] = cos_theta, vcos_sin[1] = sin_theta
@@ -232,16 +230,11 @@ static inline void hvx_rope_f32_aa(float * restrict dst, const float * restrict
 
         HVX_VectorPair vstore = Q6_W_vshuff_VVR(Q6_Vsf_equals_Vqf32(v5), Q6_Vsf_equals_Vqf32(v4), -4);
 
-        *(HVX_Vector *) dst_curr          = Q6_V_lo_W(vstore);
-        *(HVX_Vector *) (dst_curr + VLEN) = Q6_V_hi_W(vstore);
-
-        src0_curr  += 2 * VLEN;
-        theta_curr += 2 * VLEN;
-        dst_curr   += 2 * VLEN;
+        vdst[i+0] = Q6_V_lo_W(vstore);
+        vdst[i+1] = Q6_V_hi_W(vstore);
     }
 
-    int processed = (ne >> 6) << 6;
-    for (int i = processed; i < ne; i += 2) {
+    for (uint32_t i = nvec * VLEN_FP32; i < ne; i += 2) {
         const float cos_theta = theta_cache[i+0];
         const float sin_theta = theta_cache[i+1];
         float x0 = src0[i+0];

From d80dd8c1bff7d8cd41c0517646e1c36cec109901 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Thu, 19 Feb 2026 21:27:18 +0000
Subject: [PATCH 14/15] ggml-hexagon: Optimize Softmax ops using DMA and VTCM

Refactored Softmax operations to use DMA and VTCM double buffering.
Implemented double-buffered pipeline with prefetching.
Added support for broadcasting src1 (mask) using zero-stride DMA.
Added F16 fallback for src1 mask.
Removed unaligned code paths as VTCM is always aligned.
Updated context structure to manage VTCM buffers and strides.

Co-authored-by: max-krasnyansky <1380796+max-krasnyansky@users.noreply.github.com>
---
 ggml/src/ggml-hexagon/htp/softmax-ops.c | 324 +++++++++++++++---------
 1 file changed, 204 insertions(+), 120 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/softmax-ops.c b/ggml/src/ggml-hexagon/htp/softmax-ops.c
index 58278079f..fe52c91bd 100644
--- a/ggml/src/ggml-hexagon/htp/softmax-ops.c
+++ b/ggml/src/ggml-hexagon/htp/softmax-ops.c
@@ -18,6 +18,9 @@
 #include "htp-msg.h"
 #include "htp-ops.h"
 
+#define HTP_SOFTMAX_SPAD_NROWS  16
+#define HTP_SOFTMAX_SPAD_BLOCK  (HTP_SOFTMAX_SPAD_NROWS/2)
+
 #define htp_softmax_preamble3                              \
     const uint32_t ne00 = src0->ne[0];                     \
     const uint32_t ne01 = src0->ne[1];                     \
@@ -60,14 +63,19 @@ struct htp_softmax_context {
     float m0;
     float m1;
 
-    uint32_t src0_nrows_per_thread;
-    struct fastdiv_values fastdiv_ne01;
-    struct fastdiv_values fastdiv_ne02;
-    struct fastdiv_values fastdiv_ne12; // For mask broadcasting
-    struct fastdiv_values fastdiv_ne13; // For mask broadcasting
-    size_t spad_stride;
-
     struct htp_ops_context * octx;
+
+    size_t src0_row_size;
+    size_t dst_row_size;
+    size_t src0_row_size_aligned;
+    size_t dst_row_size_aligned;
+    size_t spad_pad_offset;
+    size_t spad_src1_offset; // only used if use_src1
+    size_t src1_row_size;
+    size_t src1_row_size_aligned;
+
+    uint32_t src0_nrows;
+    uint32_t src0_nrows_per_thread;
 };
 
 static void init_softmax_ctx(struct htp_softmax_context * smctx, struct htp_ops_context * octx) {
@@ -90,18 +98,6 @@ static void init_softmax_ctx(struct htp_softmax_context * smctx, struct htp_ops_
 
     smctx->octx = octx;
 
-    // Initialize fastdiv values
-    const uint32_t ne01 = src0->ne[1];
-    const uint32_t ne02 = src0->ne[2];
-
-    if (ne01 > 0) smctx->fastdiv_ne01 = init_fastdiv_values(ne01);
-    if (ne02 > 0) smctx->fastdiv_ne02 = init_fastdiv_values(ne02);
-
-    const uint32_t ne12 = (src1->ne[0]) ? src1->ne[2] : 1;
-    const uint32_t ne13 = (src1->ne[0]) ? src1->ne[3] : 1;
-
-    if (ne12 > 0) smctx->fastdiv_ne12 = init_fastdiv_values(ne12);
-    if (ne13 > 0) smctx->fastdiv_ne13 = init_fastdiv_values(ne13);
 }
 
 static void hvx_fast_softmax_prep_f32(const uint8_t * restrict src,
@@ -214,7 +210,7 @@ static void softmax_job_f32(unsigned int nth, unsigned int ith, void * data) {
 
     htp_softmax_preamble3;
 
-    const uint32_t src0_nrows            = ne01 * ne02 * ne03;  // src0 rows
+    const uint32_t src0_nrows            = smctx->src0_nrows;
     const uint32_t src0_nrows_per_thread = smctx->src0_nrows_per_thread;
 
     const uint32_t src0_start_row = src0_nrows_per_thread * ith;
@@ -228,110 +224,174 @@ static void softmax_job_f32(unsigned int nth, unsigned int ith, void * data) {
     uint64_t t1, t2;
     t1 = HAP_perf_get_qtimer_count();
 
-    int is_aligned = 1;
-    int opt_path   = 0;
-    if (!hex_is_aligned((void *) src0->data, VLEN) || !hex_is_aligned((void *) dst->data, VLEN)) {
-        is_aligned = 0;
-        FARF(HIGH, "softmax-f32: unaligned addresses in elementwise op, possibly slower execution\n");
-    }
-    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
-        opt_path = 1;
-    }
-
-    uint8_t * src0_spad_data = octx->src0_spad.data + (ith * smctx->spad_stride);
-    uint8_t * src1_spad_data = octx->src1_spad.data + (ith * smctx->spad_stride);
-    uint8_t * dst_spad_data  = octx->dst_spad.data + (ith * smctx->spad_stride);
+    uint8_t * src0_spad_base_ptr = octx->src0_spad.data + (ith * octx->src0_spad.size_per_thread);
+    uint8_t * dst_spad_base_ptr  = octx->dst_spad.data + (ith * octx->dst_spad.size_per_thread);
 
-    float * wp0 = (float *) src0_spad_data;
-    float * wp1 = (float *) src1_spad_data;
-    float * wp2 = (float *) dst_spad_data;
+    dma_queue * dma_queue = octx->ctx->dma[ith];
 
     uint32_t prev_i2 = (uint32_t)-1;
     float slope = 1.0f;
 
-    for (uint32_t r = src0_start_row; r < src0_end_row; ++r) {
-        uint32_t i1 = fastmodulo(r, ne01, &smctx->fastdiv_ne01);
-        uint32_t r_div_ne01 = fastdiv(r, &smctx->fastdiv_ne01);
-        uint32_t i2 = fastmodulo(r_div_ne01, ne02, &smctx->fastdiv_ne02);
-        uint32_t i3 = fastdiv(r_div_ne01, &smctx->fastdiv_ne02);
-
-        // Map to original logic indices
-        // i01 = i1
-        // i02 = i2
-        // i03 = i3
-
-        const uint32_t i11 = i1;
-        // const uint32_t i12 = i2 % ne12;
-        // const uint32_t i13 = i3 % ne13;
-
-        uint32_t i12, i13;
-        if (ne12 == ne02) {
-             i12 = i2;
-        } else {
-             i12 = fastmodulo(i2, ne12, &smctx->fastdiv_ne12);
+    const bool use_src1 = smctx->use_src1;
+
+    // Initial indices
+    uint32_t cur_r = src0_start_row;
+    uint32_t cur_i3 = cur_r / (ne02 * ne01);
+    uint32_t rem = cur_r % (ne02 * ne01);
+    uint32_t cur_i2 = rem / ne01;
+    uint32_t cur_i1 = rem % ne01;
+
+    // Prefetch loop variables
+    uint32_t pf_r = cur_r;
+    uint32_t pf_i1 = cur_i1;
+    uint32_t pf_i2 = cur_i2;
+    uint32_t pf_i3 = cur_i3;
+    uint32_t pf_rem = src0_end_row - cur_r;
+
+    // Compute loop variables
+    uint32_t cm_r = cur_r;
+    uint32_t cm_i1 = cur_i1;
+    uint32_t cm_i2 = cur_i2;
+    uint32_t cm_i3 = cur_i3;
+    uint32_t cm_rem = src0_end_row - cur_r;
+
+    while (cm_rem > 0) {
+        // PREFETCH
+        // Fill up to HTP_SOFTMAX_SPAD_NROWS if queue depth is low
+        while (pf_rem > 0 && dma_queue_depth(dma_queue) < (HTP_SOFTMAX_SPAD_NROWS / HTP_SOFTMAX_SPAD_BLOCK)) {
+             uint32_t block = MIN(pf_rem, HTP_SOFTMAX_SPAD_BLOCK);
+             uint32_t rows_in_i1 = ne01 - pf_i1;
+             block = MIN(block, rows_in_i1);
+
+             uint32_t pf_slot = (pf_r - src0_start_row) % HTP_SOFTMAX_SPAD_NROWS;
+             uint32_t slots_avail = HTP_SOFTMAX_SPAD_NROWS - pf_slot;
+             block = MIN(block, slots_avail);
+
+             uint8_t * s0_spad = src0_spad_base_ptr + pf_slot * smctx->src0_row_size_aligned;
+             uint8_t * d_spad  = dst_spad_base_ptr  + pf_slot * smctx->dst_row_size_aligned;
+
+             const uint8_t * s0_addr = (const uint8_t *) src0->data + pf_i3 * nb03 + pf_i2 * nb02 + pf_i1 * nb01;
+
+             uint8_t * s1_spad = NULL;
+             const uint8_t * s1_addr = NULL;
+             size_t s1_stride = 0;
+
+             if (use_src1) {
+                 uint32_t i12 = (ne12 == ne02) ? pf_i2 : (pf_i2 % ne12);
+                 uint32_t i13 = (ne13 == ne03) ? pf_i3 : (pf_i3 % ne13);
+                 uint32_t i11 = (ne11 == ne01) ? pf_i1 : (pf_i1 % ne11);
+
+                 s1_addr = (const uint8_t *) src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11;
+                 s1_spad = src0_spad_base_ptr + smctx->spad_src1_offset + pf_slot * smctx->src1_row_size_aligned;
+                 s1_stride = (ne11 == 1) ? 0 : nb11;
+             }
+
+             // Push Dummy DST (to carry pointer)
+             dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr((void*)dst->data, d_spad), 0, 0, 0);
+
+             // Push SRC0
+             dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(s0_spad, s0_addr),
+                                        smctx->src0_row_size_aligned, smctx->src0_row_size, block);
+
+             // Push SRC1
+             if (use_src1) {
+                 dma_queue_push(dma_queue, dma_make_ptr(s1_spad, s1_addr),
+                                smctx->src1_row_size_aligned, // dst_stride
+                                s1_stride, // src_stride
+                                smctx->src1_row_size, // width
+                                block);
+             }
+
+             pf_r += block;
+             pf_i1 += block;
+             if (pf_i1 >= ne01) {
+                 pf_i1 = 0;
+                 pf_i2++;
+                 if (pf_i2 >= ne02) {
+                     pf_i2 = 0;
+                     pf_i3++;
+                 }
+             }
+             pf_rem -= block;
         }
 
-        if (ne13 == ne03) {
-             i13 = i3;
-        } else {
-             i13 = fastmodulo(i3, ne13, &smctx->fastdiv_ne13);
+        // COMPUTE
+        uint32_t cm_slot = (cm_r - src0_start_row) % HTP_SOFTMAX_SPAD_NROWS;
+        uint32_t c_block = MIN(cm_rem, HTP_SOFTMAX_SPAD_BLOCK);
+        uint32_t rows_in_i1 = ne01 - cm_i1;
+        c_block = MIN(c_block, rows_in_i1);
+        uint32_t slots_avail = HTP_SOFTMAX_SPAD_NROWS - cm_slot;
+        c_block = MIN(c_block, slots_avail);
+
+        uint8_t * d_spad = (uint8_t *) dma_queue_pop(dma_queue).src;
+        uint8_t * s0_spad = (uint8_t *) dma_queue_pop(dma_queue).dst;
+        uint8_t * s1_spad = NULL;
+        if (use_src1) {
+            s1_spad = (uint8_t *) dma_queue_pop(dma_queue).dst;
         }
 
-        // ALiBi
-        if (i2 != prev_i2) {
-            const uint32_t h = i2;  // head
+        uint8_t * p_spad = src0_spad_base_ptr + smctx->spad_pad_offset + cm_slot * smctx->src0_row_size_aligned;
 
-            slope = (smctx->max_bias > 0.0f) ?
-                        h < smctx->n_head_log2 ?
-                        powf(smctx->m0, h + 1) :
-                        powf(smctx->m1, 2 * (h - smctx->n_head_log2) + 1) :
-                        1.0f;
-            prev_i2 = i2;
-        }
+        for (uint32_t b = 0; b < c_block; ++b) {
+            uint32_t cur_i2_local = cm_i2; // constant for block
 
-        float * sp = (float *) ((char *) octx->src0.data + i1 * nb01 + i2 * nb02 + i3 * nb03);
-        float * dp = (float *) ((char *) octx->dst.data + i1 * nb1 + i2 * nb2 + i3 * nb3);
-
-        // broadcast the mask across rows
-        __fp16 * mp_f16 = (smctx->use_src1) ?
-                              (__fp16 *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
-                              NULL;
-        float *  mp_f32 = (smctx->use_src1) ?
-                              (float *) ((char *) octx->src1.data + i11 * nb11 + i12 * nb12 + i13 * nb13) :
-                              NULL;
-
-        if ((1 == opt_path) && (mp_f32) && !(smctx->use_f16)) {
-            hvx_fast_softmax_prep_f32((const uint8_t *) sp, (uint8_t *) wp0, ne00, smctx->scale,
-                                      (const uint8_t *) mp_f32, slope);
-        } else {
-            hvx_scale_f32((uint8_t *) wp0, (const uint8_t *) sp, ne00, smctx->scale);
-            if (mp_f32) {
+            // ALiBi
+            if (cur_i2_local != prev_i2) {
+                const uint32_t h = cur_i2_local;
+                slope = (smctx->max_bias > 0.0f) ?
+                            h < smctx->n_head_log2 ?
+                            powf(smctx->m0, h + 1) :
+                            powf(smctx->m1, 2 * (h - smctx->n_head_log2) + 1) :
+                            1.0f;
+                prev_i2 = cur_i2_local;
+            }
+
+            uint8_t * row_s0 = s0_spad + b * smctx->src0_row_size_aligned;
+            uint8_t * row_d  = d_spad  + b * smctx->dst_row_size_aligned;
+            uint8_t * row_p  = p_spad  + b * smctx->src0_row_size_aligned;
+
+            if (use_src1) {
+                uint8_t * row_s1 = s1_spad + b * smctx->src1_row_size_aligned;
                 if (smctx->use_f16) {
-                    for (int i = 0; i < ne00; ++i) {
-                        wp0[i] += slope * (float) mp_f16[i];
+                    hvx_scale_f32(row_s0, row_s0, ne00, smctx->scale);
+                    float * r_s0_f = (float *) row_s0;
+                    __fp16 * r_s1_h = (__fp16 *) row_s1;
+                    for (uint32_t i = 0; i < ne00; ++i) {
+                        r_s0_f[i] += slope * (float) r_s1_h[i];
                     }
                 } else {
-                    for (int i = 0; i < ne00; ++i) {
-                        wp0[i] += slope * mp_f32[i];
-                    }
+                    hvx_fast_softmax_prep_f32(row_s0, row_s0, ne00, smctx->scale, row_s1, slope);
                 }
+            } else {
+                hvx_scale_f32(row_s0, row_s0, ne00, smctx->scale);
             }
+
+            hvx_fast_softmax_f32(row_s0, row_d, row_p, ne00);
         }
 
-        if (1 == opt_path) {
-            hvx_fast_softmax_f32((const uint8_t *) wp0, (uint8_t *) dp, (uint8_t *) wp1, ne00);
-        } else {
-            float max = hvx_reduce_max_f32((const uint8_t *) wp0, ne00);
-            float sum = hvx_softmax_f32((const uint8_t *) wp0, (uint8_t *) wp2, (uint8_t *) wp1, ne00, max);
-            sum       = sum > 0.0 ? (1.0 / sum) : 1;
-            hvx_scale_f32((uint8_t *) dp, (const uint8_t *) wp2, ne00, sum);
+        uint8_t * dst_addr = (uint8_t *) dst->data + cm_i3 * nb3 + cm_i2 * nb2 + cm_i1 * nb1;
+        dma_queue_push_vtcm_to_ddr(dma_queue, dma_make_ptr(dst_addr, d_spad),
+                                   smctx->dst_row_size, smctx->dst_row_size_aligned, c_block);
+
+        cm_r += c_block;
+        cm_i1 += c_block;
+        if (cm_i1 >= ne01) {
+            cm_i1 = 0;
+            cm_i2++;
+            if (cm_i2 >= ne02) {
+                cm_i2 = 0;
+                cm_i3++;
+            }
         }
+        cm_rem -= c_block;
     }
 
+    dma_queue_flush(dma_queue);
+
     t2 = HAP_perf_get_qtimer_count();
 
-    FARF(HIGH, "softmax-f32 %d/%d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
-         smctx->use_f16, opt_path, ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13,
+    FARF(HIGH, "softmax-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) x %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
+         smctx->use_f16, ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne10, ne11, ne12, ne13,
          ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
@@ -358,44 +418,68 @@ static int execute_op_softmax_f32(struct htp_ops_context * octx) {
     const uint32_t n_threads = octx->n_threads;
 
     const size_t src0_row_size = src0->nb[1];
-    const size_t src1_row_size = src0_row_size;
     const size_t dst_row_size  = dst->nb[1];
+    const size_t src1_row_size = (src1->ne[0]) ? src1->nb[1] : 0;
 
-    // VTCM scratchpads for all tensors
-    // N rows per thread, padded to HVX vector size
-    octx->dst_spad.size  = hex_round_up(dst_row_size, 128) * n_threads;
-    octx->src0_spad.size = hex_round_up(src0_row_size, 128) * n_threads;
-    octx->src1_spad.size = hex_round_up(src1_row_size, 128) * n_threads;
+    // Aligned row sizes
+    const size_t src0_row_size_aligned = hex_round_up(src0_row_size, 128);
+    const size_t dst_row_size_aligned  = hex_round_up(dst_row_size, 128);
+    const size_t src1_row_size_aligned = hex_round_up(src1_row_size, 128);
 
-    // Use stride for calculating offset
-    smctx.spad_stride = hex_round_up(src0_row_size, 128);
+    // Calculate spad sizes per thread
+    // src0_spad includes: src0 rows, pad rows (intermediate), and src1 rows (if used)
+    size_t src0_spad_size = HTP_SOFTMAX_SPAD_NROWS * src0_row_size_aligned;
+    size_t pad_spad_size  = HTP_SOFTMAX_SPAD_NROWS * src0_row_size_aligned; // same size as src0
+    size_t src1_spad_size = (src1->ne[0]) ? (HTP_SOFTMAX_SPAD_NROWS * src1_row_size_aligned) : 0;
 
-    size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
+    size_t dst_spad_size  = HTP_SOFTMAX_SPAD_NROWS * dst_row_size_aligned;
+
+    size_t src0_total_per_thread = src0_spad_size + pad_spad_size + src1_spad_size;
+    size_t dst_total_per_thread  = dst_spad_size;
+
+    size_t total_vtcm = (src0_total_per_thread + dst_total_per_thread) * n_threads;
 
     if (src1->ne[0]) {
         FARF(HIGH,
-             "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n",
+             "%s: %ux%ux%ux%u x %ux%ux%ux%u -> %ux%ux%ux%u : vtcm needed %u\n",
              op_type, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2],
-             src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], octx->src0_spad.size, octx->src1_spad.size,
-             octx->dst_spad.size);
+             src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], total_vtcm);
     } else {
-        FARF(HIGH, "%s: %ux%ux%ux%u -> %ux%ux%ux%u : src0-spad-size %u src1-spad-size %u dst-spad-size %u\n", op_type,
+        FARF(HIGH, "%s: %ux%ux%ux%u -> %ux%ux%ux%u : vtcm needed %u\n", op_type,
              src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
-             octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
+             total_vtcm);
     }
 
-    // Make sure the reserved vtcm size is sufficient
-    if (octx->ctx->vtcm_size < spad_size) {
+    if (octx->ctx->vtcm_size < total_vtcm) {
         FARF(ERROR, "%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
-             spad_size);
+             total_vtcm);
         return HTP_STATUS_VTCM_TOO_SMALL;
     }
 
+    octx->src0_spad.size_per_thread = src0_total_per_thread;
+    octx->dst_spad.size_per_thread  = dst_total_per_thread;
+    octx->src1_spad.size_per_thread = 0; // Packed into src0
+
+    octx->src0_spad.size = octx->src0_spad.size_per_thread * n_threads;
+    octx->dst_spad.size  = octx->dst_spad.size_per_thread * n_threads;
+    octx->src1_spad.size = 0;
+
     octx->src0_spad.data = octx->ctx->vtcm_base;
-    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
-    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
+    octx->src1_spad.data = NULL;
+    octx->dst_spad.data  = octx->src0_spad.data + octx->src0_spad.size;
+
+    smctx.src0_row_size = src0_row_size;
+    smctx.src0_row_size_aligned = src0_row_size_aligned;
+    smctx.dst_row_size = dst_row_size;
+    smctx.dst_row_size_aligned = dst_row_size_aligned;
+    smctx.src1_row_size = src1_row_size;
+    smctx.src1_row_size_aligned = src1_row_size_aligned;
+
+    smctx.spad_pad_offset  = src0_spad_size;
+    smctx.spad_src1_offset = src0_spad_size + pad_spad_size;
 
     uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
+    smctx.src0_nrows = src0_nrows;
 
     if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
         uint32_t n_jobs             = MIN(n_threads, src0_nrows);

From df96e3094de69d444b8a2024e833bb5ce843fa8c Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Fri, 20 Feb 2026 00:10:20 +0000
Subject: [PATCH 15/15] ggml-hexagon: Optimize Softmax ops using DMA and VTCM
 with fastdiv

Refactored Softmax operations to use DMA and VTCM double buffering.
Implemented double-buffered pipeline with prefetching.
Added support for broadcasting src1 (mask) using zero-stride DMA.
Added F16 fallback for src1 mask.
Removed unaligned code paths as VTCM is always aligned.
Updated context structure to manage VTCM buffers and strides.
Restored and utilized fastdiv for efficient index calculation in loops.

Co-authored-by: max-krasnyansky <1380796+max-krasnyansky@users.noreply.github.com>
---
 ggml/src/ggml-hexagon/htp/softmax-ops.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/softmax-ops.c b/ggml/src/ggml-hexagon/htp/softmax-ops.c
index fe52c91bd..a0da51be6 100644
--- a/ggml/src/ggml-hexagon/htp/softmax-ops.c
+++ b/ggml/src/ggml-hexagon/htp/softmax-ops.c
@@ -63,6 +63,10 @@ struct htp_softmax_context {
     float m0;
     float m1;
 
+    struct fastdiv_values fastdiv_ne11;
+    struct fastdiv_values fastdiv_ne12;
+    struct fastdiv_values fastdiv_ne13;
+
     struct htp_ops_context * octx;
 
     size_t src0_row_size;
@@ -96,6 +100,12 @@ static void init_softmax_ctx(struct htp_softmax_context * smctx, struct htp_ops_
     smctx->use_src1 = (src1->ne[0] != 0);
     smctx->use_f16  = (src1->ne[0] != 0) && (src1->type == HTP_TYPE_F16);
 
+    if (smctx->use_src1) {
+        if (src1->ne[1] > 0) smctx->fastdiv_ne11 = init_fastdiv_values(src1->ne[1]);
+        if (src1->ne[2] > 0) smctx->fastdiv_ne12 = init_fastdiv_values(src1->ne[2]);
+        if (src1->ne[3] > 0) smctx->fastdiv_ne13 = init_fastdiv_values(src1->ne[3]);
+    }
+
     smctx->octx = octx;
 
 }
@@ -277,9 +287,9 @@ static void softmax_job_f32(unsigned int nth, unsigned int ith, void * data) {
              size_t s1_stride = 0;
 
              if (use_src1) {
-                 uint32_t i12 = (ne12 == ne02) ? pf_i2 : (pf_i2 % ne12);
-                 uint32_t i13 = (ne13 == ne03) ? pf_i3 : (pf_i3 % ne13);
-                 uint32_t i11 = (ne11 == ne01) ? pf_i1 : (pf_i1 % ne11);
+                 uint32_t i12 = (ne12 == ne02) ? pf_i2 : fastmodulo(pf_i2, ne12, &smctx->fastdiv_ne12);
+                 uint32_t i13 = (ne13 == ne03) ? pf_i3 : fastmodulo(pf_i3, ne13, &smctx->fastdiv_ne13);
+                 uint32_t i11 = (ne11 == ne01) ? pf_i1 : fastmodulo(pf_i1, ne11, &smctx->fastdiv_ne11);
 
                  s1_addr = (const uint8_t *) src1->data + i13 * nb13 + i12 * nb12 + i11 * nb11;
                  s1_spad = src0_spad_base_ptr + smctx->spad_src1_offset + pf_slot * smctx->src1_row_size_aligned;