From 407b408981abc4180f64b99f89b003168bf7b962 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Thu, 27 Nov 2025 12:54:46 +0800
Subject: [PATCH 01/23] fix test failure

---
 ggml/src/ggml-hexagon/htp/rope-ops.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index 00419bcba6b..ba0ff2dbc61 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -151,9 +151,9 @@ static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context
 }
 
 static void hvx_calc_rope_neox_f32(const float * restrict src0,
-                              float * restrict dst,
-                              const int num_elems,
-                              const float * restrict theta_cache) {
+                                   float * restrict dst,
+                                   const int num_elems,
+                                   const float * restrict theta_cache) {
     // for (int i = 0; i < num_elems; i += 2) {
     //const float cos_theta = theta_cache[i + 0];
     //const float sin_theta = theta_cache[i + 1];
@@ -192,7 +192,7 @@ static void hvx_calc_rope_neox_f32(const float * restrict src0,
         HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
         HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
 
-        *(HVX_Vector *) dst_curr          = Q6_Vsf_equals_Vqf32(v4);
+        *(HVX_Vector *) dst_curr               = Q6_Vsf_equals_Vqf32(v4);
         *(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5);
 
         src0_curr += VLEN;
@@ -259,7 +259,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                          const uint32_t       ir1,
                          int                  nth,
                          int                  ith,
-                         int                  opt_path) {
+                         const int            opt_path) {
     struct htp_ops_context * octx = rope_ctx->octx;
 
     const struct htp_tensor * src0 = &octx->src0;
@@ -267,8 +267,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
     const struct htp_tensor * src2 = &octx->src2;
     struct htp_tensor *       dst  = &octx->dst;
 
-    const int32_t mode  = rope_ctx->mode;
-    const bool is_neox  = mode & HTP_ROPE_TYPE_NEOX;
+    const int32_t mode    = rope_ctx->mode;
+    const bool    is_neox = mode & HTP_ROPE_TYPE_NEOX;
 
     htp_rope_preamble;
 
@@ -317,10 +317,10 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
 
                         if (is_neox) {
                             const float x0 = src_loc[0];
-                            const float x1 = src_loc[rope_ctx->n_dims/2];
+                            const float x1 = src_loc[rope_ctx->n_dims / 2];
 
-                            dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
-                            dst_data_loc[rope_ctx->n_dims/2] = x0 * sin_theta + x1 * cos_theta;
+                            dst_data_loc[0]                    = x0 * cos_theta - x1 * sin_theta;
+                            dst_data_loc[rope_ctx->n_dims / 2] = x0 * sin_theta + x1 * cos_theta;
 
                             src_loc += 1;
                             dst_data_loc += 1;
@@ -337,6 +337,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                     }
                 }
 
+                src_loc += is_neox ? (rope_ctx->n_dims / 2) : 0;
+                dst_data_loc += is_neox ? (rope_ctx->n_dims / 2) : 0;
                 for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) {
                     dst_data_loc[0] = src_loc[0];
                     dst_data_loc[1] = src_loc[1];

From 4ddb8a449cbe057d67b684b7c34192e69f743d30 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 28 Nov 2025 01:15:25 +0800
Subject: [PATCH 02/23] fix: correct scaling calculations in rope_cache_init

---
 ggml/src/ggml-hexagon/htp/rope-ops.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index ba0ff2dbc61..719efc09b7a 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -93,17 +93,18 @@ static void rope_cache_init(const float   theta_base,
         // Get n-d rotational scaling corrected for extrapolation
         float theta_interp = freq_scale * theta_extrap;
         float theta2       = theta_interp;
+        float mscale2      = mscale;
 
         if (ext_factor != 0.0f) {
             float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
             theta2         = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
 
             // Get n-d magnitude scaling corrected for interpolation
-            mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+            mscale2 *= 1.0f + 0.1f * logf(1.0f / freq_scale);
         }
 
-        cache[i0 + 0] = cosf(theta2) * mscale;
-        cache[i0 + 1] = sinf(theta2) * mscale;
+        cache[i0 + 0] = cosf(theta2) * mscale2;
+        cache[i0 + 1] = sinf(theta2) * mscale2;
 
         theta *= theta_scale;
     }
@@ -337,8 +338,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                     }
                 }
 
-                src_loc += is_neox ? (rope_ctx->n_dims / 2) : 0;
-                dst_data_loc += is_neox ? (rope_ctx->n_dims / 2) : 0;
+                src_loc += (is_neox ? (rope_ctx->n_dims / 2) : 0);
+                dst_data_loc += (is_neox ? (rope_ctx->n_dims / 2) : 0);
                 for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) {
                     dst_data_loc[0] = src_loc[0];
                     dst_data_loc[1] = src_loc[1];

From cfca78b0d01b76aad6b1a89bc08045c9eaaa23d0 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 28 Nov 2025 01:17:41 +0800
Subject: [PATCH 03/23] wip

---
 ggml/src/ggml-hexagon/htp/rope-ops.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index 719efc09b7a..ef24c80f82b 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -73,15 +73,15 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
     return (1 - MIN(1, MAX(0, y)));
 }
 
-static void rope_cache_init(const float   theta_base,
-                            float         freq_scale,
-                            const float * freq_factors,
-                            float *       corr_dims,
-                            uint32_t      ne0,
-                            float         ext_factor,
-                            float         mscale,
-                            float *       cache,
-                            float         theta_scale) {
+static void rope_cache_init(const float    theta_base,
+                            const float    freq_scale,
+                            const float *  freq_factors,
+                            float *        corr_dims,
+                            const uint32_t ne0,
+                            const float    ext_factor,
+                            const float    mscale,
+                            float *        cache,
+                            const float    theta_scale) {
     // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
     float theta = theta_base;
 

From e9a02fdba1246b747096cb9bbcbd67c7236ed2de Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 28 Nov 2025 10:33:43 +0800
Subject: [PATCH 04/23] wip

---
 ggml/src/ggml-hexagon/htp/rope-ops.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index ef24c80f82b..7519505ab0d 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -282,8 +282,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
         freq_factors = (const float *) src2->data;
     }
 
-    int ir = 0;
-
+    int           ir        = 0;
+    const int32_t half_dims = rope_ctx->n_dims / 2;
     for (uint32_t i3 = 0; i3 < ne3; i3++) {      // batch
         for (uint32_t i2 = 0; i2 < ne2; i2++) {  // seq-len
             const int32_t p = pos[i2];
@@ -311,6 +311,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                     } else {
                         hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
                     }
+
+                    src_loc += rope_ctx->n_dims;
+                    dst_data_loc += rope_ctx->n_dims;
                 } else {
                     for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
                         const float cos_theta = wp0[i0 + 0];
@@ -318,10 +321,10 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
 
                         if (is_neox) {
                             const float x0 = src_loc[0];
-                            const float x1 = src_loc[rope_ctx->n_dims / 2];
+                            const float x1 = src_loc[half_dims];
 
-                            dst_data_loc[0]                    = x0 * cos_theta - x1 * sin_theta;
-                            dst_data_loc[rope_ctx->n_dims / 2] = x0 * sin_theta + x1 * cos_theta;
+                            dst_data_loc[0]         = x0 * cos_theta - x1 * sin_theta;
+                            dst_data_loc[half_dims] = x0 * sin_theta + x1 * cos_theta;
 
                             src_loc += 1;
                             dst_data_loc += 1;
@@ -336,10 +339,11 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                             dst_data_loc += 2;
                         }
                     }
+
+                    src_loc += (is_neox ? half_dims : 0);
+                    dst_data_loc += (is_neox ? half_dims : 0);
                 }
 
-                src_loc += (is_neox ? (rope_ctx->n_dims / 2) : 0);
-                dst_data_loc += (is_neox ? (rope_ctx->n_dims / 2) : 0);
                 for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) {
                     dst_data_loc[0] = src_loc[0];
                     dst_data_loc[1] = src_loc[1];

From e324bb0bd50898537e41810fcd41f82ce4a78e7c Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 28 Nov 2025 10:57:27 +0800
Subject: [PATCH 05/23] fix: optimize element copying in rope_hex_f32 using
 memcpy

---
 ggml/src/ggml-hexagon/htp/rope-ops.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index 7519505ab0d..0e71dcfae8e 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -344,13 +344,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                     dst_data_loc += (is_neox ? half_dims : 0);
                 }
 
-                for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) {
-                    dst_data_loc[0] = src_loc[0];
-                    dst_data_loc[1] = src_loc[1];
-
-                    src_loc += 2;
-                    dst_data_loc += 2;
-                }
+                // TODO: use simd to speed up the remaining elements copy
+                memcpy(dst_data_loc, src_loc, (ne0 - rope_ctx->n_dims) * sizeof(float));
             }
         }
     }

From 0121291d5358ee27bf4e8bd96106020f4b56857c Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 28 Nov 2025 11:12:12 +0800
Subject: [PATCH 06/23] fix: optimize loop boundaries in rope_hex_f32 for
 better performance

---
 ggml/src/ggml-hexagon/htp/rope-ops.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index 0e71dcfae8e..ddce8971204 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -282,8 +282,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
         freq_factors = (const float *) src2->data;
     }
 
-    int           ir        = 0;
-    const int32_t half_dims = rope_ctx->n_dims / 2;
+    const uint32_t i0_end    = MIN(ir1, ne1);
+    const int32_t  half_dims = rope_ctx->n_dims / 2;
     for (uint32_t i3 = 0; i3 < ne3; i3++) {      // batch
         for (uint32_t i2 = 0; i2 < ne2; i2++) {  // seq-len
             const int32_t p = pos[i2];
@@ -291,14 +291,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
             rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor,
                             rope_ctx->attn_factor, wp0, rope_ctx->theta_scale);
 
-            for (uint32_t i1 = 0; i1 < ne1; i1++) {  // attn-heads
-                if (ir++ < ir0) {
-                    continue;
-                }
-                if (ir > ir1) {
-                    break;
-                }
-
+            for (uint32_t i1 = ir0; i1 < i0_end; i1++) {  // attn-heads
                 const float * src      = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
                 float *       dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
 

From 010039a15e0239de0685d70f0a144587efb3f4ee Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 28 Nov 2025 11:22:22 +0800
Subject: [PATCH 07/23] rename

---
 ggml/src/ggml-hexagon/htp/rope-ops.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index ddce8971204..dbb4df58b56 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -92,19 +92,19 @@ static void rope_cache_init(const float    theta_base,
 
         // Get n-d rotational scaling corrected for extrapolation
         float theta_interp = freq_scale * theta_extrap;
-        float theta2       = theta_interp;
-        float mscale2      = mscale;
+        float theta_final  = theta_interp;
+        float mscale_final = mscale;
 
         if (ext_factor != 0.0f) {
             float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
-            theta2         = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+            theta_final    = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
 
             // Get n-d magnitude scaling corrected for interpolation
-            mscale2 *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+            mscale_final *= 1.0f + 0.1f * logf(1.0f / freq_scale);
         }
 
-        cache[i0 + 0] = cosf(theta2) * mscale2;
-        cache[i0 + 1] = sinf(theta2) * mscale2;
+        cache[i0 + 0] = cosf(theta_final) * mscale_final;
+        cache[i0 + 1] = sinf(theta_final) * mscale_final;
 
         theta *= theta_scale;
     }
@@ -282,7 +282,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
         freq_factors = (const float *) src2->data;
     }
 
-    const uint32_t i0_end    = MIN(ir1, ne1);
+    const uint32_t i1_end    = MIN(ir1, ne1);
     const int32_t  half_dims = rope_ctx->n_dims / 2;
     for (uint32_t i3 = 0; i3 < ne3; i3++) {      // batch
         for (uint32_t i2 = 0; i2 < ne2; i2++) {  // seq-len
@@ -291,7 +291,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
             rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor,
                             rope_ctx->attn_factor, wp0, rope_ctx->theta_scale);
 
-            for (uint32_t i1 = ir0; i1 < i0_end; i1++) {  // attn-heads
+            for (uint32_t i1 = ir0; i1 < i1_end; i1++) {  // attn-heads
                 const float * src      = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
                 float *       dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
 

From a6ef41f404da2e581a618d1efbb9a9669dfeea6e Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 28 Nov 2025 12:10:56 +0800
Subject: [PATCH 08/23] wip

---
 ggml/src/ggml-hexagon/htp/rope-ops.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index dbb4df58b56..a4399704fcb 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -282,8 +282,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
         freq_factors = (const float *) src2->data;
     }
 
-    const uint32_t i1_end    = MIN(ir1, ne1);
-    const int32_t  half_dims = rope_ctx->n_dims / 2;
+    const uint32_t i1_end       = MIN(ir1, ne1);
+    const int32_t  half_dims    = rope_ctx->n_dims / 2;
+    const size_t   remain_bytes = (ne0 - rope_ctx->n_dims) * sizeof(float);
     for (uint32_t i3 = 0; i3 < ne3; i3++) {      // batch
         for (uint32_t i2 = 0; i2 < ne2; i2++) {  // seq-len
             const int32_t p = pos[i2];
@@ -338,7 +339,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                 }
 
                 // TODO: use simd to speed up the remaining elements copy
-                memcpy(dst_data_loc, src_loc, (ne0 - rope_ctx->n_dims) * sizeof(float));
+                memcpy(dst_data_loc, src_loc, remain_bytes);
             }
         }
     }

From b567413ac97e6152b027714659cf8da847d041bf Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Mon, 1 Dec 2025 00:13:12 +0800
Subject: [PATCH 09/23] feat: add profiling macros for performance measurement
 in operations

---
 ggml/src/ggml-hexagon/htp/ops-utils.h | 7 +++++++
 ggml/src/ggml-hexagon/htp/rope-ops.c  | 9 +++------
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/ops-utils.h b/ggml/src/ggml-hexagon/htp/ops-utils.h
index af9c3305f61..5e5c48afda3 100644
--- a/ggml/src/ggml-hexagon/htp/ops-utils.h
+++ b/ggml/src/ggml-hexagon/htp/ops-utils.h
@@ -146,4 +146,11 @@ static inline void htp_dump_f16(char * pref, const __fp16 * x, uint32_t n) {
     }
 }
 
+#define PROFILER_START(name) const uint64_t name##_start_cycles = HAP_perf_get_qtimer_count()
+#define PROFILER_END(name, ...)                                                                                   \
+    do {                                                                                                          \
+        const uint64_t name##_end_cycles = HAP_perf_get_qtimer_count();                                           \
+        FARF(HIGH, __VA_ARGS__, (unsigned) HAP_perf_qtimer_count_to_us(name##_end_cycles - name##_start_cycles)); \
+    } while (0)
+
 #endif /* OPS_UTILS_H */
diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index a4399704fcb..a48cbf43f23 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -365,8 +365,7 @@ static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int
         return;
     }
 
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
+    PROFILER_START(rope_job_f32);
 
     int is_aligned = 1;
     int opt_path   = 0;
@@ -381,10 +380,8 @@ static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int
 
     rope_hex_f32(rope_ctx, src0_start_row, src0_end_row, nth, ith, opt_path);
 
-    t2 = HAP_perf_get_qtimer_count();
-
-    FARF(HIGH, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row, src0_end_row,
-         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    PROFILER_END(rope_job_f32, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row,
+                 src0_end_row);
 }
 
 static void rope_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {

From 7c8f10160b1b257537515f18667192d623eb2482 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Wed, 3 Dec 2025 22:01:01 +0800
Subject: [PATCH 10/23] refactor: replace manual timing with profiling macros
 in matmul operations

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 73 ++++++++++----------------
 1 file changed, 28 insertions(+), 45 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index c99b6a0d18e..62a2a210971 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -1092,8 +1092,7 @@ static void matmul(struct htp_matmul_type * mt,
     uint8_t * restrict spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith;
     uint8_t * restrict src1_data = src1_spad->data;
 
-    volatile uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
+    PROFILER_START(matmul);
 
     const uint8_t * restrict src0_row = (const uint8_t *) src0->data;
 
@@ -1144,12 +1143,9 @@ static void matmul(struct htp_matmul_type * mt,
         }
     }
 
-    t2 = HAP_perf_get_qtimer_count();
-
-    FARF(HIGH, "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, nth,
-         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
-         src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
-         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    PROFILER_END(matmul, "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith,
+                 nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
+                 src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
 }
 
 // q8x4x2 src1 tensor is already in VTCM spad
@@ -1190,8 +1186,7 @@ static void matvec(struct htp_matmul_type * mt,
     uint8_t * spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith;
     uint8_t * src1_data = src1_spad->data;
 
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
+    PROFILER_START(matvec);
 
     float * tmp = (float *) spad_dst;
 
@@ -1236,12 +1231,9 @@ static void matvec(struct htp_matmul_type * mt,
 
     hvx_copy_fp32_ua((uint8_t *) &dst_col[src0_start_row], (uint8_t *) tmp, src0_end_row - src0_start_row);
 
-    t2 = HAP_perf_get_qtimer_count();
-
-    FARF(HIGH, "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, nth,
-         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
-         src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
-         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    PROFILER_END(matvec, "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith,
+                 nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
+                 src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
 }
 
 #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ids->ne[0] * ids->ne[1] + (i1)]
@@ -1267,8 +1259,7 @@ static void matmul_id(struct htp_matmul_type * mt,
                       dma_queue * dma_queue) {
     htp_matmul_preamble;
 
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
+    PROFILER_START(matmul_id);
 
     const uint32_t src0_nrows = ne01;  // src0 rows per expert
     const uint32_t src1_nrows = ne11;
@@ -1373,12 +1364,11 @@ static void matmul_id(struct htp_matmul_type * mt,
         }
     }
 
-    t2 = HAP_perf_get_qtimer_count();
-
-    FARF(HIGH, "matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mt->type,
-         ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
-         src1->ne[1], src1->ne[2], src1->ne[3], ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], dst->ne[0], dst->ne[1],
-         dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    PROFILER_END(matmul_id,
+                 "matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n",
+                 mt->type, ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row,
+                 src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3],
+                 dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
 }
 
 // q8x4 src1 tensor is already in VTCM spad
@@ -1397,8 +1387,7 @@ static void matvec_id(struct htp_matmul_type * mt,
                       dma_queue * dma_queue) {
     htp_matmul_preamble;
 
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
+    PROFILER_START(matvec_id);
 
     const uint32_t src0_nrows = ne01;  // src0 rows per expert
 
@@ -1473,12 +1462,11 @@ static void matvec_id(struct htp_matmul_type * mt,
         }
     }
 
-    t2 = HAP_perf_get_qtimer_count();
-
-    FARF(HIGH, "matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mt->type,
-         ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
-         src1->ne[1], src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0],
-         dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    PROFILER_END(matvec_id,
+                 "matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n",
+                 mt->type, ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row,
+                 src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3],
+                 dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
 }
 
 // *** matmul in fp16
@@ -1495,8 +1483,7 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
                            dma_queue * dma_queue) {
     htp_matmul_preamble;
 
-    uint64_t t1, t2;
-    t1 = HAP_perf_get_qtimer_count();
+    PROFILER_START(matmul_f16_f32);
 
     const size_t src0_row_size = sizeof(__fp16) * ne00;
     const size_t src1_row_size = sizeof(float) * ne10;
@@ -1575,12 +1562,10 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
         }
     }
 
-    t2 = HAP_perf_get_qtimer_count();
-
-    FARF(HIGH, "matmul-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u %u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
-         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0_start, ir0_end, ir1_start, ir1_end, src1->ne[0],
-         src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
-         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    PROFILER_END(matmul_f16_f32,
+                 "matmul-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u %u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
+                 src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0_start, ir0_end, ir1_start, ir1_end,
+                 src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
 }
 
 // *** dynamic quant
@@ -1662,7 +1647,7 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src,
                                  uint32_t          nth,
                                  uint32_t          ith,
                                  uint32_t          nrows_per_thread) {
-    uint64_t t1 = HAP_perf_get_qtimer_count();
+    PROFILER_START(quantize_fp32_q8x4);
 
     const uint32_t ne0 = src->ne[0];
     const uint32_t ne1 = src->ne[1];
@@ -1694,10 +1679,8 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src,
         src_data += src_row_size;
     }
 
-    uint64_t t2 = HAP_perf_get_qtimer_count();
-
-    FARF(HIGH, "quantize-fp32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first,
-         ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+    PROFILER_END(quantize_fp32_q8x4, "quantize-fp32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith,
+                 nth, nrows, ir_first, ir_last, src_row_size, dst_row_size);
 }
 
 static void htp_quantize_fp32_q8x4x2(unsigned int n, unsigned int i, void * data) {

From 3b0cef47e20c66f680e75dc321141efe634772a5 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 5 Dec 2025 10:11:46 +0800
Subject: [PATCH 11/23] Revert "refactor: replace manual timing with profiling
 macros in matmul operations"

This reverts commit 7c8f10160b1b257537515f18667192d623eb2482.
---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 73 ++++++++++++++++----------
 1 file changed, 45 insertions(+), 28 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 62a2a210971..c99b6a0d18e 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -1092,7 +1092,8 @@ static void matmul(struct htp_matmul_type * mt,
     uint8_t * restrict spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith;
     uint8_t * restrict src1_data = src1_spad->data;
 
-    PROFILER_START(matmul);
+    volatile uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
 
     const uint8_t * restrict src0_row = (const uint8_t *) src0->data;
 
@@ -1143,9 +1144,12 @@ static void matmul(struct htp_matmul_type * mt,
         }
     }
 
-    PROFILER_END(matmul, "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith,
-                 nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
-                 src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, nth,
+         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
+         src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
 // q8x4x2 src1 tensor is already in VTCM spad
@@ -1186,7 +1190,8 @@ static void matvec(struct htp_matmul_type * mt,
     uint8_t * spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith;
     uint8_t * src1_data = src1_spad->data;
 
-    PROFILER_START(matvec);
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
 
     float * tmp = (float *) spad_dst;
 
@@ -1231,9 +1236,12 @@ static void matvec(struct htp_matmul_type * mt,
 
     hvx_copy_fp32_ua((uint8_t *) &dst_col[src0_start_row], (uint8_t *) tmp, src0_end_row - src0_start_row);
 
-    PROFILER_END(matvec, "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith,
-                 nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
-                 src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, nth,
+         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1],
+         src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
 #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ids->ne[0] * ids->ne[1] + (i1)]
@@ -1259,7 +1267,8 @@ static void matmul_id(struct htp_matmul_type * mt,
                       dma_queue * dma_queue) {
     htp_matmul_preamble;
 
-    PROFILER_START(matmul_id);
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
 
     const uint32_t src0_nrows = ne01;  // src0 rows per expert
     const uint32_t src1_nrows = ne11;
@@ -1364,11 +1373,12 @@ static void matmul_id(struct htp_matmul_type * mt,
         }
     }
 
-    PROFILER_END(matmul_id,
-                 "matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n",
-                 mt->type, ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row,
-                 src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3],
-                 dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mt->type,
+         ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
+         src1->ne[1], src1->ne[2], src1->ne[3], ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], dst->ne[0], dst->ne[1],
+         dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
 // q8x4 src1 tensor is already in VTCM spad
@@ -1387,7 +1397,8 @@ static void matvec_id(struct htp_matmul_type * mt,
                       dma_queue * dma_queue) {
     htp_matmul_preamble;
 
-    PROFILER_START(matvec_id);
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
 
     const uint32_t src0_nrows = ne01;  // src0 rows per expert
 
@@ -1462,11 +1473,12 @@ static void matvec_id(struct htp_matmul_type * mt,
         }
     }
 
-    PROFILER_END(matvec_id,
-                 "matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n",
-                 mt->type, ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row,
-                 src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3],
-                 dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mt->type,
+         ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0],
+         src1->ne[1], src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0],
+         dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
 // *** matmul in fp16
@@ -1483,7 +1495,8 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
                            dma_queue * dma_queue) {
     htp_matmul_preamble;
 
-    PROFILER_START(matmul_f16_f32);
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
 
     const size_t src0_row_size = sizeof(__fp16) * ne00;
     const size_t src1_row_size = sizeof(float) * ne10;
@@ -1562,10 +1575,12 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0,
         }
     }
 
-    PROFILER_END(matmul_f16_f32,
-                 "matmul-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u %u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
-                 src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0_start, ir0_end, ir1_start, ir1_end,
-                 src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "matmul-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u %u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth,
+         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0_start, ir0_end, ir1_start, ir1_end, src1->ne[0],
+         src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
 // *** dynamic quant
@@ -1647,7 +1662,7 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src,
                                  uint32_t          nth,
                                  uint32_t          ith,
                                  uint32_t          nrows_per_thread) {
-    PROFILER_START(quantize_fp32_q8x4);
+    uint64_t t1 = HAP_perf_get_qtimer_count();
 
     const uint32_t ne0 = src->ne[0];
     const uint32_t ne1 = src->ne[1];
@@ -1679,8 +1694,10 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src,
         src_data += src_row_size;
     }
 
-    PROFILER_END(quantize_fp32_q8x4, "quantize-fp32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith,
-                 nth, nrows, ir_first, ir_last, src_row_size, dst_row_size);
+    uint64_t t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "quantize-fp32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first,
+         ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
 static void htp_quantize_fp32_q8x4x2(unsigned int n, unsigned int i, void * data) {

From 121e656b3184c69d248807de2ab4f0d7ff0e9c9e Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 5 Dec 2025 10:11:55 +0800
Subject: [PATCH 12/23] Revert "feat: add profiling macros for performance
 measurement in operations"

This reverts commit b567413ac97e6152b027714659cf8da847d041bf.
---
 ggml/src/ggml-hexagon/htp/ops-utils.h | 7 -------
 ggml/src/ggml-hexagon/htp/rope-ops.c  | 9 ++++++---
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/ops-utils.h b/ggml/src/ggml-hexagon/htp/ops-utils.h
index 5e5c48afda3..af9c3305f61 100644
--- a/ggml/src/ggml-hexagon/htp/ops-utils.h
+++ b/ggml/src/ggml-hexagon/htp/ops-utils.h
@@ -146,11 +146,4 @@ static inline void htp_dump_f16(char * pref, const __fp16 * x, uint32_t n) {
     }
 }
 
-#define PROFILER_START(name) const uint64_t name##_start_cycles = HAP_perf_get_qtimer_count()
-#define PROFILER_END(name, ...)                                                                                   \
-    do {                                                                                                          \
-        const uint64_t name##_end_cycles = HAP_perf_get_qtimer_count();                                           \
-        FARF(HIGH, __VA_ARGS__, (unsigned) HAP_perf_qtimer_count_to_us(name##_end_cycles - name##_start_cycles)); \
-    } while (0)
-
 #endif /* OPS_UTILS_H */
diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index a48cbf43f23..a4399704fcb 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -365,7 +365,8 @@ static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int
         return;
     }
 
-    PROFILER_START(rope_job_f32);
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
 
     int is_aligned = 1;
     int opt_path   = 0;
@@ -380,8 +381,10 @@ static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int
 
     rope_hex_f32(rope_ctx, src0_start_row, src0_end_row, nth, ith, opt_path);
 
-    PROFILER_END(rope_job_f32, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row,
-                 src0_end_row);
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row, src0_end_row,
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 
 static void rope_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) {

From 401fd3ee1a5f33a22bf6d3c1aec2283b7cbe6f84 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 5 Dec 2025 15:40:15 +0800
Subject: [PATCH 13/23] refactor: optimize vector operations in
 vec_dot_q4x4x2_q8x4x2_rx2 function

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 37 +++++++++++++++++++-------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index c99b6a0d18e..418d8bfa36d 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -421,6 +421,9 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
     const uint32_t nb   = n / qk;  // num full blocks
     const uint32_t nloe = n % qk;  // num leftover elemements
 
+    const HVX_VectorPred rd_mask = Q6_Q_vsetq_R(VLEN / 2);
+    r1_x_d -= VLEN / 2;  // make sure r1 at the high half of the vector
+
     uint32_t i = 0;
     for (; i < nb; i++) {
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
@@ -430,12 +433,19 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
         HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
 
-        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
-        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
-        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+        HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
+        HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+        HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
 
-        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
-        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+        vy_d             = Q6_Vh_vshuff_Vh(vy_d);
+        HVX_Vector r01_d = Q6_V_vmux_QVV(rd_mask, r0_d, r1_d);
+
+        vy_d  = Q6_Vh_vshuffe_VhVh(vy_d, vy_d);
+        r01_d = Q6_Vh_vshuff_Vh(r01_d);
+
+        HVX_VectorPair r01_dd = Q6_Wqf32_vmpy_VhfVhf(r01_d, vy_d);
+        HVX_Vector     r0_dd  = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r01_dd));
+        HVX_Vector     r1_dd  = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r01_dd));
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
         HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
@@ -453,12 +463,19 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
         HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe));
 
-        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
-        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
-        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
+        HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
+        HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+        HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
 
-        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
-        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+        vy_d             = Q6_Vh_vshuff_Vh(vy_d);
+        HVX_Vector r01_d = Q6_V_vmux_QVV(rd_mask, r0_d, r1_d);
+
+        vy_d  = Q6_Vh_vshuffe_VhVh(vy_d, vy_d);
+        r01_d = Q6_Vh_vshuff_Vh(r01_d);
+
+        HVX_VectorPair r01_dd = Q6_Wqf32_vmpy_VhfVhf(r01_d, vy_d);
+        HVX_Vector     r0_dd  = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r01_dd));
+        HVX_Vector     r1_dd  = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r01_dd));
 
         // Zero out unused scales
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);

From cf491f2c8fa7239876c63e05631b8e2881214c87 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Sat, 6 Dec 2025 00:02:18 +0800
Subject: [PATCH 14/23] wip

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 418d8bfa36d..4fafe41c1f8 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -934,6 +934,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
 
     // for some reason we need volatile here so that the compiler doesn't try anything funky
     volatile HVX_Vector rsum = Q6_V_vsplat_R(0);
+    const HVX_Vector    kOne = Q6_Vh_vsplat_R(0x3C00);  // 1.0 in fp16
 
     uint32_t i = 0;
 
@@ -941,7 +942,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
         HVX_VectorPair yp = vy[i];
 
         HVX_Vector     x  = vx[i];
-        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00));  // mul by 1.0
+        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), kOne);  // mul by 1.0
 
         HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));
         HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp));
@@ -954,7 +955,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri
         HVX_VectorPair yp = vy[i];
 
         HVX_Vector     x  = vx[i];
-        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00));  // mul by 1.0
+        HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), kOne);  // mul by 1.0
 
         if (nv1 >= 32) {
             HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp));

From 3a01d8211b0e91bc3e261bfa87d9ed06842733c1 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Sun, 7 Dec 2025 23:11:03 +0800
Subject: [PATCH 15/23] feat: enhance vec_dot_q4x4x2_q8x4x2_rx2 function with
 optimized data handling and processing

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 59 ++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 4fafe41c1f8..5c7dae7c436 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -393,11 +393,11 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
 
     const uint32_t qk = QK_Q4_0x4x2 * 4;
 
-    const uint32_t x_dblk_size = 8 * 4 * 2;                                                        // 32x __fp16
+    const uint32_t x_dblk_size = 8 * 4 * sizeof(uint16_t);                                         // 32x __fp16
     const uint32_t x_qblk_size = qk / 2;                                                           // int4
     const uint32_t x_qrow_size = n / 2;                                                            // int4 (not padded)
 
-    const uint32_t y_dblk_size = 8 * 4 * 2;                                                        // 32x __fp16
+    const uint32_t y_dblk_size = 8 * 4 * sizeof(uint16_t);                                         // 32x __fp16
     const uint32_t y_qblk_size = qk;                                                               // int8
     const uint32_t y_qrow_size = n;                                                                // int8 (not padded)
 
@@ -421,10 +421,63 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
     const uint32_t nb   = n / qk;  // num full blocks
     const uint32_t nloe = n % qk;  // num leftover elemements
 
+    uint32_t i = 0;
+    for (; i + 1 < nb; i += 2) {
+        HVX_Vector r00_ia;
+        HVX_Vector r10_ia;
+        {
+            HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
+            HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
+            HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
+
+            r00_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
+            r10_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
+        }
+
+        HVX_Vector r01_ia;
+        HVX_Vector r11_ia;
+        {
+            HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + (i + 1) * y_qblk_size);
+            HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + (i + 1) * x_qblk_size);
+            HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + (i + 1) * x_qblk_size);
+
+            r01_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
+            r11_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
+        }
+
+        HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
+        HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
+        HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
+
+        vy_d = Q6_Vh_vshuff_Vh(vy_d);
+        r0_d = Q6_Vh_vshuff_Vh(r0_d);
+        r1_d = Q6_Vh_vshuff_Vh(r1_d);
+
+        HVX_VectorPair r0_dd = Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d);
+        HVX_VectorPair r1_dd = Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d);
+
+        HVX_Vector r00_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r0_dd));
+        HVX_Vector r01_dd = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r0_dd));
+
+        HVX_Vector r10_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r1_dd));
+        HVX_Vector r11_dd = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r1_dd));
+
+        HVX_Vector r00_fa = Q6_Vqf32_vmpy_VsfVsf(r00_ia, r00_dd);
+        HVX_Vector r01_fa = Q6_Vqf32_vmpy_VsfVsf(r01_ia, r01_dd);
+
+        HVX_Vector r10_fa = Q6_Vqf32_vmpy_VsfVsf(r10_ia, r10_dd);
+        HVX_Vector r11_fa = Q6_Vqf32_vmpy_VsfVsf(r11_ia, r11_dd);
+
+        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r00_fa);
+        r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r10_fa);
+
+        r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r01_fa);
+        r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r11_fa);
+    }
+
     const HVX_VectorPred rd_mask = Q6_Q_vsetq_R(VLEN / 2);
     r1_x_d -= VLEN / 2;  // make sure r1 at the high half of the vector
 
-    uint32_t i = 0;
     for (; i < nb; i++) {
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
         HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);

From 421d0313d98d314e61f1babbf3fa0f469d4c4a8b Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Mon, 8 Dec 2025 11:54:15 +0800
Subject: [PATCH 16/23] feat: add hvx_vec_load_d_and_mpy function for optimized
 data loading and multiplication

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 41 ++++++++++++++++----------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 5c7dae7c436..db9bc8821ef 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -307,6 +307,28 @@ static inline HVX_Vector hvx_vec_rmpy_x8_nloe(HVX_Vector_x8 x, HVX_Vector_x8 y,
     return hvx_vec_rmpy_x8_n(x, y, 1024);
 }
 
+static inline HVX_Vector_x2 hvx_vec_load_d_and_mpy_rx2(const uint8_t * restrict r0_x_d,
+                                                       const uint8_t * restrict r1_x_d,
+                                                       const uint8_t * restrict y_d,
+                                                       const HVX_Vector rd_mask) {
+    HVX_Vector vy_d = *(const HVX_UVector *) y_d;
+    HVX_Vector r0_d = *(const HVX_UVector *) r0_x_d;
+    HVX_Vector r1_d = *(const HVX_UVector *) r1_x_d;
+
+    vy_d             = Q6_Vh_vshuff_Vh(vy_d);
+    HVX_Vector r01_d = Q6_V_vmux_QVV(rd_mask, r0_d, r1_d);
+
+    vy_d  = Q6_Vh_vshuffe_VhVh(vy_d, vy_d);
+    r01_d = Q6_Vh_vshuff_Vh(r01_d);
+
+    HVX_VectorPair r01_dd = Q6_Wqf32_vmpy_VhfVhf(r01_d, vy_d);
+
+    HVX_Vector_x2 r;
+    r.v[0] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r01_dd));
+    r.v[1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r01_dd));
+    return r;
+}
+
 static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
     assert(n % 32 == 0);  // min sub-block size
     assert((unsigned long) vx % 128 == 0);
@@ -486,22 +508,11 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
         HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
 
-        HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
-        HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
-        HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
-
-        vy_d             = Q6_Vh_vshuff_Vh(vy_d);
-        HVX_Vector r01_d = Q6_V_vmux_QVV(rd_mask, r0_d, r1_d);
-
-        vy_d  = Q6_Vh_vshuffe_VhVh(vy_d, vy_d);
-        r01_d = Q6_Vh_vshuff_Vh(r01_d);
+        HVX_Vector_x2 r_dd = hvx_vec_load_d_and_mpy_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size,
+                                                        y_d + i * y_dblk_size, rd_mask);
 
-        HVX_VectorPair r01_dd = Q6_Wqf32_vmpy_VhfVhf(r01_d, vy_d);
-        HVX_Vector     r0_dd  = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r01_dd));
-        HVX_Vector     r1_dd  = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r01_dd));
-
-        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
-        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r_dd.v[0]);
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r_dd.v[1]);
 
         r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
         r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);

From bd438609848e5e3b37f35753a0261c4ccec4bbe7 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Mon, 8 Dec 2025 12:12:59 +0800
Subject: [PATCH 17/23] wip

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index db9bc8821ef..96e8588e96e 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -527,24 +527,13 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
         HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe));
 
-        HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
-        HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
-        HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
-
-        vy_d             = Q6_Vh_vshuff_Vh(vy_d);
-        HVX_Vector r01_d = Q6_V_vmux_QVV(rd_mask, r0_d, r1_d);
-
-        vy_d  = Q6_Vh_vshuffe_VhVh(vy_d, vy_d);
-        r01_d = Q6_Vh_vshuff_Vh(r01_d);
-
-        HVX_VectorPair r01_dd = Q6_Wqf32_vmpy_VhfVhf(r01_d, vy_d);
-        HVX_Vector     r0_dd  = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r01_dd));
-        HVX_Vector     r1_dd  = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r01_dd));
+        HVX_Vector_x2 r_dd = hvx_vec_load_d_and_mpy_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size,
+                                                        y_d + i * y_dblk_size, rd_mask);
 
         // Zero out unused scales
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
-        r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
-        r1_dd                = Q6_V_vand_QV(bmask, r1_dd);
+        HVX_Vector r0_dd                = Q6_V_vand_QV(bmask, r_dd.v[0]);
+        HVX_Vector r1_dd                = Q6_V_vand_QV(bmask, r_dd.v[1]);
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
         HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
@@ -554,8 +543,8 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
     }
 
     // Convert into fp32 and reduce
-    r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
-    r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum));
+    r0_sum            = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
+    r1_sum            = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum));
     HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
 
     hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));

From b1974641a560c94388fe964e3a6778ac6e6934de Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Mon, 8 Dec 2025 12:26:07 +0800
Subject: [PATCH 18/23] feat: add hvx_vec_load_d_and_mpy_r2x2 function for
 optimized vector loading and multiplication

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 48 +++++++++++++++-----------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 96e8588e96e..3fc5394bfb6 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -329,6 +329,28 @@ static inline HVX_Vector_x2 hvx_vec_load_d_and_mpy_rx2(const uint8_t * restrict
     return r;
 }
 
+static inline HVX_Vector_x4 hvx_vec_load_d_and_mpy_r2x2(const uint8_t * restrict r0_x_d,
+                                                        const uint8_t * restrict r1_x_d,
+                                                        const uint8_t * restrict y_d) {
+    HVX_Vector vy_d = *(const HVX_UVector *) y_d;
+    HVX_Vector r0_d = *(const HVX_UVector *) r0_x_d;
+    HVX_Vector r1_d = *(const HVX_UVector *) r1_x_d;
+
+    vy_d = Q6_Vh_vshuff_Vh(vy_d);
+    r0_d = Q6_Vh_vshuff_Vh(r0_d);
+    r1_d = Q6_Vh_vshuff_Vh(r1_d);
+
+    HVX_VectorPair r0_dd = Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d);
+    HVX_VectorPair r1_dd = Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d);
+
+    HVX_Vector_x4 r;
+    r.v[0] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r0_dd));
+    r.v[1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r0_dd));
+    r.v[2] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r1_dd));
+    r.v[3] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r1_dd));
+    return r;
+}
+
 static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
     assert(n % 32 == 0);  // min sub-block size
     assert((unsigned long) vx % 128 == 0);
@@ -467,28 +489,14 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
             r11_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
         }
 
-        HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size);
-        HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
-        HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size);
-
-        vy_d = Q6_Vh_vshuff_Vh(vy_d);
-        r0_d = Q6_Vh_vshuff_Vh(r0_d);
-        r1_d = Q6_Vh_vshuff_Vh(r1_d);
-
-        HVX_VectorPair r0_dd = Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d);
-        HVX_VectorPair r1_dd = Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d);
-
-        HVX_Vector r00_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r0_dd));
-        HVX_Vector r01_dd = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r0_dd));
-
-        HVX_Vector r10_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r1_dd));
-        HVX_Vector r11_dd = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r1_dd));
+        HVX_Vector_x4 r_dd =
+            hvx_vec_load_d_and_mpy_r2x2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size, y_d + i * y_dblk_size);
 
-        HVX_Vector r00_fa = Q6_Vqf32_vmpy_VsfVsf(r00_ia, r00_dd);
-        HVX_Vector r01_fa = Q6_Vqf32_vmpy_VsfVsf(r01_ia, r01_dd);
+        HVX_Vector r00_fa = Q6_Vqf32_vmpy_VsfVsf(r00_ia, r_dd.v[0]);
+        HVX_Vector r01_fa = Q6_Vqf32_vmpy_VsfVsf(r01_ia, r_dd.v[1]);
 
-        HVX_Vector r10_fa = Q6_Vqf32_vmpy_VsfVsf(r10_ia, r10_dd);
-        HVX_Vector r11_fa = Q6_Vqf32_vmpy_VsfVsf(r11_ia, r11_dd);
+        HVX_Vector r10_fa = Q6_Vqf32_vmpy_VsfVsf(r10_ia, r_dd.v[2]);
+        HVX_Vector r11_fa = Q6_Vqf32_vmpy_VsfVsf(r11_ia, r_dd.v[3]);
 
         r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r00_fa);
         r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r10_fa);

From 309d78298762c27a5a07f7bf34b48faa7ae0820f Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Mon, 8 Dec 2025 12:46:25 +0800
Subject: [PATCH 19/23] feat: optimize vec_dot functions with improved data
 handling and loading

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 35 +++++++++++---------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 3fc5394bfb6..03c3e50a89a 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -540,8 +540,8 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
 
         // Zero out unused scales
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
-        HVX_Vector r0_dd                = Q6_V_vand_QV(bmask, r_dd.v[0]);
-        HVX_Vector r1_dd                = Q6_V_vand_QV(bmask, r_dd.v[1]);
+        HVX_Vector     r0_dd = Q6_V_vand_QV(bmask, r_dd.v[0]);
+        HVX_Vector     r1_dd = Q6_V_vand_QV(bmask, r_dd.v[1]);
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
         HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
@@ -672,6 +672,9 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
     const uint32_t nb   = n / qk;  // num full blocks
     int32_t        nloe = n % qk;  // num leftover elemements (must be signed)
 
+    const HVX_VectorPred rd_mask = Q6_Q_vsetq_R(VLEN / 2);
+    r1_x_d -= VLEN / 2;  // make sure r1 at the high half of the vector
+
     uint32_t i = 0;
     for (; i < nb; i++) {
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
@@ -681,15 +684,11 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
         HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
 
-        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
-        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
-        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
-
-        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
-        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+        HVX_Vector_x2 r_dd = hvx_vec_load_d_and_mpy_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size,
+                                                        y_d + i * y_dblk_size, rd_mask);
 
-        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
-        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
+        HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r_dd.v[0]);
+        HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r_dd.v[1]);
 
         r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
         r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
@@ -704,17 +703,13 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
         HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe));
 
-        HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size));
-        HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size));
-        HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size));
-
-        HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d)));
-        HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d)));
+        HVX_Vector_x2 r_dd = hvx_vec_load_d_and_mpy_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size,
+                                                        y_d + i * y_dblk_size, rd_mask);
 
         // Zero out unused scales
         HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8);
-        r0_dd                = Q6_V_vand_QV(bmask, r0_dd);
-        r1_dd                = Q6_V_vand_QV(bmask, r1_dd);
+        HVX_Vector     r0_dd = Q6_V_vand_QV(bmask, r_dd.v[0]);
+        HVX_Vector     r1_dd = Q6_V_vand_QV(bmask, r_dd.v[1]);
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd);
         HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd);
@@ -724,8 +719,8 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
     }
 
     // Convert into fp32 and reduce
-    r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
-    r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum));
+    r0_sum            = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
+    r1_sum            = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum));
     HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
 
     hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));

From dbe93098513c212c6d48a89f12b4105fcd84f84c Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Tue, 9 Dec 2025 19:10:05 +0800
Subject: [PATCH 20/23] wip

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 32 +++++++++++++-------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 03c3e50a89a..7fabedf64ab 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -508,7 +508,7 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
     const HVX_VectorPred rd_mask = Q6_Q_vsetq_R(VLEN / 2);
     r1_x_d -= VLEN / 2;  // make sure r1 at the high half of the vector
 
-    for (; i < nb; i++) {
+    if (i < nb) {
         HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size);
         HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size);
         HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size);
@@ -524,6 +524,7 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
 
         r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa);
         r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa);
+        i++;
     }
 
     // Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks
@@ -757,8 +758,12 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
     // Compute combined scale (fp32).
     // Apply scale to acc and accumulate into the row sum (qf32).
 
-    const uint32_t nb   = n / qk;  // num full blocks
-    int32_t        nloe = n % qk;  // num leftover elemements (must be signed)
+    const uint32_t nb   = n / qk;                         // num full blocks
+    int32_t        nloe = n % qk;                         // num leftover elemements (must be signed)
+
+    const HVX_Vector half      = Q6_Vh_vsplat_R(0x3800);  // 0.5 in fp16
+    const HVX_Vector expand    = *(const HVX_Vector *) expand_x32_e8m0;
+    const HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff);
 
     uint32_t i = 0;
     for (; i < nb; i++) {
@@ -771,19 +776,16 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
         HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size);
 
         // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving
-        HVX_Vector half = Q6_Vh_vsplat_R(0x3800);  // 0.5 in fp16
-        vy_d            = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half));
-        vy_d            = Q6_Vsf_equals_Vqf32(vy_d);
+        vy_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half));
+        vy_d = Q6_Vsf_equals_Vqf32(vy_d);
 
         // Convert rX_d scales from e8m0 to fp32
         // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ...
         // Left shift with zero fill to create FP32
         // FIXME: might need to handle zero as a special case (see ggml-cpu code)
-        HVX_Vector expand    = *(const HVX_Vector *) expand_x32_e8m0;
-        HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff);
-        r0_d                 = Q6_V_vdelta_VV(r0_d, expand);
-        r0_d                 = Q6_V_vand_VV(r0_d, e8m0_mask);
-        r0_d                 = Q6_Vw_vasl_VwR(r0_d, 23);
+        r0_d = Q6_V_vdelta_VV(r0_d, expand);
+        r0_d = Q6_V_vand_VV(r0_d, e8m0_mask);
+        r0_d = Q6_Vw_vasl_VwR(r0_d, 23);
 
         HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
 
@@ -811,11 +813,9 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n,
         // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ...
         // Left shift with zero fill to create FP32
         // FIXME: might need to handle zero as a special case (see ggml-cpu code)
-        HVX_Vector expand    = *(const HVX_Vector *) expand_x32_e8m0;
-        HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff);
-        r0_d                 = Q6_V_vdelta_VV(r0_d, expand);
-        r0_d                 = Q6_V_vand_VV(r0_d, e8m0_mask);
-        r0_d                 = Q6_Vw_vasl_VwR(r0_d, 23);
+        r0_d = Q6_V_vdelta_VV(r0_d, expand);
+        r0_d = Q6_V_vand_VV(r0_d, e8m0_mask);
+        r0_d = Q6_Vw_vasl_VwR(r0_d, 23);
 
         HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d));
 

From 00d5fb31b83e82491d1d4ed9fd0858087f02355e Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Tue, 9 Dec 2025 19:29:49 +0800
Subject: [PATCH 21/23] feat: add build information and update vector loading
 functions for optimization

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 32 +++++++++++++-------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 7fabedf64ab..7eb9aedeb5b 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -307,13 +307,13 @@ static inline HVX_Vector hvx_vec_rmpy_x8_nloe(HVX_Vector_x8 x, HVX_Vector_x8 y,
     return hvx_vec_rmpy_x8_n(x, y, 1024);
 }
 
-static inline HVX_Vector_x2 hvx_vec_load_d_and_mpy_rx2(const uint8_t * restrict r0_x_d,
+static inline HVX_Vector_x2 hvx_vec_load_and_mul_d_rx2(const uint8_t * restrict r0_x_d,
                                                        const uint8_t * restrict r1_x_d,
                                                        const uint8_t * restrict y_d,
                                                        const HVX_Vector rd_mask) {
-    HVX_Vector vy_d = *(const HVX_UVector *) y_d;
-    HVX_Vector r0_d = *(const HVX_UVector *) r0_x_d;
-    HVX_Vector r1_d = *(const HVX_UVector *) r1_x_d;
+    HVX_Vector vy_d = *(const HVX_Vector *) y_d;
+    HVX_Vector r0_d = *(const HVX_Vector *) r0_x_d;
+    HVX_Vector r1_d = *(const HVX_Vector *) r1_x_d;
 
     vy_d             = Q6_Vh_vshuff_Vh(vy_d);
     HVX_Vector r01_d = Q6_V_vmux_QVV(rd_mask, r0_d, r1_d);
@@ -329,12 +329,12 @@ static inline HVX_Vector_x2 hvx_vec_load_d_and_mpy_rx2(const uint8_t * restrict
     return r;
 }
 
-static inline HVX_Vector_x4 hvx_vec_load_d_and_mpy_r2x2(const uint8_t * restrict r0_x_d,
+static inline HVX_Vector_x4 hvx_vec_load_and_mul_d_r2x2(const uint8_t * restrict r0_x_d,
                                                         const uint8_t * restrict r1_x_d,
                                                         const uint8_t * restrict y_d) {
-    HVX_Vector vy_d = *(const HVX_UVector *) y_d;
-    HVX_Vector r0_d = *(const HVX_UVector *) r0_x_d;
-    HVX_Vector r1_d = *(const HVX_UVector *) r1_x_d;
+    HVX_Vector vy_d = *(const HVX_Vector *) y_d;
+    HVX_Vector r0_d = *(const HVX_Vector *) r0_x_d;
+    HVX_Vector r1_d = *(const HVX_Vector *) r1_x_d;
 
     vy_d = Q6_Vh_vshuff_Vh(vy_d);
     r0_d = Q6_Vh_vshuff_Vh(r0_d);
@@ -490,7 +490,7 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
         }
 
         HVX_Vector_x4 r_dd =
-            hvx_vec_load_d_and_mpy_r2x2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size, y_d + i * y_dblk_size);
+            hvx_vec_load_and_mul_d_r2x2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size, y_d + i * y_dblk_size);
 
         HVX_Vector r00_fa = Q6_Vqf32_vmpy_VsfVsf(r00_ia, r_dd.v[0]);
         HVX_Vector r01_fa = Q6_Vqf32_vmpy_VsfVsf(r01_ia, r_dd.v[1]);
@@ -516,7 +516,7 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
         HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
 
-        HVX_Vector_x2 r_dd = hvx_vec_load_d_and_mpy_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size,
+        HVX_Vector_x2 r_dd = hvx_vec_load_and_mul_d_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size,
                                                         y_d + i * y_dblk_size, rd_mask);
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r_dd.v[0]);
@@ -536,7 +536,7 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n,
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
         HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe));
 
-        HVX_Vector_x2 r_dd = hvx_vec_load_d_and_mpy_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size,
+        HVX_Vector_x2 r_dd = hvx_vec_load_and_mul_d_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size,
                                                         y_d + i * y_dblk_size, rd_mask);
 
         // Zero out unused scales
@@ -685,7 +685,7 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q));
         HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q));
 
-        HVX_Vector_x2 r_dd = hvx_vec_load_d_and_mpy_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size,
+        HVX_Vector_x2 r_dd = hvx_vec_load_and_mul_d_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size,
                                                         y_d + i * y_dblk_size, rd_mask);
 
         HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r_dd.v[0]);
@@ -704,7 +704,7 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n,
         HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe));
         HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe));
 
-        HVX_Vector_x2 r_dd = hvx_vec_load_d_and_mpy_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size,
+        HVX_Vector_x2 r_dd = hvx_vec_load_and_mul_d_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size,
                                                         y_d + i * y_dblk_size, rd_mask);
 
         // Zero out unused scales
@@ -961,8 +961,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n,
     }
 
     // Convert into fp32 and reduce
-    r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
-    r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum));
+    r0_sum            = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum));
+    r1_sum            = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum));
     HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4);
 
     hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0));
@@ -2273,7 +2273,7 @@ int op_matmul_id(struct htp_ops_context * octx) {
 
                 assert(i02 >= 0 && i02 < n_as);
 
-                MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) { id, iid1 };
+                MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping){ id, iid1 };
                 matrix_row_counts[i02] += 1;
             }
         }

From b54ff1895d20c896d29fb708ef27cd1202d2d9b3 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Wed, 10 Dec 2025 21:46:40 +0800
Subject: [PATCH 22/23] revert rope changes

---
 ggml/src/ggml-hexagon/htp/rope-ops.c | 78 +++++++++++++++-------------
 1 file changed, 41 insertions(+), 37 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c
index a4399704fcb..00419bcba6b 100644
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -73,15 +73,15 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
     return (1 - MIN(1, MAX(0, y)));
 }
 
-static void rope_cache_init(const float    theta_base,
-                            const float    freq_scale,
-                            const float *  freq_factors,
-                            float *        corr_dims,
-                            const uint32_t ne0,
-                            const float    ext_factor,
-                            const float    mscale,
-                            float *        cache,
-                            const float    theta_scale) {
+static void rope_cache_init(const float   theta_base,
+                            float         freq_scale,
+                            const float * freq_factors,
+                            float *       corr_dims,
+                            uint32_t      ne0,
+                            float         ext_factor,
+                            float         mscale,
+                            float *       cache,
+                            float         theta_scale) {
     // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
     float theta = theta_base;
 
@@ -92,19 +92,18 @@ static void rope_cache_init(const float    theta_base,
 
         // Get n-d rotational scaling corrected for extrapolation
         float theta_interp = freq_scale * theta_extrap;
-        float theta_final  = theta_interp;
-        float mscale_final = mscale;
+        float theta2       = theta_interp;
 
         if (ext_factor != 0.0f) {
             float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
-            theta_final    = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+            theta2         = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
 
             // Get n-d magnitude scaling corrected for interpolation
-            mscale_final *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+            mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
         }
 
-        cache[i0 + 0] = cosf(theta_final) * mscale_final;
-        cache[i0 + 1] = sinf(theta_final) * mscale_final;
+        cache[i0 + 0] = cosf(theta2) * mscale;
+        cache[i0 + 1] = sinf(theta2) * mscale;
 
         theta *= theta_scale;
     }
@@ -152,9 +151,9 @@ static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context
 }
 
 static void hvx_calc_rope_neox_f32(const float * restrict src0,
-                                   float * restrict dst,
-                                   const int num_elems,
-                                   const float * restrict theta_cache) {
+                              float * restrict dst,
+                              const int num_elems,
+                              const float * restrict theta_cache) {
     // for (int i = 0; i < num_elems; i += 2) {
     //const float cos_theta = theta_cache[i + 0];
     //const float sin_theta = theta_cache[i + 1];
@@ -193,7 +192,7 @@ static void hvx_calc_rope_neox_f32(const float * restrict src0,
         HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
         HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
 
-        *(HVX_Vector *) dst_curr               = Q6_Vsf_equals_Vqf32(v4);
+        *(HVX_Vector *) dst_curr          = Q6_Vsf_equals_Vqf32(v4);
         *(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5);
 
         src0_curr += VLEN;
@@ -260,7 +259,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                          const uint32_t       ir1,
                          int                  nth,
                          int                  ith,
-                         const int            opt_path) {
+                         int                  opt_path) {
     struct htp_ops_context * octx = rope_ctx->octx;
 
     const struct htp_tensor * src0 = &octx->src0;
@@ -268,8 +267,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
     const struct htp_tensor * src2 = &octx->src2;
     struct htp_tensor *       dst  = &octx->dst;
 
-    const int32_t mode    = rope_ctx->mode;
-    const bool    is_neox = mode & HTP_ROPE_TYPE_NEOX;
+    const int32_t mode  = rope_ctx->mode;
+    const bool is_neox  = mode & HTP_ROPE_TYPE_NEOX;
 
     htp_rope_preamble;
 
@@ -282,9 +281,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
         freq_factors = (const float *) src2->data;
     }
 
-    const uint32_t i1_end       = MIN(ir1, ne1);
-    const int32_t  half_dims    = rope_ctx->n_dims / 2;
-    const size_t   remain_bytes = (ne0 - rope_ctx->n_dims) * sizeof(float);
+    int ir = 0;
+
     for (uint32_t i3 = 0; i3 < ne3; i3++) {      // batch
         for (uint32_t i2 = 0; i2 < ne2; i2++) {  // seq-len
             const int32_t p = pos[i2];
@@ -292,7 +290,14 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
             rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor,
                             rope_ctx->attn_factor, wp0, rope_ctx->theta_scale);
 
-            for (uint32_t i1 = ir0; i1 < i1_end; i1++) {  // attn-heads
+            for (uint32_t i1 = 0; i1 < ne1; i1++) {  // attn-heads
+                if (ir++ < ir0) {
+                    continue;
+                }
+                if (ir > ir1) {
+                    break;
+                }
+
                 const float * src      = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
                 float *       dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
 
@@ -305,9 +310,6 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                     } else {
                         hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
                     }
-
-                    src_loc += rope_ctx->n_dims;
-                    dst_data_loc += rope_ctx->n_dims;
                 } else {
                     for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
                         const float cos_theta = wp0[i0 + 0];
@@ -315,10 +317,10 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
 
                         if (is_neox) {
                             const float x0 = src_loc[0];
-                            const float x1 = src_loc[half_dims];
+                            const float x1 = src_loc[rope_ctx->n_dims/2];
 
-                            dst_data_loc[0]         = x0 * cos_theta - x1 * sin_theta;
-                            dst_data_loc[half_dims] = x0 * sin_theta + x1 * cos_theta;
+                            dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
+                            dst_data_loc[rope_ctx->n_dims/2] = x0 * sin_theta + x1 * cos_theta;
 
                             src_loc += 1;
                             dst_data_loc += 1;
@@ -333,13 +335,15 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
                             dst_data_loc += 2;
                         }
                     }
-
-                    src_loc += (is_neox ? half_dims : 0);
-                    dst_data_loc += (is_neox ? half_dims : 0);
                 }
 
-                // TODO: use simd to speed up the remaining elements copy
-                memcpy(dst_data_loc, src_loc, remain_bytes);
+                for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) {
+                    dst_data_loc[0] = src_loc[0];
+                    dst_data_loc[1] = src_loc[1];
+
+                    src_loc += 2;
+                    dst_data_loc += 2;
+                }
             }
         }
     }

From 09c48991df3b8d0cd6fd90e108184ac7ed61633d Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Thu, 11 Dec 2025 11:43:49 +0800
Subject: [PATCH 23/23] fix: revert HVX_Vector back to HVX_UVector

---
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 7eb9aedeb5b..48bebc18ca0 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -311,9 +311,9 @@ static inline HVX_Vector_x2 hvx_vec_load_and_mul_d_rx2(const uint8_t * restrict
                                                        const uint8_t * restrict r1_x_d,
                                                        const uint8_t * restrict y_d,
                                                        const HVX_Vector rd_mask) {
-    HVX_Vector vy_d = *(const HVX_Vector *) y_d;
-    HVX_Vector r0_d = *(const HVX_Vector *) r0_x_d;
-    HVX_Vector r1_d = *(const HVX_Vector *) r1_x_d;
+    HVX_Vector vy_d = *(const HVX_UVector *) y_d;
+    HVX_Vector r0_d = *(const HVX_UVector *) r0_x_d;
+    HVX_Vector r1_d = *(const HVX_UVector *) r1_x_d;
 
     vy_d             = Q6_Vh_vshuff_Vh(vy_d);
     HVX_Vector r01_d = Q6_V_vmux_QVV(rd_mask, r0_d, r1_d);
@@ -332,9 +332,9 @@ static inline HVX_Vector_x2 hvx_vec_load_and_mul_d_rx2(const uint8_t * restrict
 static inline HVX_Vector_x4 hvx_vec_load_and_mul_d_r2x2(const uint8_t * restrict r0_x_d,
                                                         const uint8_t * restrict r1_x_d,
                                                         const uint8_t * restrict y_d) {
-    HVX_Vector vy_d = *(const HVX_Vector *) y_d;
-    HVX_Vector r0_d = *(const HVX_Vector *) r0_x_d;
-    HVX_Vector r1_d = *(const HVX_Vector *) r1_x_d;
+    HVX_Vector vy_d = *(const HVX_UVector *) y_d;
+    HVX_Vector r0_d = *(const HVX_UVector *) r0_x_d;
+    HVX_Vector r1_d = *(const HVX_UVector *) r1_x_d;
 
     vy_d = Q6_Vh_vshuff_Vh(vy_d);
     r0_d = Q6_Vh_vshuff_Vh(r0_d);