From 407b408981abc4180f64b99f89b003168bf7b962 Mon Sep 17 00:00:00 2001 From: chraac Date: Thu, 27 Nov 2025 12:54:46 +0800 Subject: [PATCH 01/23] fix test failure --- ggml/src/ggml-hexagon/htp/rope-ops.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index 00419bcba6b..ba0ff2dbc61 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -151,9 +151,9 @@ static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context } static void hvx_calc_rope_neox_f32(const float * restrict src0, - float * restrict dst, - const int num_elems, - const float * restrict theta_cache) { + float * restrict dst, + const int num_elems, + const float * restrict theta_cache) { // for (int i = 0; i < num_elems; i += 2) { //const float cos_theta = theta_cache[i + 0]; //const float sin_theta = theta_cache[i + 1]; @@ -192,7 +192,7 @@ static void hvx_calc_rope_neox_f32(const float * restrict src0, HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s); HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c); - *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v4); + *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v4); *(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5); src0_curr += VLEN; @@ -259,7 +259,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, const uint32_t ir1, int nth, int ith, - int opt_path) { + const int opt_path) { struct htp_ops_context * octx = rope_ctx->octx; const struct htp_tensor * src0 = &octx->src0; @@ -267,8 +267,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, const struct htp_tensor * src2 = &octx->src2; struct htp_tensor * dst = &octx->dst; - const int32_t mode = rope_ctx->mode; - const bool is_neox = mode & HTP_ROPE_TYPE_NEOX; + const int32_t mode = rope_ctx->mode; + const bool is_neox = mode & HTP_ROPE_TYPE_NEOX; htp_rope_preamble; @@ -317,10 +317,10 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, if (is_neox) { const float x0 = src_loc[0]; - const float x1 = src_loc[rope_ctx->n_dims/2]; + const float x1 = src_loc[rope_ctx->n_dims / 2]; - dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta; - dst_data_loc[rope_ctx->n_dims/2] = x0 * sin_theta + x1 * cos_theta; + dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta; + dst_data_loc[rope_ctx->n_dims / 2] = x0 * sin_theta + x1 * cos_theta; src_loc += 1; dst_data_loc += 1; @@ -337,6 +337,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, } } + src_loc += is_neox ? (rope_ctx->n_dims / 2) : 0; + dst_data_loc += is_neox ? (rope_ctx->n_dims / 2) : 0; for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) { dst_data_loc[0] = src_loc[0]; dst_data_loc[1] = src_loc[1]; From 4ddb8a449cbe057d67b684b7c34192e69f743d30 Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 28 Nov 2025 01:15:25 +0800 Subject: [PATCH 02/23] fix: correct scaling calculations in rope_cache_init --- ggml/src/ggml-hexagon/htp/rope-ops.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index ba0ff2dbc61..719efc09b7a 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -93,17 +93,18 @@ static void rope_cache_init(const float theta_base, // Get n-d rotational scaling corrected for extrapolation float theta_interp = freq_scale * theta_extrap; float theta2 = theta_interp; + float mscale2 = mscale; if (ext_factor != 0.0f) { float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; theta2 = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; // Get n-d magnitude scaling corrected for interpolation - mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); + mscale2 *= 1.0f + 0.1f * logf(1.0f / freq_scale); } - cache[i0 + 0] = cosf(theta2) * mscale; - cache[i0 + 1] = sinf(theta2) * mscale; + cache[i0 + 0] = cosf(theta2) * mscale2; + cache[i0 + 1] = sinf(theta2) * mscale2; theta *= theta_scale; } @@ -337,8 +338,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, } } - src_loc += is_neox ? (rope_ctx->n_dims / 2) : 0; - dst_data_loc += is_neox ? (rope_ctx->n_dims / 2) : 0; + src_loc += (is_neox ? (rope_ctx->n_dims / 2) : 0); + dst_data_loc += (is_neox ? (rope_ctx->n_dims / 2) : 0); for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) { dst_data_loc[0] = src_loc[0]; dst_data_loc[1] = src_loc[1]; From cfca78b0d01b76aad6b1a89bc08045c9eaaa23d0 Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 28 Nov 2025 01:17:41 +0800 Subject: [PATCH 03/23] wip --- ggml/src/ggml-hexagon/htp/rope-ops.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index 719efc09b7a..ef24c80f82b 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -73,15 +73,15 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) { return (1 - MIN(1, MAX(0, y))); } -static void rope_cache_init(const float theta_base, - float freq_scale, - const float * freq_factors, - float * corr_dims, - uint32_t ne0, - float ext_factor, - float mscale, - float * cache, - float theta_scale) { +static void rope_cache_init(const float theta_base, + const float freq_scale, + const float * freq_factors, + float * corr_dims, + const uint32_t ne0, + const float ext_factor, + const float mscale, + float * cache, + const float theta_scale) { // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py float theta = theta_base; From e9a02fdba1246b747096cb9bbcbd67c7236ed2de Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 28 Nov 2025 10:33:43 +0800 Subject: [PATCH 04/23] wip --- ggml/src/ggml-hexagon/htp/rope-ops.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index ef24c80f82b..7519505ab0d 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -282,8 +282,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, freq_factors = (const float *) src2->data; } - int ir = 0; - + int ir = 0; + const int32_t half_dims = rope_ctx->n_dims / 2; for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len const int32_t p = pos[i2]; @@ -311,6 +311,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, } else { hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0); } + + src_loc += rope_ctx->n_dims; + dst_data_loc += rope_ctx->n_dims; } else { for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) { const float cos_theta = wp0[i0 + 0]; @@ -318,10 +321,10 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, if (is_neox) { const float x0 = src_loc[0]; - const float x1 = src_loc[rope_ctx->n_dims / 2]; + const float x1 = src_loc[half_dims]; - dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta; - dst_data_loc[rope_ctx->n_dims / 2] = x0 * sin_theta + x1 * cos_theta; + dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta; + dst_data_loc[half_dims] = x0 * sin_theta + x1 * cos_theta; src_loc += 1; dst_data_loc += 1; @@ -336,10 +339,11 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, dst_data_loc += 2; } } + + src_loc += (is_neox ? half_dims : 0); + dst_data_loc += (is_neox ? half_dims : 0); } - src_loc += (is_neox ? (rope_ctx->n_dims / 2) : 0); - dst_data_loc += (is_neox ? (rope_ctx->n_dims / 2) : 0); for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) { dst_data_loc[0] = src_loc[0]; dst_data_loc[1] = src_loc[1]; From e324bb0bd50898537e41810fcd41f82ce4a78e7c Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 28 Nov 2025 10:57:27 +0800 Subject: [PATCH 05/23] fix: optimize element copying in rope_hex_f32 using memcpy --- ggml/src/ggml-hexagon/htp/rope-ops.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index 7519505ab0d..0e71dcfae8e 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -344,13 +344,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, dst_data_loc += (is_neox ? half_dims : 0); } - for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) { - dst_data_loc[0] = src_loc[0]; - dst_data_loc[1] = src_loc[1]; - - src_loc += 2; - dst_data_loc += 2; - } + // TODO: use simd to speed up the remaining elements copy + memcpy(dst_data_loc, src_loc, (ne0 - rope_ctx->n_dims) * sizeof(float)); } } } From 0121291d5358ee27bf4e8bd96106020f4b56857c Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 28 Nov 2025 11:12:12 +0800 Subject: [PATCH 06/23] fix: optimize loop boundaries in rope_hex_f32 for better performance --- ggml/src/ggml-hexagon/htp/rope-ops.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index 0e71dcfae8e..ddce8971204 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -282,8 +282,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, freq_factors = (const float *) src2->data; } - int ir = 0; - const int32_t half_dims = rope_ctx->n_dims / 2; + const uint32_t i0_end = MIN(ir1, ne1); + const int32_t half_dims = rope_ctx->n_dims / 2; for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len const int32_t p = pos[i2]; @@ -291,14 +291,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor, rope_ctx->attn_factor, wp0, rope_ctx->theta_scale); - for (uint32_t i1 = 0; i1 < ne1; i1++) { // attn-heads - if (ir++ < ir0) { - continue; - } - if (ir > ir1) { - break; - } - + for (uint32_t i1 = ir0; i1 < i0_end; i1++) { // attn-heads const float * src = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01); float * dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1); From 010039a15e0239de0685d70f0a144587efb3f4ee Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 28 Nov 2025 11:22:22 +0800 Subject: [PATCH 07/23] rename --- ggml/src/ggml-hexagon/htp/rope-ops.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index ddce8971204..dbb4df58b56 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -92,19 +92,19 @@ static void rope_cache_init(const float theta_base, // Get n-d rotational scaling corrected for extrapolation float theta_interp = freq_scale * theta_extrap; - float theta2 = theta_interp; - float mscale2 = mscale; + float theta_final = theta_interp; + float mscale_final = mscale; if (ext_factor != 0.0f) { float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; - theta2 = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + theta_final = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; // Get n-d magnitude scaling corrected for interpolation - mscale2 *= 1.0f + 0.1f * logf(1.0f / freq_scale); + mscale_final *= 1.0f + 0.1f * logf(1.0f / freq_scale); } - cache[i0 + 0] = cosf(theta2) * mscale2; - cache[i0 + 1] = sinf(theta2) * mscale2; + cache[i0 + 0] = cosf(theta_final) * mscale_final; + cache[i0 + 1] = sinf(theta_final) * mscale_final; theta *= theta_scale; } @@ -282,7 +282,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, freq_factors = (const float *) src2->data; } - const uint32_t i0_end = MIN(ir1, ne1); + const uint32_t i1_end = MIN(ir1, ne1); const int32_t half_dims = rope_ctx->n_dims / 2; for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len @@ -291,7 +291,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor, rope_ctx->attn_factor, wp0, rope_ctx->theta_scale); - for (uint32_t i1 = ir0; i1 < i0_end; i1++) { // attn-heads + for (uint32_t i1 = ir0; i1 < i1_end; i1++) { // attn-heads const float * src = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01); float * dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1); From a6ef41f404da2e581a618d1efbb9a9669dfeea6e Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 28 Nov 2025 12:10:56 +0800 Subject: [PATCH 08/23] wip --- ggml/src/ggml-hexagon/htp/rope-ops.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index dbb4df58b56..a4399704fcb 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -282,8 +282,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, freq_factors = (const float *) src2->data; } - const uint32_t i1_end = MIN(ir1, ne1); - const int32_t half_dims = rope_ctx->n_dims / 2; + const uint32_t i1_end = MIN(ir1, ne1); + const int32_t half_dims = rope_ctx->n_dims / 2; + const size_t remain_bytes = (ne0 - rope_ctx->n_dims) * sizeof(float); for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len const int32_t p = pos[i2]; @@ -338,7 +339,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, } // TODO: use simd to speed up the remaining elements copy - memcpy(dst_data_loc, src_loc, (ne0 - rope_ctx->n_dims) * sizeof(float)); + memcpy(dst_data_loc, src_loc, remain_bytes); } } } From b567413ac97e6152b027714659cf8da847d041bf Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 1 Dec 2025 00:13:12 +0800 Subject: [PATCH 09/23] feat: add profiling macros for performance measurement in operations --- ggml/src/ggml-hexagon/htp/ops-utils.h | 7 +++++++ ggml/src/ggml-hexagon/htp/rope-ops.c | 9 +++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/ops-utils.h b/ggml/src/ggml-hexagon/htp/ops-utils.h index af9c3305f61..5e5c48afda3 100644 --- a/ggml/src/ggml-hexagon/htp/ops-utils.h +++ b/ggml/src/ggml-hexagon/htp/ops-utils.h @@ -146,4 +146,11 @@ static inline void htp_dump_f16(char * pref, const __fp16 * x, uint32_t n) { } } +#define PROFILER_START(name) const uint64_t name##_start_cycles = HAP_perf_get_qtimer_count() +#define PROFILER_END(name, ...) \ + do { \ + const uint64_t name##_end_cycles = HAP_perf_get_qtimer_count(); \ + FARF(HIGH, __VA_ARGS__, (unsigned) HAP_perf_qtimer_count_to_us(name##_end_cycles - name##_start_cycles)); \ + } while (0) + #endif /* OPS_UTILS_H */ diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index a4399704fcb..a48cbf43f23 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -365,8 +365,7 @@ static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int return; } - uint64_t t1, t2; - t1 = HAP_perf_get_qtimer_count(); + PROFILER_START(rope_job_f32); int is_aligned = 1; int opt_path = 0; @@ -381,10 +380,8 @@ static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int rope_hex_f32(rope_ctx, src0_start_row, src0_end_row, nth, ith, opt_path); - t2 = HAP_perf_get_qtimer_count(); - - FARF(HIGH, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row, src0_end_row, - (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); + PROFILER_END(rope_job_f32, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row, + src0_end_row); } static void rope_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) { From 7c8f10160b1b257537515f18667192d623eb2482 Mon Sep 17 00:00:00 2001 From: chraac Date: Wed, 3 Dec 2025 22:01:01 +0800 Subject: [PATCH 10/23] refactor: replace manual timing with profiling macros in matmul operations --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 73 ++++++++++---------------- 1 file changed, 28 insertions(+), 45 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index c99b6a0d18e..62a2a210971 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -1092,8 +1092,7 @@ static void matmul(struct htp_matmul_type * mt, uint8_t * restrict spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith; uint8_t * restrict src1_data = src1_spad->data; - volatile uint64_t t1, t2; - t1 = HAP_perf_get_qtimer_count(); + PROFILER_START(matmul); const uint8_t * restrict src0_row = (const uint8_t *) src0->data; @@ -1144,12 +1143,9 @@ static void matmul(struct htp_matmul_type * mt, } } - t2 = HAP_perf_get_qtimer_count(); - - FARF(HIGH, "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, nth, - src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1], - src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], - (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); + PROFILER_END(matmul, "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, + nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], + src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); } // q8x4x2 src1 tensor is already in VTCM spad @@ -1190,8 +1186,7 @@ static void matvec(struct htp_matmul_type * mt, uint8_t * spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith; uint8_t * src1_data = src1_spad->data; - uint64_t t1, t2; - t1 = HAP_perf_get_qtimer_count(); + PROFILER_START(matvec); float * tmp = (float *) spad_dst; @@ -1236,12 +1231,9 @@ static void matvec(struct htp_matmul_type * mt, hvx_copy_fp32_ua((uint8_t *) &dst_col[src0_start_row], (uint8_t *) tmp, src0_end_row - src0_start_row); - t2 = HAP_perf_get_qtimer_count(); - - FARF(HIGH, "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, nth, - src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1], - src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], - (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); + PROFILER_END(matvec, "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, + nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], + src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); } #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ids->ne[0] * ids->ne[1] + (i1)] @@ -1267,8 +1259,7 @@ static void matmul_id(struct htp_matmul_type * mt, dma_queue * dma_queue) { htp_matmul_preamble; - uint64_t t1, t2; - t1 = HAP_perf_get_qtimer_count(); + PROFILER_START(matmul_id); const uint32_t src0_nrows = ne01; // src0 rows per expert const uint32_t src1_nrows = ne11; @@ -1373,12 +1364,11 @@ static void matmul_id(struct htp_matmul_type * mt, } } - t2 = HAP_perf_get_qtimer_count(); - - FARF(HIGH, "matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mt->type, - ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], - src1->ne[1], src1->ne[2], src1->ne[3], ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], dst->ne[0], dst->ne[1], - dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); + PROFILER_END(matmul_id, + "matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", + mt->type, ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, + src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); } // q8x4 src1 tensor is already in VTCM spad @@ -1397,8 +1387,7 @@ static void matvec_id(struct htp_matmul_type * mt, dma_queue * dma_queue) { htp_matmul_preamble; - uint64_t t1, t2; - t1 = HAP_perf_get_qtimer_count(); + PROFILER_START(matvec_id); const uint32_t src0_nrows = ne01; // src0 rows per expert @@ -1473,12 +1462,11 @@ static void matvec_id(struct htp_matmul_type * mt, } } - t2 = HAP_perf_get_qtimer_count(); - - FARF(HIGH, "matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mt->type, - ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], - src1->ne[1], src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0], - dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); + PROFILER_END(matvec_id, + "matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", + mt->type, ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, + src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); } // *** matmul in fp16 @@ -1495,8 +1483,7 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0, dma_queue * dma_queue) { htp_matmul_preamble; - uint64_t t1, t2; - t1 = HAP_perf_get_qtimer_count(); + PROFILER_START(matmul_f16_f32); const size_t src0_row_size = sizeof(__fp16) * ne00; const size_t src1_row_size = sizeof(float) * ne10; @@ -1575,12 +1562,10 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0, } } - t2 = HAP_perf_get_qtimer_count(); - - FARF(HIGH, "matmul-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u %u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, - src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0_start, ir0_end, ir1_start, ir1_end, src1->ne[0], - src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], - (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); + PROFILER_END(matmul_f16_f32, + "matmul-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u %u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0_start, ir0_end, ir1_start, ir1_end, + src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); } // *** dynamic quant @@ -1662,7 +1647,7 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src, uint32_t nth, uint32_t ith, uint32_t nrows_per_thread) { - uint64_t t1 = HAP_perf_get_qtimer_count(); + PROFILER_START(quantize_fp32_q8x4); const uint32_t ne0 = src->ne[0]; const uint32_t ne1 = src->ne[1]; @@ -1694,10 +1679,8 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src, src_data += src_row_size; } - uint64_t t2 = HAP_perf_get_qtimer_count(); - - FARF(HIGH, "quantize-fp32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first, - ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); + PROFILER_END(quantize_fp32_q8x4, "quantize-fp32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, + nth, nrows, ir_first, ir_last, src_row_size, dst_row_size); } static void htp_quantize_fp32_q8x4x2(unsigned int n, unsigned int i, void * data) { From 3b0cef47e20c66f680e75dc321141efe634772a5 Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 5 Dec 2025 10:11:46 +0800 Subject: [PATCH 11/23] Revert "refactor: replace manual timing with profiling macros in matmul operations" This reverts commit 7c8f10160b1b257537515f18667192d623eb2482. --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 73 ++++++++++++++++---------- 1 file changed, 45 insertions(+), 28 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 62a2a210971..c99b6a0d18e 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -1092,7 +1092,8 @@ static void matmul(struct htp_matmul_type * mt, uint8_t * restrict spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith; uint8_t * restrict src1_data = src1_spad->data; - PROFILER_START(matmul); + volatile uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); const uint8_t * restrict src0_row = (const uint8_t *) src0->data; @@ -1143,9 +1144,12 @@ static void matmul(struct htp_matmul_type * mt, } } - PROFILER_END(matmul, "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, - nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], - src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, nth, + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1], + src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } // q8x4x2 src1 tensor is already in VTCM spad @@ -1186,7 +1190,8 @@ static void matvec(struct htp_matmul_type * mt, uint8_t * spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith; uint8_t * src1_data = src1_spad->data; - PROFILER_START(matvec); + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); float * tmp = (float *) spad_dst; @@ -1231,9 +1236,12 @@ static void matvec(struct htp_matmul_type * mt, hvx_copy_fp32_ua((uint8_t *) &dst_col[src0_start_row], (uint8_t *) tmp, src0_end_row - src0_start_row); - PROFILER_END(matvec, "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, - nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], - src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mt->type, ith, nth, + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1], + src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ids->ne[0] * ids->ne[1] + (i1)] @@ -1259,7 +1267,8 @@ static void matmul_id(struct htp_matmul_type * mt, dma_queue * dma_queue) { htp_matmul_preamble; - PROFILER_START(matmul_id); + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); const uint32_t src0_nrows = ne01; // src0 rows per expert const uint32_t src1_nrows = ne11; @@ -1364,11 +1373,12 @@ static void matmul_id(struct htp_matmul_type * mt, } } - PROFILER_END(matmul_id, - "matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", - mt->type, ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, - src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], - dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mt->type, + ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], + src1->ne[1], src1->ne[2], src1->ne[3], ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], dst->ne[0], dst->ne[1], + dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } // q8x4 src1 tensor is already in VTCM spad @@ -1387,7 +1397,8 @@ static void matvec_id(struct htp_matmul_type * mt, dma_queue * dma_queue) { htp_matmul_preamble; - PROFILER_START(matvec_id); + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); const uint32_t src0_nrows = ne01; // src0 rows per expert @@ -1462,11 +1473,12 @@ static void matvec_id(struct htp_matmul_type * mt, } } - PROFILER_END(matvec_id, - "matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", - mt->type, ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, - src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], - dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mt->type, + ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], + src1->ne[1], src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0], + dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } // *** matmul in fp16 @@ -1483,7 +1495,8 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0, dma_queue * dma_queue) { htp_matmul_preamble; - PROFILER_START(matmul_f16_f32); + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); const size_t src0_row_size = sizeof(__fp16) * ne00; const size_t src1_row_size = sizeof(float) * ne10; @@ -1562,10 +1575,12 @@ static void matmul_f16_f32(struct htp_tensor * restrict src0, } } - PROFILER_END(matmul_f16_f32, - "matmul-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u %u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, - src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0_start, ir0_end, ir1_start, ir1_end, - src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "matmul-f16-f32 %d/%d: %ux%ux%ux%u (%u:%u %u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0_start, ir0_end, ir1_start, ir1_end, src1->ne[0], + src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } // *** dynamic quant @@ -1647,7 +1662,7 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src, uint32_t nth, uint32_t ith, uint32_t nrows_per_thread) { - PROFILER_START(quantize_fp32_q8x4); + uint64_t t1 = HAP_perf_get_qtimer_count(); const uint32_t ne0 = src->ne[0]; const uint32_t ne1 = src->ne[1]; @@ -1679,8 +1694,10 @@ static void quantize_fp32_q8x4x2(const struct htp_tensor * src, src_data += src_row_size; } - PROFILER_END(quantize_fp32_q8x4, "quantize-fp32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, - nth, nrows, ir_first, ir_last, src_row_size, dst_row_size); + uint64_t t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "quantize-fp32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first, + ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } static void htp_quantize_fp32_q8x4x2(unsigned int n, unsigned int i, void * data) { From 121e656b3184c69d248807de2ab4f0d7ff0e9c9e Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 5 Dec 2025 10:11:55 +0800 Subject: [PATCH 12/23] Revert "feat: add profiling macros for performance measurement in operations" This reverts commit b567413ac97e6152b027714659cf8da847d041bf. --- ggml/src/ggml-hexagon/htp/ops-utils.h | 7 ------- ggml/src/ggml-hexagon/htp/rope-ops.c | 9 ++++++--- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/ops-utils.h b/ggml/src/ggml-hexagon/htp/ops-utils.h index 5e5c48afda3..af9c3305f61 100644 --- a/ggml/src/ggml-hexagon/htp/ops-utils.h +++ b/ggml/src/ggml-hexagon/htp/ops-utils.h @@ -146,11 +146,4 @@ static inline void htp_dump_f16(char * pref, const __fp16 * x, uint32_t n) { } } -#define PROFILER_START(name) const uint64_t name##_start_cycles = HAP_perf_get_qtimer_count() -#define PROFILER_END(name, ...) \ - do { \ - const uint64_t name##_end_cycles = HAP_perf_get_qtimer_count(); \ - FARF(HIGH, __VA_ARGS__, (unsigned) HAP_perf_qtimer_count_to_us(name##_end_cycles - name##_start_cycles)); \ - } while (0) - #endif /* OPS_UTILS_H */ diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index a48cbf43f23..a4399704fcb 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -365,7 +365,8 @@ static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int return; } - PROFILER_START(rope_job_f32); + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); int is_aligned = 1; int opt_path = 0; @@ -380,8 +381,10 @@ static void rope_job_f32_per_thread(struct rope_th_ctx * rope_ctx, int nth, int rope_hex_f32(rope_ctx, src0_start_row, src0_end_row, nth, ith, opt_path); - PROFILER_END(rope_job_f32, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row, - src0_end_row); + t2 = HAP_perf_get_qtimer_count(); + + FARF(HIGH, "rope-f32: %d/%d/%d: (%u:%u) usec %u\n", ith, nth, opt_path, src0_start_row, src0_end_row, + (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } static void rope_job_dispatcher_f32(unsigned int n, unsigned int i, void * data) { From 401fd3ee1a5f33a22bf6d3c1aec2283b7cbe6f84 Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 5 Dec 2025 15:40:15 +0800 Subject: [PATCH 13/23] refactor: optimize vector operations in vec_dot_q4x4x2_q8x4x2_rx2 function --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 37 +++++++++++++++++++------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index c99b6a0d18e..418d8bfa36d 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -421,6 +421,9 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, const uint32_t nb = n / qk; // num full blocks const uint32_t nloe = n % qk; // num leftover elemements + const HVX_VectorPred rd_mask = Q6_Q_vsetq_R(VLEN / 2); + r1_x_d -= VLEN / 2; // make sure r1 at the high half of the vector + uint32_t i = 0; for (; i < nb; i++) { HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size); @@ -430,12 +433,19 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); + HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); + HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); + HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); + vy_d = Q6_Vh_vshuff_Vh(vy_d); + HVX_Vector r01_d = Q6_V_vmux_QVV(rd_mask, r0_d, r1_d); + + vy_d = Q6_Vh_vshuffe_VhVh(vy_d, vy_d); + r01_d = Q6_Vh_vshuff_Vh(r01_d); + + HVX_VectorPair r01_dd = Q6_Wqf32_vmpy_VhfVhf(r01_d, vy_d); + HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r01_dd)); + HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r01_dd)); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); @@ -453,12 +463,19 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe)); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); + HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); + HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); + HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); + vy_d = Q6_Vh_vshuff_Vh(vy_d); + HVX_Vector r01_d = Q6_V_vmux_QVV(rd_mask, r0_d, r1_d); + + vy_d = Q6_Vh_vshuffe_VhVh(vy_d, vy_d); + r01_d = Q6_Vh_vshuff_Vh(r01_d); + + HVX_VectorPair r01_dd = Q6_Wqf32_vmpy_VhfVhf(r01_d, vy_d); + HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r01_dd)); + HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r01_dd)); // Zero out unused scales HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); From cf491f2c8fa7239876c63e05631b8e2881214c87 Mon Sep 17 00:00:00 2001 From: chraac Date: Sat, 6 Dec 2025 00:02:18 +0800 Subject: [PATCH 14/23] wip --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 418d8bfa36d..4fafe41c1f8 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -934,6 +934,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri // for some reason we need volatile here so that the compiler doesn't try anything funky volatile HVX_Vector rsum = Q6_V_vsplat_R(0); + const HVX_Vector kOne = Q6_Vh_vsplat_R(0x3C00); // 1.0 in fp16 uint32_t i = 0; @@ -941,7 +942,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri HVX_VectorPair yp = vy[i]; HVX_Vector x = vx[i]; - HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0 + HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), kOne); // mul by 1.0 HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp)); HVX_Vector lo = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_lo_W(xp)), Q6_V_lo_W(yp)); @@ -954,7 +955,7 @@ static void vec_dot_f16_f32(const int n, float * restrict s, const void * restri HVX_VectorPair yp = vy[i]; HVX_Vector x = vx[i]; - HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), Q6_Vh_vsplat_R(0x3C00)); // mul by 1.0 + HVX_VectorPair xp = Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(x), kOne); // mul by 1.0 if (nv1 >= 32) { HVX_Vector hi = Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(Q6_V_hi_W(xp)), Q6_V_hi_W(yp)); From 3a01d8211b0e91bc3e261bfa87d9ed06842733c1 Mon Sep 17 00:00:00 2001 From: chraac Date: Sun, 7 Dec 2025 23:11:03 +0800 Subject: [PATCH 15/23] feat: enhance vec_dot_q4x4x2_q8x4x2_rx2 function with optimized data handling and processing --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 59 ++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 4fafe41c1f8..5c7dae7c436 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -393,11 +393,11 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, const uint32_t qk = QK_Q4_0x4x2 * 4; - const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t x_dblk_size = 8 * 4 * sizeof(uint16_t); // 32x __fp16 const uint32_t x_qblk_size = qk / 2; // int4 const uint32_t x_qrow_size = n / 2; // int4 (not padded) - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 + const uint32_t y_dblk_size = 8 * 4 * sizeof(uint16_t); // 32x __fp16 const uint32_t y_qblk_size = qk; // int8 const uint32_t y_qrow_size = n; // int8 (not padded) @@ -421,10 +421,63 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, const uint32_t nb = n / qk; // num full blocks const uint32_t nloe = n % qk; // num leftover elemements + uint32_t i = 0; + for (; i + 1 < nb; i += 2) { + HVX_Vector r00_ia; + HVX_Vector r10_ia; + { + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size); + HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size); + + r00_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); + r10_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); + } + + HVX_Vector r01_ia; + HVX_Vector r11_ia; + { + HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + (i + 1) * y_qblk_size); + HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + (i + 1) * x_qblk_size); + HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + (i + 1) * x_qblk_size); + + r01_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); + r11_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); + } + + HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); + HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); + HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); + + vy_d = Q6_Vh_vshuff_Vh(vy_d); + r0_d = Q6_Vh_vshuff_Vh(r0_d); + r1_d = Q6_Vh_vshuff_Vh(r1_d); + + HVX_VectorPair r0_dd = Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d); + HVX_VectorPair r1_dd = Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d); + + HVX_Vector r00_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r0_dd)); + HVX_Vector r01_dd = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r0_dd)); + + HVX_Vector r10_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r1_dd)); + HVX_Vector r11_dd = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r1_dd)); + + HVX_Vector r00_fa = Q6_Vqf32_vmpy_VsfVsf(r00_ia, r00_dd); + HVX_Vector r01_fa = Q6_Vqf32_vmpy_VsfVsf(r01_ia, r01_dd); + + HVX_Vector r10_fa = Q6_Vqf32_vmpy_VsfVsf(r10_ia, r10_dd); + HVX_Vector r11_fa = Q6_Vqf32_vmpy_VsfVsf(r11_ia, r11_dd); + + r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r00_fa); + r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r10_fa); + + r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r01_fa); + r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r11_fa); + } + const HVX_VectorPred rd_mask = Q6_Q_vsetq_R(VLEN / 2); r1_x_d -= VLEN / 2; // make sure r1 at the high half of the vector - uint32_t i = 0; for (; i < nb; i++) { HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size); HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size); From 421d0313d98d314e61f1babbf3fa0f469d4c4a8b Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 8 Dec 2025 11:54:15 +0800 Subject: [PATCH 16/23] feat: add hvx_vec_load_d_and_mpy function for optimized data loading and multiplication --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 41 ++++++++++++++++---------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 5c7dae7c436..db9bc8821ef 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -307,6 +307,28 @@ static inline HVX_Vector hvx_vec_rmpy_x8_nloe(HVX_Vector_x8 x, HVX_Vector_x8 y, return hvx_vec_rmpy_x8_n(x, y, 1024); } +static inline HVX_Vector_x2 hvx_vec_load_d_and_mpy_rx2(const uint8_t * restrict r0_x_d, + const uint8_t * restrict r1_x_d, + const uint8_t * restrict y_d, + const HVX_Vector rd_mask) { + HVX_Vector vy_d = *(const HVX_UVector *) y_d; + HVX_Vector r0_d = *(const HVX_UVector *) r0_x_d; + HVX_Vector r1_d = *(const HVX_UVector *) r1_x_d; + + vy_d = Q6_Vh_vshuff_Vh(vy_d); + HVX_Vector r01_d = Q6_V_vmux_QVV(rd_mask, r0_d, r1_d); + + vy_d = Q6_Vh_vshuffe_VhVh(vy_d, vy_d); + r01_d = Q6_Vh_vshuff_Vh(r01_d); + + HVX_VectorPair r01_dd = Q6_Wqf32_vmpy_VhfVhf(r01_d, vy_d); + + HVX_Vector_x2 r; + r.v[0] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r01_dd)); + r.v[1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r01_dd)); + return r; +} + static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % 32 == 0); // min sub-block size assert((unsigned long) vx % 128 == 0); @@ -486,22 +508,11 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); - HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); - HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); - - vy_d = Q6_Vh_vshuff_Vh(vy_d); - HVX_Vector r01_d = Q6_V_vmux_QVV(rd_mask, r0_d, r1_d); - - vy_d = Q6_Vh_vshuffe_VhVh(vy_d, vy_d); - r01_d = Q6_Vh_vshuff_Vh(r01_d); + HVX_Vector_x2 r_dd = hvx_vec_load_d_and_mpy_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size, + y_d + i * y_dblk_size, rd_mask); - HVX_VectorPair r01_dd = Q6_Wqf32_vmpy_VhfVhf(r01_d, vy_d); - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r01_dd)); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r01_dd)); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); + HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r_dd.v[0]); + HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r_dd.v[1]); r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa); r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa); From bd438609848e5e3b37f35753a0261c4ccec4bbe7 Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 8 Dec 2025 12:12:59 +0800 Subject: [PATCH 17/23] wip --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index db9bc8821ef..96e8588e96e 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -527,24 +527,13 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe)); - HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); - HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); - HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); - - vy_d = Q6_Vh_vshuff_Vh(vy_d); - HVX_Vector r01_d = Q6_V_vmux_QVV(rd_mask, r0_d, r1_d); - - vy_d = Q6_Vh_vshuffe_VhVh(vy_d, vy_d); - r01_d = Q6_Vh_vshuff_Vh(r01_d); - - HVX_VectorPair r01_dd = Q6_Wqf32_vmpy_VhfVhf(r01_d, vy_d); - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r01_dd)); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r01_dd)); + HVX_Vector_x2 r_dd = hvx_vec_load_d_and_mpy_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size, + y_d + i * y_dblk_size, rd_mask); // Zero out unused scales HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_dd = Q6_V_vand_QV(bmask, r0_dd); - r1_dd = Q6_V_vand_QV(bmask, r1_dd); + HVX_Vector r0_dd = Q6_V_vand_QV(bmask, r_dd.v[0]); + HVX_Vector r1_dd = Q6_V_vand_QV(bmask, r_dd.v[1]); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); @@ -554,8 +543,8 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, } // Convert into fp32 and reduce - r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum)); - r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum)); + r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum)); + r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum)); HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4); hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0)); From b1974641a560c94388fe964e3a6778ac6e6934de Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 8 Dec 2025 12:26:07 +0800 Subject: [PATCH 18/23] feat: add hvx_vec_load_d_and_mpy_r2x2 function for optimized vector loading and multiplication --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 48 +++++++++++++++----------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 96e8588e96e..3fc5394bfb6 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -329,6 +329,28 @@ static inline HVX_Vector_x2 hvx_vec_load_d_and_mpy_rx2(const uint8_t * restrict return r; } +static inline HVX_Vector_x4 hvx_vec_load_d_and_mpy_r2x2(const uint8_t * restrict r0_x_d, + const uint8_t * restrict r1_x_d, + const uint8_t * restrict y_d) { + HVX_Vector vy_d = *(const HVX_UVector *) y_d; + HVX_Vector r0_d = *(const HVX_UVector *) r0_x_d; + HVX_Vector r1_d = *(const HVX_UVector *) r1_x_d; + + vy_d = Q6_Vh_vshuff_Vh(vy_d); + r0_d = Q6_Vh_vshuff_Vh(r0_d); + r1_d = Q6_Vh_vshuff_Vh(r1_d); + + HVX_VectorPair r0_dd = Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d); + HVX_VectorPair r1_dd = Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d); + + HVX_Vector_x4 r; + r.v[0] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r0_dd)); + r.v[1] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r0_dd)); + r.v[2] = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r1_dd)); + r.v[3] = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r1_dd)); + return r; +} + static void vec_dot_q4x4x2_q8x4x2(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % 32 == 0); // min sub-block size assert((unsigned long) vx % 128 == 0); @@ -467,28 +489,14 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, r11_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); } - HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); - HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); - HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); - - vy_d = Q6_Vh_vshuff_Vh(vy_d); - r0_d = Q6_Vh_vshuff_Vh(r0_d); - r1_d = Q6_Vh_vshuff_Vh(r1_d); - - HVX_VectorPair r0_dd = Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d); - HVX_VectorPair r1_dd = Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d); - - HVX_Vector r00_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r0_dd)); - HVX_Vector r01_dd = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r0_dd)); - - HVX_Vector r10_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(r1_dd)); - HVX_Vector r11_dd = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(r1_dd)); + HVX_Vector_x4 r_dd = + hvx_vec_load_d_and_mpy_r2x2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size, y_d + i * y_dblk_size); - HVX_Vector r00_fa = Q6_Vqf32_vmpy_VsfVsf(r00_ia, r00_dd); - HVX_Vector r01_fa = Q6_Vqf32_vmpy_VsfVsf(r01_ia, r01_dd); + HVX_Vector r00_fa = Q6_Vqf32_vmpy_VsfVsf(r00_ia, r_dd.v[0]); + HVX_Vector r01_fa = Q6_Vqf32_vmpy_VsfVsf(r01_ia, r_dd.v[1]); - HVX_Vector r10_fa = Q6_Vqf32_vmpy_VsfVsf(r10_ia, r10_dd); - HVX_Vector r11_fa = Q6_Vqf32_vmpy_VsfVsf(r11_ia, r11_dd); + HVX_Vector r10_fa = Q6_Vqf32_vmpy_VsfVsf(r10_ia, r_dd.v[2]); + HVX_Vector r11_fa = Q6_Vqf32_vmpy_VsfVsf(r11_ia, r_dd.v[3]); r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r00_fa); r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r10_fa); From 309d78298762c27a5a07f7bf34b48faa7ae0820f Mon Sep 17 00:00:00 2001 From: chraac Date: Mon, 8 Dec 2025 12:46:25 +0800 Subject: [PATCH 19/23] feat: optimize vec_dot functions with improved data handling and loading --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 35 +++++++++++--------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 3fc5394bfb6..03c3e50a89a 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -540,8 +540,8 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, // Zero out unused scales HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - HVX_Vector r0_dd = Q6_V_vand_QV(bmask, r_dd.v[0]); - HVX_Vector r1_dd = Q6_V_vand_QV(bmask, r_dd.v[1]); + HVX_Vector r0_dd = Q6_V_vand_QV(bmask, r_dd.v[0]); + HVX_Vector r1_dd = Q6_V_vand_QV(bmask, r_dd.v[1]); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); @@ -672,6 +672,9 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n, const uint32_t nb = n / qk; // num full blocks int32_t nloe = n % qk; // num leftover elemements (must be signed) + const HVX_VectorPred rd_mask = Q6_Q_vsetq_R(VLEN / 2); + r1_x_d -= VLEN / 2; // make sure r1 at the high half of the vector + uint32_t i = 0; for (; i < nb; i++) { HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size); @@ -681,15 +684,11 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n, HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); + HVX_Vector_x2 r_dd = hvx_vec_load_d_and_mpy_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size, + y_d + i * y_dblk_size, rd_mask); - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); + HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r_dd.v[0]); + HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r_dd.v[1]); r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa); r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa); @@ -704,17 +703,13 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n, HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe)); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); + HVX_Vector_x2 r_dd = hvx_vec_load_d_and_mpy_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size, + y_d + i * y_dblk_size, rd_mask); // Zero out unused scales HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_dd = Q6_V_vand_QV(bmask, r0_dd); - r1_dd = Q6_V_vand_QV(bmask, r1_dd); + HVX_Vector r0_dd = Q6_V_vand_QV(bmask, r_dd.v[0]); + HVX_Vector r1_dd = Q6_V_vand_QV(bmask, r_dd.v[1]); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); @@ -724,8 +719,8 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n, } // Convert into fp32 and reduce - r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum)); - r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum)); + r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum)); + r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum)); HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4); hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0)); From dbe93098513c212c6d48a89f12b4105fcd84f84c Mon Sep 17 00:00:00 2001 From: chraac Date: Tue, 9 Dec 2025 19:10:05 +0800 Subject: [PATCH 20/23] wip --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 32 +++++++++++++------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 03c3e50a89a..7fabedf64ab 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -508,7 +508,7 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, const HVX_VectorPred rd_mask = Q6_Q_vsetq_R(VLEN / 2); r1_x_d -= VLEN / 2; // make sure r1 at the high half of the vector - for (; i < nb; i++) { + if (i < nb) { HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8(y_q + i * y_qblk_size); HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8(r0_x_q + i * x_qblk_size); HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8(r1_x_q + i * x_qblk_size); @@ -524,6 +524,7 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, r0_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r0_sum, r0_fa); r1_sum = Q6_Vqf32_vadd_Vqf32Vqf32(r1_sum, r1_fa); + i++; } // Process leftovers, we still load full 4x4x2 block but zero out unused scales/blocks @@ -757,8 +758,12 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n, // Compute combined scale (fp32). // Apply scale to acc and accumulate into the row sum (qf32). - const uint32_t nb = n / qk; // num full blocks - int32_t nloe = n % qk; // num leftover elemements (must be signed) + const uint32_t nb = n / qk; // num full blocks + int32_t nloe = n % qk; // num leftover elemements (must be signed) + + const HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16 + const HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0; + const HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff); uint32_t i = 0; for (; i < nb; i++) { @@ -771,19 +776,16 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n, HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving - HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16 - vy_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half)); - vy_d = Q6_Vsf_equals_Vqf32(vy_d); + vy_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half)); + vy_d = Q6_Vsf_equals_Vqf32(vy_d); // Convert rX_d scales from e8m0 to fp32 // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ... // Left shift with zero fill to create FP32 // FIXME: might need to handle zero as a special case (see ggml-cpu code) - HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0; - HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff); - r0_d = Q6_V_vdelta_VV(r0_d, expand); - r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); - r0_d = Q6_Vw_vasl_VwR(r0_d, 23); + r0_d = Q6_V_vdelta_VV(r0_d, expand); + r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); + r0_d = Q6_Vw_vasl_VwR(r0_d, 23); HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d)); @@ -811,11 +813,9 @@ static void vec_dot_mxfp4x4x2_q8x4x2(const int n, // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ... // Left shift with zero fill to create FP32 // FIXME: might need to handle zero as a special case (see ggml-cpu code) - HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0; - HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff); - r0_d = Q6_V_vdelta_VV(r0_d, expand); - r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); - r0_d = Q6_Vw_vasl_VwR(r0_d, 23); + r0_d = Q6_V_vdelta_VV(r0_d, expand); + r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); + r0_d = Q6_Vw_vasl_VwR(r0_d, 23); HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d)); From 00d5fb31b83e82491d1d4ed9fd0858087f02355e Mon Sep 17 00:00:00 2001 From: chraac Date: Tue, 9 Dec 2025 19:29:49 +0800 Subject: [PATCH 21/23] feat: add build information and update vector loading functions for optimization --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 32 +++++++++++++------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 7fabedf64ab..7eb9aedeb5b 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -307,13 +307,13 @@ static inline HVX_Vector hvx_vec_rmpy_x8_nloe(HVX_Vector_x8 x, HVX_Vector_x8 y, return hvx_vec_rmpy_x8_n(x, y, 1024); } -static inline HVX_Vector_x2 hvx_vec_load_d_and_mpy_rx2(const uint8_t * restrict r0_x_d, +static inline HVX_Vector_x2 hvx_vec_load_and_mul_d_rx2(const uint8_t * restrict r0_x_d, const uint8_t * restrict r1_x_d, const uint8_t * restrict y_d, const HVX_Vector rd_mask) { - HVX_Vector vy_d = *(const HVX_UVector *) y_d; - HVX_Vector r0_d = *(const HVX_UVector *) r0_x_d; - HVX_Vector r1_d = *(const HVX_UVector *) r1_x_d; + HVX_Vector vy_d = *(const HVX_Vector *) y_d; + HVX_Vector r0_d = *(const HVX_Vector *) r0_x_d; + HVX_Vector r1_d = *(const HVX_Vector *) r1_x_d; vy_d = Q6_Vh_vshuff_Vh(vy_d); HVX_Vector r01_d = Q6_V_vmux_QVV(rd_mask, r0_d, r1_d); @@ -329,12 +329,12 @@ static inline HVX_Vector_x2 hvx_vec_load_d_and_mpy_rx2(const uint8_t * restrict return r; } -static inline HVX_Vector_x4 hvx_vec_load_d_and_mpy_r2x2(const uint8_t * restrict r0_x_d, +static inline HVX_Vector_x4 hvx_vec_load_and_mul_d_r2x2(const uint8_t * restrict r0_x_d, const uint8_t * restrict r1_x_d, const uint8_t * restrict y_d) { - HVX_Vector vy_d = *(const HVX_UVector *) y_d; - HVX_Vector r0_d = *(const HVX_UVector *) r0_x_d; - HVX_Vector r1_d = *(const HVX_UVector *) r1_x_d; + HVX_Vector vy_d = *(const HVX_Vector *) y_d; + HVX_Vector r0_d = *(const HVX_Vector *) r0_x_d; + HVX_Vector r1_d = *(const HVX_Vector *) r1_x_d; vy_d = Q6_Vh_vshuff_Vh(vy_d); r0_d = Q6_Vh_vshuff_Vh(r0_d); @@ -490,7 +490,7 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, } HVX_Vector_x4 r_dd = - hvx_vec_load_d_and_mpy_r2x2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size, y_d + i * y_dblk_size); + hvx_vec_load_and_mul_d_r2x2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size, y_d + i * y_dblk_size); HVX_Vector r00_fa = Q6_Vqf32_vmpy_VsfVsf(r00_ia, r_dd.v[0]); HVX_Vector r01_fa = Q6_Vqf32_vmpy_VsfVsf(r01_ia, r_dd.v[1]); @@ -516,7 +516,7 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - HVX_Vector_x2 r_dd = hvx_vec_load_d_and_mpy_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size, + HVX_Vector_x2 r_dd = hvx_vec_load_and_mul_d_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size, y_d + i * y_dblk_size, rd_mask); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r_dd.v[0]); @@ -536,7 +536,7 @@ static void vec_dot_q4x4x2_q8x4x2_rx2(const int n, HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe)); - HVX_Vector_x2 r_dd = hvx_vec_load_d_and_mpy_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size, + HVX_Vector_x2 r_dd = hvx_vec_load_and_mul_d_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size, y_d + i * y_dblk_size, rd_mask); // Zero out unused scales @@ -685,7 +685,7 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n, HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - HVX_Vector_x2 r_dd = hvx_vec_load_d_and_mpy_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size, + HVX_Vector_x2 r_dd = hvx_vec_load_and_mul_d_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size, y_d + i * y_dblk_size, rd_mask); HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r_dd.v[0]); @@ -704,7 +704,7 @@ static void vec_dot_q8x4x2_q8x4x2_rx2(const int n, HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r0_q, vy_q, nloe)); HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_nloe(r1_q, vy_q, nloe)); - HVX_Vector_x2 r_dd = hvx_vec_load_d_and_mpy_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size, + HVX_Vector_x2 r_dd = hvx_vec_load_and_mul_d_rx2(r0_x_d + i * x_dblk_size, r1_x_d + i * x_dblk_size, y_d + i * y_dblk_size, rd_mask); // Zero out unused scales @@ -961,8 +961,8 @@ static void vec_dot_mxfp4x4x2_q8x4x2_rx2(const int n, } // Convert into fp32 and reduce - r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum)); - r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum)); + r0_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r0_sum)); + r1_sum = hvx_vec_fp32_reduce_sum(Q6_Vsf_equals_Vqf32(r1_sum)); HVX_VectorPair p0 = Q6_W_vshuff_VVR(r1_sum, r0_sum, 4); hvx_vec_store_u(&s[0], 8, Q6_V_lo_W(p0)); @@ -2273,7 +2273,7 @@ int op_matmul_id(struct htp_ops_context * octx) { assert(i02 >= 0 && i02 < n_as); - MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) { id, iid1 }; + MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping){ id, iid1 }; matrix_row_counts[i02] += 1; } } From b54ff1895d20c896d29fb708ef27cd1202d2d9b3 Mon Sep 17 00:00:00 2001 From: chraac Date: Wed, 10 Dec 2025 21:46:40 +0800 Subject: [PATCH 22/23] revert rope changes --- ggml/src/ggml-hexagon/htp/rope-ops.c | 78 +++++++++++++++------------- 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c index a4399704fcb..00419bcba6b 100644 --- a/ggml/src/ggml-hexagon/htp/rope-ops.c +++ b/ggml/src/ggml-hexagon/htp/rope-ops.c @@ -73,15 +73,15 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) { return (1 - MIN(1, MAX(0, y))); } -static void rope_cache_init(const float theta_base, - const float freq_scale, - const float * freq_factors, - float * corr_dims, - const uint32_t ne0, - const float ext_factor, - const float mscale, - float * cache, - const float theta_scale) { +static void rope_cache_init(const float theta_base, + float freq_scale, + const float * freq_factors, + float * corr_dims, + uint32_t ne0, + float ext_factor, + float mscale, + float * cache, + float theta_scale) { // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py float theta = theta_base; @@ -92,19 +92,18 @@ static void rope_cache_init(const float theta_base, // Get n-d rotational scaling corrected for extrapolation float theta_interp = freq_scale * theta_extrap; - float theta_final = theta_interp; - float mscale_final = mscale; + float theta2 = theta_interp; if (ext_factor != 0.0f) { float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; - theta_final = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + theta2 = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; // Get n-d magnitude scaling corrected for interpolation - mscale_final *= 1.0f + 0.1f * logf(1.0f / freq_scale); + mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); } - cache[i0 + 0] = cosf(theta_final) * mscale_final; - cache[i0 + 1] = sinf(theta_final) * mscale_final; + cache[i0 + 0] = cosf(theta2) * mscale; + cache[i0 + 1] = sinf(theta2) * mscale; theta *= theta_scale; } @@ -152,9 +151,9 @@ static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context } static void hvx_calc_rope_neox_f32(const float * restrict src0, - float * restrict dst, - const int num_elems, - const float * restrict theta_cache) { + float * restrict dst, + const int num_elems, + const float * restrict theta_cache) { // for (int i = 0; i < num_elems; i += 2) { //const float cos_theta = theta_cache[i + 0]; //const float sin_theta = theta_cache[i + 1]; @@ -193,7 +192,7 @@ static void hvx_calc_rope_neox_f32(const float * restrict src0, HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s); HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c); - *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v4); + *(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v4); *(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5); src0_curr += VLEN; @@ -260,7 +259,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, const uint32_t ir1, int nth, int ith, - const int opt_path) { + int opt_path) { struct htp_ops_context * octx = rope_ctx->octx; const struct htp_tensor * src0 = &octx->src0; @@ -268,8 +267,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, const struct htp_tensor * src2 = &octx->src2; struct htp_tensor * dst = &octx->dst; - const int32_t mode = rope_ctx->mode; - const bool is_neox = mode & HTP_ROPE_TYPE_NEOX; + const int32_t mode = rope_ctx->mode; + const bool is_neox = mode & HTP_ROPE_TYPE_NEOX; htp_rope_preamble; @@ -282,9 +281,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, freq_factors = (const float *) src2->data; } - const uint32_t i1_end = MIN(ir1, ne1); - const int32_t half_dims = rope_ctx->n_dims / 2; - const size_t remain_bytes = (ne0 - rope_ctx->n_dims) * sizeof(float); + int ir = 0; + for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len const int32_t p = pos[i2]; @@ -292,7 +290,14 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor, rope_ctx->attn_factor, wp0, rope_ctx->theta_scale); - for (uint32_t i1 = ir0; i1 < i1_end; i1++) { // attn-heads + for (uint32_t i1 = 0; i1 < ne1; i1++) { // attn-heads + if (ir++ < ir0) { + continue; + } + if (ir > ir1) { + break; + } + const float * src = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01); float * dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1); @@ -305,9 +310,6 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, } else { hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0); } - - src_loc += rope_ctx->n_dims; - dst_data_loc += rope_ctx->n_dims; } else { for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) { const float cos_theta = wp0[i0 + 0]; @@ -315,10 +317,10 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, if (is_neox) { const float x0 = src_loc[0]; - const float x1 = src_loc[half_dims]; + const float x1 = src_loc[rope_ctx->n_dims/2]; - dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta; - dst_data_loc[half_dims] = x0 * sin_theta + x1 * cos_theta; + dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta; + dst_data_loc[rope_ctx->n_dims/2] = x0 * sin_theta + x1 * cos_theta; src_loc += 1; dst_data_loc += 1; @@ -333,13 +335,15 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx, dst_data_loc += 2; } } - - src_loc += (is_neox ? half_dims : 0); - dst_data_loc += (is_neox ? half_dims : 0); } - // TODO: use simd to speed up the remaining elements copy - memcpy(dst_data_loc, src_loc, remain_bytes); + for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) { + dst_data_loc[0] = src_loc[0]; + dst_data_loc[1] = src_loc[1]; + + src_loc += 2; + dst_data_loc += 2; + } } } } From 09c48991df3b8d0cd6fd90e108184ac7ed61633d Mon Sep 17 00:00:00 2001 From: chraac Date: Thu, 11 Dec 2025 11:43:49 +0800 Subject: [PATCH 23/23] fix: revert HVX_Vector back to HVX_UVector --- ggml/src/ggml-hexagon/htp/matmul-ops.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 7eb9aedeb5b..48bebc18ca0 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -311,9 +311,9 @@ static inline HVX_Vector_x2 hvx_vec_load_and_mul_d_rx2(const uint8_t * restrict const uint8_t * restrict r1_x_d, const uint8_t * restrict y_d, const HVX_Vector rd_mask) { - HVX_Vector vy_d = *(const HVX_Vector *) y_d; - HVX_Vector r0_d = *(const HVX_Vector *) r0_x_d; - HVX_Vector r1_d = *(const HVX_Vector *) r1_x_d; + HVX_Vector vy_d = *(const HVX_UVector *) y_d; + HVX_Vector r0_d = *(const HVX_UVector *) r0_x_d; + HVX_Vector r1_d = *(const HVX_UVector *) r1_x_d; vy_d = Q6_Vh_vshuff_Vh(vy_d); HVX_Vector r01_d = Q6_V_vmux_QVV(rd_mask, r0_d, r1_d); @@ -332,9 +332,9 @@ static inline HVX_Vector_x2 hvx_vec_load_and_mul_d_rx2(const uint8_t * restrict static inline HVX_Vector_x4 hvx_vec_load_and_mul_d_r2x2(const uint8_t * restrict r0_x_d, const uint8_t * restrict r1_x_d, const uint8_t * restrict y_d) { - HVX_Vector vy_d = *(const HVX_Vector *) y_d; - HVX_Vector r0_d = *(const HVX_Vector *) r0_x_d; - HVX_Vector r1_d = *(const HVX_Vector *) r1_x_d; + HVX_Vector vy_d = *(const HVX_UVector *) y_d; + HVX_Vector r0_d = *(const HVX_UVector *) r0_x_d; + HVX_Vector r1_d = *(const HVX_UVector *) r1_x_d; vy_d = Q6_Vh_vshuff_Vh(vy_d); r0_d = Q6_Vh_vshuff_Vh(r0_d);