Skip to content

Commit a310f28

Browse files
committed
boot-test: add a userspace performance test
Add a test to compare kernel and userspace performance when performing tight loop calculations with no API calls. Signed-off-by: Guennadi Liakhovetski <guennadi.liakhovetski@linux.intel.com>
1 parent a92d0d2 commit a310f28

2 files changed

Lines changed: 173 additions & 0 deletions

File tree

zephyr/test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ if(CONFIG_SOF_BOOT_TEST)
44
)
55
zephyr_library_sources_ifdef(CONFIG_USERSPACE
66
userspace/ksem.c
7+
userspace/test_perf.c
78
)
89
endif()
910

zephyr/test/userspace/test_perf.c

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
// SPDX-License-Identifier: BSD-3-Clause
2+
/* Copyright(c) 2026 Intel Corporation. */
3+
4+
/* Test kernel vs. user-space performance. */
5+
6+
#include <sof/boot_test.h>
7+
#include <rtos/alloc.h>
8+
9+
#include <zephyr/kernel.h>
10+
#include <zephyr/ztest.h>
11+
#include <zephyr/logging/log.h>
12+
13+
LOG_MODULE_DECLARE(sof_boot_test, LOG_LEVEL_DBG);
14+
15+
static int load_add(void)
16+
{
17+
#define N_ADD (1000 * 1000 * 100)
18+
unsigned long r = 0;
19+
20+
for (unsigned int i = 0; i < N_ADD; i++)
21+
r += i;
22+
#define N_DIV 10000
23+
for (unsigned int i = 1; i <= N_DIV; i++)
24+
r = r / (i % 10 + 1) * (i % 10 + 3);
25+
return (int)r;
26+
}
27+
28+
#ifdef __XCC__
29+
#include <xtensa/tie/xt_hifi4.h>
30+
31+
/* Compute dot product of two vectors using HiFi4 SIMD instructions */
32+
static int32_t dot_product_hifi4(const int16_t *a, const int16_t *b, int length)
33+
{
34+
ae_int64 acc = AE_ZERO64(); /* 1. Initialize accumulator to zero */
35+
ae_int16x4 *pa = (ae_int16x4 *)a; /* Pointer to vector a */
36+
ae_int16x4 *pb = (ae_int16x4 *)b; /* Pointer to vector b */
37+
38+
for (int i = 0; i < length / 4; i++) {
39+
ae_int16x4 va, vb;
40+
41+
AE_L16X4_IP(va, pa, 8); /* 2. Load 4x 16-bit values from a */
42+
AE_L16X4_IP(vb, pb, 8); /* 3. Load 4x 16-bit values from b */
43+
AE_MULAAAAQ16(acc, va, vb); /* 4. Multiply-accumulate (4 MACs in parallel) */
44+
}
45+
46+
return AE_TRUNCA32F64S(acc, 0); /* 5. Convert 64-bit result to 32-bit */
47+
}
48+
49+
#define VECTOR_LENGTH 100
50+
static int load_hifi4(void)
51+
{
52+
uint16_t a[VECTOR_LENGTH], b[VECTOR_LENGTH];
53+
int ret = 0;
54+
55+
for (unsigned int j = 0; j < 1000; j++) {
56+
for (unsigned int i = 0; i < VECTOR_LENGTH; i++) {
57+
a[i] = i * 3 - 47 * j;
58+
b[i] = 411 * j - i * 5;
59+
}
60+
61+
ret += dot_product_hifi4(a, b, VECTOR_LENGTH);
62+
}
63+
return ret;
64+
}
65+
#endif /* __XCC__ */
66+
67+
typedef int (*load_fn_t)(void);
68+
69+
load_fn_t load_fn[] = {
70+
load_add,
71+
#ifdef __XCC__
72+
load_hifi4,
73+
#endif
74+
};
75+
76+
static unsigned int test_perf(load_fn_t fn, struct k_event *event,
77+
struct k_sem *sem)
78+
{
79+
uint64_t start = k_uptime_ticks();
80+
81+
k_event_set(event, (uint32_t)fn);
82+
83+
int ret = k_sem_take(sem, K_MSEC(200));
84+
85+
zassert_ok(ret);
86+
87+
uint64_t end = k_uptime_ticks();
88+
89+
return (unsigned int)(end - start);
90+
}
91+
92+
static void thread_fn(void *p1, void *p2, void *p3)
93+
{
94+
struct k_event *event = p1;
95+
struct k_sem *sem = p2;
96+
bool first = true;
97+
98+
for (;;) {
99+
load_fn_t fn = (load_fn_t)k_event_wait(event, 0xffffffff, !first, K_FOREVER);
100+
101+
first = false;
102+
LOG_INF("fn %p ret %d", (void *)fn, fn());
103+
104+
k_sem_give(sem);
105+
}
106+
}
107+
108+
#define STACK_SIZE 4096
109+
110+
ZTEST(sof_boot, test_perf)
111+
{
112+
/* Synchronization objects allocated on original uncached heap */
113+
struct k_event *u_event = k_object_alloc(K_OBJ_EVENT);
114+
struct k_event *k_event = k_object_alloc(K_OBJ_EVENT);
115+
116+
zassert_not_null(u_event);
117+
zassert_not_null(k_event);
118+
119+
k_event_init(u_event);
120+
k_event_init(k_event);
121+
122+
struct k_sem *sem = k_object_alloc(K_OBJ_SEM);
123+
124+
zassert_not_null(sem);
125+
k_sem_init(sem, 0, 1);
126+
127+
/* Allocate kernel stack and thread and start it */
128+
struct k_thread *k_thread = k_object_alloc(K_OBJ_THREAD);
129+
130+
zassert_not_null(k_thread);
131+
/* Important: Xtensa thread initialization code checks certain fields for 0 */
132+
memset(&k_thread->arch, 0, sizeof(k_thread->arch));
133+
134+
k_thread_stack_t *k_stack = k_thread_stack_alloc(STACK_SIZE, 0);
135+
136+
zassert_not_null(k_stack);
137+
138+
struct k_thread *pk_thread = k_thread_create(k_thread, k_stack, STACK_SIZE, thread_fn,
139+
k_event, sem, NULL, 0, 0, K_FOREVER);
140+
141+
k_thread_start(pk_thread);
142+
143+
/* Allocate userspace stack and thread and start it */
144+
struct k_thread *u_thread = k_object_alloc(K_OBJ_THREAD);
145+
146+
zassert_not_null(u_thread);
147+
memset(&u_thread->arch, 0, sizeof(u_thread->arch));
148+
149+
k_thread_stack_t *u_stack = k_thread_stack_alloc(STACK_SIZE, K_USER);
150+
151+
zassert_not_null(u_stack);
152+
153+
struct k_thread *pu_thread = k_thread_create(u_thread, u_stack, STACK_SIZE, thread_fn,
154+
u_event, sem, NULL, 0, K_USER, K_FOREVER);
155+
156+
zassert_not_null(pu_thread);
157+
k_thread_access_grant(pu_thread, u_event, sem);
158+
k_thread_start(pu_thread);
159+
160+
for (unsigned int i = 0; i < ARRAY_SIZE(load_fn); i++) {
161+
LOG_INF("user: fn %p took %u", load_fn[i], test_perf(load_fn[i], u_event, sem));
162+
LOG_INF("kernel: fn %p took %u", load_fn[i], test_perf(load_fn[i], k_event, sem));
163+
}
164+
165+
k_thread_abort(pu_thread);
166+
k_thread_stack_free(u_stack);
167+
k_thread_abort(pk_thread);
168+
k_thread_stack_free(k_stack);
169+
k_object_free(sem);
170+
k_object_free(u_event);
171+
k_object_free(k_event);
172+
}

0 commit comments

Comments
 (0)