-
Notifications
You must be signed in to change notification settings - Fork 350
Exception dump hook #10517
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Exception dump hook #10517
Changes from all commits
ba001a4
1368589
6854c28
dbb88d7
4ba333c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -10,11 +10,12 @@ | |||
| #include <rtos/string.h> | ||||
| #include <user/debug_stream.h> | ||||
| #include <user/debug_stream_slot.h> | ||||
| #include <zephyr/kernel.h> | ||||
|
|
||||
| LOG_MODULE_REGISTER(debug_stream_slot); | ||||
|
|
||||
| struct cpu_mutex { | ||||
| struct k_mutex m; | ||||
| struct k_spinlock l; | ||||
| } __aligned(CONFIG_DCACHE_LINE_SIZE); | ||||
|
|
||||
| /* CPU specific mutexes for each circular buffer */ | ||||
|
|
@@ -66,6 +67,7 @@ int debug_stream_slot_send_record(struct debug_stream_record *rec) | |||
| debug_stream_get_circular_buffer(&desc, arch_proc_id()); | ||||
| uint32_t record_size = rec->size_words; | ||||
| uint32_t record_start, buf_remain; | ||||
| k_spinlock_key_t key; | ||||
|
|
||||
| LOG_DBG("Sending record %u id %u len %u", rec->seqno, rec->id, rec->size_words); | ||||
|
|
||||
|
|
@@ -77,7 +79,7 @@ int debug_stream_slot_send_record(struct debug_stream_record *rec) | |||
| desc.buf_words, desc.core_id, desc.buf_words, desc.offset); | ||||
| return -ENOMEM; | ||||
| } | ||||
| k_mutex_lock(&cpu_mutex[arch_proc_id()].m, K_FOREVER); | ||||
| key = k_spin_lock(&cpu_mutex[arch_proc_id()].l); | ||||
|
|
||||
| rec->seqno = buf->next_seqno++; | ||||
| rec->size_words = record_size + 1; /* +1 for size at the end of record */ | ||||
|
|
@@ -105,7 +107,7 @@ int debug_stream_slot_send_record(struct debug_stream_record *rec) | |||
| buf->data[buf->w_ptr] = record_size + 1; | ||||
| buf->w_ptr = (buf->w_ptr + 1) % desc.buf_words; | ||||
|
|
||||
| k_mutex_unlock(&cpu_mutex[arch_proc_id()].m); | ||||
| k_spin_unlock(&cpu_mutex[arch_proc_id()].l, key); | ||||
|
|
||||
| LOG_DBG("Record %u id %u len %u sent", rec->seqno, rec->id, record_size); | ||||
| return 0; | ||||
|
|
@@ -159,14 +161,6 @@ static int debug_stream_slot_init(void) | |||
|
|
||||
| buf->next_seqno = 0; | ||||
| buf->w_ptr = 0; | ||||
| k_mutex_init(&cpu_mutex[i].m); | ||||
| /* The core specific mutexes are now .bss which is uncached so the | ||||
| * following line is commented out. However, since the mutexes are | ||||
| * core specific there should be nothing preventing from having them | ||||
| * in cached memory. | ||||
| * | ||||
| * sys_cache_data_flush_range(&cpu_mutex[i], sizeof(cpu_mutex[i])); | ||||
| */ | ||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hrm, I forgot that I already checked first time around that Zephyr spinlocks do not have such function or anything like it.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. SOF has one sof/zephyr/include/rtos/spinlock.h Line 13 in a32d983
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's arguably quite confusing we use Zephyr native calls, but have a SOF extension. I'd only use k_spinlock_init if we really need to re-initialize. |
||||
| } | ||||
| LOG_INF("Debug stream slot initialized"); | ||||
|
|
||||
|
|
||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,9 +8,13 @@ | |
| #include <soc.h> | ||
| #include <adsp_debug_window.h> | ||
| #include <sof/common.h> | ||
| #include <zephyr/logging/log.h> | ||
| #include <zephyr/arch/exception.h> | ||
jsarha marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| #include <user/debug_stream_text_msg.h> | ||
|
|
||
| LOG_MODULE_REGISTER(debug_stream_text_msg); | ||
|
|
||
| void ds_msg(const char *format, ...) | ||
| { | ||
| va_list args; | ||
|
|
@@ -33,3 +37,109 @@ void ds_msg(const char *format, ...) | |
| sizeof(buf.msg.hdr.data[0])); | ||
| debug_stream_slot_send_record(&buf.msg.hdr); | ||
| } | ||
|
|
||
| #if defined(CONFIG_EXCEPTION_DUMP_HOOK) | ||
| /* The debug stream debug window slot is 4k, and when it is split | ||
| * between the cores and the header/other overhead is removed, with 5 | ||
| * cores the size is 768 bytes. The dump record must be smaller than | ||
| * that to get through to the host side. | ||
| * | ||
| * Also, because of the limited circular buffer size, we should only | ||
| * send one exception record. Exceptions often happen in bursts in a | ||
| * SOF system, and sending more than one record makes the host-side | ||
| * decoder lose track of things. | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. exceptions in bursts? We're talking about fatal exceptions here, which - well - should be fatal? Do you mean recursive (double etc.) exceptions or exceptions on multiple cores?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not exactly sure what goes on in the system, but the when I cause a fatal exception with assert(0), the xtensa_excint1_c() is triggered multiple (tens of) times. The practical solution here is to print only one and filter out the ones that do not have proper data.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jsarha wow, maybe we should understand those multiple exceptions. Could you check their causes? Perhaps addresses too? And if they're double exceptions? Maybe that's exactly what is breaking our exception debugging and by understanding them we can fix it? At least would be good to check the second exception. And then the third. To see from which point they start to recurse, |
||
| */ | ||
| static struct { | ||
| struct debug_stream_text_msg msg; | ||
| char text[640]; | ||
| } __packed ds_buf[CONFIG_MP_MAX_NUM_CPUS]; | ||
| static int reports_sent_cpu[CONFIG_MP_MAX_NUM_CPUS]; | ||
| static size_t ds_pos[CONFIG_MP_MAX_NUM_CPUS]; | ||
|
|
||
| static void ds_exception_drain(bool flush) | ||
| { | ||
| unsigned int cpu = arch_proc_id(); | ||
|
|
||
| if (cpu >= CONFIG_MP_MAX_NUM_CPUS) | ||
| return; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this can happen. In worst case we would crash, so not sure if this check adds value. |
||
|
|
||
| if (flush) { | ||
| ds_pos[cpu] = 0; | ||
| reports_sent_cpu[cpu] = 0; | ||
| return; | ||
| } | ||
|
|
||
| if (ds_pos[cpu] == 0) | ||
| return; | ||
|
|
||
| if (reports_sent_cpu[cpu] > 0) | ||
| return; | ||
|
|
||
| ds_buf[cpu].msg.hdr.id = DEBUG_STREAM_RECORD_ID_TEXT_MSG; | ||
| ds_buf[cpu].msg.hdr.size_words = | ||
| SOF_DIV_ROUND_UP(sizeof(ds_buf[cpu].msg) + ds_pos[cpu], | ||
| sizeof(ds_buf[cpu].msg.hdr.data[0])); | ||
| /* Make sure the possible up to 3 extra bytes at end of msg are '\0' */ | ||
| memset(ds_buf[cpu].text + ds_pos[cpu], 0, | ||
| ds_buf[cpu].msg.hdr.size_words * sizeof(ds_buf[cpu].msg.hdr.data[0]) - ds_pos[cpu]); | ||
| debug_stream_slot_send_record(&ds_buf[cpu].msg.hdr); | ||
| reports_sent_cpu[cpu] = 1; | ||
| ds_pos[cpu] = 0; | ||
| } | ||
|
|
||
| static void ds_exception_dump(const char *format, va_list args) | ||
| { | ||
| ssize_t len; | ||
| size_t avail; | ||
| size_t written; | ||
| unsigned int cpu = arch_proc_id(); | ||
|
|
||
| if (cpu >= CONFIG_MP_MAX_NUM_CPUS) | ||
| return; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ditto here. |
||
|
|
||
| if (reports_sent_cpu[cpu] > 0) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we instead just limit based on availability? I.e. if no space, stop sending reports?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The problem is that we do not know where the host side read pointer is. From DSP side the host side python script is terribly slow, so slow that it practically stands still. So in practice we can only write less than the circular buffer size to avoid host side from loosing track of things. The circular bufffers size on PTL, with 5 cores, is 768 bytes. The current max record size, 640 bytes, is selected to be a bit less that that. The 640 bytes can host a normal exception dump, but if the back trace is present it usually has to be truncated. So there really is room for only one exception dump record (per cpu), so there is no point in counting the bytes. Of course we could busy wait sufficiently long time for the host to decode the previous record and then send another, but that is an advanced feature that can be added later if there is need for it. |
||
| return; | ||
|
|
||
| avail = sizeof(ds_buf[cpu].text) - ds_pos[cpu]; | ||
| if (avail == 0) { | ||
| ds_exception_drain(false); | ||
| return; | ||
| } | ||
|
|
||
| if (format[0] == '\0') | ||
| return; | ||
|
|
||
| /* Skip useless " ** " prefix to save bytes */ | ||
| if (strlen(format) >= 4 && | ||
| format[0] == ' ' && format[1] == '*' && format[2] == '*' && format[3] == ' ') | ||
| format += 4; | ||
|
|
||
| len = vsnprintf(ds_buf[cpu].text + ds_pos[cpu], avail, format, args); | ||
| if (len < 0) { | ||
| ds_pos[cpu] = 0; | ||
| return; | ||
| } | ||
|
|
||
| if (len == 0) | ||
| return; | ||
|
|
||
| if ((size_t)len >= avail) | ||
| written = avail - 1; | ||
| else | ||
| written = (size_t)len; | ||
|
|
||
| ds_pos[cpu] += written; | ||
|
|
||
| if (ds_pos[cpu] >= sizeof(ds_buf[cpu].text)) | ||
| ds_exception_drain(false); | ||
| } | ||
|
|
||
| static int init_exception_dump_hook(void) | ||
| { | ||
| set_exception_dump_hook(ds_exception_dump, ds_exception_drain); | ||
| LOG_INF("exception_dump_hook set"); | ||
| return 0; | ||
| } | ||
|
|
||
| SYS_INIT(init_exception_dump_hook, APPLICATION, CONFIG_KERNEL_INIT_PRIORITY_DEFAULT); | ||
| #endif | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -162,8 +162,9 @@ def print_text_msg(self, record, cpu): | |
| buffer = ( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo in commit: s/star/start/ |
||
| ctypes.c_ubyte * (len(record) - ctypes.sizeof(TextMsg)) | ||
| ).from_address(ctypes.addressof(record) + ctypes.sizeof(TextMsg)) | ||
| msg = bytearray(buffer).decode("utf-8") | ||
| print("CPU %u: %s" % (cpu, msg)) | ||
| payload = bytes(buffer) | ||
| msg = payload.split(b"\0", 1)[0].decode("utf-8", errors="replace") | ||
| print("CPU %u:\n%s" % (cpu, msg)) | ||
| return True | ||
|
|
||
| class DebugStreamSectionDescriptor(ctypes.Structure): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, this is ok for this overlay, but needs attention when/if we try to make this the default. The CI systems we have in place do not have debugstream runnign on DUTs, only mtrace, so this would in effect make dumps unavailable in CI.