ESPHome 2026.5.0b1
Loading...
Searching...
No Matches
crash_handler.cpp
Go to the documentation of this file.
1#ifdef USE_ESP32
2
4#ifdef USE_ESP32_CRASH_HANDLER
5
6#include "crash_handler.h"
7#include "esphome/core/log.h"
8
9#include <cinttypes>
10#include <cstring>
11#include <esp_attr.h>
12#include <esp_private/panic_internal.h>
13#include <soc/soc.h>
14
15#if CONFIG_IDF_TARGET_ARCH_XTENSA
16#include <esp_cpu_utils.h>
17#include <esp_debug_helpers.h>
18#include <xtensa_context.h>
19#elif CONFIG_IDF_TARGET_ARCH_RISCV
20#include <riscv/rvruntime-frames.h>
21#endif
22
23static constexpr uint32_t CRASH_MAGIC = 0xDEADBEEF;
24static constexpr size_t MAX_BACKTRACE = 16;
25
26// Check if an address looks like code (flash-mapped or IRAM).
27// Must be safe to call from panic context (no flash access needed).
28static inline bool IRAM_ATTR is_code_addr(uint32_t addr) {
29 return (addr >= SOC_IROM_LOW && addr < SOC_IROM_HIGH) || (addr >= SOC_IRAM_LOW && addr < SOC_IRAM_HIGH);
30}
31
32#if CONFIG_IDF_TARGET_ARCH_RISCV
33// Check if a code address is a real return address by verifying the preceding
34// instruction is a JAL or JALR with rd=ra (x1). Called at log time (not during
35// panic) so flash cache is available and both IRAM and IROM are safely readable.
36static inline bool is_return_addr(uint32_t addr) {
37 if (!is_code_addr(addr) || addr < 4)
38 return false;
39 // A return address on the stack points to the instruction after a call.
40 // Check for 4-byte JAL/JALR call instruction before this address.
41 // Use memcpy for alignment safety — RISC-V C extension means code addresses
42 // are only 2-byte aligned, so addr-4 may not be 4-byte aligned.
43 uint32_t inst;
44 memcpy(&inst, (const void *) (addr - 4), sizeof(inst));
45 // RISC-V instruction encoding: bits [6:0] = opcode, bits [11:7] = rd
46 uint32_t opcode = inst & 0x7f; // Extract 7-bit opcode
47 uint32_t rd = inst & 0xf80; // Extract rd field (bits 11:7)
48 // Match JAL (0x6f) or JALR (0x67) with rd=ra (x1, encoded as 0x80 = 1<<7)
49 if ((opcode == 0x6f || opcode == 0x67) && rd == 0x80)
50 return true;
51 // Check for 2-byte compressed c.jalr before this address (C extension).
52 // c.jalr saves to ra implicitly: funct4=1001, rs1!=0, rs2=0, op=10
53 if (addr >= 2) {
54 uint16_t c_inst = *(uint16_t *) (addr - 2);
55 if ((c_inst & 0xf07f) == 0x9002 && (c_inst & 0x0f80) != 0)
56 return true;
57 }
58 return false;
59}
60#endif
61
62// --- Architecture-specific backtrace helpers ---
63// These run from IRAM during panic (no flash access).
64
65#if CONFIG_IDF_TARGET_ARCH_XTENSA
66// Walk Xtensa backtrace from an exception frame, writing PCs to out[].
67// Returns number of entries written.
68static uint8_t IRAM_ATTR walk_xtensa_backtrace(XtExcFrame *frame, uint32_t *out, uint8_t max) {
69 esp_backtrace_frame_t bt_frame = {
70 .pc = (uint32_t) frame->pc,
71 .sp = (uint32_t) frame->a1,
72 .next_pc = (uint32_t) frame->a0,
73 .exc_frame = frame,
74 };
75 uint8_t count = 0;
76 uint32_t first_pc = esp_cpu_process_stack_pc(bt_frame.pc);
77 if (is_code_addr(first_pc)) {
78 out[count++] = first_pc;
79 }
80 while (count < max && bt_frame.next_pc != 0) {
81 if (!esp_backtrace_get_next_frame(&bt_frame))
82 break;
83 uint32_t pc = esp_cpu_process_stack_pc(bt_frame.pc);
84 if (is_code_addr(pc)) {
85 out[count++] = pc;
86 }
87 }
88 return count;
89}
90#endif
91
92#if CONFIG_IDF_TARGET_ARCH_RISCV
93// Capture RISC-V backtrace: MEPC + RA from registers, then stack scan.
94// Returns total count; *reg_count receives number of register-sourced entries.
95static uint8_t IRAM_ATTR capture_riscv_backtrace(RvExcFrame *frame, uint32_t *out, uint8_t max, uint8_t *reg_count) {
96 uint8_t count = 0;
97 if (is_code_addr(frame->mepc)) {
98 out[count++] = frame->mepc;
99 }
100 if (is_code_addr(frame->ra) && frame->ra != frame->mepc) {
101 out[count++] = frame->ra;
102 }
103 *reg_count = count;
104 auto *scan_start = (uint32_t *) frame->sp;
105 for (uint32_t i = 0; i < 64 && count < max; i++) {
107 if (is_code_addr(val) && val != frame->mepc && val != frame->ra) {
108 out[count++] = val;
109 }
110 }
111 return count;
112}
113#endif
114
115// Raw crash data written by the panic handler wrapper.
116// Lives in .noinit so it survives software reset but contains garbage after power cycle.
117// Validated by magic marker. Static linkage since it's only used within this file.
118// Version field is first so future firmware can always identify the struct layout.
119// Magic is second to validate the data. Remaining fields can change between versions.
120// Version is uint32_t because it would be padded to 4 bytes anyway before the next
121// uint32_t field, so we use the full width rather than wasting 3 bytes of padding.
122static constexpr uint32_t CRASH_DATA_VERSION = 2;
123struct RawCrashData {
124 uint32_t version;
125 uint32_t magic;
126 uint32_t pc;
127 uint8_t backtrace_count;
128 uint8_t reg_frame_count; // Number of entries from registers (not stack-scanned)
129 uint8_t exception; // panic_exception_t enum (FAULT/ABORT/IWDT/TWDT/DEBUG)
130 uint8_t pseudo_excause; // Whether cause is a pseudo exception (Xtensa SoC-level panic)
131 uint32_t backtrace[MAX_BACKTRACE];
132 uint32_t cause; // Architecture-specific: exccause (Xtensa) or mcause (RISC-V)
133 uint8_t crashed_core;
134#if SOC_CPU_CORES_NUM > 1
135 static_assert(SOC_CPU_CORES_NUM == 2, "Dual-core logic assumes exactly 2 cores");
136 uint8_t other_backtrace_count;
137 uint8_t other_reg_frame_count;
138 uint32_t other_backtrace[MAX_BACKTRACE];
139#endif
140};
141static RawCrashData __attribute__((section(".noinit")))
142s_raw_crash_data; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
143
144// Whether crash data was found and validated this boot.
145static bool s_crash_data_valid = false; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
146
147namespace esphome::esp32 {
148
149static const char *const TAG = "esp32.crash";
150
152 if (s_raw_crash_data.magic == CRASH_MAGIC && s_raw_crash_data.version == CRASH_DATA_VERSION) {
153 s_crash_data_valid = true;
154 // Clamp counts to prevent out-of-bounds reads from corrupt .noinit data
155 if (s_raw_crash_data.backtrace_count > MAX_BACKTRACE)
156 s_raw_crash_data.backtrace_count = MAX_BACKTRACE;
157 if (s_raw_crash_data.reg_frame_count > s_raw_crash_data.backtrace_count)
158 s_raw_crash_data.reg_frame_count = s_raw_crash_data.backtrace_count;
159 if (s_raw_crash_data.exception > 4) // panic_exception_t max value
160 s_raw_crash_data.exception = 4; // Default to PANIC_EXCEPTION_FAULT
161 if (s_raw_crash_data.pseudo_excause > 1)
162 s_raw_crash_data.pseudo_excause = 0;
163 if (s_raw_crash_data.crashed_core >= SOC_CPU_CORES_NUM)
164 s_raw_crash_data.crashed_core = 0;
165#if SOC_CPU_CORES_NUM > 1
166 if (s_raw_crash_data.other_backtrace_count > MAX_BACKTRACE)
167 s_raw_crash_data.other_backtrace_count = MAX_BACKTRACE;
168 if (s_raw_crash_data.other_reg_frame_count > s_raw_crash_data.other_backtrace_count)
169 s_raw_crash_data.other_reg_frame_count = s_raw_crash_data.other_backtrace_count;
170#endif
171 }
172 // Don't clear magic here — crash data must survive OTA rollback reboots.
173 // Magic is cleared by crash_handler_clear() after an API client receives the data.
174}
175
176bool crash_handler_has_data() { return s_crash_data_valid; }
177
179 // Only clear the magic so data doesn't survive the next reboot.
180 // Keep s_crash_data_valid so crash_handler_log() still works for
181 // additional API clients connecting during this boot session.
182 s_raw_crash_data.magic = 0;
183}
184
185// Look up the exception cause as a human-readable string.
186// Tables mirror ESP-IDF's panic_arch_fill_info() which uses local static arrays
187// not exposed via any public API.
188static const char *get_exception_reason() {
189#if CONFIG_IDF_TARGET_ARCH_XTENSA
190 if (s_raw_crash_data.pseudo_excause) {
191 // SoC-level panic: watchdog, cache error, etc.
192 // Keep in sync with ESP-IDF's PANIC_RSN_* defines
193 static const char *const PSEUDO_REASON[] = {
194 "Unknown reason", // 0
195 "Unhandled debug exception", // 1
196 "Double exception", // 2
197 "Unhandled kernel exception", // 3
198 "Coprocessor exception", // 4
199 "Interrupt wdt timeout on CPU0", // 5
200 "Interrupt wdt timeout on CPU1", // 6
201 "Cache error", // 7
202 };
203 uint32_t cause = s_raw_crash_data.cause;
204 if (cause < sizeof(PSEUDO_REASON) / sizeof(PSEUDO_REASON[0]))
205 return PSEUDO_REASON[cause];
206 return PSEUDO_REASON[0];
207 }
208 // Real Xtensa exception
209 static const char *const REASON[] = {
210 "IllegalInstruction",
211 "Syscall",
212 "InstructionFetchError",
213 "LoadStoreError",
214 "Level1Interrupt",
215 "Alloca",
216 "IntegerDivideByZero",
217 "PCValue",
218 "Privileged",
219 "LoadStoreAlignment",
220 nullptr,
221 nullptr,
222 "InstrPDAddrError",
223 "LoadStorePIFDataError",
224 "InstrPIFAddrError",
225 "LoadStorePIFAddrError",
226 "InstTLBMiss",
227 "InstTLBMultiHit",
228 "InstFetchPrivilege",
229 nullptr,
230 "InstrFetchProhibited",
231 nullptr,
232 nullptr,
233 nullptr,
234 "LoadStoreTLBMiss",
235 "LoadStoreTLBMultihit",
236 "LoadStorePrivilege",
237 nullptr,
238 "LoadProhibited",
239 "StoreProhibited",
240 };
241 uint32_t cause = s_raw_crash_data.cause;
242 if (cause < sizeof(REASON) / sizeof(REASON[0]) && REASON[cause] != nullptr)
243 return REASON[cause];
244#elif CONFIG_IDF_TARGET_ARCH_RISCV
245 // For SoC-level panics (watchdog, cache error), mcause holds IDF-internal
246 // interrupt numbers, not standard RISC-V cause codes. The exception type
247 // field already identifies these, so just return null to use the type name.
248 if (s_raw_crash_data.pseudo_excause)
249 return nullptr;
250 static const char *const REASON[] = {
251 "Instruction address misaligned",
252 "Instruction access fault",
253 "Illegal instruction",
254 "Breakpoint",
255 "Load address misaligned",
256 "Load access fault",
257 "Store address misaligned",
258 "Store access fault",
259 "Environment call from U-mode",
260 "Environment call from S-mode",
261 nullptr,
262 "Environment call from M-mode",
263 "Instruction page fault",
264 "Load page fault",
265 nullptr,
266 "Store page fault",
267 };
268 uint32_t cause = s_raw_crash_data.cause;
269 if (cause < sizeof(REASON) / sizeof(REASON[0]) && REASON[cause] != nullptr)
270 return REASON[cause];
271#endif
272 return "Unknown";
273}
274
275// Exception type names matching panic_exception_t enum
276static const char *get_exception_type() {
277 static const char *const TYPES[] = {
278 "Debug exception", // PANIC_EXCEPTION_DEBUG
279 "Interrupt wdt", // PANIC_EXCEPTION_IWDT
280 "Task wdt", // PANIC_EXCEPTION_TWDT
281 "Abort", // PANIC_EXCEPTION_ABORT
282 "Fault", // PANIC_EXCEPTION_FAULT
283 };
284 uint8_t exc = s_raw_crash_data.exception;
285 if (exc < sizeof(TYPES) / sizeof(TYPES[0]))
286 return TYPES[exc];
287 return "Unknown";
288}
289
290// Log backtrace entries, filtering stack-scanned addresses on RISC-V.
291static void log_backtrace(const uint32_t *addrs, uint8_t count, uint8_t reg_frame_count) {
292 uint8_t bt_num = 0;
293 for (uint8_t i = 0; i < count; i++) {
294 uint32_t addr = addrs[i];
295#if CONFIG_IDF_TARGET_ARCH_RISCV
296 if (i >= reg_frame_count && !is_return_addr(addr))
297 continue;
298 const char *source = (i < reg_frame_count) ? "backtrace" : "stack scan";
299#else
300 const char *source = "backtrace";
301#endif
302 ESP_LOGE(TAG, " BT%d: 0x%08" PRIX32 " (%s)", bt_num++, addr, source);
303 }
304}
305
306// Append backtrace addresses to the addr2line hint buffer.
307static int append_addrs_to_hint(char *buf, int size, int pos, const uint32_t *addrs, uint8_t count,
308 uint8_t reg_frame_count) {
309 for (uint8_t i = 0; i < count && pos < size - 12; i++) {
310 uint32_t addr = addrs[i];
311#if CONFIG_IDF_TARGET_ARCH_RISCV
312 if (i >= reg_frame_count && !is_return_addr(addr))
313 continue;
314#endif
315 pos += snprintf(buf + pos, size - pos, " 0x%08" PRIX32, addr);
316 }
317 return pos;
318}
319
320// Intentionally uses separate ESP_LOGE calls per line instead of combining into
321// one multi-line log message. This ensures each address appears as its own line
322// on the serial console, making it possible to see partial output if the device
323// crashes again during boot, and allowing the CLI's process_stacktrace to match
324// and decode each address individually.
326 if (!s_crash_data_valid)
327 return;
328
329 ESP_LOGE(TAG, "*** CRASH DETECTED ON PREVIOUS BOOT ***");
330 const char *reason = get_exception_reason();
331 if (reason != nullptr) {
332 ESP_LOGE(TAG, " Reason: %s - %s", get_exception_type(), reason);
333 } else {
334 ESP_LOGE(TAG, " Reason: %s", get_exception_type());
335 }
336 ESP_LOGE(TAG, " Crashed core: %d", s_raw_crash_data.crashed_core);
337 ESP_LOGE(TAG, " PC: 0x%08" PRIX32 " (fault location)", s_raw_crash_data.pc);
338 log_backtrace(s_raw_crash_data.backtrace, s_raw_crash_data.backtrace_count, s_raw_crash_data.reg_frame_count);
339
340#if SOC_CPU_CORES_NUM > 1
341 if (s_raw_crash_data.other_backtrace_count > 0) {
342 int other_core = 1 - s_raw_crash_data.crashed_core;
343 ESP_LOGE(TAG, " Other core (%d) backtrace:", other_core);
344 log_backtrace(s_raw_crash_data.other_backtrace, s_raw_crash_data.other_backtrace_count,
345 s_raw_crash_data.other_reg_frame_count);
346 }
347#endif
348
349 // Build addr2line hint with all captured addresses for easy copy-paste
350 char hint[256];
351 int pos = snprintf(hint, sizeof(hint), "Use: addr2line -pfiaC -e firmware.elf 0x%08" PRIX32, s_raw_crash_data.pc);
352 pos = append_addrs_to_hint(hint, sizeof(hint), pos, s_raw_crash_data.backtrace, s_raw_crash_data.backtrace_count,
353 s_raw_crash_data.reg_frame_count);
354#if SOC_CPU_CORES_NUM > 1
355 append_addrs_to_hint(hint, sizeof(hint), pos, s_raw_crash_data.other_backtrace,
356 s_raw_crash_data.other_backtrace_count, s_raw_crash_data.other_reg_frame_count);
357#endif
358 ESP_LOGE(TAG, "%s", hint);
359}
360
361} // namespace esphome::esp32
362
363// --- Panic handler wrapper ---
364// Intercepts esp_panic_handler() via --wrap linker flag to capture crash data
365// into NOINIT memory before the normal panic handler runs.
366//
367extern "C" {
368// NOLINTBEGIN(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp,readability-identifier-naming)
369// Names are mandated by the --wrap linker mechanism
370extern void __real_esp_panic_handler(panic_info_t *info);
371
372void IRAM_ATTR __wrap_esp_panic_handler(panic_info_t *info) {
373 // Save the faulting PC and exception info
374 s_raw_crash_data.pc = (uint32_t) info->addr;
375 s_raw_crash_data.backtrace_count = 0;
376 s_raw_crash_data.reg_frame_count = 0;
377 s_raw_crash_data.exception = (uint8_t) info->exception;
378 s_raw_crash_data.pseudo_excause = info->pseudo_excause ? 1 : 0;
379 s_raw_crash_data.crashed_core = (uint8_t) info->core;
380#if SOC_CPU_CORES_NUM > 1
381 s_raw_crash_data.other_backtrace_count = 0;
382 s_raw_crash_data.other_reg_frame_count = 0;
383#endif
384
385#if CONFIG_IDF_TARGET_ARCH_XTENSA
386 // Xtensa: walk the backtrace using the public API
387 if (info->frame != nullptr) {
388 auto *xt_frame = (XtExcFrame *) info->frame;
389 s_raw_crash_data.cause = xt_frame->exccause;
390 s_raw_crash_data.backtrace_count = walk_xtensa_backtrace(xt_frame, s_raw_crash_data.backtrace, MAX_BACKTRACE);
391 }
392
393#if SOC_CPU_CORES_NUM > 1
394 // Capture the other core's backtrace from the global frame array.
395 // Both cores save their frames to g_exc_frames[] before esp_panic_handler
396 // is called, so the other core's frame is available here.
397 if (info->core >= 0 && info->core < SOC_CPU_CORES_NUM) {
398 int other_core = 1 - info->core;
399 auto *other_frame = (XtExcFrame *) g_exc_frames[other_core];
400 if (other_frame != nullptr) {
401 s_raw_crash_data.other_backtrace_count =
402 walk_xtensa_backtrace(other_frame, s_raw_crash_data.other_backtrace, MAX_BACKTRACE);
403 }
404 }
405#endif
406
407#elif CONFIG_IDF_TARGET_ARCH_RISCV
408 // RISC-V: capture MEPC + RA, then scan stack for code addresses
409 if (info->frame != nullptr) {
410 auto *rv_frame = (RvExcFrame *) info->frame;
411 s_raw_crash_data.cause = rv_frame->mcause;
412 s_raw_crash_data.backtrace_count =
413 capture_riscv_backtrace(rv_frame, s_raw_crash_data.backtrace, MAX_BACKTRACE, &s_raw_crash_data.reg_frame_count);
414 }
415
416#if SOC_CPU_CORES_NUM > 1
417 // Capture the other core's backtrace from the global frame array.
418 if (info->core >= 0 && info->core < SOC_CPU_CORES_NUM) {
419 int other_core = 1 - info->core;
420 auto *other_frame = (RvExcFrame *) g_exc_frames[other_core];
421 if (other_frame != nullptr) {
422 s_raw_crash_data.other_backtrace_count = capture_riscv_backtrace(
423 other_frame, s_raw_crash_data.other_backtrace, MAX_BACKTRACE, &s_raw_crash_data.other_reg_frame_count);
424 }
425 }
426#endif
427#endif
428
429 // Write version and magic last — ensures all data is written before we mark it valid
430 s_raw_crash_data.version = CRASH_DATA_VERSION;
431 s_raw_crash_data.magic = CRASH_MAGIC;
432
433 // Call the real panic handler (prints to UART, does core dump, reboots, etc.)
435}
436
437// NOLINTEND(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp,readability-identifier-naming)
438} // extern "C"
439
440#endif // USE_ESP32_CRASH_HANDLER
441#endif // USE_ESP32
struct @65::@66 __attribute__
Wake the main loop task from an ISR. ISR-safe.
Definition main_task.h:32
void __real_esp_panic_handler(panic_info_t *info)
void IRAM_ATTR __wrap_esp_panic_handler(panic_info_t *info)
mopeka_std_values val[3]
bool crash_handler_has_data()
Returns true if crash data was found this boot.
void crash_handler_log()
Log crash data if a crash was detected on previous boot.
void crash_handler_read_and_clear()
Read and validate crash data from NOINIT memory.
void crash_handler_clear()
Clear the magic marker and mark crash data as consumed.
uint16_t size
Definition helpers.cpp:25
size_t size_t pos
Definition helpers.h:1038
uint32_t * scan_start
static void uint32_t
uint32_t pc