diff --git a/9_Firmware/9_1_Microcontroller/9_1_3_C_Cpp_Code/main.cpp b/9_Firmware/9_1_Microcontroller/9_1_3_C_Cpp_Code/main.cpp index b8c9392..0e48939 100644 --- a/9_Firmware/9_1_Microcontroller/9_1_3_C_Cpp_Code/main.cpp +++ b/9_Firmware/9_1_Microcontroller/9_1_3_C_Cpp_Code/main.cpp @@ -620,7 +620,8 @@ typedef enum { ERROR_POWER_SUPPLY, ERROR_TEMPERATURE_HIGH, ERROR_MEMORY_ALLOC, - ERROR_WATCHDOG_TIMEOUT + ERROR_WATCHDOG_TIMEOUT, + ERROR_COUNT // must be last — used for bounds checking error_strings[] } SystemError_t; static SystemError_t last_error = ERROR_NONE; @@ -631,21 +632,42 @@ static bool system_emergency_state = false; SystemError_t checkSystemHealth(void) { SystemError_t current_error = ERROR_NONE; - // 1. Check AD9523 Clock Generator - static uint32_t last_clock_check = 0; - if (HAL_GetTick() - last_clock_check > 5000) { - GPIO_PinState s0 = HAL_GPIO_ReadPin(AD9523_STATUS0_GPIO_Port, AD9523_STATUS0_Pin); - GPIO_PinState s1 = HAL_GPIO_ReadPin(AD9523_STATUS1_GPIO_Port, AD9523_STATUS1_Pin); - DIAG_GPIO("CLK", "AD9523 STATUS0", s0); - DIAG_GPIO("CLK", "AD9523 STATUS1", s1); - if (s0 == GPIO_PIN_RESET || s1 == GPIO_PIN_RESET) { - current_error = ERROR_AD9523_CLOCK; - DIAG_ERR("CLK", "AD9523 clock health check FAILED (STATUS0=%d STATUS1=%d)", s0, s1); + // 0. Watchdog: detect main-loop stall (checkSystemHealth not called for >60 s). + // Timestamp is captured at function ENTRY and updated unconditionally, so + // any early return from a sub-check below cannot leave a stale value that + // would later trip a spurious ERROR_WATCHDOG_TIMEOUT. A dedicated cold-start + // branch ensures the first call after boot never trips (last_health_check==0 + // would otherwise make `HAL_GetTick() - 0 > 60000` true forever after the + // 60-s mark of the init sequence). + static uint32_t last_health_check = 0; + uint32_t now_tick = HAL_GetTick(); + if (last_health_check == 0) { + last_health_check = now_tick; // cold start: seed only + } else { + uint32_t elapsed = now_tick - last_health_check; + last_health_check = now_tick; // update BEFORE any early return + if (elapsed > 60000) { + current_error = ERROR_WATCHDOG_TIMEOUT; + DIAG_ERR("SYS", "Health check: Watchdog timeout (>60s since last check)"); return current_error; } - last_clock_check = HAL_GetTick(); } + // 1. Check AD9523 Clock Generator + static uint32_t last_clock_check = 0; + if (HAL_GetTick() - last_clock_check > 5000) { + GPIO_PinState s0 = HAL_GPIO_ReadPin(AD9523_STATUS0_GPIO_Port, AD9523_STATUS0_Pin); + GPIO_PinState s1 = HAL_GPIO_ReadPin(AD9523_STATUS1_GPIO_Port, AD9523_STATUS1_Pin); + DIAG_GPIO("CLK", "AD9523 STATUS0", s0); + DIAG_GPIO("CLK", "AD9523 STATUS1", s1); + if (s0 == GPIO_PIN_RESET || s1 == GPIO_PIN_RESET) { + current_error = ERROR_AD9523_CLOCK; + DIAG_ERR("CLK", "AD9523 clock health check FAILED (STATUS0=%d STATUS1=%d)", s0, s1); + return current_error; + } + last_clock_check = HAL_GetTick(); + } + // 2. Check ADF4382 Lock Status bool tx_locked, rx_locked; if (ADF4382A_CheckLockStatus(&lo_manager, &tx_locked, &rx_locked) == ADF4382A_MANAGER_OK) { @@ -679,26 +701,26 @@ SystemError_t checkSystemHealth(void) { // 4. Check IMU Communication static uint32_t last_imu_check = 0; - if (HAL_GetTick() - last_imu_check > 10000) { - if (!GY85_Update(&imu)) { - current_error = ERROR_IMU_COMM; - DIAG_ERR("IMU", "Health check: GY85_Update() FAILED"); - return current_error; - } - last_imu_check = HAL_GetTick(); - } + if (HAL_GetTick() - last_imu_check > 10000) { + if (!GY85_Update(&imu)) { + current_error = ERROR_IMU_COMM; + DIAG_ERR("IMU", "Health check: GY85_Update() FAILED"); + return current_error; + } + last_imu_check = HAL_GetTick(); + } // 5. Check BMP180 Communication static uint32_t last_bmp_check = 0; - if (HAL_GetTick() - last_bmp_check > 15000) { - double pressure = myBMP.getPressure(); - if (pressure < 30000.0 || pressure > 110000.0 || isnan(pressure)) { - current_error = ERROR_BMP180_COMM; - DIAG_ERR("SYS", "Health check: BMP180 pressure out of range: %.0f", pressure); - return current_error; - } - last_bmp_check = HAL_GetTick(); - } + if (HAL_GetTick() - last_bmp_check > 15000) { + double pressure = myBMP.getPressure(); + if (pressure < 30000.0 || pressure > 110000.0 || isnan(pressure)) { + current_error = ERROR_BMP180_COMM; + DIAG_ERR("SYS", "Health check: BMP180 pressure out of range: %.0f", pressure); + return current_error; + } + last_bmp_check = HAL_GetTick(); + } // 6. Check GPS Communication static uint32_t last_gps_fix = 0; @@ -734,14 +756,7 @@ SystemError_t checkSystemHealth(void) { return current_error; } - // 9. Simple watchdog check - static uint32_t last_health_check = 0; - if (HAL_GetTick() - last_health_check > 60000) { - current_error = ERROR_WATCHDOG_TIMEOUT; - DIAG_ERR("SYS", "Health check: Watchdog timeout (>60s since last check)"); - return current_error; - } - last_health_check = HAL_GetTick(); + // 9. Watchdog check is performed at function entry (see step 0). if (current_error != ERROR_NONE) { DIAG_ERR("SYS", "checkSystemHealth returning error code %d", current_error); @@ -853,7 +868,7 @@ void handleSystemError(SystemError_t error) { DIAG_ERR("SYS", "handleSystemError: error=%d error_count=%lu", error, error_count); char error_msg[100]; - const char* error_strings[] = { + static const char* const error_strings[] = { "No error", "AD9523 Clock failure", "ADF4382 TX LO unlocked", @@ -873,9 +888,16 @@ void handleSystemError(SystemError_t error) { "Watchdog timeout" }; + static_assert(sizeof(error_strings) / sizeof(error_strings[0]) == ERROR_COUNT, + "error_strings[] and SystemError_t enum are out of sync"); + + const char* err_name = (error >= 0 && error < (int)(sizeof(error_strings) / sizeof(error_strings[0]))) + ? error_strings[error] + : "Unknown error"; + snprintf(error_msg, sizeof(error_msg), "ERROR #%d: %s (Count: %lu)\r\n", - error, error_strings[error], error_count); + error, err_name, error_count); HAL_UART_Transmit(&huart3, (uint8_t*)error_msg, strlen(error_msg), 1000); // Blink LED pattern based on error code @@ -901,7 +923,7 @@ void handleSystemError(SystemError_t error) { if ((error >= ERROR_RF_PA_OVERCURRENT && error <= ERROR_POWER_SUPPLY) || error == ERROR_TEMPERATURE_HIGH || error == ERROR_WATCHDOG_TIMEOUT) { - DIAG_ERR("SYS", "CRITICAL ERROR (code %d: %s) -- initiating Emergency_Stop()", error, error_strings[error]); + DIAG_ERR("SYS", "CRITICAL ERROR (code %d: %s) -- initiating Emergency_Stop()", error, err_name); snprintf(error_msg, sizeof(error_msg), "CRITICAL ERROR! Initiating emergency shutdown.\r\n"); HAL_UART_Transmit(&huart3, (uint8_t*)error_msg, strlen(error_msg), 1000); diff --git a/9_Firmware/9_1_Microcontroller/tests/.gitignore b/9_Firmware/9_1_Microcontroller/tests/.gitignore index e185c71..acc7942 100644 --- a/9_Firmware/9_1_Microcontroller/tests/.gitignore +++ b/9_Firmware/9_1_Microcontroller/tests/.gitignore @@ -3,18 +3,38 @@ *.dSYM/ # Test binaries (built by Makefile) +# TESTS_WITH_REAL test_bug1_timed_sync_init_ordering -test_bug2_ad9523_double_setup test_bug3_timed_sync_noop test_bug4_phase_shift_before_check test_bug5_fine_phase_gpio_only +test_bug9_platform_ops_null +test_bug10_spi_cs_not_toggled +test_bug15_htim3_dangling_extern + +# TESTS_MOCK_ONLY +test_bug2_ad9523_double_setup test_bug6_timer_variable_collision test_bug7_gpio_pin_conflict test_bug8_uart_commented_out -test_bug9_platform_ops_null -test_bug10_spi_cs_not_toggled -test_bug11_platform_spi_transmit_only +test_bug14_diag_section_args +test_gap3_emergency_stop_rails + +# TESTS_STANDALONE test_bug12_pa_cal_loop_inverted test_bug13_dac2_adc_buffer_mismatch -test_bug14_diag_section_args -test_bug15_htim3_dangling_extern +test_gap3_iwdg_config +test_gap3_temperature_max +test_gap3_idq_periodic_reread +test_gap3_emergency_state_ordering +test_gap3_overtemp_emergency_stop +test_gap3_health_watchdog_cold_start + +# TESTS_WITH_PLATFORM +test_bug11_platform_spi_transmit_only + +# TESTS_WITH_CXX +test_agc_outer_loop + +# Manual / one-off test builds +test_um982_gps diff --git a/9_Firmware/9_1_Microcontroller/tests/Makefile b/9_Firmware/9_1_Microcontroller/tests/Makefile index 75b7548..2d34e58 100644 --- a/9_Firmware/9_1_Microcontroller/tests/Makefile +++ b/9_Firmware/9_1_Microcontroller/tests/Makefile @@ -65,7 +65,8 @@ TESTS_STANDALONE := test_bug12_pa_cal_loop_inverted \ test_gap3_temperature_max \ test_gap3_idq_periodic_reread \ test_gap3_emergency_state_ordering \ - test_gap3_overtemp_emergency_stop + test_gap3_overtemp_emergency_stop \ + test_gap3_health_watchdog_cold_start # Tests that need platform_noos_stm32.o + mocks TESTS_WITH_PLATFORM := test_bug11_platform_spi_transmit_only @@ -78,7 +79,7 @@ ALL_TESTS := $(TESTS_WITH_REAL) $(TESTS_MOCK_ONLY) $(TESTS_STANDALONE) $(TESTS_W .PHONY: all build test clean \ $(addprefix test_,bug1 bug2 bug3 bug4 bug5 bug6 bug7 bug8 bug9 bug10 bug11 bug12 bug13 bug14 bug15) \ test_gap3_estop test_gap3_iwdg test_gap3_temp test_gap3_idq test_gap3_order \ - test_gap3_overtemp + test_gap3_overtemp test_gap3_wdog all: build test @@ -167,6 +168,9 @@ test_gap3_emergency_state_ordering: test_gap3_emergency_state_ordering.c test_gap3_overtemp_emergency_stop: test_gap3_overtemp_emergency_stop.c $(CC) $(CFLAGS) $< -o $@ +test_gap3_health_watchdog_cold_start: test_gap3_health_watchdog_cold_start.c + $(CC) $(CFLAGS) $< -o $@ + # Tests that need platform_noos_stm32.o + mocks $(TESTS_WITH_PLATFORM): %: %.c $(MOCK_OBJS) $(PLATFORM_OBJ) $(CC) $(CFLAGS) $(INCLUDES) $< $(MOCK_OBJS) $(PLATFORM_OBJ) -o $@ @@ -254,6 +258,9 @@ test_gap3_order: test_gap3_emergency_state_ordering test_gap3_overtemp: test_gap3_overtemp_emergency_stop ./test_gap3_overtemp_emergency_stop +test_gap3_wdog: test_gap3_health_watchdog_cold_start + ./test_gap3_health_watchdog_cold_start + # --- Clean --- clean: diff --git a/9_Firmware/9_1_Microcontroller/tests/test_gap3_health_watchdog_cold_start.c b/9_Firmware/9_1_Microcontroller/tests/test_gap3_health_watchdog_cold_start.c new file mode 100644 index 0000000..01fadfa --- /dev/null +++ b/9_Firmware/9_1_Microcontroller/tests/test_gap3_health_watchdog_cold_start.c @@ -0,0 +1,132 @@ +/******************************************************************************* + * test_gap3_health_watchdog_cold_start.c + * + * Safety bug: checkSystemHealth()'s internal watchdog (step 9, pre-fix) had two + * linked defects that, once ERROR_WATCHDOG_TIMEOUT was escalated to + * Emergency_Stop() by the overtemp/watchdog PR, would false-latch the radar: + * + * (1) Cold-start false trip: + * static uint32_t last_health_check = 0; + * if (HAL_GetTick() - last_health_check > 60000) { ... } + * On the very first call, last_health_check == 0, so once the MCU has + * been up >60 s (which is typical after the ADAR1000 / AD9523 / ADF4382 + * init sequence) the subtraction `now - 0` exceeds 60 000 ms and the + * watchdog trips spuriously. + * + * (2) Stale-timestamp after early returns: + * last_health_check = HAL_GetTick(); // at END of function + * Every earlier sub-check (IMU, BMP180, GPS, PA Idq, temperature) has an + * `if (fault) return current_error;` path that skips the update. After a + * cumulative 60 s of transient faults, the next clean call compares + * `now` against the long-stale `last_health_check` and trips. + * + * After fix: Watchdog logic moved to function ENTRY. A dedicated cold-start + * branch seeds the timestamp on the first call without checking. + * On every subsequent call, the elapsed delta is captured FIRST + * and last_health_check is updated BEFORE any sub-check runs, so + * early returns no longer leave a stale value. + * + * Test strategy: + * Extract the post-fix watchdog predicate into a standalone function that + * takes a simulated HAL_GetTick() value and returns whether the watchdog + * should trip. Walk through boot + fault sequences that would have tripped + * the pre-fix code and assert the post-fix code does NOT trip. + ******************************************************************************/ +#include +#include +#include + +/* --- Post-fix watchdog state + predicate, extracted verbatim --- */ +static uint32_t last_health_check = 0; + +/* Returns 1 iff this call should raise ERROR_WATCHDOG_TIMEOUT. + Updates last_health_check BEFORE returning (matches post-fix behaviour). */ +static int health_watchdog_step(uint32_t now_tick) +{ + if (last_health_check == 0) { + last_health_check = now_tick; /* cold start: seed only, never trip */ + return 0; + } + uint32_t elapsed = now_tick - last_health_check; + last_health_check = now_tick; /* update BEFORE any early return */ + return (elapsed > 60000) ? 1 : 0; +} + +/* Test helper: reset the static state between scenarios. */ +static void reset_state(void) { last_health_check = 0; } + +int main(void) +{ + printf("=== Safety fix: checkSystemHealth() watchdog cold-start + stale-ts ===\n"); + + /* ---------- Scenario 1: cold-start after 60 s of init must NOT trip ---- */ + printf(" Test 1: first call at t=75000 ms (post-init) does not trip... "); + reset_state(); + assert(health_watchdog_step(75000) == 0); + printf("PASS\n"); + + /* ---------- Scenario 2: first call far beyond 60 s (PRE-FIX BUG) ------- */ + printf(" Test 2: first call at t=600000 ms still does not trip... "); + reset_state(); + assert(health_watchdog_step(600000) == 0); + printf("PASS\n"); + + /* ---------- Scenario 3: healthy main-loop pacing (10 ms period) -------- */ + printf(" Test 3: 1000 calls at 10 ms intervals never trip... "); + reset_state(); + (void)health_watchdog_step(1000); /* seed */ + for (int i = 1; i <= 1000; i++) { + assert(health_watchdog_step(1000 + i * 10) == 0); + } + printf("PASS\n"); + + /* ---------- Scenario 4: stale-timestamp after a burst of early returns - + Pre-fix bug: many early returns skipped the timestamp update, so a + later clean call would compare `now` against a 60+ s old value. Post-fix, + every call (including ones that would have early-returned in the real + function) updates the timestamp at the top, so this scenario is modelled + by calling health_watchdog_step() on every iteration of the main loop. */ + printf(" Test 4: 70 s of 100 ms-spaced calls after seed do not trip... "); + reset_state(); + (void)health_watchdog_step(50000); /* seed mid-run */ + for (int i = 1; i <= 700; i++) { /* 70 s @ 100 ms */ + int tripped = health_watchdog_step(50000 + i * 100); + assert(tripped == 0); + } + printf("PASS\n"); + + /* ---------- Scenario 5: genuine stall MUST trip ------------------------ */ + printf(" Test 5: real 60+ s gap between calls does trip... "); + reset_state(); + (void)health_watchdog_step(10000); /* seed */ + assert(health_watchdog_step(10000 + 60001) == 1); + printf("PASS\n"); + + /* ---------- Scenario 6: exactly 60 s gap is the boundary -- do NOT trip + Post-fix predicate uses strict >60000, matching the pre-fix comparator. */ + printf(" Test 6: exactly 60000 ms gap does not trip (boundary)... "); + reset_state(); + (void)health_watchdog_step(10000); + assert(health_watchdog_step(10000 + 60000) == 0); + printf("PASS\n"); + + /* ---------- Scenario 7: trip, then recover on next paced call ---------- */ + printf(" Test 7: after a genuine stall+trip, next paced call does not re-trip... "); + reset_state(); + (void)health_watchdog_step(5000); /* seed */ + assert(health_watchdog_step(5000 + 70000) == 1); /* stall -> trip */ + assert(health_watchdog_step(5000 + 70000 + 10) == 0); /* resume paced */ + printf("PASS\n"); + + /* ---------- Scenario 8: HAL_GetTick() 32-bit wrap (~49.7 days) --------- + Because we subtract unsigned 32-bit values, wrap is handled correctly as + long as the true elapsed time is < 2^32 ms. */ + printf(" Test 8: tick wrap from 0xFFFFFF00 -> 0x00000064 (200 ms span) does not trip... "); + reset_state(); + (void)health_watchdog_step(0xFFFFFF00u); + assert(health_watchdog_step(0x00000064u) == 0); /* elapsed = 0x164 = 356 ms */ + printf("PASS\n"); + + printf("\n=== Safety fix: ALL TESTS PASSED ===\n\n"); + return 0; +}