From 393e8850a07ddcc77aa23e15bf3ea5ab3742483f Mon Sep 17 00:00:00 2001 From: binhusmachado-code <238357170+binhusmachado-code@users.noreply.github.com> Date: Sat, 20 Jun 2026 01:31:39 -0400 Subject: [PATCH] fix(firmware): stop sendto ENOMEM tight loop on ESP32-S3 egress path (#1135) On a fresh ESP32-S3 the node enters a permanent `sendto ENOMEM` loop from the second CSI callback onward and zero UDP frames ever leave the device (the aggregator stays `esp32:offline`), even though Wi-Fi, DHCP and ICMP are healthy and pkt_yield sits at 0 pps. Per the analysis in #1135, during the first ~1 s after boot the 50 Hz self-ping + mmWave UART probe + ESPNOW init + promiscuous sniffer all contend for the same lwIP pbuf / Wi-Fi dynamic-TX pools; `sendto` returns ENOMEM and the fixed 100 ms backoff from #132 is too short to let the pools drain, so it re-fires into a still-full pool every cycle and loops forever. The S3 contends harder for these buffers than the C6 the original 0.6.x/0.7.0 tuning was verified against. Implements the three mitigations proposed in #1135: * csi_collector.c: self-ping cadence 50 Hz -> 10 Hz (interval_ms 20 -> 100). Cuts ~52 back-to-back boot-time datagrams/s of TX flood while keeping the CSI OFDM source alive. Interval comment, header comment and log string updated. * stream_sender.c: ENOMEM_COOLDOWN_MS 100 -> 300 so the backoff outlasts the pool pressure instead of re-firing into a still-full pool. * sdkconfig.defaults: CONFIG_ESP_WIFI_DYNAMIC_TX_BUFFER_NUM 64 -> 128 (max of the IDF 1..128 range) for TX headroom during the boot contention window. Scoped to the S3: the bump lives in the base sdkconfig.defaults, so to leave the untested C6 build unchanged it is pinned back to 64 in sdkconfig.defaults.esp32c6. Also tidied a stale "50 Hz" self-ping header comment and a stale "100 ms" backoff comment in adaptive_controller.c so they match the new runtime behavior. Measured on an ESP32-S3-DevKitC-1-class board (QFN56 rev v0.2, 16MB/8MB, USB-Serial/JTAG, WPA2 2.4 GHz; aggregator UDP :5005 on macOS), built and flashed with ESP-IDF v5.4: before: sendto ENOMEM tight loop, yield 0 pps, 0 frames reach the host after: yield 9-13 pps, no ENOMEM, 300+ CSI frames/min received, vitals parsing Fixes the egress/ENOMEM half (bug #1) of #1135 only; the phantom-LD2410-on- floating-UART detection (bug #2) is out of scope and belongs with the #1107/#1119 mmwave-validation work. Verified on ESP32-S3 only, not on C6. Refs #132, #521, #954, #1107, #1119. --- firmware/esp32-csi-node/main/adaptive_controller.c | 2 +- firmware/esp32-csi-node/main/csi_collector.c | 8 +++++--- firmware/esp32-csi-node/main/stream_sender.c | 3 ++- firmware/esp32-csi-node/sdkconfig.defaults | 2 +- firmware/esp32-csi-node/sdkconfig.defaults.esp32c6 | 7 +++++++ 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/firmware/esp32-csi-node/main/adaptive_controller.c b/firmware/esp32-csi-node/main/adaptive_controller.c index f85a22b924..e73a19dbf9 100644 --- a/firmware/esp32-csi-node/main/adaptive_controller.c +++ b/firmware/esp32-csi-node/main/adaptive_controller.c @@ -225,7 +225,7 @@ static void fast_loop_cb(TimerHandle_t t) * the default 200 ms fast period), which combined with CSI promiscuous * RX saturated the WiFi TX airtime — measured live on COM8 (S3) and * COM9 (C6): every adaptive cycle showed `sendto ENOMEM — backing off - * for 100 ms`, and bumping LWIP/WiFi buffer pools to 4× had no effect + * for the ENOMEM cooldown`, and bumping LWIP/WiFi buffer pools to 4× had no effect * on the rate because the bottleneck was radio TX time, not pool size. * Dropping to 1 Hz (5× less feature_state traffic) frees the TX queue * for CSI sends and lands well within the spec. */ diff --git a/firmware/esp32-csi-node/main/csi_collector.c b/firmware/esp32-csi-node/main/csi_collector.c index 0dc03676d8..5f2c44a80e 100644 --- a/firmware/esp32-csi-node/main/csi_collector.c +++ b/firmware/esp32-csi-node/main/csi_collector.c @@ -376,7 +376,7 @@ static void wifi_promiscuous_cb(void *buf, wifi_promiscuous_pkt_type_t type) * are sparse beacons (often non-OFDM DSSS), so wifi_csi_callback can starve to * yield=0pps -> DEGRADED -> motion/presence=0 (#521, #954). * - * This guarantees a ~50 Hz OFDM unicast floor by pinging the STA's own gateway: + * This guarantees a ~10 Hz OFDM unicast floor by pinging the STA's own gateway: * the router's ICMP echo replies are OFDM frames destined to this station, which * drive the CSI engine regardless of promiscuous filter state or ambient traffic. * It is ADDITIVE — promiscuous capture (#396/#893) is left fully intact so @@ -409,7 +409,9 @@ static void csi_start_self_ping(void) esp_ping_config_t cfg = ESP_PING_DEFAULT_CONFIG(); cfg.target_addr = target; cfg.count = ESP_PING_COUNT_INFINITE; - cfg.interval_ms = 20; /* 50 Hz -> ~50 received OFDM replies/sec */ + cfg.interval_ms = 100; /* 10 Hz: cut self-ping TX flood that exhausts + S3 WiFi TX buffers -> sendto ENOMEM (#1135). + 10 Hz still keeps the CSI OFDM source alive. */ cfg.data_size = 1; cfg.task_stack_size = 4096; @@ -422,7 +424,7 @@ static void csi_start_self_ping(void) if (esp_ping_new_session(&cfg, &cbs, &s_self_ping) == ESP_OK && s_self_ping != NULL) { esp_ping_start(s_self_ping); - ESP_LOGI(TAG, "self-ping started -> %s @50Hz (CSI OFDM source, fix #521/#954)", gw_str); + ESP_LOGI(TAG, "self-ping started -> %s @10Hz (CSI OFDM source, fix #521/#954, S3 ENOMEM #1135)", gw_str); } else { ESP_LOGW(TAG, "self-ping: esp_ping_new_session failed"); s_self_ping = NULL; diff --git a/firmware/esp32-csi-node/main/stream_sender.c b/firmware/esp32-csi-node/main/stream_sender.c index b85c206a59..7819b1aba4 100644 --- a/firmware/esp32-csi-node/main/stream_sender.c +++ b/firmware/esp32-csi-node/main/stream_sender.c @@ -26,7 +26,8 @@ static struct sockaddr_in s_dest_addr; * rapid-fire CSI callbacks can exhaust the pbuf pool and crash the device. */ static int64_t s_backoff_until_us = 0; /* esp_timer timestamp to resume */ -#define ENOMEM_COOLDOWN_MS 100 /* suppress sends for 100 ms */ +#define ENOMEM_COOLDOWN_MS 300 /* suppress sends for 300 ms — 100 ms + * was too short to drain on S3 (#1135) */ #define ENOMEM_LOG_INTERVAL 50 /* log every Nth suppressed send */ static uint32_t s_enomem_suppressed = 0; diff --git a/firmware/esp32-csi-node/sdkconfig.defaults b/firmware/esp32-csi-node/sdkconfig.defaults index 94ec09222a..d196dd121a 100644 --- a/firmware/esp32-csi-node/sdkconfig.defaults +++ b/firmware/esp32-csi-node/sdkconfig.defaults @@ -41,7 +41,7 @@ CONFIG_LWIP_SO_RCVBUF=y # ~3 KB extra heap cost, measured live on both targets Jun 8 2026. CONFIG_LWIP_UDP_RECVMBOX_SIZE=32 CONFIG_LWIP_TCPIP_RECVMBOX_SIZE=64 -CONFIG_ESP_WIFI_DYNAMIC_TX_BUFFER_NUM=64 +CONFIG_ESP_WIFI_DYNAMIC_TX_BUFFER_NUM=128 # NOTE: Empirical 25 s measurements on the S3 at COM8 showed these bumps # eliminate the csi_collector.sendto failure path (`fail #1..5` → # `fail #0`) — real improvement — but do NOT eliminate the broader diff --git a/firmware/esp32-csi-node/sdkconfig.defaults.esp32c6 b/firmware/esp32-csi-node/sdkconfig.defaults.esp32c6 index b6bda708e5..919457450d 100644 --- a/firmware/esp32-csi-node/sdkconfig.defaults.esp32c6 +++ b/firmware/esp32-csi-node/sdkconfig.defaults.esp32c6 @@ -70,6 +70,13 @@ CONFIG_LWIP_SO_RCVBUF=y CONFIG_ESP_MAIN_TASK_STACK_SIZE=8192 CONFIG_FREERTOS_TIMER_TASK_STACK_DEPTH=8192 +# ── Wi-Fi dynamic TX buffers: pin to the C6's verified value ── +# The S3 ENOMEM fix (#1135) raises CONFIG_ESP_WIFI_DYNAMIC_TX_BUFFER_NUM to 128 +# in the base sdkconfig.defaults. That fix was hardware-verified on the S3 only, +# so keep the C6 build at its previously-shipping 64 rather than silently +# inheriting 128 on an untested target. Re-tune here if the C6 ever needs it. +CONFIG_ESP_WIFI_DYNAMIC_TX_BUFFER_NUM=64 + # ── Power: keep CPU at max 160 MHz (C6 ceiling) for DSP throughput ── CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_160=y CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ=160