From 4a9fecde37329f0c99d164b081be61f11befd33e Mon Sep 17 00:00:00 2001
From: Jacob Dahl <dahl.jakejacob@gmail.com>
Date: Tue, 9 Jun 2026 16:12:03 -0600
Subject: [PATCH] fix(main): keep node spin resilient to unrecognised message
 types

Unrecognised data type IDs (e.g. while prototyping new DSDL that isn't
built into the tool) caused two distinct failures.

First, they tripped the 1000-strike successive-error guard and terminated
the local node after a few seconds, logging a full traceback on every
10 ms spin.

Second, and more subtly: Node.spin() drains the RX queue and then runs
its scheduler, but when Transfer.from_frames() raises on an undecodable
transfer the drain aborts and the scheduler poll is skipped. The node
monitor's liveness is scheduler-driven (periodic stale-sweep and
outstanding-request timeouts), so at high message rates the scheduler is
starved -- nodes flap in and out of the monitor and the UI stalls.

Wrap Node._recv_frame so an undecodable transfer is dropped in place,
letting spin() finish draining the queue and schedule normally. Keep a
backstop catch in the spin loop that no longer counts these towards the
fatal threshold, and throttle the logging so a high-rate unknown message
cannot flood the log.
---
 dronecan_gui_tool/main.py | 62 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/dronecan_gui_tool/main.py b/dronecan_gui_tool/main.py
index 2ae8632..9f2fd97 100644
--- a/dronecan_gui_tool/main.py
+++ b/dronecan_gui_tool/main.py
@@ -110,6 +110,8 @@
 
 class MainWindow(QMainWindow):
     MAX_SUCCESSIVE_NODE_ERRORS = 1000
+    TRANSFER_ERROR_LOG_INTERVAL = 10        # Seconds between repeated logs of an identical transfer error
+    MAX_TRACKED_TRANSFER_ERRORS = 64        # Cap on distinct transfer-error messages tracked for throttling
 
     # noinspection PyTypeChecker,PyCallByClass,PyUnresolvedReferences
     def __init__(self, node, iface_name, iface_kwargs):
@@ -120,6 +122,8 @@ def __init__(self, node, iface_name, iface_kwargs):
 
         self._node = node
         self._successive_node_errors = 0
+        self._transfer_error_stats = {}     # Error text -> {'last_log': monotonic, 'suppressed': int}
+        self._make_frame_handling_resilient()
         self._iface_name = iface_name
 
         self._active_data_type_detector = ActiveDataTypeDetector(self._node)
@@ -559,12 +563,42 @@ def _show_node_window(self, node_id):
         w.show()
         self._node_windows[node_id] = w
 
+    def _make_frame_handling_resilient(self):
+        # dronecan's Node.spin() drains the whole RX queue and *then* runs its scheduler -- periodic
+        # NodeStatus broadcasts, outstanding-request timeouts/retries, and the node monitor's 1 Hz
+        # stale-node sweep (node_monitor registers it via node.periodic). If a single frame fails to
+        # decode, Node._recv_frame -> Transfer.from_frames raises (e.g. an unrecognised data type ID
+        # from a node using DSDL we haven't loaded, while prototyping new messages). That exception
+        # aborts the rest of the drain AND skips the scheduler poll for that spin. A message broadcast
+        # at high rate then trips nearly every 10 ms spin, so the scheduler is starved and good frames
+        # queued behind the bad one are delayed -- nodes flap in and out of the monitor and the UI
+        # stalls. Catching it up in _spin_node is too late: the damage is already done inside spin().
+        #
+        # So we wrap _recv_frame to drop an undecodable transfer in place, letting spin() finish
+        # draining the queue and run its scheduler exactly as it would for clean traffic. The raw
+        # frames remain visible in the bus monitor regardless (it taps the driver directly).
+        original_recv_frame = self._node._recv_frame
+
+        def resilient_recv_frame(raw_frame):
+            try:
+                original_recv_frame(raw_frame)
+            except dronecan.transport.TransferError as ex:
+                self._report_transfer_error(ex)
+
+        self._node._recv_frame = resilient_recv_frame
+
     def _spin_node(self):
         # We're running the node in the GUI thread.
         # This is not great, but at the moment seems like other options are even worse.
         try:
             self._node.spin(0)
             self._successive_node_errors = 0
+        except dronecan.transport.TransferError as ex:
+            # Backstop only: undecodable transfers are normally dropped inside spin() by the
+            # _recv_frame wrapper installed in _make_frame_handling_resilient(). This catches a
+            # TransferError that might surface from any other path. Benign and per-transfer, so it
+            # must NOT count towards the fatal successive-error threshold below.
+            self._report_transfer_error(ex)
         except Exception as ex:
             self._successive_node_errors += 1
 
@@ -581,6 +615,34 @@ def _spin_node(self):
             logger.error(msg, exc_info=True)
             self.statusBar().showMessage(msg, 3000)
 
+    def _report_transfer_error(self, ex):
+        # Called for every undecodable transfer -- potentially at hundreds of hertz -- so the common
+        # path must stay cheap. We touch the log and status bar at most once per
+        # TRANSFER_ERROR_LOG_INTERVAL seconds per distinct error, and otherwise just tally how many
+        # identical occurrences were suppressed in between.
+        text = str(ex)
+        now = time.monotonic()
+        stat = self._transfer_error_stats.get(text)
+        if stat is None:
+            # Bound memory in the pathological case of many distinct errors (e.g. CRC mismatches
+            # embed the varying payload bytes in their message text).
+            if len(self._transfer_error_stats) >= self.MAX_TRACKED_TRANSFER_ERRORS:
+                self._transfer_error_stats.clear()
+            stat = self._transfer_error_stats[text] = {'last_log': 0.0, 'suppressed': 0}
+
+        if now - stat['last_log'] >= self.TRANSFER_ERROR_LOG_INTERVAL:
+            if stat['suppressed']:
+                logger.warning('Ignoring undecodable transfer: %s (%d more suppressed in the last %.0f s)',
+                               text, stat['suppressed'], self.TRANSFER_ERROR_LOG_INTERVAL)
+            else:
+                logger.warning('Ignoring undecodable transfer: %s', text)
+            self.statusBar().showMessage('Ignoring undecodable transfer: %s' % text,
+                                         int(self.TRANSFER_ERROR_LOG_INTERVAL * 1000))
+            stat['last_log'] = now
+            stat['suppressed'] = 0
+        else:
+            stat['suppressed'] += 1
+
     def closeEvent(self, qcloseevent):
         self._plotter_manager.close()
         self._console_manager.close()