diff --git a/dronecan_gui_tool/main.py b/dronecan_gui_tool/main.py index 2ae8632..9f2fd97 100644 --- a/dronecan_gui_tool/main.py +++ b/dronecan_gui_tool/main.py @@ -110,6 +110,8 @@ class MainWindow(QMainWindow): MAX_SUCCESSIVE_NODE_ERRORS = 1000 + TRANSFER_ERROR_LOG_INTERVAL = 10 # Seconds between repeated logs of an identical transfer error + MAX_TRACKED_TRANSFER_ERRORS = 64 # Cap on distinct transfer-error messages tracked for throttling # noinspection PyTypeChecker,PyCallByClass,PyUnresolvedReferences def __init__(self, node, iface_name, iface_kwargs): @@ -120,6 +122,8 @@ def __init__(self, node, iface_name, iface_kwargs): self._node = node self._successive_node_errors = 0 + self._transfer_error_stats = {} # Error text -> {'last_log': monotonic, 'suppressed': int} + self._make_frame_handling_resilient() self._iface_name = iface_name self._active_data_type_detector = ActiveDataTypeDetector(self._node) @@ -559,12 +563,42 @@ def _show_node_window(self, node_id): w.show() self._node_windows[node_id] = w + def _make_frame_handling_resilient(self): + # dronecan's Node.spin() drains the whole RX queue and *then* runs its scheduler -- periodic + # NodeStatus broadcasts, outstanding-request timeouts/retries, and the node monitor's 1 Hz + # stale-node sweep (node_monitor registers it via node.periodic). If a single frame fails to + # decode, Node._recv_frame -> Transfer.from_frames raises (e.g. an unrecognised data type ID + # from a node using DSDL we haven't loaded, while prototyping new messages). That exception + # aborts the rest of the drain AND skips the scheduler poll for that spin. A message broadcast + # at high rate then trips nearly every 10 ms spin, so the scheduler is starved and good frames + # queued behind the bad one are delayed -- nodes flap in and out of the monitor and the UI + # stalls. Catching it up in _spin_node is too late: the damage is already done inside spin(). + # + # So we wrap _recv_frame to drop an undecodable transfer in place, letting spin() finish + # draining the queue and run its scheduler exactly as it would for clean traffic. The raw + # frames remain visible in the bus monitor regardless (it taps the driver directly). + original_recv_frame = self._node._recv_frame + + def resilient_recv_frame(raw_frame): + try: + original_recv_frame(raw_frame) + except dronecan.transport.TransferError as ex: + self._report_transfer_error(ex) + + self._node._recv_frame = resilient_recv_frame + def _spin_node(self): # We're running the node in the GUI thread. # This is not great, but at the moment seems like other options are even worse. try: self._node.spin(0) self._successive_node_errors = 0 + except dronecan.transport.TransferError as ex: + # Backstop only: undecodable transfers are normally dropped inside spin() by the + # _recv_frame wrapper installed in _make_frame_handling_resilient(). This catches a + # TransferError that might surface from any other path. Benign and per-transfer, so it + # must NOT count towards the fatal successive-error threshold below. + self._report_transfer_error(ex) except Exception as ex: self._successive_node_errors += 1 @@ -581,6 +615,34 @@ def _spin_node(self): logger.error(msg, exc_info=True) self.statusBar().showMessage(msg, 3000) + def _report_transfer_error(self, ex): + # Called for every undecodable transfer -- potentially at hundreds of hertz -- so the common + # path must stay cheap. We touch the log and status bar at most once per + # TRANSFER_ERROR_LOG_INTERVAL seconds per distinct error, and otherwise just tally how many + # identical occurrences were suppressed in between. + text = str(ex) + now = time.monotonic() + stat = self._transfer_error_stats.get(text) + if stat is None: + # Bound memory in the pathological case of many distinct errors (e.g. CRC mismatches + # embed the varying payload bytes in their message text). + if len(self._transfer_error_stats) >= self.MAX_TRACKED_TRANSFER_ERRORS: + self._transfer_error_stats.clear() + stat = self._transfer_error_stats[text] = {'last_log': 0.0, 'suppressed': 0} + + if now - stat['last_log'] >= self.TRANSFER_ERROR_LOG_INTERVAL: + if stat['suppressed']: + logger.warning('Ignoring undecodable transfer: %s (%d more suppressed in the last %.0f s)', + text, stat['suppressed'], self.TRANSFER_ERROR_LOG_INTERVAL) + else: + logger.warning('Ignoring undecodable transfer: %s', text) + self.statusBar().showMessage('Ignoring undecodable transfer: %s' % text, + int(self.TRANSFER_ERROR_LOG_INTERVAL * 1000)) + stat['last_log'] = now + stat['suppressed'] = 0 + else: + stat['suppressed'] += 1 + def closeEvent(self, qcloseevent): self._plotter_manager.close() self._console_manager.close()