Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions dronecan_gui_tool/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@

class MainWindow(QMainWindow):
MAX_SUCCESSIVE_NODE_ERRORS = 1000
TRANSFER_ERROR_LOG_INTERVAL = 10 # Seconds between repeated logs of an identical transfer error
MAX_TRACKED_TRANSFER_ERRORS = 64 # Cap on distinct transfer-error messages tracked for throttling

# noinspection PyTypeChecker,PyCallByClass,PyUnresolvedReferences
def __init__(self, node, iface_name, iface_kwargs):
Expand All @@ -120,6 +122,8 @@ def __init__(self, node, iface_name, iface_kwargs):

self._node = node
self._successive_node_errors = 0
self._transfer_error_stats = {} # Error text -> {'last_log': monotonic, 'suppressed': int}
self._make_frame_handling_resilient()
self._iface_name = iface_name

self._active_data_type_detector = ActiveDataTypeDetector(self._node)
Expand Down Expand Up @@ -559,12 +563,42 @@ def _show_node_window(self, node_id):
w.show()
self._node_windows[node_id] = w

def _make_frame_handling_resilient(self):
# dronecan's Node.spin() drains the whole RX queue and *then* runs its scheduler -- periodic
# NodeStatus broadcasts, outstanding-request timeouts/retries, and the node monitor's 1 Hz
# stale-node sweep (node_monitor registers it via node.periodic). If a single frame fails to
# decode, Node._recv_frame -> Transfer.from_frames raises (e.g. an unrecognised data type ID
# from a node using DSDL we haven't loaded, while prototyping new messages). That exception
# aborts the rest of the drain AND skips the scheduler poll for that spin. A message broadcast
# at high rate then trips nearly every 10 ms spin, so the scheduler is starved and good frames
# queued behind the bad one are delayed -- nodes flap in and out of the monitor and the UI
# stalls. Catching it up in _spin_node is too late: the damage is already done inside spin().
#
# So we wrap _recv_frame to drop an undecodable transfer in place, letting spin() finish
# draining the queue and run its scheduler exactly as it would for clean traffic. The raw
# frames remain visible in the bus monitor regardless (it taps the driver directly).
original_recv_frame = self._node._recv_frame

def resilient_recv_frame(raw_frame):
try:
original_recv_frame(raw_frame)
except dronecan.transport.TransferError as ex:
self._report_transfer_error(ex)

self._node._recv_frame = resilient_recv_frame

def _spin_node(self):
# We're running the node in the GUI thread.
# This is not great, but at the moment seems like other options are even worse.
try:
self._node.spin(0)
self._successive_node_errors = 0
except dronecan.transport.TransferError as ex:
# Backstop only: undecodable transfers are normally dropped inside spin() by the
# _recv_frame wrapper installed in _make_frame_handling_resilient(). This catches a
# TransferError that might surface from any other path. Benign and per-transfer, so it
# must NOT count towards the fatal successive-error threshold below.
self._report_transfer_error(ex)
except Exception as ex:
self._successive_node_errors += 1

Expand All @@ -581,6 +615,34 @@ def _spin_node(self):
logger.error(msg, exc_info=True)
self.statusBar().showMessage(msg, 3000)

def _report_transfer_error(self, ex):
# Called for every undecodable transfer -- potentially at hundreds of hertz -- so the common
# path must stay cheap. We touch the log and status bar at most once per
# TRANSFER_ERROR_LOG_INTERVAL seconds per distinct error, and otherwise just tally how many
# identical occurrences were suppressed in between.
text = str(ex)
now = time.monotonic()
stat = self._transfer_error_stats.get(text)
if stat is None:
# Bound memory in the pathological case of many distinct errors (e.g. CRC mismatches
# embed the varying payload bytes in their message text).
if len(self._transfer_error_stats) >= self.MAX_TRACKED_TRANSFER_ERRORS:
self._transfer_error_stats.clear()
stat = self._transfer_error_stats[text] = {'last_log': 0.0, 'suppressed': 0}

if now - stat['last_log'] >= self.TRANSFER_ERROR_LOG_INTERVAL:
if stat['suppressed']:
logger.warning('Ignoring undecodable transfer: %s (%d more suppressed in the last %.0f s)',
text, stat['suppressed'], self.TRANSFER_ERROR_LOG_INTERVAL)
else:
logger.warning('Ignoring undecodable transfer: %s', text)
self.statusBar().showMessage('Ignoring undecodable transfer: %s' % text,
int(self.TRANSFER_ERROR_LOG_INTERVAL * 1000))
stat['last_log'] = now
stat['suppressed'] = 0
else:
stat['suppressed'] += 1

def closeEvent(self, qcloseevent):
self._plotter_manager.close()
self._console_manager.close()
Expand Down