From ec12a859a2a13d296574ba08f7a3d1f39f9fd156 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 17 Jun 2026 02:54:12 +0000 Subject: [PATCH] fix(telemetry): don't record a cancelled command as an error Cancelling a command (Ctrl-C / SIGTERM) is a normal part of CLI use, but build_event treated every non-success outcome as a failure: a "cancelled" outcome shipped with status:error and a reserved error.kind block, so it fed Datadog Error Tracking and inflated the crash rate (seen for `assembly llm` and `assembly transcripts get` cancellations). Treat "cancelled" alongside "success" as a non-error outcome via a _NON_ERROR_OUTCOMES set: it now ships as an info log with no error block, while genuine failures still feed Error Tracking unchanged. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_019fXXbmHqMUbRoEHssQEMmi --- aai_cli/core/telemetry.py | 18 +++++++++++++----- tests/test_telemetry.py | 15 +++++++++++++-- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/aai_cli/core/telemetry.py b/aai_cli/core/telemetry.py index 814174df..4ea7919a 100644 --- a/aai_cli/core/telemetry.py +++ b/aai_cli/core/telemetry.py @@ -51,6 +51,12 @@ # error line, while bounding the payload if an upstream message embeds a body. _ERROR_MESSAGE_MAX_CHARS = 500 +# Outcomes that are a normal part of CLI use, not failures: a clean exit and a +# user/SIGTERM cancellation (Ctrl-C, e.g. stopping a long `llm` or +# `transcripts get`). Both ship as ``status: info`` with no ``error`` block, so a +# cancel never lands in Datadog Error Tracking or inflates the crash rate. +_NON_ERROR_OUTCOMES = frozenset({"success", "cancelled"}) + def client_token() -> str: """The write-only intake token: env override first, then the shipped one.""" @@ -143,20 +149,22 @@ def build_event( device id is a random UUID minted locally — no account id, email, or hostname ever rides along. - A failure additionally sets ``status: error`` and the reserved + A genuine failure additionally sets ``status: error`` and the reserved ``error.kind``/``error.message`` so the event feeds Datadog **Error Tracking** (issue grouping), not just log search. ``error.kind`` reuses the anonymous ``outcome`` (the ``CLIError.error_type``); ``error.message`` is the one-line message the user saw (capped at ``_ERROR_MESSAGE_MAX_CHARS``). - Stack traces are still deliberately omitted. + Stack traces are still deliberately omitted. A ``cancelled`` outcome is *not* + a failure (see ``_NON_ERROR_OUTCOMES``) — it stays an ``info`` log with no + ``error`` block, since stopping a command is normal CLI use, not a crash. """ - succeeded = outcome == "success" + is_failure = outcome not in _NON_ERROR_OUTCOMES event: dict[str, object] = { "ddsource": "aai-cli", "service": "aai-cli", "ddtags": f"version:{__version__}", "message": f"{command} {outcome}", - "status": "info" if succeeded else "error", + "status": "error" if is_failure else "info", "command": command, "outcome": outcome, "exit_code": exit_code, @@ -167,7 +175,7 @@ def build_event( "ci": bool(env.get("CI")), "device_id": config.get_device_id(), } - if not succeeded: + if is_failure: error: dict[str, object] = {"kind": outcome} if error_message: error["message"] = error_message[:_ERROR_MESSAGE_MAX_CHARS] diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py index 36acfb97..d84249d4 100644 --- a/tests/test_telemetry.py +++ b/tests/test_telemetry.py @@ -170,6 +170,15 @@ def test_build_event_success_is_info_with_no_error_attribute(): assert "error" not in event +def test_build_event_cancelled_is_info_with_no_error_attribute(): + # Cancelling a command (Ctrl-C / SIGTERM) is a normal part of CLI use, not a crash: + # it stays an info log with no `error` namespace, so it never lands in Datadog Error + # Tracking nor inflates the crash rate (the regression these telemetry events showed). + event = telemetry.build_event("aai llm", outcome="cancelled", exit_code=130, duration_ms=1) + assert event["status"] == "info" + assert "error" not in event + + def test_build_event_failure_feeds_error_tracking(monkeypatch): monkeypatch.setenv("CI", "true") event = telemetry.build_event("aai stream", outcome="api_error", exit_code=1, duration_ms=5) @@ -371,8 +380,10 @@ def test_track_typer_exit_maps_code(events, code, outcome): (event,) = events assert event["outcome"] == outcome assert event["exit_code"] == code - # A bare typer.Exit carries no message, so the failure event has only the kind. - assert event.get("error") == ({"kind": outcome} if code else None) + # Only a genuine error (exit 3) feeds Error Tracking; a clean exit (0) and a cancel + # (130) are normal CLI use and carry no `error` block. A bare typer.Exit has no message. + assert event.get("error") == ({"kind": outcome} if outcome == "error" else None) + assert event["status"] == ("error" if outcome == "error" else "info") def test_track_keyboard_interrupt_is_cancelled(events):