diff --git a/packages/core/observability/sentry.test.ts b/packages/core/observability/sentry.test.ts new file mode 100644 index 0000000..bf6735a --- /dev/null +++ b/packages/core/observability/sentry.test.ts @@ -0,0 +1,60 @@ +import { describe, expect, it } from "bun:test"; +import { isBenignDeliveryFailure } from "@/core/observability/sentry"; + +describe("isBenignDeliveryFailure", () => { + it("flags Slack message_not_found on update as benign", () => { + expect( + isBenignDeliveryFailure({ + op: "update", + error: "An API error occurred: message_not_found", + }), + ).toBe(true); + }); + + it("flags Slack message_not_found on delete as benign", () => { + expect( + isBenignDeliveryFailure({ + op: "delete", + error: "An API error occurred: message_not_found", + }), + ).toBe(true); + }); + + it("flags Discord unknown_message on update as benign", () => { + expect( + isBenignDeliveryFailure({ + op: "update", + error: "Unknown Message (unknown_message)", + }), + ).toBe(true); + }); + + it("does not flag channel_not_found as benign (bot may have been removed)", () => { + expect( + isBenignDeliveryFailure({ + op: "send", + error: "An API error occurred: channel_not_found", + }), + ).toBe(false); + }); + + it("does not flag benign-looking errors on send", () => { + // Send failures should never be treated as benign — if a send fails, + // the user didn't get the message. + expect( + isBenignDeliveryFailure({ + op: "send", + error: "message_not_found", + }), + ).toBe(false); + }); + + it("does not flag generic auth errors as benign", () => { + expect( + isBenignDeliveryFailure({ + op: "update", + error: "invalid_auth", + }), + ).toBe(false); + }); +}); diff --git a/packages/core/observability/sentry.ts b/packages/core/observability/sentry.ts index 340a2df..be1d722 100644 --- a/packages/core/observability/sentry.ts +++ b/packages/core/observability/sentry.ts @@ -152,6 +152,17 @@ function buildFailureFingerprint( function captureDeliveryFailure(failure: FailureRecord): void { if (!initialized) return; + // Some delivery failures are expected race conditions, not bugs: + // - `message_not_found` on update/delete: the status message was already + // deleted or replaced (common when finalization races with teardown). + // - `unknown_message` / `not_found` on update/delete for Discord/Lark. + // Skipping these entirely keeps Sentry quota focused on real issues like + // auth/network/payload errors. We still keep them in the local + // `deliveryStats` counters for debugging. + if (!failure.rateLimited && isBenignDeliveryFailure(failure)) { + return; + } + const dedupKey = `${failure.platform}|${failure.op}|${failure.rateLimited ? "rl" : "err"}|${failure.channelId}`; const interval = failure.rateLimited ? RATE_LIMIT_CAPTURE_INTERVAL_MS @@ -197,3 +208,25 @@ function captureDeliveryFailure(failure: FailureRecord): void { log.debug("Sentry capture failed", { error: String(hookErr) }); } } + +/** + * Error strings that represent expected races, not real bugs, for a given op. + * Keep this list narrow; when in doubt, send to Sentry rather than suppress. + */ +const BENIGN_UPDATE_DELETE_PATTERNS = [ + // Slack + "message_not_found", + // Discord + "unknown_message", + // Lark (generic "not found" codes vary; substring match keeps it simple) + "not_found", +]; + +export function isBenignDeliveryFailure(failure: { + op: DeliveryOp; + error: string; +}): boolean { + if (failure.op !== "update" && failure.op !== "delete") return false; + const text = failure.error.toLowerCase(); + return BENIGN_UPDATE_DELETE_PATTERNS.some((pattern) => text.includes(pattern)); +} diff --git a/packages/shared/queue/coalesced-update-queue.test.ts b/packages/shared/queue/coalesced-update-queue.test.ts index 4ac2bf0..4025997 100644 --- a/packages/shared/queue/coalesced-update-queue.test.ts +++ b/packages/shared/queue/coalesced-update-queue.test.ts @@ -35,4 +35,45 @@ describe("CoalescedUpdateQueue", () => { expect(seen).toEqual(["M1:a", "M2:b"]); }); + + it("does not leak unhandled rejections when clear() drops waiting jobs", async () => { + // Reproduces ODE-DEAMON-5: Bottleneck rejects dropped jobs with + // "This limiter has been stopped." and a subsequent clear() would + // also reject with "stop() has already been called". Both must be + // swallowed internally so they don't surface as unhandled rejections. + const unhandled: unknown[] = []; + const onUnhandled = (reason: unknown) => { + unhandled.push(reason); + }; + process.on("unhandledRejection", onUnhandled); + + try { + const queue = new CoalescedUpdateQueue(50, async (_key, payload) => { + // Simulate a slow worker so the job is still in the limiter's queue + // when clear() drops it. + await new Promise((resolve) => setTimeout(resolve, 25)); + return payload; + }); + + // First enqueue starts running; subsequent ones wait in the queue and + // will be dropped by clear(). + const p1 = queue.enqueue({ channelId: "C1", messageId: "M1" }, "a"); + const p2 = queue.enqueue({ channelId: "C1", messageId: "M2" }, "b"); + const p3 = queue.enqueue({ channelId: "C1", messageId: "M3" }, "c"); + + queue.clear(); + // Idempotent clear — must not re-throw "stop() has already been called". + queue.clear(); + + // All pending callers receive undefined instead of hanging. + const results = await Promise.all([p1, p2, p3]); + expect(results).toEqual([undefined, undefined, undefined]); + + // Let any async rejections settle before assertion. + await new Promise((resolve) => setTimeout(resolve, 50)); + expect(unhandled).toEqual([]); + } finally { + process.off("unhandledRejection", onUnhandled); + } + }); }); diff --git a/packages/shared/queue/coalesced-update-queue.ts b/packages/shared/queue/coalesced-update-queue.ts index 8d95183..bc9547a 100644 --- a/packages/shared/queue/coalesced-update-queue.ts +++ b/packages/shared/queue/coalesced-update-queue.ts @@ -42,13 +42,29 @@ export class CoalescedUpdateQueue { payload, resolve, }); - void this.limiter.schedule(async () => { - const pending = this.pendingByKey.get(dedupKey); - if (!pending) return; - this.pendingByKey.delete(dedupKey); - const result = await this.worker(pending.key, pending.payload); - pending.resolve(result); - }); + // Use `.catch` instead of `void` so that rejections from a stopped + // limiter (Bottleneck rejects dropped jobs with + // "This limiter has been stopped." and "stop() has already been + // called") don't leak as unhandled rejections. Any in-flight jobs + // dropped by `clear()` are already resolved via `pendingByKey` above, + // so swallowing the rejection here is safe. + this.limiter + .schedule(async () => { + const pending = this.pendingByKey.get(dedupKey); + if (!pending) return; + this.pendingByKey.delete(dedupKey); + const result = await this.worker(pending.key, pending.payload); + pending.resolve(result); + }) + .catch(() => { + // Job was dropped during teardown or rejected by the worker. + // Ensure the enqueue promise still settles so callers don't hang. + const pending = this.pendingByKey.get(dedupKey); + if (pending) { + this.pendingByKey.delete(dedupKey); + pending.resolve(undefined as TResult); + } + }); }); }