|
| 1 | +import type { MollifierBuffer } from "@trigger.dev/redis-worker"; |
| 2 | +import { logger as defaultLogger } from "~/services/logger.server"; |
| 3 | +import { getMollifierBuffer } from "./mollifierBuffer.server"; |
| 4 | +import { recordStaleEntry as defaultRecordStaleEntry } from "./mollifierTelemetry.server"; |
| 5 | + |
| 6 | +// One pass of the sweep scans every env's queue ZSET. The per-env page |
| 7 | +// is bounded so a single pathological env can't make the sweep run |
| 8 | +// unboundedly long. |
| 9 | +const DEFAULT_MAX_ENTRIES_PER_ENV = 1000; |
| 10 | + |
| 11 | +export type StaleSweepConfig = { |
| 12 | + // Entries whose dwell exceeds this threshold are flagged stale. Set |
| 13 | + // it well below `entryTtlSeconds * 1000` so ops have lead time before |
| 14 | + // TTL-induced silent loss; the default (half of entryTtlSeconds) |
| 15 | + // matches the cadence in the plan doc. |
| 16 | + staleThresholdMs: number; |
| 17 | + maxEntriesPerEnv?: number; |
| 18 | +}; |
| 19 | + |
| 20 | +export type StaleSweepDeps = { |
| 21 | + getBuffer?: () => MollifierBuffer | null; |
| 22 | + recordStaleEntry?: (envId: string) => void; |
| 23 | + logger?: { warn: (message: string, fields: Record<string, unknown>) => void }; |
| 24 | + now?: () => number; |
| 25 | +}; |
| 26 | + |
| 27 | +export type StaleSweepResult = { |
| 28 | + orgsScanned: number; |
| 29 | + envsScanned: number; |
| 30 | + entriesScanned: number; |
| 31 | + staleCount: number; |
| 32 | +}; |
| 33 | + |
| 34 | +// Walks orgs → envs → entries, emitting an OTel counter tick and a |
| 35 | +// structured warning log for each buffer entry whose dwell exceeds the |
| 36 | +// stale threshold. Read-only: the sweep does NOT remove or salvage |
| 37 | +// entries; that decision is deferred to a separate retention-policy |
| 38 | +// change. The signal here exists so ops sees the drainer falling |
| 39 | +// behind well before TTL-induced loss kicks in. |
| 40 | +export async function runStaleSweepOnce( |
| 41 | + config: StaleSweepConfig, |
| 42 | + deps: StaleSweepDeps = {}, |
| 43 | +): Promise<StaleSweepResult> { |
| 44 | + const getBuffer = deps.getBuffer ?? getMollifierBuffer; |
| 45 | + const recordStale = deps.recordStaleEntry ?? defaultRecordStaleEntry; |
| 46 | + const log = deps.logger ?? defaultLogger; |
| 47 | + const now = (deps.now ?? Date.now)(); |
| 48 | + const maxEntries = config.maxEntriesPerEnv ?? DEFAULT_MAX_ENTRIES_PER_ENV; |
| 49 | + |
| 50 | + const buffer = getBuffer(); |
| 51 | + if (!buffer) { |
| 52 | + return { orgsScanned: 0, envsScanned: 0, entriesScanned: 0, staleCount: 0 }; |
| 53 | + } |
| 54 | + |
| 55 | + const orgs = await buffer.listOrgs(); |
| 56 | + let envsScanned = 0; |
| 57 | + let entriesScanned = 0; |
| 58 | + let staleCount = 0; |
| 59 | + |
| 60 | + for (const orgId of orgs) { |
| 61 | + const envs = await buffer.listEnvsForOrg(orgId); |
| 62 | + for (const envId of envs) { |
| 63 | + envsScanned += 1; |
| 64 | + const entries = await buffer.listEntriesForEnv(envId, maxEntries); |
| 65 | + for (const entry of entries) { |
| 66 | + entriesScanned += 1; |
| 67 | + const dwellMs = now - entry.createdAt.getTime(); |
| 68 | + if (dwellMs > config.staleThresholdMs) { |
| 69 | + recordStale(envId); |
| 70 | + log.warn("mollifier.stale_entry", { |
| 71 | + runId: entry.runId, |
| 72 | + envId, |
| 73 | + orgId, |
| 74 | + dwellMs, |
| 75 | + staleThresholdMs: config.staleThresholdMs, |
| 76 | + }); |
| 77 | + staleCount += 1; |
| 78 | + } |
| 79 | + } |
| 80 | + } |
| 81 | + } |
| 82 | + |
| 83 | + return { orgsScanned: orgs.length, envsScanned, entriesScanned, staleCount }; |
| 84 | +} |
| 85 | + |
| 86 | +export type StaleSweepIntervalHandle = { |
| 87 | + stop: () => void; |
| 88 | +}; |
| 89 | + |
| 90 | +// Production wrapper: schedule `runStaleSweepOnce` on a fixed interval. |
| 91 | +// One pass at a time — if a sweep is still running when the timer fires |
| 92 | +// the next tick is skipped (a backed-up Redis would otherwise queue |
| 93 | +// overlapping sweeps that all log the same stale entries). |
| 94 | +export function startStaleSweepInterval( |
| 95 | + config: StaleSweepConfig & { intervalMs: number }, |
| 96 | + deps: StaleSweepDeps = {}, |
| 97 | +): StaleSweepIntervalHandle { |
| 98 | + let stopped = false; |
| 99 | + let inFlight = false; |
| 100 | + |
| 101 | + const tick = async () => { |
| 102 | + if (stopped || inFlight) return; |
| 103 | + inFlight = true; |
| 104 | + try { |
| 105 | + await runStaleSweepOnce(config, deps); |
| 106 | + } catch (err) { |
| 107 | + const log = deps.logger ?? defaultLogger; |
| 108 | + log.warn("mollifier.stale_sweep.failed", { |
| 109 | + err: err instanceof Error ? err.message : String(err), |
| 110 | + }); |
| 111 | + } finally { |
| 112 | + inFlight = false; |
| 113 | + } |
| 114 | + }; |
| 115 | + |
| 116 | + const timer = setInterval(() => { |
| 117 | + void tick(); |
| 118 | + }, config.intervalMs); |
| 119 | + |
| 120 | + return { |
| 121 | + stop: () => { |
| 122 | + stopped = true; |
| 123 | + clearInterval(timer); |
| 124 | + }, |
| 125 | + }; |
| 126 | +} |
0 commit comments