tangle-network · drewstone · Apr 26, 2026 · Apr 26, 2026
diff --git a/.changeset/jobs-robustness-and-orchestrator.md b/.changeset/jobs-robustness-and-orchestrator.md
@@ -0,0 +1,22 @@
+---
+'@tangle-network/browser-agent-driver': minor
+---
+
+feat(jobs): robustness layer + agentic orchestrator
+
+Five hardening additions plus an LLM-driven control loop that wraps the runner. The architectural rule: protocols are deterministic (retry, anti-bot detection, schema gating) and judgment is agentic (when to re-sample broken wayback snapshots, retry vs. skip, conclude). Mixing those lines is how you end up paying LLM tax on exponential backoff.
+
+**Deterministic foundation**
+- `src/jobs/retry.ts` — whitelist-based retry with exponential backoff + jitter. Retries 429 / 5xx / network / timeout / fetch failures; everything else (4xx, anti-bot, schema, unknown) is treated as deterministic and not retried. Configurable per-error-class via `isRetryable`. Default: 3 attempts, 500ms base, 5s cap. Wired into `runJob` via `RunJobOptions.retryPolicy`.
+- `src/jobs/anti-bot.ts` — pure pattern match against an audit's `report.json`. Title patterns (Cloudflare interstitial, "Just a moment...", "Access denied", etc.) and intent patterns plus a last-resort heuristic (zero findings + low classifier confidence + unknown type). When fired, the runner records `status: 'skipped'` with a reason instead of putting a bogus score on the leaderboard.
+- `src/jobs/cost-history.ts` — adaptive cost estimate from prior job records. Uses static default until N≥3 completed jobs exist; afterward averages per-target cost from the last 20. Floors at 50% of the static default to prevent runaway optimism on a stretch of zero-cost claude-code jobs.
+- Schema versioning: `tokens.json` is now stamped with `schemaVersion: 1` at write time; the aggregator refuses files older than `MIN_TOKENS_SCHEMA`.
+- Resume: `bad jobs resume <jobId>` re-runs only targets that aren't already `ok`/`skipped`. `RunJobOptions.resume` exposes the same on the API.
+
+**Agentic orchestrator**
+- `src/jobs/orchestrator.ts` — `orchestrateJob(job, opts)` runs the deterministic fan-out via `runJob`, then enters a control loop only if intervention is warranted. `needsIntervention` is the gate: any failures, missing entries, or zero-scored wayback snapshots (broken archive captures) trigger the agent.
+- LLM tool surface (5 tools): `getJobState`, `resampleWayback`, `retryTarget`, `markSkipped`, `concludeJob`. Hard caps: 2 retries per target, 1 resample per URL, cost ≤ `spec.maxCostUSD * 0.9`.
+- Default brain uses the same `claude-code` provider as the audit pipeline (subscription-based, no API key required).
+- CLI: `bad jobs orchestrate --spec <file.json>` runs the spec end-to-end with the agent layer. Same JSON spec as `create`.
+
+**Tests:** +34 across `jobs-retry`, `jobs-anti-bot`, `jobs-cost-history`, `jobs-orchestrator` (deterministic gate), and `jobs-orchestrator-agent` (LLM path with `MockLanguageModelV3`). Total: 1494 passing.
diff --git a/src/cli-jobs.ts b/src/cli-jobs.ts
@@ -35,6 +35,7 @@ interface ParsedArgs {
   json?: boolean
   jobId?: string
   yes?: boolean
+  maxIterations?: number
 }
 
 function parseArgs(argv: string[]): ParsedArgs {
@@ -44,6 +45,7 @@ function parseArgs(argv: string[]): ParsedArgs {
     if (a === '--spec') out.spec = argv[++i]
     else if (a === '--json') out.json = true
     else if (a === '--yes' || a === '-y') out.yes = true
+    else if (a === '--max-iterations') out.maxIterations = Number(argv[++i])
     else if (!a.startsWith('-') && !out.jobId) out.jobId = a
   }
   return out
@@ -68,7 +70,9 @@ export async function runJobsCli(args: string[]): Promise<void> {
   if (sub === 'status') return cmdStatus(opts)
   if (sub === 'estimate') return cmdEstimate(opts)
   if (sub === 'create') return cmdCreate(opts)
-  die(`Unknown subcommand: ${sub}. Use create | list | status | estimate.`)
+  if (sub === 'resume') return cmdResume(opts)
+  if (sub === 'orchestrate') return cmdOrchestrate(opts)
+  die(`Unknown subcommand: ${sub}. Use create | list | status | estimate | resume | orchestrate.`)
 }
 
 function cmdList(opts: ParsedArgs): void {
@@ -112,13 +116,15 @@ async function cmdEstimate(opts: ParsedArgs): Promise<void> {
   if (!opts.spec) die('--spec is required for estimate')
   const spec = readSpec(opts.spec)
   const targets = await discoverTargets(spec.discover)
-  const est = estimateCost(spec, targets.length)
+  const { computePerAuditFromHistory } = await import('./jobs/cost-history.js')
+  const adaptive = computePerAuditFromHistory()
+  const est = estimateCost(spec, targets.length, adaptive.perAuditUSD)
   if (opts.json) {
-    console.log(JSON.stringify({ spec, ...est }, null, 2))
+    console.log(JSON.stringify({ spec, ...est, costSource: adaptive.source, jobsObserved: adaptive.jobsObserved }, null, 2))
     return
   }
   console.log(`  Targets: ${est.targetCount}`)
-  console.log(`  Per-audit: $${est.perAuditUSD.toFixed(2)}`)
+  console.log(`  Per-audit: $${est.perAuditUSD.toFixed(2)} ${chalk.dim(`(${adaptive.source}${adaptive.source === 'history' ? `, n=${adaptive.targetsObserved}` : ''})`)}`)
   console.log(`  Estimated total: $${est.estimatedTotalUSD.toFixed(2)}`)
   if (est.exceedsCap && spec.maxCostUSD !== undefined) {
     console.log(chalk.yellow(`  ⚠ exceeds cap of $${spec.maxCostUSD.toFixed(2)}`))
@@ -146,13 +152,53 @@ async function cmdCreate(opts: ParsedArgs): Promise<void> {
   console.log(`  Status: ${chalk.bold(final?.status ?? 'unknown')}  ·  ok: ${final?.results.filter(r => r.status === 'ok').length ?? 0}/${final?.targets.length ?? 0}  ·  $${final?.totalCostUSD.toFixed(2)}`)
 }
 
+async function cmdResume(opts: ParsedArgs): Promise<void> {
+  if (!opts.jobId) die('jobId is required: bad jobs resume <jobId>')
+  const job = loadJob(opts.jobId)
+  if (!job) die(`job not found: ${opts.jobId}`)
+  const remaining = job.targets.filter(t => {
+    const key = t.snapshotUrl ?? t.url
+    return !job.results.some(r => (r.snapshotUrl ?? r.url) === key && (r.status === 'ok' || r.status === 'skipped'))
+  })
+  if (remaining.length === 0) {
+    console.log(chalk.green(`  Nothing to resume — all ${job.targets.length} targets already completed or skipped.`))
+    return
+  }
+  console.log(`  Resuming job ${chalk.bold(job.jobId)}  ·  ${remaining.length}/${job.targets.length} targets remain`)
+  const auditFn = await buildAuditFn(job.spec)
+  await runJob(job, { auditFn, resume: true })
+  const final = loadJob(job.jobId)
+  console.log(`  Status: ${chalk.bold(final?.status ?? 'unknown')}  ·  ok: ${final?.results.filter(r => r.status === 'ok').length ?? 0}/${final?.targets.length ?? 0}`)
+}
+
+async function cmdOrchestrate(opts: ParsedArgs): Promise<void> {
+  if (!opts.spec) die('--spec is required: bad jobs orchestrate --spec <file.json>')
+  const spec = readSpec(opts.spec)
+  const targets = await discoverTargets(spec.discover)
+  if (targets.length === 0) die('discover yielded zero targets — check your URLs / wayback range')
+  const est = estimateCost(spec, targets.length)
+  console.log(`  Targets discovered: ${targets.length}`)
+  console.log(`  Estimated cost: $${est.estimatedTotalUSD.toFixed(2)}`)
+  if (est.exceedsCap && spec.maxCostUSD !== undefined) {
+    die(`Estimated cost $${est.estimatedTotalUSD.toFixed(2)} exceeds maxCostUSD $${spec.maxCostUSD.toFixed(2)}`)
+  }
+  const job = createJob(spec, targets)
+  console.log(`  Created job ${chalk.bold(job.jobId)} (orchestrator mode)`)
+  const auditFn = await buildAuditFn(spec)
+  const { orchestrateJob } = await import('./jobs/orchestrator.js')
+  await orchestrateJob(job, { auditFn, verbose: true, maxIterations: opts.maxIterations })
+  const final = loadJob(job.jobId)
+  console.log(`  Status: ${chalk.bold(final?.status ?? 'unknown')}  ·  ok: ${final?.results.filter(r => r.status === 'ok').length ?? 0}/${final?.targets.length ?? 0}  ·  $${final?.totalCostUSD.toFixed(2)}`)
+}
+
 /**
  * Wire the runner to the design-audit pipeline. Imported lazily so `bad jobs
  * list` doesn't pull in Playwright. Each target gets its own output dir so
  * we can deterministically locate `report.json` after the audit returns.
  */
 async function buildAuditFn(_spec: JobSpec): Promise<AuditFn> {
   const { runDesignAudit, extractDesignTokens } = await import('./cli-design-audit.js')
+  const { detectBlock } = await import('./jobs/anti-bot.js')
   let counter = 0
   return async (target, opts) => {
     const url = target.snapshotUrl ?? target.url
@@ -187,15 +233,27 @@ async function buildAuditFn(_spec: JobSpec): Promise<AuditFn> {
     const rollupScore = page?.auditResultV2?.rollup?.score ?? page?.rollup?.score ?? page?.score
     const pageType = page?.auditResultV2?.classification?.type ?? page?.classification?.type
 
+    // Anti-bot / blocked-page detection. When fired, runOne records skipped.
+    const blockedReason = detectBlock(data) ?? undefined
+
     let tokensPath: string | undefined
-    if (opts?.extractTokens) {
+    // Skip token extraction on blocked pages — there's no real DOM to mine.
+    if (opts?.extractTokens && !blockedReason) {
       try {
         const tokensDir = path.join(outputDir, 'tokens')
         const { tokens } = await extractDesignTokens({ url, headless: opts?.headless ?? true, outputDir: tokensDir })
         tokensPath = path.resolve(tokensDir, 'tokens.json')
-        // extractDesignTokens persists its own files; ensure tokens.json exists at the canonical path.
+        // extractDesignTokens persists its own files; ensure tokens.json exists at the canonical path,
+        // and stamp it with our schemaVersion so future readers can refuse incompatible shapes.
+        const tokensWithVersion = { schemaVersion: 1, ...tokens }
         if (!fs.existsSync(tokensPath)) {
-          fs.writeFileSync(tokensPath, JSON.stringify(tokens, null, 2))
+          fs.writeFileSync(tokensPath, JSON.stringify(tokensWithVersion, null, 2))
+        } else {
+          // Re-stamp existing file with schemaVersion if missing.
+          const existing = JSON.parse(fs.readFileSync(tokensPath, 'utf-8')) as Record<string, unknown>
+          if (typeof existing.schemaVersion !== 'number') {
+            fs.writeFileSync(tokensPath, JSON.stringify({ schemaVersion: 1, ...existing }, null, 2))
+          }
         }
       } catch (err) {
         // Token extraction is additive — never let it fail the parent audit.
@@ -209,6 +267,7 @@ async function buildAuditFn(_spec: JobSpec): Promise<AuditFn> {
       rollupScore,
       pageType,
       tokensPath,
+      blockedReason,
     }
   }
 }

diff --git a/src/jobs/anti-bot.ts b/src/jobs/anti-bot.ts
@@ -0,0 +1,74 @@
+/**
+ * Anti-bot / blocked-page detection. Pure pattern match against an audit's
+ * report.json — we propagate the existing audit signals rather than re-running
+ * inference.
+ *
+ * Returns a reason string when blocked (so the job can carry it through to
+ * the result envelope), else null.
+ */
+
+const TITLE_PATTERNS = [
+  /just a moment\.{3}/i,
+  /^attention required/i,
+  /access denied/i,
+  /verify you are human/i,
+  /enable javascript and cookies/i,
+  /one more step/i,
+  /please complete the security check/i,
+  /^cloudflare/i,
+  /challenge[- ]page/i,
+]
+
+const INTENT_PATTERNS = [
+  /cloudflare challenge/i,
+  /anti.?bot/i,
+  /captcha/i,
+  /verify (the )?(human|user|browser)/i,
+  /access (denied|restricted|blocked)/i,
+]
+
+export interface BlockSignals {
+  title?: string
+  intent?: string
+  type?: string
+  ensembleConfidence?: number
+  findingCount?: number
+}
+
+/** Check the audit's report.json for anti-bot patterns. Returns the reason or null. */
+export function detectBlock(report: unknown): string | null {
+  const r = report as { pages?: Array<{ title?: string; classification?: { type?: string; intent?: string; ensembleConfidence?: number }; findings?: unknown[]; auditResultV2?: { classification?: { intent?: string; type?: string; ensembleConfidence?: number } } }> }
+  const page = r.pages?.[0]
+  if (!page) return null
+  const v2cls = page.auditResultV2?.classification
+  const cls = v2cls ?? page.classification ?? {}
+  const signals: BlockSignals = {
+    title: page.title,
+    intent: cls.intent,
+    type: cls.type,
+    ensembleConfidence: cls.ensembleConfidence,
+    findingCount: page.findings?.length ?? 0,
+  }
+  return reasonFor(signals)
+}
+
+export function reasonFor(s: BlockSignals): string | null {
+  const title = (s.title ?? '').trim()
+  const intent = (s.intent ?? '').trim()
+  if (TITLE_PATTERNS.some(re => re.test(title))) {
+    return `blocked: page title looks like an anti-bot challenge ("${title.slice(0, 80)}")`
+  }
+  if (INTENT_PATTERNS.some(re => re.test(intent))) {
+    return `blocked: classification intent indicates a challenge page ("${intent.slice(0, 80)}")`
+  }
+  // Last-resort heuristic: zero findings + very low ensemble confidence + unknown
+  // page-type is overwhelmingly an anti-bot or empty page. Leaving it in the
+  // leaderboard pollutes rankings.
+  if ((s.findingCount ?? 0) === 0
+    && typeof s.ensembleConfidence === 'number'
+    && s.ensembleConfidence < 0.35
+    && s.type === 'unknown') {
+    return 'blocked: zero findings, low classifier confidence, unknown type — likely empty/blocked'
+  }
+  return null
+}
diff --git a/src/jobs/cost-history.ts b/src/jobs/cost-history.ts
@@ -0,0 +1,52 @@
+/**
+ * Adaptive cost estimate from historical jobs. The default flat
+ * `DEFAULT_PER_AUDIT_USD` is still a fine starting point for a fresh user,
+ * but once 3+ jobs have completed we can do better: averaging the actual
+ * per-target cost across recent jobs is closer to ground truth, especially
+ * once ethics / first-principles modes start firing differently per target.
+ *
+ * Pure function of `~/.bad/jobs/` records — no telemetry endpoint required.
+ */
+
+import type { JobIndexEntry } from './store.js'
+import { listJobs, loadJob } from './store.js'
+import { DEFAULT_PER_AUDIT_USD } from './cost-estimate.js'
+
+/** Min number of completed jobs before we trust history over the static default. */
+const MIN_HISTORY = 3
+
+export interface AdaptiveCostStats {
+  perAuditUSD: number
+  source: 'history' | 'default'
+  /** Number of historical job records the estimate was averaged over. */
+  jobsObserved: number
+  /** Number of audited targets the estimate was averaged over. */
+  targetsObserved: number
+}
+
+export function computePerAuditFromHistory(dir?: string): AdaptiveCostStats {
+  const entries = listJobs(dir)
+  // Only count completed/partial jobs — failed ones have skewed cost.
+  const usable = entries.filter((e: JobIndexEntry) => e.status === 'completed' || e.status === 'partial').slice(0, 20)
+  if (usable.length < MIN_HISTORY) {
+    return { perAuditUSD: DEFAULT_PER_AUDIT_USD, source: 'default', jobsObserved: usable.length, targetsObserved: 0 }
+  }
+  let totalCost = 0
+  let totalTargets = 0
+  for (const entry of usable) {
+    const job = loadJob(entry.jobId, dir)
+    if (!job) continue
+    const okCount = job.results.filter(r => r.status === 'ok' && typeof r.costUSD === 'number').length
+    if (okCount === 0) continue
+    totalCost += job.totalCostUSD
+    totalTargets += okCount
+  }
+  if (totalTargets === 0) {
+    return { perAuditUSD: DEFAULT_PER_AUDIT_USD, source: 'default', jobsObserved: usable.length, targetsObserved: 0 }
+  }
+  const perAudit = totalCost / totalTargets
+  // Floor at half the static default to prevent runaway optimism on a stretch
+  // of zero-cost jobs (which can happen with the claude-code provider).
+  const floored = Math.max(perAudit, DEFAULT_PER_AUDIT_USD * 0.5)
+  return { perAuditUSD: floored, source: 'history', jobsObserved: usable.length, targetsObserved: totalTargets }
+}
diff --git a/src/jobs/index.ts b/src/jobs/index.ts
@@ -27,6 +27,14 @@ export type { JobIndexEntry } from './store.js'
 export { estimateCost, DEFAULT_PER_AUDIT_USD } from './cost-estimate.js'
 export { runJob } from './queue.js'
 export type { AuditFn, RunJobOptions } from './queue.js'
+export { withRetry, isRetryableDefault, DEFAULT_RETRY_POLICY } from './retry.js'
+export type { RetryPolicy } from './retry.js'
+export { detectBlock, reasonFor } from './anti-bot.js'
+export type { BlockSignals } from './anti-bot.js'
+export { computePerAuditFromHistory } from './cost-history.js'
+export type { AdaptiveCostStats } from './cost-history.js'
+export { orchestrateJob, needsIntervention } from './orchestrator.js'
+export type { OrchestrateJobOptions } from './orchestrator.js'
 
 import type { Job, JobSpec, JobTarget } from './types.js'
 import { newJobId, saveJob, appendIndexEntry } from './store.js'