Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions .changeset/jobs-robustness-and-orchestrator.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
---
'@tangle-network/browser-agent-driver': minor
---

feat(jobs): robustness layer + agentic orchestrator

Five hardening additions plus an LLM-driven control loop that wraps the runner. The architectural rule: protocols are deterministic (retry, anti-bot detection, schema gating) and judgment is agentic (when to re-sample broken wayback snapshots, retry vs. skip, conclude). Mixing those lines is how you end up paying LLM tax on exponential backoff.

**Deterministic foundation**
- `src/jobs/retry.ts` — whitelist-based retry with exponential backoff + jitter. Retries 429 / 5xx / network / timeout / fetch failures; everything else (4xx, anti-bot, schema, unknown) is treated as deterministic and not retried. Configurable per-error-class via `isRetryable`. Default: 3 attempts, 500ms base, 5s cap. Wired into `runJob` via `RunJobOptions.retryPolicy`.
- `src/jobs/anti-bot.ts` — pure pattern match against an audit's `report.json`. Title patterns (Cloudflare interstitial, "Just a moment...", "Access denied", etc.) and intent patterns plus a last-resort heuristic (zero findings + low classifier confidence + unknown type). When fired, the runner records `status: 'skipped'` with a reason instead of putting a bogus score on the leaderboard.
- `src/jobs/cost-history.ts` — adaptive cost estimate from prior job records. Uses static default until N≥3 completed jobs exist; afterward averages per-target cost from the last 20. Floors at 50% of the static default to prevent runaway optimism on a stretch of zero-cost claude-code jobs.
- Schema versioning: `tokens.json` is now stamped with `schemaVersion: 1` at write time; the aggregator refuses files older than `MIN_TOKENS_SCHEMA`.
- Resume: `bad jobs resume <jobId>` re-runs only targets that aren't already `ok`/`skipped`. `RunJobOptions.resume` exposes the same on the API.

**Agentic orchestrator**
- `src/jobs/orchestrator.ts` — `orchestrateJob(job, opts)` runs the deterministic fan-out via `runJob`, then enters a control loop only if intervention is warranted. `needsIntervention` is the gate: any failures, missing entries, or zero-scored wayback snapshots (broken archive captures) trigger the agent.
- LLM tool surface (5 tools): `getJobState`, `resampleWayback`, `retryTarget`, `markSkipped`, `concludeJob`. Hard caps: 2 retries per target, 1 resample per URL, cost ≤ `spec.maxCostUSD * 0.9`.
- Default brain uses the same `claude-code` provider as the audit pipeline (subscription-based, no API key required).
- CLI: `bad jobs orchestrate --spec <file.json>` runs the spec end-to-end with the agent layer. Same JSON spec as `create`.

**Tests:** +34 across `jobs-retry`, `jobs-anti-bot`, `jobs-cost-history`, `jobs-orchestrator` (deterministic gate), and `jobs-orchestrator-agent` (LLM path with `MockLanguageModelV3`). Total: 1494 passing.
73 changes: 66 additions & 7 deletions src/cli-jobs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ interface ParsedArgs {
json?: boolean
jobId?: string
yes?: boolean
maxIterations?: number
}

function parseArgs(argv: string[]): ParsedArgs {
Expand All @@ -44,6 +45,7 @@ function parseArgs(argv: string[]): ParsedArgs {
if (a === '--spec') out.spec = argv[++i]
else if (a === '--json') out.json = true
else if (a === '--yes' || a === '-y') out.yes = true
else if (a === '--max-iterations') out.maxIterations = Number(argv[++i])
else if (!a.startsWith('-') && !out.jobId) out.jobId = a
}
return out
Expand All @@ -68,7 +70,9 @@ export async function runJobsCli(args: string[]): Promise<void> {
if (sub === 'status') return cmdStatus(opts)
if (sub === 'estimate') return cmdEstimate(opts)
if (sub === 'create') return cmdCreate(opts)
die(`Unknown subcommand: ${sub}. Use create | list | status | estimate.`)
if (sub === 'resume') return cmdResume(opts)
if (sub === 'orchestrate') return cmdOrchestrate(opts)
die(`Unknown subcommand: ${sub}. Use create | list | status | estimate | resume | orchestrate.`)
}

function cmdList(opts: ParsedArgs): void {
Expand Down Expand Up @@ -112,13 +116,15 @@ async function cmdEstimate(opts: ParsedArgs): Promise<void> {
if (!opts.spec) die('--spec is required for estimate')
const spec = readSpec(opts.spec)
const targets = await discoverTargets(spec.discover)
const est = estimateCost(spec, targets.length)
const { computePerAuditFromHistory } = await import('./jobs/cost-history.js')
const adaptive = computePerAuditFromHistory()
const est = estimateCost(spec, targets.length, adaptive.perAuditUSD)
if (opts.json) {
console.log(JSON.stringify({ spec, ...est }, null, 2))
console.log(JSON.stringify({ spec, ...est, costSource: adaptive.source, jobsObserved: adaptive.jobsObserved }, null, 2))
return
}
console.log(` Targets: ${est.targetCount}`)
console.log(` Per-audit: $${est.perAuditUSD.toFixed(2)}`)
console.log(` Per-audit: $${est.perAuditUSD.toFixed(2)} ${chalk.dim(`(${adaptive.source}${adaptive.source === 'history' ? `, n=${adaptive.targetsObserved}` : ''})`)}`)
console.log(` Estimated total: $${est.estimatedTotalUSD.toFixed(2)}`)
if (est.exceedsCap && spec.maxCostUSD !== undefined) {
console.log(chalk.yellow(` ⚠ exceeds cap of $${spec.maxCostUSD.toFixed(2)}`))
Expand Down Expand Up @@ -146,13 +152,53 @@ async function cmdCreate(opts: ParsedArgs): Promise<void> {
console.log(` Status: ${chalk.bold(final?.status ?? 'unknown')} · ok: ${final?.results.filter(r => r.status === 'ok').length ?? 0}/${final?.targets.length ?? 0} · $${final?.totalCostUSD.toFixed(2)}`)
}

async function cmdResume(opts: ParsedArgs): Promise<void> {
if (!opts.jobId) die('jobId is required: bad jobs resume <jobId>')
const job = loadJob(opts.jobId)
if (!job) die(`job not found: ${opts.jobId}`)
const remaining = job.targets.filter(t => {
const key = t.snapshotUrl ?? t.url
return !job.results.some(r => (r.snapshotUrl ?? r.url) === key && (r.status === 'ok' || r.status === 'skipped'))
})
if (remaining.length === 0) {
console.log(chalk.green(` Nothing to resume — all ${job.targets.length} targets already completed or skipped.`))
return
}
console.log(` Resuming job ${chalk.bold(job.jobId)} · ${remaining.length}/${job.targets.length} targets remain`)
const auditFn = await buildAuditFn(job.spec)
await runJob(job, { auditFn, resume: true })
const final = loadJob(job.jobId)
console.log(` Status: ${chalk.bold(final?.status ?? 'unknown')} · ok: ${final?.results.filter(r => r.status === 'ok').length ?? 0}/${final?.targets.length ?? 0}`)
}

async function cmdOrchestrate(opts: ParsedArgs): Promise<void> {
if (!opts.spec) die('--spec is required: bad jobs orchestrate --spec <file.json>')
const spec = readSpec(opts.spec)
const targets = await discoverTargets(spec.discover)
if (targets.length === 0) die('discover yielded zero targets — check your URLs / wayback range')
const est = estimateCost(spec, targets.length)
console.log(` Targets discovered: ${targets.length}`)
console.log(` Estimated cost: $${est.estimatedTotalUSD.toFixed(2)}`)
if (est.exceedsCap && spec.maxCostUSD !== undefined) {
die(`Estimated cost $${est.estimatedTotalUSD.toFixed(2)} exceeds maxCostUSD $${spec.maxCostUSD.toFixed(2)}`)
}
const job = createJob(spec, targets)
console.log(` Created job ${chalk.bold(job.jobId)} (orchestrator mode)`)
const auditFn = await buildAuditFn(spec)
const { orchestrateJob } = await import('./jobs/orchestrator.js')
await orchestrateJob(job, { auditFn, verbose: true, maxIterations: opts.maxIterations })
const final = loadJob(job.jobId)
console.log(` Status: ${chalk.bold(final?.status ?? 'unknown')} · ok: ${final?.results.filter(r => r.status === 'ok').length ?? 0}/${final?.targets.length ?? 0} · $${final?.totalCostUSD.toFixed(2)}`)
}

/**
* Wire the runner to the design-audit pipeline. Imported lazily so `bad jobs
* list` doesn't pull in Playwright. Each target gets its own output dir so
* we can deterministically locate `report.json` after the audit returns.
*/
async function buildAuditFn(_spec: JobSpec): Promise<AuditFn> {
const { runDesignAudit, extractDesignTokens } = await import('./cli-design-audit.js')
const { detectBlock } = await import('./jobs/anti-bot.js')
let counter = 0
return async (target, opts) => {
const url = target.snapshotUrl ?? target.url
Expand Down Expand Up @@ -187,15 +233,27 @@ async function buildAuditFn(_spec: JobSpec): Promise<AuditFn> {
const rollupScore = page?.auditResultV2?.rollup?.score ?? page?.rollup?.score ?? page?.score
const pageType = page?.auditResultV2?.classification?.type ?? page?.classification?.type

// Anti-bot / blocked-page detection. When fired, runOne records skipped.
const blockedReason = detectBlock(data) ?? undefined

let tokensPath: string | undefined
if (opts?.extractTokens) {
// Skip token extraction on blocked pages — there's no real DOM to mine.
if (opts?.extractTokens && !blockedReason) {
try {
const tokensDir = path.join(outputDir, 'tokens')
const { tokens } = await extractDesignTokens({ url, headless: opts?.headless ?? true, outputDir: tokensDir })
tokensPath = path.resolve(tokensDir, 'tokens.json')
// extractDesignTokens persists its own files; ensure tokens.json exists at the canonical path.
// extractDesignTokens persists its own files; ensure tokens.json exists at the canonical path,
// and stamp it with our schemaVersion so future readers can refuse incompatible shapes.
const tokensWithVersion = { schemaVersion: 1, ...tokens }
if (!fs.existsSync(tokensPath)) {
fs.writeFileSync(tokensPath, JSON.stringify(tokens, null, 2))
fs.writeFileSync(tokensPath, JSON.stringify(tokensWithVersion, null, 2))
} else {
// Re-stamp existing file with schemaVersion if missing.
const existing = JSON.parse(fs.readFileSync(tokensPath, 'utf-8')) as Record<string, unknown>
if (typeof existing.schemaVersion !== 'number') {
fs.writeFileSync(tokensPath, JSON.stringify({ schemaVersion: 1, ...existing }, null, 2))
}
}
} catch (err) {
// Token extraction is additive — never let it fail the parent audit.
Expand All @@ -209,6 +267,7 @@ async function buildAuditFn(_spec: JobSpec): Promise<AuditFn> {
rollupScore,
pageType,
tokensPath,
blockedReason,
}
}
}
Expand Down
74 changes: 74 additions & 0 deletions src/jobs/anti-bot.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/**
* Anti-bot / blocked-page detection. Pure pattern match against an audit's
* report.json — we propagate the existing audit signals rather than re-running
* inference.
*
* Returns a reason string when blocked (so the job can carry it through to
* the result envelope), else null.
*/

const TITLE_PATTERNS = [
/just a moment\.{3}/i,
/^attention required/i,
/access denied/i,
/verify you are human/i,
/enable javascript and cookies/i,
/one more step/i,
/please complete the security check/i,
/^cloudflare/i,
/challenge[- ]page/i,
]

const INTENT_PATTERNS = [
/cloudflare challenge/i,
/anti.?bot/i,
/captcha/i,
/verify (the )?(human|user|browser)/i,
/access (denied|restricted|blocked)/i,
]

export interface BlockSignals {
title?: string
intent?: string
type?: string
ensembleConfidence?: number
findingCount?: number
}

/** Check the audit's report.json for anti-bot patterns. Returns the reason or null. */
export function detectBlock(report: unknown): string | null {
const r = report as { pages?: Array<{ title?: string; classification?: { type?: string; intent?: string; ensembleConfidence?: number }; findings?: unknown[]; auditResultV2?: { classification?: { intent?: string; type?: string; ensembleConfidence?: number } } }> }
const page = r.pages?.[0]
if (!page) return null
const v2cls = page.auditResultV2?.classification
const cls = v2cls ?? page.classification ?? {}
const signals: BlockSignals = {
title: page.title,
intent: cls.intent,
type: cls.type,
ensembleConfidence: cls.ensembleConfidence,
findingCount: page.findings?.length ?? 0,
}
return reasonFor(signals)
}

export function reasonFor(s: BlockSignals): string | null {
const title = (s.title ?? '').trim()
const intent = (s.intent ?? '').trim()
if (TITLE_PATTERNS.some(re => re.test(title))) {
return `blocked: page title looks like an anti-bot challenge ("${title.slice(0, 80)}")`
}
if (INTENT_PATTERNS.some(re => re.test(intent))) {
return `blocked: classification intent indicates a challenge page ("${intent.slice(0, 80)}")`
}
// Last-resort heuristic: zero findings + very low ensemble confidence + unknown
// page-type is overwhelmingly an anti-bot or empty page. Leaving it in the
// leaderboard pollutes rankings.
if ((s.findingCount ?? 0) === 0
&& typeof s.ensembleConfidence === 'number'
&& s.ensembleConfidence < 0.35
&& s.type === 'unknown') {
return 'blocked: zero findings, low classifier confidence, unknown type — likely empty/blocked'
}
return null
}
52 changes: 52 additions & 0 deletions src/jobs/cost-history.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/**
* Adaptive cost estimate from historical jobs. The default flat
* `DEFAULT_PER_AUDIT_USD` is still a fine starting point for a fresh user,
* but once 3+ jobs have completed we can do better: averaging the actual
* per-target cost across recent jobs is closer to ground truth, especially
* once ethics / first-principles modes start firing differently per target.
*
* Pure function of `~/.bad/jobs/` records — no telemetry endpoint required.
*/

import type { JobIndexEntry } from './store.js'
import { listJobs, loadJob } from './store.js'
import { DEFAULT_PER_AUDIT_USD } from './cost-estimate.js'

/** Min number of completed jobs before we trust history over the static default. */
const MIN_HISTORY = 3

export interface AdaptiveCostStats {
perAuditUSD: number
source: 'history' | 'default'
/** Number of historical job records the estimate was averaged over. */
jobsObserved: number
/** Number of audited targets the estimate was averaged over. */
targetsObserved: number
}

export function computePerAuditFromHistory(dir?: string): AdaptiveCostStats {
const entries = listJobs(dir)
// Only count completed/partial jobs — failed ones have skewed cost.
const usable = entries.filter((e: JobIndexEntry) => e.status === 'completed' || e.status === 'partial').slice(0, 20)
if (usable.length < MIN_HISTORY) {
return { perAuditUSD: DEFAULT_PER_AUDIT_USD, source: 'default', jobsObserved: usable.length, targetsObserved: 0 }
}
let totalCost = 0
let totalTargets = 0
for (const entry of usable) {
const job = loadJob(entry.jobId, dir)
if (!job) continue
const okCount = job.results.filter(r => r.status === 'ok' && typeof r.costUSD === 'number').length
if (okCount === 0) continue
totalCost += job.totalCostUSD
totalTargets += okCount
}
if (totalTargets === 0) {
return { perAuditUSD: DEFAULT_PER_AUDIT_USD, source: 'default', jobsObserved: usable.length, targetsObserved: 0 }
}
const perAudit = totalCost / totalTargets
// Floor at half the static default to prevent runaway optimism on a stretch
// of zero-cost jobs (which can happen with the claude-code provider).
const floored = Math.max(perAudit, DEFAULT_PER_AUDIT_USD * 0.5)
return { perAuditUSD: floored, source: 'history', jobsObserved: usable.length, targetsObserved: totalTargets }
}
8 changes: 8 additions & 0 deletions src/jobs/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,14 @@ export type { JobIndexEntry } from './store.js'
export { estimateCost, DEFAULT_PER_AUDIT_USD } from './cost-estimate.js'
export { runJob } from './queue.js'
export type { AuditFn, RunJobOptions } from './queue.js'
export { withRetry, isRetryableDefault, DEFAULT_RETRY_POLICY } from './retry.js'
export type { RetryPolicy } from './retry.js'
export { detectBlock, reasonFor } from './anti-bot.js'
export type { BlockSignals } from './anti-bot.js'
export { computePerAuditFromHistory } from './cost-history.js'
export type { AdaptiveCostStats } from './cost-history.js'
export { orchestrateJob, needsIntervention } from './orchestrator.js'
export type { OrchestrateJobOptions } from './orchestrator.js'

import type { Job, JobSpec, JobTarget } from './types.js'
import { newJobId, saveJob, appendIndexEntry } from './store.js'
Expand Down
Loading
Loading