diff --git a/docs/test-benchmark.md b/docs/test-benchmark.md index f619ff9f..f754e766 100644 --- a/docs/test-benchmark.md +++ b/docs/test-benchmark.md @@ -86,7 +86,7 @@ pnpm test:bench --threshold 30 # 墙钟超 30s 则 exit 1(CI 回归闸) ## 当前瓶颈 & 后续可优化项 -16 核墙钟的下限(~10s)现在由 8 个**真实 spawn `node dist/cli.js`** 的集成测试门控(`workflow-cli` / `workflow-cli-ls-tail` / `preset-export-cli` / `worker-budget-cli` / `workflow-c0-isolation` / `seed-adapter` / `hook-installer` / `tmux-env-isolation`)。它们 **CPU-throughput-bound**:14 个 vitest fork + 每用例再 spawn 一个 node 子进程,16 核被超额订阅,单文件耗时在 5–9s 间抖动。 +16 核墙钟的下限(~10s)现在由 7 个**真实 spawn `node dist/cli.js`** 的集成测试门控(`workflow-cli` / `workflow-cli-ls-tail` / `preset-export-cli` / `workflow-c0-isolation` / `seed-adapter` / `hook-installer` / `tmux-env-isolation`)。它们 **CPU-throughput-bound**:14 个 vitest fork + 每用例再 spawn 一个 node 子进程,16 核被超额订阅,单文件耗时在 5–9s 间抖动。 - **(大杠杆,需取舍)in-process 跑 CLI。** 从 4138 行的 `src/cli.ts` 抽出可测的 `main(argv): Promise` 入口,让这 8 个文件直接进程内调用而非 spawn `node`。能同时砍掉串行总工作量和并行争抢,但牺牲「真实启动二进制 / argv 解析 / 退出码」的保真度,是较大的重构。 - **(CI 侧)`vitest --shard=i/N`** 跨 runner 分片,缩短 CI 墙钟(不影响本地)。 diff --git a/src/bot-registry.ts b/src/bot-registry.ts index 68ede0a3..50da86af 100644 --- a/src/bot-registry.ts +++ b/src/bot-registry.ts @@ -100,6 +100,18 @@ export interface BotConfig { */ sandboxHidePaths?: string[]; backendType?: BackendType; + /** + * Max simultaneously-LIVE sessions for this bot. When the bot's live session + * count exceeds this, the idle-worker sweeper suspends its longest-idle, + * not-currently-busy sessions (resumable backends only) down to the cap — the + * worker AND the CLI are killed to reclaim memory, and the session + * cold-resumes from its on-disk transcript on the next message. Unset → the + * built-in default {@link DEFAULT_MAX_LIVE_WORKERS} (30); an explicit positive + * integer overrides it. Pure count-based: there is NO idle-time threshold. + * Configured per bot from the dashboard (Groups & Bots → bot card). Adopted + * sessions are never suspended. See core/idle-worker-sweeper.ts. + */ + maxLiveWorkers?: number; workingDir?: string; workingDirs?: string[]; allowedUsers?: string[]; @@ -752,6 +764,11 @@ export function parseBotConfigsFromText(jsonText: string): BotConfig[] { ? entry.sandboxHidePaths.filter((p: unknown): p is string => typeof p === 'string' && !!p.trim()) : [], backendType: entry.backendType, + // Positive integer only; ≤0 / non-int / absent → undefined (= no cap). + maxLiveWorkers: typeof entry.maxLiveWorkers === 'number' + && Number.isInteger(entry.maxLiveWorkers) && entry.maxLiveWorkers > 0 + ? entry.maxLiveWorkers + : undefined, workingDir: workingDirs?.[0] ?? entry.workingDir, workingDirs, allowedUsers: entry.allowedUsers, diff --git a/src/cli.ts b/src/cli.ts index e9cd7c66..d7c66e95 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -16,7 +16,6 @@ * botmux delete — close a session by ID prefix * botmux delete all — close all active sessions * botmux autostart enable|disable|status — manage boot-time autostart (launchd / user systemd) - * botmux worker-budget status|set|unset — inspect/override idle worker suspension budget */ import { execSync, execFileSync, spawnSync, spawn } from 'node:child_process'; import { existsSync, mkdirSync, copyFileSync, readFileSync, writeFileSync, renameSync, readdirSync, readlinkSync, appendFileSync, statSync, unlinkSync } from 'node:fs'; @@ -71,8 +70,7 @@ import { } from './utils/bot-routing.js'; import { isLocale, localeForBot, setDefaultLocale, SUPPORTED_LOCALES, t, type Locale } from './i18n/index.js'; import { type Brand, chatAppLink, larkHosts, normalizeBrand, sdkDomain } from './im/lark/lark-hosts.js'; -import { mergeGlobalConfig, readGlobalConfig, setGlobalLocale, globalConfigPath, type WorkerConfig } from './global-config.js'; -import { detectWorkerResources, resolveWorkerBudget } from './core/worker-budget.js'; +import { mergeGlobalConfig, readGlobalConfig, setGlobalLocale, globalConfigPath } from './global-config.js'; import { buildBridgeSendMarkerContent } from './services/bridge-fallback-gate.js'; import { writeManualIntentIfAbsentTo } from './services/restart-intent-store.js'; @@ -2591,9 +2589,6 @@ botmux v${getVersion()} — IM ↔ AI 编程 CLI 桥接 autostart enable 注册开机自启(macOS launchd / Linux user systemd,无需 sudo) autostart disable 注销开机自启 autostart status 查看自启状态 - worker-budget [status] 查看 idle worker 自动暂停预算 - set --max-live-workers N [--idle-minutes N] - 覆盖全局 worker 预算,写入 ~/.botmux/config.json(agent 推荐用命令改,不手写 JSON) unset 清除 worker 预算覆盖,恢复按机器 CPU/内存自动推导 lang [zh|en] 切换 UI 语言(无参 = 查看当前设置) --bot N 仅改 bots.json 中第 N 个 bot 的 lang @@ -4953,88 +4948,6 @@ async function cmdLang(args: string[]): Promise { await reportLocaleApplied(); } -// ─── botmux worker-budget ─────────────────────────────────────────────────── - -function parsePositiveInt(value: string | undefined, label: string): number { - const n = Number(value); - if (!Number.isInteger(n) || n <= 0) { - console.error(`${label} must be a positive integer.`); - process.exit(1); - } - return n; -} - -function formatGib(bytes: number): string { - return `${(bytes / 1024 ** 3).toFixed(1)}GiB`; -} - -function cmdWorkerBudget(args: string[]): void { - const sub = (args[0] ?? 'status').toLowerCase(); - if (sub === '--help' || sub === '-h' || sub === 'help') { - console.log(`Usage: - botmux worker-budget [status] - botmux worker-budget set --max-live-workers [--idle-minutes |--idle-ms ] - botmux worker-budget unset`); - return; - } - - if (sub === 'status') { - const cfg = readGlobalConfig(); - const resources = detectWorkerResources(); - const budget = resolveWorkerBudget(cfg.worker, resources); - console.log('Worker budget'); - console.log(` maxLiveWorkers: ${budget.maxLiveWorkers} (${budget.maxLiveWorkersSource})`); - console.log(` idleSuspendMs: ${budget.idleSuspendMs} (${budget.idleSuspendMsSource})`); - console.log(` auto baseline: ${budget.autoMaxLiveWorkers} from cpu=${resources.cpuCount}, memory=${formatGib(resources.memoryBytes)}`); - console.log(` Config file: ${globalConfigPath()}`); - console.log(''); - console.log('Agent-safe edit commands:'); - console.log(' botmux worker-budget set --max-live-workers 12 --idle-minutes 45'); - console.log(' botmux worker-budget unset'); - return; - } - - if (sub === 'set') { - const rest = args.slice(1); - const maxLive = argValue(rest, '--max-live-workers', '--max-live'); - const idleMs = argValue(rest, '--idle-ms', '--idle-suspend-ms'); - const idleMinutes = argValue(rest, '--idle-minutes', '--idle-min'); - if (maxLive === undefined && idleMs === undefined && idleMinutes === undefined) { - console.error('Usage: botmux worker-budget set --max-live-workers [--idle-minutes |--idle-ms ]'); - process.exit(1); - } - if (idleMs !== undefined && idleMinutes !== undefined) { - console.error('Use only one of --idle-ms or --idle-minutes.'); - process.exit(1); - } - - const current = readGlobalConfig().worker ?? {}; - const next: WorkerConfig = { ...current }; - if (maxLive !== undefined) next.maxLiveWorkers = parsePositiveInt(maxLive, '--max-live-workers'); - if (idleMs !== undefined) next.idleSuspendMs = parsePositiveInt(idleMs, '--idle-ms'); - if (idleMinutes !== undefined) next.idleSuspendMs = parsePositiveInt(idleMinutes, '--idle-minutes') * 60_000; - - mergeGlobalConfig({ worker: next }); - const budget = resolveWorkerBudget(next); - console.log('✅ Updated worker budget.'); - console.log(` maxLiveWorkers: ${budget.maxLiveWorkers} (${budget.maxLiveWorkersSource})`); - console.log(` idleSuspendMs: ${budget.idleSuspendMs} (${budget.idleSuspendMsSource})`); - console.log(` Config file: ${globalConfigPath()}`); - console.log('Daemon reads this on the next idle-worker sweep; restart also picks it up.'); - return; - } - - if (sub === 'unset' || sub === 'clear') { - mergeGlobalConfig({ worker: null }); - console.log('✅ Cleared worker budget override; daemon will use the auto-derived budget.'); - console.log(` Config file: ${globalConfigPath()}`); - return; - } - - console.error('Usage: botmux worker-budget [status|set|unset]'); - process.exit(1); -} - // ─── botmux preset ──────────────────────────────────────────────────────────── /** @@ -5403,7 +5316,6 @@ switch (command) { case 'quoted': await cmdQuoted(process.argv.slice(3)); break; case 'lang': await cmdLang(process.argv.slice(3)); break; case 'voice': await cmdVoiceSetup(process.argv.slice(3)); break; - case 'worker-budget': cmdWorkerBudget(process.argv.slice(3)); break; case 'thread': { // Removed in favor of `botmux history` (普通群也兼容). Friendly stderr so // pre-rename scripts/skills surface the rename instead of "unknown command". diff --git a/src/core/dashboard-ipc-server.ts b/src/core/dashboard-ipc-server.ts index 77fd4328..1215a508 100644 --- a/src/core/dashboard-ipc-server.ts +++ b/src/core/dashboard-ipc-server.ts @@ -17,7 +17,7 @@ import * as cardPrefsStore from '../services/card-prefs-store.js'; import * as observedBotsStore from '../services/observed-bots-store.js'; import { getDeploymentIdentity } from '../services/deployment-identity.js'; import * as grantPrefsStore from '../services/grant-prefs-store.js'; -import { findConfigField, applyConfigField } from '../services/bot-config-store.js'; +import { findConfigField, applyConfigField, coerceConfigValue } from '../services/bot-config-store.js'; import { config } from '../config.js'; import { computeSandboxDiff, applySandboxDiff } from '../services/sandbox-land.js'; import { readRawConfig, findEntryIndex, requireConfigPath } from '../services/config-store.js'; @@ -845,6 +845,11 @@ ipcRoute('GET', '/api/bot-default-oncall', async (_req, res) => { const grantPrefs = grantPrefsStore.getBotGrantPrefs(cachedLarkAppId); let p2pMode: 'thread' | 'chat' = 'thread'; try { if (getBot(cachedLarkAppId).config.p2pMode === 'chat') p2pMode = 'chat'; } catch { /* default thread */ } + let maxLiveWorkers: number | null = null; + try { + const m = getBot(cachedLarkAppId).config.maxLiveWorkers; + if (typeof m === 'number' && Number.isInteger(m) && m > 0) maxLiveWorkers = m; + } catch { /* default unlimited */ } jsonRes(res, 200, { larkAppId: cachedLarkAppId, botName: getBotName(), @@ -864,6 +869,7 @@ ipcRoute('GET', '/api/bot-default-oncall', async (_req, res) => { restrictGrantCommands: grantPrefs.restrictGrantCommands, messageQuotaDefaultLimit: grantPrefs.messageQuotaDefaultLimit, p2pMode, + maxLiveWorkers, }); }); @@ -968,6 +974,37 @@ ipcRoute('PUT', '/api/bot-p2p-mode', async (req, res) => { jsonRes(res, 200, { ok: true, p2pMode: value ?? 'thread' }); }); +// Per-bot 最大同时活跃会话数 maxLiveWorkers。Body `{ maxLiveWorkers: number | null }`: +// • 正整数 → 设上限;超过后 idle-worker sweeper 把最久未用的会话休眠到上限内 +// • null → 清除(回落到内置默认 30) +// 走 applyConfigField(与 /config 同一写盘 + 内存热更新路径):sweeper 每分钟读 +// 实时 bot.config.maxLiveWorkers,免重启即生效。 +ipcRoute('PUT', '/api/bot-max-live-workers', async (req, res) => { + if (!cachedLarkAppId) return jsonRes(res, 503, { error: 'larkAppId_not_set' }); + let raw: unknown; + try { raw = await readJsonBody(req); } + catch { return jsonRes(res, 400, { ok: false, error: 'bad_json' }); } + if (typeof raw !== 'object' || raw === null || Array.isArray(raw)) { + return jsonRes(res, 400, { ok: false, error: 'no_valid_fields' }); + } + const body = raw as { maxLiveWorkers?: unknown }; + const spec = findConfigField('maxLiveWorkers'); + if (!spec) return jsonRes(res, 500, { ok: false, error: 'spec_missing' }); + + // null(含 JSON null)= 清除上限;number 走 coerce 校验正整数。 + let value: number | null; + if (body.maxLiveWorkers === null || body.maxLiveWorkers === undefined) { + value = null; + } else { + const c = coerceConfigValue(spec, body.maxLiveWorkers); + if (!c.ok || typeof c.value !== 'number') return jsonRes(res, 400, { ok: false, error: 'invalid_number' }); + value = c.value; + } + const r = await applyConfigField(cachedLarkAppId, spec, value); + if (!r.ok) return jsonRes(res, 400, { ok: false, error: r.reason }); + jsonRes(res, 200, { ok: true, maxLiveWorkers: value }); +}); + // Per-bot file-sandbox toggle. Body `{ enabled: boolean }`. When on, this bot's // CLI sessions run inside a per-session bwrap file sandbox (Linux). For oncall // bots shared with semi-trusted users. diff --git a/src/core/idle-worker-sweeper.ts b/src/core/idle-worker-sweeper.ts index f4339813..43a5b961 100644 --- a/src/core/idle-worker-sweeper.ts +++ b/src/core/idle-worker-sweeper.ts @@ -1,12 +1,25 @@ import type { DaemonSession } from './types.js'; -import { readGlobalConfig } from '../global-config.js'; -import { DEFAULT_IDLE_SUSPEND_MS, resolveWorkerBudget, type ResolvedWorkerBudget } from './worker-budget.js'; import { suspendWorker } from './worker-pool.js'; import { isSuspendableBackendType } from './persistent-backend.js'; +/** + * Default per-bot live-session cap applied when a bot has no explicit + * `maxLiveWorkers` configured. Keeps memory bounded out of the box: beyond this + * many live sessions, the least-recently-used ones are suspended (CLI freed, + * cold-resumes from transcript on the next message). A bot can override it from + * the dashboard. NOTE: the dashboard help copy hardcodes this number + * ('botDefaults.maxLiveWorkers*' i18n) — keep them in sync. + */ +export const DEFAULT_MAX_LIVE_WORKERS = 30; + export interface IdleWorkerSweepOptions { - now?: number; - workerBudget?: Pick; + /** + * Explicit per-bot cap for THIS bot (one daemon = one bot, so the whole + * `activeSessions` map belongs to a single bot). `undefined` (bot unset) → + * fall back to {@link DEFAULT_MAX_LIVE_WORKERS}. `≤0` → no cap (escape hatch: + * never suspend). + */ + maxLiveWorkers?: number; } export interface IdleWorkerSweepResult { @@ -14,22 +27,32 @@ export interface IdleWorkerSweepResult { reason: string; } -export const DEFAULT_IDLE_WORKER_MS = DEFAULT_IDLE_SUSPEND_MS; - function liveWorkers(activeSessions: Map): DaemonSession[] { return [...activeSessions.values()].filter(ds => !!ds.worker && !ds.worker.killed); } +/** + * Count-based live-worker cap. When this bot has more live workers than its + * configured `maxLiveWorkers`, suspend its longest-idle (by lastMessageAt), + * not-currently-busy, resumable-backend sessions down to the cap. The CLI keeps + * running detached; the next message / terminal open re-forks the worker + * (daemon.ts worker-null resume path). + * + * Deliberately has NO idle-time threshold: 申晗's policy is "while resources + * allow, never time out an old session" — suspension only kicks in to enforce + * an explicit per-bot count cap. The only guard kept is correctness, not a + * timeout: a session that is mid-turn (`lastScreenStatus !== 'idle'`) is never + * suspended so an in-flight reply is never interrupted. If every over-cap + * session is busy, none are suspended this round and the next sweep retries. + */ export function sweepIdleWorkers( activeSessions: Map, opts: IdleWorkerSweepOptions = {}, ): IdleWorkerSweepResult[] { - const now = opts.now ?? Date.now(); - const budget = opts.workerBudget ?? resolveWorkerBudget(readGlobalConfig().worker); - const maxLiveWorkers = budget.maxLiveWorkers; - const idleMs = budget.idleSuspendMs; + const cap = opts.maxLiveWorkers ?? DEFAULT_MAX_LIVE_WORKERS; + if (cap <= 0) return []; // explicit ≤0 = unlimited escape hatch const running = liveWorkers(activeSessions); - if (running.length <= maxLiveWorkers) return []; + if (running.length <= cap) return []; const candidates = running // Never suspend an adopted session. forkAdoptWorker stamps its @@ -42,16 +65,17 @@ export function sweepIdleWorkers( // marker so a restored adopt session is excluded too. .filter(ds => !ds.adoptedFrom && !ds.session.adoptedFrom) .filter(ds => isSuspendableBackendType(ds.initConfig?.backendType)) + // Correctness guard (not a timeout): never suspend a session that is + // currently producing output — that would cut off an in-flight reply. .filter(ds => ds.lastScreenStatus === 'idle') - .filter(ds => now - (ds.lastMessageAt || 0) >= idleMs) .sort((a, b) => (a.lastMessageAt || 0) - (b.lastMessageAt || 0)); const suspended: IdleWorkerSweepResult[] = []; let liveCount = running.length; for (const ds of candidates) { - if (liveCount <= maxLiveWorkers) break; - if (!suspendWorker(ds, 'idle_worker_budget')) continue; - suspended.push({ sessionId: ds.session.sessionId, reason: 'idle_worker_budget' }); + if (liveCount <= cap) break; + if (!suspendWorker(ds, 'live_worker_cap')) continue; + suspended.push({ sessionId: ds.session.sessionId, reason: 'live_worker_cap' }); liveCount--; } return suspended; diff --git a/src/core/session-manager.ts b/src/core/session-manager.ts index a516e668..3c874a48 100644 --- a/src/core/session-manager.ts +++ b/src/core/session-manager.ts @@ -796,6 +796,15 @@ export async function restoreActiveSessions(activeSessions: Map 0 ? n : undefined; -} - -function cgroupMemoryLimitBytes(hostMemoryBytes: number): number | undefined { - const limits = [ - readMemoryLimitFile('/sys/fs/cgroup/memory.max'), - readMemoryLimitFile('/sys/fs/cgroup/memory/memory.limit_in_bytes'), - ].filter((n): n is number => n !== undefined); - if (limits.length === 0) return undefined; - const limit = Math.min(...limits); - return limit < hostMemoryBytes ? limit : undefined; -} - -export function detectWorkerResources(): WorkerResources { - const hostMemoryBytes = totalmem(); - return { - cpuCount: Math.max(1, availableParallelism?.() ?? 1), - memoryBytes: cgroupMemoryLimitBytes(hostMemoryBytes) ?? hostMemoryBytes, - }; -} - -export function autoMaxLiveWorkers(resources: WorkerResources = detectWorkerResources()): number { - const cpuBudget = Math.max(1, resources.cpuCount) * 2; - const memoryBudget = Math.max(1, Math.round(resources.memoryBytes / 1024 ** 3)); - return clamp(Math.min(cpuBudget, memoryBudget), MIN_AUTO_MAX_LIVE_WORKERS, MAX_AUTO_MAX_LIVE_WORKERS); -} - -export function resolveWorkerBudget( - workerConfig?: WorkerConfig, - resources: WorkerResources = detectWorkerResources(), -): ResolvedWorkerBudget { - const auto = autoMaxLiveWorkers(resources); - return { - maxLiveWorkers: workerConfig?.maxLiveWorkers ?? auto, - idleSuspendMs: workerConfig?.idleSuspendMs ?? DEFAULT_IDLE_SUSPEND_MS, - autoMaxLiveWorkers: auto, - maxLiveWorkersSource: workerConfig?.maxLiveWorkers === undefined ? 'auto' : 'config', - idleSuspendMsSource: workerConfig?.idleSuspendMs === undefined ? 'default' : 'config', - }; -} diff --git a/src/core/worker-pool.ts b/src/core/worker-pool.ts index 3f0090b3..4b8eaec9 100644 --- a/src/core/worker-pool.ts +++ b/src/core/worker-pool.ts @@ -1034,6 +1034,16 @@ export function suspendWorker(ds: DaemonSession, reason = 'suspended_idle'): boo ds.workerPort = null; ds.workerToken = null; ds.session.webPort = undefined; + // The worker's suspend handler destroys the backing session + CLI (frees + // memory), so there is no live CLI to reattach to: the next turn MUST + // cold-resume from the on-disk transcript. forkWorker(resume=true) builds the + // CLI's `--resume ` args, so mark this session as having history + // (the normal `claude_exit` path that sets this never fires on suspend — + // process.exit(0) races it). Also persist `suspendedColdResume` so a daemon + // restart treats a 'missing' backing session as a deliberate lazy-resume + // rather than a zombie to close. See sweepIdleWorkers + restoreActiveSessions. + ds.hasHistory = true; + ds.session.suspendedColdResume = true; sessionStore.updateSessionPid(ds.session.sessionId, null); sessionStore.updateSession(ds.session); @@ -1044,7 +1054,7 @@ export function suspendWorker(ds: DaemonSession, reason = 'suspended_idle'): boo body: { sessionId: ds.session.sessionId, reason }, }); } - logger.info(`[${tag(ds)}] Worker suspended (${reason}); session remains active`); + logger.info(`[${tag(ds)}] Worker + CLI suspended (${reason}); session stays active, cold-resumes from transcript on next message`); return true; } @@ -1465,6 +1475,14 @@ export function forkWorker(ds: DaemonSession, prompt: string, resume = false): v ds.workerToken = null; } + // Re-establishing a worker ends the cold-resume-suspended state: clear the + // persisted marker so a future restart no longer treats this session's + // backing as a deliberate-missing (a genuine later zombie must still close). + if (ds.session.suspendedColdResume) { + ds.session.suspendedColdResume = undefined; + sessionStore.updateSession(ds.session); + } + ensureCliEnv(botCfg.cliId, botCfg.cliPathOverride); // Claude Code blocks on the interactive folder-trust dialog the first time // it runs in an untrusted workingDir; pre-accept it so the spawn doesn't hang. diff --git a/src/daemon.ts b/src/daemon.ts index c64e5ffa..c7880773 100644 --- a/src/daemon.ts +++ b/src/daemon.ts @@ -89,7 +89,7 @@ import { } from './core/session-manager.js'; import { beginReplyTargetTurn, resolveSessionReplyTarget, syncReplyTargetState } from './core/reply-target.js'; import { sweepOrphanSandboxes } from './adapters/backend/sandbox.js'; -import { sweepIdleWorkers } from './core/idle-worker-sweeper.js'; +import { sweepIdleWorkers, DEFAULT_MAX_LIVE_WORKERS } from './core/idle-worker-sweeper.js'; import { handleCardAction } from './im/lark/card-handler.js'; import type { CardHandlerDeps } from './im/lark/card-handler.js'; import { @@ -3751,9 +3751,13 @@ export async function startDaemon(botIndex?: number): Promise { } const idleWorkerSweepTimer = setInterval(() => { - const suspended = sweepIdleWorkers(activeSessions); + // Re-read the live per-bot cap each tick so a dashboard edit (which mutates + // bot.config in place via applyConfigField) takes effect within 60s without + // a restart. Unset → DEFAULT_MAX_LIVE_WORKERS; ≤0 → no cap. + const maxLiveWorkers = getBot(cfg.larkAppId).config.maxLiveWorkers; + const suspended = sweepIdleWorkers(activeSessions, { maxLiveWorkers }); if (suspended.length > 0) { - logger.info(`[idle-worker-sweeper] suspended ${suspended.length} idle worker(s)`); + logger.info(`[idle-worker-sweeper] suspended ${suspended.length} session(s) over per-bot cap ${maxLiveWorkers ?? DEFAULT_MAX_LIVE_WORKERS}`); } }, 60_000); idleWorkerSweepTimer.unref?.(); diff --git a/src/dashboard.ts b/src/dashboard.ts index 1bbe2e5b..c90a2186 100644 --- a/src/dashboard.ts +++ b/src/dashboard.ts @@ -1169,6 +1169,7 @@ const server = createServer(async (req, res) => { restrictGrantCommands: j.restrictGrantCommands === true, messageQuotaDefaultLimit: typeof j.messageQuotaDefaultLimit === 'number' ? j.messageQuotaDefaultLimit : null, p2pMode: j.p2pMode === 'chat' ? 'chat' : 'thread', + maxLiveWorkers: typeof j.maxLiveWorkers === 'number' ? j.maxLiveWorkers : null, }; } catch (e: any) { return { larkAppId: d.larkAppId, botName: d.botName, online: true, error: e?.message ?? String(e) }; @@ -1283,6 +1284,25 @@ const server = createServer(async (req, res) => { return; } + // PUT /api/bots/:appId/max-live-workers — proxy to that bot's daemon. Body + // `{ maxLiveWorkers: number | null }` (null = clear → fall back to the + // built-in default of 30; a positive integer overrides it). + let mBotMaxLive: RegExpMatchArray | null; + if (req.method === 'PUT' && (mBotMaxLive = url.pathname.match(/^\/api\/bots\/([^/]+)\/max-live-workers$/))) { + const appId = decodeURIComponent(mBotMaxLive[1]); + const chunks: Buffer[] = []; + for await (const c of req) chunks.push(c as Buffer); + const raw = Buffer.concat(chunks).toString('utf8') || '{}'; + const upstream = await proxyToDaemon(appId, `/api/bot-max-live-workers`, { + method: 'PUT', + headers: { 'content-type': 'application/json' }, + body: raw, + }); + res.writeHead(upstream.status, { 'content-type': 'application/json' }); + res.end(await upstream.text()); + return; + } + // Create a new chat — pick a creator from the user-selected larkAppIds // (Feishu makes the calling bot the implicit first member, so picking // anything else would silently add an unwanted bot). Auto-invite the diff --git a/src/dashboard/web/bot-defaults.ts b/src/dashboard/web/bot-defaults.ts index 021a675f..e3d16455 100644 --- a/src/dashboard/web/bot-defaults.ts +++ b/src/dashboard/web/bot-defaults.ts @@ -207,7 +207,7 @@ export async function renderBotDefaultsPage(root: HTMLElement) { ${renderSandboxSection(b)}
${renderRoleSection(b)}
-
${renderSessionModeSection(b)}
+
${renderSessionModeSection(b)}${renderSessionCapSection(b)}
${renderCardBehaviorSection(b)}${renderBrandSection(b)}
${renderGrantSection(b)}
@@ -379,6 +379,37 @@ export async function renderBotDefaultsPage(root: HTMLElement) { `; } + function sessionCapStateLabel(cap: number | null): string { + return cap == null + ? t('botDefaults.maxLiveWorkersStateDefault') + : t('botDefaults.maxLiveWorkersStateOn', { count: cap }); + } + + // 最大同时活跃会话数(maxLiveWorkers):数字输入 + 保存/恢复默认按钮(空=用默认 30)。 + // 超过上限时最久未用的会话自动休眠(worker+CLI 一起杀回收内存),下条消息冷恢复。 + // PUT /api/bots/:appId/max-live-workers 落 bots.json,daemon 每分钟读实时值即时生效。 + function renderSessionCapSection(b: any): string { + const cap: number | null = typeof b.maxLiveWorkers === 'number' ? b.maxLiveWorkers : null; + return `
+

${t('botDefaults.sectionSessionCap')}

+
+ + ${escapeHtml(sessionCapStateLabel(cap))} + ${t('botDefaults.maxLiveWorkersHelp')} +
+ + + +
+
+
`; + } + // File sandbox (oncall): a per-bot toggle. ON → this bot's sessions run inside // a per-session bwrap file sandbox (Linux). Auto-saves on change. function renderSandboxSection(b: any): string { @@ -965,6 +996,69 @@ export async function renderBotDefaultsPage(root: HTMLElement) { putGrantPref({ messageQuotaDefaultLimit: null }, quotaOffBtn); }); } + + // ── 最大同时活跃会话数 maxLiveWorkers(空=回落默认 30) ────────────────── + const capInput = card.querySelector('input[data-input=maxLiveWorkers]'); + const capSaveBtn = card.querySelector('button[data-action=save-session-cap]'); + const capOffBtn = card.querySelector('button[data-action=off-session-cap]'); + const capStatusEl = card.querySelector('[data-session-cap-status]'); + const capStateEl = card.querySelector('[data-session-cap-state]'); + + // PUT { maxLiveWorkers: number | null } to the bot's daemon (via the + // dashboard proxy). null = unlimited. Mirrors putGrantPref. + async function putMaxLiveWorkers(value: number | null, selfEl: HTMLInputElement | HTMLButtonElement) { + if (!capStatusEl) return; + capStatusEl.textContent = ''; + capStatusEl.className = 'oncall-status'; + selfEl.disabled = true; + try { + const r = await fetch(`/api/bots/${encodeURIComponent(appId)}/max-live-workers`, { + method: 'PUT', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ maxLiveWorkers: value }), + }); + const body = await r.json().catch(() => ({})); + if (r.ok && body.ok) { + capStatusEl.textContent = `✓ ${t('botDefaults.cardPrefSaved')}`; + capStatusEl.classList.add('hint-ok'); + const next: number | null = typeof body.maxLiveWorkers === 'number' ? body.maxLiveWorkers : null; + const cached = cache.bots.find((bb: any) => bb.larkAppId === appId); + if (cached) cached.maxLiveWorkers = next; + if (capStateEl) capStateEl.textContent = sessionCapStateLabel(next); + if (capInput) capInput.value = next == null ? '' : String(next); + } else { + capStatusEl.textContent = `✗ ${body.error ?? r.status}`; + capStatusEl.classList.add('hint-warn-inline'); + } + } catch (e: any) { + capStatusEl.textContent = `✗ ${e?.message ?? e}`; + capStatusEl.classList.add('hint-warn-inline'); + } finally { + selfEl.disabled = false; + } + } + + if (capInput && capSaveBtn) { + capSaveBtn.addEventListener('click', () => { + const raw = capInput.value.trim(); + if (raw === '') { putMaxLiveWorkers(null, capSaveBtn); return; } // 空=清回默认 30 + // 只认纯正整数 token(拒 1e2 / 1.0 / 01),与额度输入同口径。 + if (!/^[1-9]\d*$/.test(raw)) { + if (capStatusEl) { + capStatusEl.textContent = `✗ ${t('botDefaults.maxLiveWorkersInvalid')}`; + capStatusEl.className = 'oncall-status hint-warn-inline'; + } + return; + } + putMaxLiveWorkers(Number(raw), capSaveBtn); + }); + } + if (capInput && capOffBtn) { + capOffBtn.addEventListener('click', () => { + capInput.value = ''; + putMaxLiveWorkers(null, capOffBtn); + }); + } }); } diff --git a/src/dashboard/web/i18n.ts b/src/dashboard/web/i18n.ts index c4597808..bdbdc950 100644 --- a/src/dashboard/web/i18n.ts +++ b/src/dashboard/web/i18n.ts @@ -540,6 +540,15 @@ const zh: DashboardMessages = { 'botDefaults.quotaInvalid': '额度必须是正整数', 'botDefaults.quotaStateOff': '当前:未配置默认额度', 'botDefaults.quotaStateOn': '当前:每人 {count} 条', + 'botDefaults.sectionSessionCap': '会话数上限', + 'botDefaults.maxLiveWorkers': '最大同时活跃会话数', + 'botDefaults.maxLiveWorkersPlaceholder': '留空=默认 30', + 'botDefaults.maxLiveWorkersHelp': '本 Bot 同时保活的会话数上限,用于控制内存。超过后最久未用的会话自动休眠:worker 和 CLI 进程一起杀掉、回收全部内存,下条消息时从磁盘 transcript 冷恢复(--resume 重建上下文,几秒,不再常驻占内存)。留空=用默认 30;填数字=本 Bot 自定义(想要更多就填大数字)。注意:休眠只对可恢复后端(tmux/herdr/zellij)生效,纯 PTY 会话不受影响;正在出答案的会话不会被打断。', + 'botDefaults.maxLiveWorkersSave': '保存上限', + 'botDefaults.maxLiveWorkersOff': '恢复默认', + 'botDefaults.maxLiveWorkersInvalid': '上限必须是正整数', + 'botDefaults.maxLiveWorkersStateDefault': '当前:默认 30 个活跃会话', + 'botDefaults.maxLiveWorkersStateOn': '当前:最多 {count} 个活跃会话', 'nav.roles': '角色管理', 'roles.title': '角色管理', 'roles.subtitle': '为每个群组的每个 Bot 单独设置角色提示词,Bot 在该群中会以此角色行事。', @@ -1278,6 +1287,15 @@ const en: DashboardMessages = { 'botDefaults.quotaInvalid': 'Quota must be a positive integer', 'botDefaults.quotaStateOff': 'Current: no default quota', 'botDefaults.quotaStateOn': 'Current: {count} per grantee', + 'botDefaults.sectionSessionCap': 'Session limit', + 'botDefaults.maxLiveWorkers': 'Max live sessions', + 'botDefaults.maxLiveWorkersPlaceholder': 'Empty = default 30', + 'botDefaults.maxLiveWorkersHelp': 'Cap on this bot\'s simultaneously-live sessions, to bound memory. Beyond it, the least-recently-used sessions are suspended: both the worker AND the CLI process are killed to reclaim all their memory, and the session cold-resumes from its on-disk transcript on the next message (--resume rebuilds context in a few seconds — nothing stays resident). Empty = use the default of 30; a number = a per-bot override (set a larger number if you want more). Note: suspension only applies to resumable backends (tmux/herdr/zellij), never plain PTY; a session that is mid-reply is never interrupted.', + 'botDefaults.maxLiveWorkersSave': 'Save limit', + 'botDefaults.maxLiveWorkersOff': 'Reset to default', + 'botDefaults.maxLiveWorkersInvalid': 'Limit must be a positive integer', + 'botDefaults.maxLiveWorkersStateDefault': 'Current: default 30 live sessions', + 'botDefaults.maxLiveWorkersStateOn': 'Current: up to {count} live sessions', 'nav.roles': 'Roles', 'roles.title': 'Role Management', 'roles.subtitle': 'Set per-bot role prompts for each group. Each bot adopts its own persona in the selected group.', diff --git a/src/global-config.ts b/src/global-config.ts index 5229c1d6..f7daf80a 100644 --- a/src/global-config.ts +++ b/src/global-config.ts @@ -19,11 +19,6 @@ import { homedir } from 'node:os'; import { isLocale, type Locale } from './i18n/types.js'; import type { VoiceConfig } from './services/voice/types.js'; -export interface WorkerConfig { - maxLiveWorkers?: number; - idleSuspendMs?: number; -} - export type RepoPickerMode = 'all' | 'repos'; export interface GlobalConfig { @@ -40,9 +35,6 @@ export interface GlobalConfig { * services/voice/types.ts. Presence (with usable creds) gates the * "🔊 语音总结" button. */ voice?: VoiceConfig; - /** Machine-wide worker resource policy. Daemon falls back to an - * auto-derived live-worker budget when this block is absent. */ - worker?: WorkerConfig; /** Machine-wide auto-update / auto-restart schedule. Off unless explicitly * enabled. Only the primary daemon (bot-0) acts on it — see core/maintenance.ts. */ maintenance?: MaintenanceConfig; @@ -181,22 +173,6 @@ function readDashboard(raw: unknown): DashboardGlobalConfig | undefined { return Object.keys(out).length > 0 ? out : undefined; } -function readPositiveInteger(raw: unknown): number | undefined { - if (typeof raw !== 'number' || !Number.isInteger(raw) || raw <= 0) return undefined; - return raw; -} - -function readWorker(raw: unknown): WorkerConfig | undefined { - if (!raw || typeof raw !== 'object') return undefined; - const v = raw as Record; - const worker: WorkerConfig = {}; - const maxLiveWorkers = readPositiveInteger(v.maxLiveWorkers); - const idleSuspendMs = readPositiveInteger(v.idleSuspendMs); - if (maxLiveWorkers !== undefined) worker.maxLiveWorkers = maxLiveWorkers; - if (idleSuspendMs !== undefined) worker.idleSuspendMs = idleSuspendMs; - return Object.keys(worker).length > 0 ? worker : undefined; -} - export function globalConfigPath(): string { return join(homedir(), '.botmux', 'config.json'); } @@ -247,8 +223,6 @@ export function readGlobalConfig(): GlobalConfig { if (dashboard) out.dashboard = dashboard; const voice = readVoice(raw.voice); if (voice) out.voice = voice; - const worker = readWorker(raw.worker); - if (worker) out.worker = worker; const maintenance = readMaintenance(raw.maintenance); if (maintenance) out.maintenance = maintenance; if (typeof raw.httpProxy === 'string' && raw.httpProxy.trim()) out.httpProxy = raw.httpProxy.trim(); diff --git a/src/services/bot-config-store.ts b/src/services/bot-config-store.ts index 09af2b80..67b7ce71 100644 --- a/src/services/bot-config-store.ts +++ b/src/services/bot-config-store.ts @@ -26,7 +26,7 @@ import { logger } from '../utils/logger.js'; */ export type ConfigEffect = 'immediate' | 'next-session'; -export type ConfigFieldKind = 'string' | 'boolean' | 'enum' | 'cli' | 'dir' | 'allowedUsers'; +export type ConfigFieldKind = 'string' | 'boolean' | 'number' | 'enum' | 'cli' | 'dir' | 'allowedUsers'; export interface ConfigFieldSpec { /** 用户面命令里用的字段名(大小写不敏感匹配,见 {@link findConfigField})。 */ @@ -64,6 +64,7 @@ export const CONFIG_FIELDS: readonly ConfigFieldSpec[] = [ { key: 'disableCliBypass', configKey: 'disableCliBypass', kind: 'boolean', effect: 'next-session', clearable: false, hint: '不加 CLI 审批/sandbox 绕过参数 on|off' }, { key: 'restrictGrantCommands', configKey: 'restrictGrantCommands', kind: 'boolean', effect: 'immediate', clearable: false, hint: '被授权人仅能纯对话、拦截斜杠命令 on|off' }, { key: 'p2pMode', configKey: 'p2pMode', kind: 'enum', effect: 'immediate', clearable: true, enumValues: ['thread', 'chat'], hint: '私聊单聊模式 thread|chat;chat=扁平连续会话,thread/unset 回默认(每条 DM 独立会话)' }, + { key: 'maxLiveWorkers', configKey: 'maxLiveWorkers', kind: 'number', effect: 'immediate', clearable: true, hint: '最大同时活跃会话数;超过后最久未用的会话自动休眠(杀 worker+CLI 回收内存,下条消息冷恢复);unset=默认 30' }, ]; /** 大小写不敏感地按 key 找字段 spec。 */ @@ -140,7 +141,7 @@ export type ApplyFieldResult = export async function applyConfigField( larkAppId: string, spec: ConfigFieldSpec, - value: string | boolean | null, + value: string | boolean | number | null, ): Promise { if (spec.kind === 'allowedUsers') return { ok: false, reason: 'use_setBotAllowedUsers' }; let bot; @@ -214,8 +215,8 @@ export async function setBotAllowedUsers( } export type CoerceResult = - | { ok: true; value: string | boolean } - | { ok: false; reason: 'invalid_bool' | 'invalid_enum' | 'invalid_cli' | 'invalid_dir' | 'empty' }; + | { ok: true; value: string | boolean | number } + | { ok: false; reason: 'invalid_bool' | 'invalid_enum' | 'invalid_cli' | 'invalid_dir' | 'invalid_number' | 'empty' }; /** * 把一个**原始**字段值(来自卡片下拉/输入或别处)按字段 kind 解析校验成可落盘的 @@ -228,6 +229,10 @@ export function coerceConfigValue(spec: ConfigFieldSpec, raw: unknown): CoerceRe const b = parseBooleanValue(String(raw ?? '')); return b === undefined ? { ok: false, reason: 'invalid_bool' } : { ok: true, value: b }; } + if (spec.kind === 'number') { + const n = typeof raw === 'number' ? raw : Number(String(raw ?? '').trim()); + return Number.isInteger(n) && n > 0 ? { ok: true, value: n } : { ok: false, reason: 'invalid_number' }; + } const s = String(raw ?? '').trim(); if (!s) return { ok: false, reason: 'empty' }; switch (spec.kind) { diff --git a/src/skills/definitions.ts b/src/skills/definitions.ts index 4036dfee..ae328f79 100644 --- a/src/skills/definitions.ts +++ b/src/skills/definitions.ts @@ -1001,31 +1001,6 @@ stdout 为一行 JSON。注意:\`--json\` 覆盖所有结果类型;超时 / - 默认超时 300 秒,可用 \`--timeout \` 调整 `; -const WORKER_BUDGET_SKILL = `--- -name: botmux-worker-budget -description: 查看或调整 botmux idle worker 自动暂停预算。触发场景:用户提到 OOM、内存占用、live worker 太多、闲置会话暂停、maxLiveWorkers、idleSuspendMs,或要求 agent 修改 worker 预算配置。必须使用 botmux worker-budget 命令,不要手写 ~/.botmux/config.json。 ---- - -# botmux-worker-budget — worker 预算配置 - -当用户要求查看或调整 botmux worker 资源预算时,使用 \`botmux worker-budget\`。不要直接编辑 \`~/.botmux/config.json\`;命令会校验正整数、保留未知配置,并写入 daemon 读取的全局配置。 - -## 用法 - -\`\`\`bash -# 查看自动推导值、当前覆盖值、配置文件路径 -botmux worker-budget status - -# 覆盖 live worker 上限;idle 阈值可选 -botmux worker-budget set --max-live-workers 12 --idle-minutes 45 - -# 清除覆盖,恢复按 CPU/内存自动推导 -botmux worker-budget unset -\`\`\` - -\`maxLiveWorkers\` 控制多少个 live worker 以内保持常驻;超过预算时 daemon 会优先暂停最久未活跃的 worker。\`idleSuspendMs\` 控制 worker 需要闲置多久才允许被暂停。 -`; - const ORCHESTRATE_SKILL = `--- name: botmux-orchestrate description: 作为「主 bot/编排者」把一个大项目拆成多个子项目,在普通群里自动开多话题、把不同 bot(常 coder+reviewer 一组)派进各话题并行干活,用飞书任务清单当共享进度板,收齐结果再汇总。触发:用户提到「多话题协作模式」,或要「把大项目拆给多个机器人并行做」「协调多个 bot」「多话题并行推进」「你当总控/编排」「一个写一个 review 多组并行」,或显式提到 botmux orchestrate / botmux dispatch 派活。 @@ -1109,7 +1084,6 @@ export const BUILTIN_SKILLS: SkillDef[] = [ { name: 'botmux-bots', content: BOTS_SKILL }, { name: 'botmux-handoff', content: HANDOFF_SKILL }, { name: 'botmux-workflow-create', content: WORKFLOW_CREATE_SKILL }, - { name: 'botmux-worker-budget', content: WORKER_BUDGET_SKILL }, { name: 'botmux-orchestrate', content: ORCHESTRATE_SKILL }, ]; @@ -1121,4 +1095,8 @@ export const RETIRED_SKILL_NAMES: string[] = [ // Folded into botmux-send as the `--attention` flag. Installer prunes the old // standalone skill dir. 'botmux-needs-help', + // Retired in favour of a per-bot "max live sessions" dashboard field + // (Groups & Bots → bot card). The CLI subcommand was removed too, so the + // skill has nothing to drive — prune it from every CLI's skills dir on upgrade. + 'botmux-worker-budget', ]; diff --git a/src/types.ts b/src/types.ts index 5aaf271a..3d0a00b8 100644 --- a/src/types.ts +++ b/src/types.ts @@ -106,6 +106,16 @@ export interface Session { lastCliInput?: string; /** CLI-native resume id when it differs from botmux's sessionId (for example Codex thread id). */ cliSessionId?: string; + /** + * Set true when the idle-worker sweeper suspends this session over the per-bot + * live cap: the worker AND the backing tmux/herdr/zellij session (+ CLI) were + * intentionally killed to reclaim memory, but the session stays `active` and + * cold-resumes from its on-disk transcript on the next message. Distinguishes + * this deliberate state from a real zombie (pane gone while the server runs): + * `restoreActiveSessions` must NOT close a suspended session whose backing + * session probes 'missing'. Cleared once a live worker is re-established. + */ + suspendedColdResume?: boolean; /** CLI used to spawn this session — stamped on every save so closed sessions retain it. */ cliId?: import('./adapters/cli/types.js').CliId; /** diff --git a/src/worker.ts b/src/worker.ts index 35629e7b..f087529c 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -4958,7 +4958,15 @@ process.on('message', async (raw: unknown) => { log('Suspend requested'); stopScreenshotLoop(); stopBridgeWatcher(); - try { backend?.kill(); } catch { /* detach best-effort */ } + // Free the CLI's memory, not just the worker's: destroySession kills the + // backing tmux/herdr/zellij session AND the CLI process inside it (kill() + // would only detach the pty viewer and leave the CLI running in the + // background — defeating the whole point of a session cap, since the CLI + // is the memory hog). On the next message the session cold-resumes via + // forkWorker(resume=true) → a fresh `new-session --resume ` + // that rebuilds context from the on-disk transcript (same path the daemon + // uses to recover sessions after a reboot kills the tmux server). + try { (backend?.destroySession ?? backend?.kill)?.call(backend); } catch { /* best-effort */ } backend = null; isPromptReady = false; // Suspend INTENDS to resume later: preserve the sandbox overlay mount + the diff --git a/test/bot-config-store.test.ts b/test/bot-config-store.test.ts index e062ff54..a3df26db 100644 --- a/test/bot-config-store.test.ts +++ b/test/bot-config-store.test.ts @@ -124,6 +124,35 @@ describe('bot-config store', () => { expect(registry.getBot('app_default').config.disableStreamingCard).toBeUndefined(); }); + it('number field (maxLiveWorkers) round-trips and clears on null', async () => { + const { registry, store } = await loaded(); + const spec = store.findConfigField('maxLiveWorkers')!; + expect(spec.kind).toBe('number'); + expect(spec.effect).toBe('immediate'); + + const r1 = await store.applyConfigField('app_default', spec, 6); + expect(r1.ok).toBe(true); + if (r1.ok) { expect(r1.oldText).toBe('∅'); expect(r1.newText).toBe('6'); } + expect(readConfig().maxLiveWorkers).toBe(6); + expect(registry.getBot('app_default').config.maxLiveWorkers).toBe(6); + + const r2 = await store.applyConfigField('app_default', spec, null); + expect(r2.ok).toBe(true); + expect(readConfig().maxLiveWorkers).toBeUndefined(); + expect(registry.getBot('app_default').config.maxLiveWorkers).toBeUndefined(); + }); + + it('coerceConfigValue(number) accepts positive integers and rejects junk/≤0/fractions', async () => { + const { store } = await loaded(); + const spec = store.findConfigField('maxLiveWorkers')!; + expect(store.coerceConfigValue(spec, 4)).toEqual({ ok: true, value: 4 }); + expect(store.coerceConfigValue(spec, '12')).toEqual({ ok: true, value: 12 }); + expect(store.coerceConfigValue(spec, 0)).toEqual({ ok: false, reason: 'invalid_number' }); + expect(store.coerceConfigValue(spec, -3)).toEqual({ ok: false, reason: 'invalid_number' }); + expect(store.coerceConfigValue(spec, 1.5)).toEqual({ ok: false, reason: 'invalid_number' }); + expect(store.coerceConfigValue(spec, 'abc')).toEqual({ ok: false, reason: 'invalid_number' }); + }); + it('cli field persists the chosen adapter id', async () => { const { registry, store } = await loaded(); const spec = store.findConfigField('cli')!; diff --git a/test/bot-registry.test.ts b/test/bot-registry.test.ts index a4fe2b63..ca2c9af7 100644 --- a/test/bot-registry.test.ts +++ b/test/bot-registry.test.ts @@ -141,6 +141,29 @@ describe('parseBotConfigsFromText — brand', () => { ])); expect(cfg.brand).toBeUndefined(); }); + + it('keeps a positive-integer maxLiveWorkers cap', () => { + const [cfg] = mod.parseBotConfigsFromText(JSON.stringify([ + { larkAppId: 'a', larkAppSecret: 's', maxLiveWorkers: 8 }, + ])); + expect(cfg.maxLiveWorkers).toBe(8); + }); + + it('leaves maxLiveWorkers undefined (= unlimited) when unset', () => { + const [cfg] = mod.parseBotConfigsFromText(JSON.stringify([ + { larkAppId: 'a', larkAppSecret: 's' }, + ])); + expect(cfg.maxLiveWorkers).toBeUndefined(); + }); + + it('drops ≤0 / fractional / non-numeric maxLiveWorkers to undefined', () => { + for (const bad of [0, -2, 1.5, '4', null] as const) { + const [cfg] = mod.parseBotConfigsFromText(JSON.stringify([ + { larkAppId: 'a', larkAppSecret: 's', maxLiveWorkers: bad }, + ])); + expect(cfg.maxLiveWorkers).toBeUndefined(); + } + }); }); // ─── getBot / getBotClient ──────────────────────────────────────────────── diff --git a/test/builtin-skills.test.ts b/test/builtin-skills.test.ts index 05172810..fe87f4b1 100644 --- a/test/builtin-skills.test.ts +++ b/test/builtin-skills.test.ts @@ -125,16 +125,10 @@ describe('built-in botmux-handoff skill', () => { }); }); -describe('built-in botmux-worker-budget skill', () => { - it('teaches agents to use the CLI command instead of hand-editing JSON', () => { - const skill = BUILTIN_SKILLS.find(s => s.name === 'botmux-worker-budget'); - expect(skill).toBeDefined(); - expect(skill!.content).toContain('botmux worker-budget status'); - expect(skill!.content).toContain('botmux worker-budget set --max-live-workers'); - expect(skill!.content).toContain('botmux worker-budget unset'); - expect(skill!.content).toContain('不要直接编辑 `~/.botmux/config.json`'); - expect(skill!.content).toContain('maxLiveWorkers'); - expect(skill!.content).toContain('idleSuspendMs'); +describe('botmux-worker-budget skill retired (moved to per-bot dashboard field)', () => { + it('is no longer a standalone skill and is pruned on upgrade', () => { + expect(BUILTIN_SKILLS.find(s => s.name === 'botmux-worker-budget')).toBeUndefined(); + expect(RETIRED_SKILL_NAMES).toContain('botmux-worker-budget'); }); }); diff --git a/test/global-config-worker.test.ts b/test/global-config-worker.test.ts deleted file mode 100644 index 881ff7af..00000000 --- a/test/global-config-worker.test.ts +++ /dev/null @@ -1,65 +0,0 @@ -import { afterEach, describe, expect, it } from 'vitest'; -import { mkdirSync, mkdtempSync, readFileSync, writeFileSync } from 'node:fs'; -import { tmpdir } from 'node:os'; -import { join } from 'node:path'; - -import { globalConfigPath, mergeGlobalConfig, readGlobalConfig } from '../src/global-config.js'; - -const originalHome = process.env.HOME; -const originalUserProfile = process.env.USERPROFILE; - -function withHome(): string { - const home = mkdtempSync(join(tmpdir(), 'botmux-global-config-worker-')); - process.env.HOME = home; - process.env.USERPROFILE = home; - mkdirSync(join(home, '.botmux'), { recursive: true }); - return home; -} - -afterEach(() => { - process.env.HOME = originalHome; - process.env.USERPROFILE = originalUserProfile; -}); - -describe('global worker config', () => { - it('reads valid worker budget settings from the global config', () => { - const home = withHome(); - writeFileSync( - join(home, '.botmux', 'config.json'), - JSON.stringify({ - worker: { - maxLiveWorkers: 12, - idleSuspendMs: 45 * 60_000, - }, - }), - { flag: 'w' }, - ); - - expect(readGlobalConfig().worker).toEqual({ - maxLiveWorkers: 12, - idleSuspendMs: 45 * 60_000, - }); - }); - - it('drops invalid worker budget values while preserving unknown keys on write', () => { - const home = withHome(); - writeFileSync( - join(home, '.botmux', 'config.json'), - JSON.stringify({ - unknown: 'keep-me', - worker: { - maxLiveWorkers: -1, - idleSuspendMs: 'nope', - }, - }), - { flag: 'w' }, - ); - - expect(readGlobalConfig().worker).toBeUndefined(); - - mergeGlobalConfig({ worker: { maxLiveWorkers: 10 } as any }); - const raw = JSON.parse(readFileSync(globalConfigPath(), 'utf-8')); - expect(raw.unknown).toBe('keep-me'); - expect(raw.worker).toEqual({ maxLiveWorkers: 10 }); - }); -}); diff --git a/test/idle-worker-sweeper.test.ts b/test/idle-worker-sweeper.test.ts index dae1e081..665fb156 100644 --- a/test/idle-worker-sweeper.test.ts +++ b/test/idle-worker-sweeper.test.ts @@ -16,7 +16,7 @@ vi.mock('../src/utils/logger.js', () => ({ }, })); -import { sweepIdleWorkers } from '../src/core/idle-worker-sweeper.js'; +import { sweepIdleWorkers, DEFAULT_MAX_LIVE_WORKERS } from '../src/core/idle-worker-sweeper.js'; function ds(sessionId: string, backendType: string, lastMessageAt: number, worker = {}) { return { @@ -39,9 +39,35 @@ function ds(sessionId: string, backendType: string, lastMessageAt: number, worke } as any; } -describe('sweepIdleWorkers', () => { - it('does nothing while live workers are at or under the resolved budget', () => { - const now = 1_000_000; +const now = 1_000_000; + +describe('sweepIdleWorkers (per-bot count cap)', () => { + it('falls back to the default cap (30) when the bot has no explicit value', () => { + expect(DEFAULT_MAX_LIVE_WORKERS).toBe(30); + // DEFAULT_MAX_LIVE_WORKERS + 2 sessions, oldest first by lastMessageAt. + const n = DEFAULT_MAX_LIVE_WORKERS + 2; + const entries: [string, any][] = []; + for (let i = 0; i < n; i++) entries.push([`s${i}`, ds(`s${i}`, 'tmux', now - (n - i) * 60_000)]); + const activeSessions = new Map(entries); + + // No explicit cap → default 30 → suspend the 2 oldest (s0, s1). + const suspended = sweepIdleWorkers(activeSessions, {}); + expect(suspended.map(s => s.sessionId)).toEqual(['s0', 's1']); + expect(activeSessions.get('s0').worker).toBe(null); + expect(activeSessions.get('s2').worker).not.toBe(null); + }); + + it('treats an explicit ≤0 cap as the unlimited escape hatch (never suspends)', () => { + const make = () => new Map([ + ['a', ds('a', 'tmux', now - 90 * 60_000)], + ['b', ds('b', 'herdr', now - 80 * 60_000)], + ['c', ds('c', 'zellij', now - 70 * 60_000)], + ]); + expect(sweepIdleWorkers(make(), { maxLiveWorkers: 0 })).toEqual([]); + expect(sweepIdleWorkers(make(), { maxLiveWorkers: -5 })).toEqual([]); + }); + + it('does nothing while live workers are at or under the cap', () => { const activeSessions = new Map([ ['a', ds('a', 'tmux', now - 60 * 60_000)], ['b', ds('b', 'herdr', now - 50 * 60_000)], @@ -49,20 +75,14 @@ describe('sweepIdleWorkers', () => { ['d', ds('d', 'tmux', now - 2 * 60_000)], ]); - const suspended = sweepIdleWorkers(activeSessions, { - now, - workerBudget: { maxLiveWorkers: 8, idleSuspendMs: 30 * 60_000 }, - }); + const suspended = sweepIdleWorkers(activeSessions, { maxLiveWorkers: 4 }); expect(suspended).toEqual([]); expect(activeSessions.get('a').worker).not.toBe(null); - expect(activeSessions.get('b').worker).not.toBe(null); - expect(activeSessions.get('c').worker).not.toBe(null); expect(activeSessions.get('d').worker).not.toBe(null); }); - it('uses the resolved policy and suspends oldest idle workers over the live-worker budget', () => { - const now = 1_000_000; + it('suspends the oldest (by lastMessageAt) sessions down to the cap', () => { const activeSessions = new Map([ ['a', ds('a', 'tmux', now - 90 * 60_000)], ['b', ds('b', 'herdr', now - 80 * 60_000)], @@ -70,74 +90,52 @@ describe('sweepIdleWorkers', () => { ['d', ds('d', 'tmux', now - 60 * 60_000)], ['e', ds('e', 'herdr', now - 50 * 60_000)], ['f', ds('f', 'zellij', now - 40 * 60_000)], - ['g', ds('g', 'tmux', now - 35 * 60_000)], - ['h', ds('h', 'herdr', now - 31 * 60_000)], - ['i', ds('i', 'zellij', now - 30 * 60_000)], - ['j', ds('j', 'tmux', now - 2 * 60_000)], ]); - const suspended = sweepIdleWorkers(activeSessions, { - now, - workerBudget: { maxLiveWorkers: 8, idleSuspendMs: 30 * 60_000 }, - }); + const suspended = sweepIdleWorkers(activeSessions, { maxLiveWorkers: 4 }); expect(suspended.map(s => s.sessionId)).toEqual(['a', 'b']); - expect(activeSessions.get('a').session.status).toBe('active'); - expect(activeSessions.get('b').session.status).toBe('active'); + expect(suspended.every(s => s.reason === 'live_worker_cap')).toBe(true); + expect(activeSessions.get('a').worker).toBe(null); + expect(activeSessions.get('b').worker).toBe(null); expect(activeSessions.get('c').worker).not.toBe(null); - expect(activeSessions.get('j').worker).not.toBe(null); + expect(activeSessions.get('f').worker).not.toBe(null); }); - it('honors configured max live workers and idle suspend threshold', () => { - const now = 1_000_000; + it('is purely count-based: suspends a recently-active session with NO idle-time threshold', () => { + // Both sessions are only a couple minutes idle. The old budget had a 30-min + // idle gate that would have suspended nothing here; the new policy caps by + // count alone, so the single oldest session is suspended down to the cap. const activeSessions = new Map([ - ['a', ds('a', 'tmux', now - 120 * 60_000)], - ['b', ds('b', 'herdr', now - 90 * 60_000)], - ['c', ds('c', 'zellij', now - 45 * 60_000)], - ['d', ds('d', 'tmux', now - 20 * 60_000)], - ['e', ds('e', 'herdr', now - 10 * 60_000)], - ['f', ds('f', 'zellij', now - 5 * 60_000)], + ['a', ds('a', 'tmux', now - 2 * 60_000)], + ['b', ds('b', 'herdr', now - 1 * 60_000)], ]); - const suspended = sweepIdleWorkers(activeSessions, { - now, - workerBudget: { maxLiveWorkers: 4, idleSuspendMs: 30 * 60_000 }, - }); + const suspended = sweepIdleWorkers(activeSessions, { maxLiveWorkers: 1 }); - expect(suspended.map(s => s.sessionId)).toEqual(['a', 'b']); + expect(suspended.map(s => s.sessionId)).toEqual(['a']); expect(activeSessions.get('a').worker).toBe(null); - expect(activeSessions.get('b').worker).toBe(null); - expect(activeSessions.get('c').worker).not.toBe(null); - expect(activeSessions.get('d').worker).not.toBe(null); + expect(activeSessions.get('b').worker).not.toBe(null); }); - it('never suspends pty workers', () => { - const now = 1_000_000; + it('never suspends pty (non-resumable) workers', () => { const activeSessions = new Map([ ['a', ds('a', 'pty', now - 60 * 60_000)], ['b', ds('b', 'pty', now - 60 * 60_000)], ['c', ds('c', 'pty', now - 60 * 60_000)], - ['d', ds('d', 'pty', now - 60 * 60_000)], - ['e', ds('e', 'pty', now - 60 * 60_000)], - ['f', ds('f', 'pty', now - 60 * 60_000)], - ['g', ds('g', 'pty', now - 60 * 60_000)], - ['h', ds('h', 'pty', now - 60 * 60_000)], - ['i', ds('i', 'pty', now - 60 * 60_000)], - ['j', ds('j', 'tmux', now - 60 * 60_000)], + ['d', ds('d', 'tmux', now - 60 * 60_000)], ]); - const suspended = sweepIdleWorkers(activeSessions, { - now, - workerBudget: { maxLiveWorkers: 8, idleSuspendMs: 30 * 60_000 }, - }); + const suspended = sweepIdleWorkers(activeSessions, { maxLiveWorkers: 1 }); - expect(suspended.map(s => s.sessionId)).toEqual(['j']); + // Cap 1, 4 live → wants to drop 3, but only the single tmux session is + // resumable, so only 'd' can be suspended. + expect(suspended.map(s => s.sessionId)).toEqual(['d']); expect(activeSessions.get('a').worker).not.toBe(null); - expect(activeSessions.get('i').worker).not.toBe(null); + expect(activeSessions.get('c').worker).not.toBe(null); }); - it('does not suspend workers that are not idle', () => { - const now = 1_000_000; + it('never suspends a session that is mid-turn (lastScreenStatus !== idle)', () => { const activeSessions = new Map([ ['a', { ...ds('a', 'tmux', now - 90 * 60_000), lastScreenStatus: 'working' }], ['b', { ...ds('b', 'herdr', now - 80 * 60_000), lastScreenStatus: 'analyzing' }], @@ -145,32 +143,20 @@ describe('sweepIdleWorkers', () => { ['d', { ...ds('d', 'tmux', now - 60 * 60_000), lastScreenStatus: undefined }], ['e', ds('e', 'herdr', now - 50 * 60_000)], ['f', ds('f', 'zellij', now - 40 * 60_000)], - ['g', ds('g', 'tmux', now - 35 * 60_000)], - ['h', ds('h', 'herdr', now - 31 * 60_000)], - ['i', ds('i', 'zellij', now - 30 * 60_000)], - ['j', ds('j', 'tmux', now - 2 * 60_000)], ]); - const suspended = sweepIdleWorkers(activeSessions, { - now, - workerBudget: { maxLiveWorkers: 8, idleSuspendMs: 30 * 60_000 }, - }); + const suspended = sweepIdleWorkers(activeSessions, { maxLiveWorkers: 4 }); + // a–d are busy → only the two idle ones (e, f) are eligible. expect(suspended.map(s => s.sessionId)).toEqual(['e', 'f']); expect(activeSessions.get('a').worker).not.toBe(null); - expect(activeSessions.get('b').worker).not.toBe(null); - expect(activeSessions.get('c').worker).not.toBe(null); expect(activeSessions.get('d').worker).not.toBe(null); }); - it('never suspends adopt sessions even when idle, over-budget, and on a suspendable backend', () => { - const now = 1_000_000; - // 'a' and 'b' are the oldest idle workers — without the adopt guard they - // would be the first picked. They are adopt sessions (one marked via the - // runtime mirror ds.adoptedFrom, one via the persisted ds.session.adoptedFrom) - // so they must be skipped; the sweeper falls through to the oldest *normal* - // sessions ('c', 'd') instead. Suspending an adopt session would break it: - // the worker-null resume path re-forks via forkWorker, not forkAdoptWorker. + it('never suspends adopt sessions even when oldest and over cap', () => { + // 'a' (runtime mirror) and 'b' (persisted marker) are the oldest, but are + // adopt sessions → skipped; the sweeper falls through to the oldest normal + // sessions ('c', 'd'). const adoptRuntime = { ...ds('a', 'tmux', now - 90 * 60_000), adoptedFrom: { tmuxTarget: 'user:0.1' } }; const adoptPersisted = ds('b', 'herdr', now - 80 * 60_000); adoptPersisted.session.adoptedFrom = { herdrTarget: 'user-herdr' }; @@ -181,36 +167,23 @@ describe('sweepIdleWorkers', () => { ['d', ds('d', 'tmux', now - 60 * 60_000)], ['e', ds('e', 'herdr', now - 50 * 60_000)], ['f', ds('f', 'zellij', now - 40 * 60_000)], - ['g', ds('g', 'tmux', now - 35 * 60_000)], - ['h', ds('h', 'herdr', now - 31 * 60_000)], - ['i', ds('i', 'zellij', now - 30 * 60_000)], - ['j', ds('j', 'tmux', now - 2 * 60_000)], ]); - const suspended = sweepIdleWorkers(activeSessions, { - now, - workerBudget: { maxLiveWorkers: 8, idleSuspendMs: 30 * 60_000 }, - }); + const suspended = sweepIdleWorkers(activeSessions, { maxLiveWorkers: 4 }); expect(suspended.map(s => s.sessionId)).toEqual(['c', 'd']); expect(activeSessions.get('a').worker).not.toBe(null); expect(activeSessions.get('b').worker).not.toBe(null); }); - it('does not suspend an adopt session even if it is the only over-budget candidate', () => { - const now = 1_000_000; - // Budget 1, but the single eligible-looking over-budget worker is an adopt - // session → nothing is suspended (an adopt session is never a candidate). + it('does not suspend an adopt session even if it is the only over-cap candidate', () => { const adopt = { ...ds('a', 'tmux', now - 90 * 60_000), adoptedFrom: { tmuxTarget: 'user:0.1' } }; const activeSessions = new Map([ ['a', adopt], - ['b', ds('b', 'tmux', now - 2 * 60_000)], // recent, under idle threshold + ['b', ds('b', 'pty', now - 2 * 60_000)], // pty → also never suspendable ]); - const suspended = sweepIdleWorkers(activeSessions, { - now, - workerBudget: { maxLiveWorkers: 1, idleSuspendMs: 30 * 60_000 }, - }); + const suspended = sweepIdleWorkers(activeSessions, { maxLiveWorkers: 1 }); expect(suspended).toEqual([]); expect(activeSessions.get('a').worker).not.toBe(null); diff --git a/test/restore-zombie-close.test.ts b/test/restore-zombie-close.test.ts index 9c40b936..5a5d7c41 100644 --- a/test/restore-zombie-close.test.ts +++ b/test/restore-zombie-close.test.ts @@ -228,6 +228,30 @@ describe('restoreActiveSessions — persistent-backend zombie-close decision', ( expect(forkWorker).not.toHaveBeenCalled(); }); + it('"missing" + server UP but session was cap-suspended → keeps active for cold-resume (NOT a zombie)', async () => { + // The idle-worker sweeper deliberately kills a session's backing pane + CLI + // over the per-bot cap. The server stays up (only one pane was killed), so + // without the suspend-intent marker this looks exactly like a solo zombie + // and would be wrongly closed — losing a session that should lazily + // cold-resume on the next message. + probe.result = 'missing'; + server.state = 'running'; + const s = makeActivePersistentSession('om_cap_suspended'); + s.suspendedColdResume = true; + sessionStore.updateSession(s); + const map = new Map(); + wp.registry = map; + + await restoreActiveSessions(map); + + expect(closeSession).not.toHaveBeenCalled(); + const ds = map.get(sessionKey('om_cap_suspended', 'app_test')); + expect(ds).toBeDefined(); // active record retained… + expect(ds!.worker).toBeNull(); // …worker-less, cold-resumes on next message + expect(sessionStore.getSession(s.sessionId)!.status).toBe('active'); // NOT closed + expect(forkWorker).not.toHaveBeenCalled(); + }); + it('"missing" + server state UNKNOWN → closes (conservative, server may be up)', async () => { probe.result = 'missing'; server.state = 'unknown'; diff --git a/test/worker-budget-cli.test.ts b/test/worker-budget-cli.test.ts deleted file mode 100644 index 0b98f0b5..00000000 --- a/test/worker-budget-cli.test.ts +++ /dev/null @@ -1,59 +0,0 @@ -import { afterAll, beforeAll, describe, expect, it } from 'vitest'; -import { spawnSync } from 'node:child_process'; -import { existsSync, mkdirSync, mkdtempSync, readFileSync, rmSync } from 'node:fs'; -import { tmpdir } from 'node:os'; -import { join } from 'node:path'; - -const CLI_PATH = join(__dirname, '..', 'dist', 'cli.js'); - -let home: string; - -beforeAll(() => { - if (!existsSync(CLI_PATH)) { - throw new Error('dist/cli.js missing — run `pnpm build` first'); - } - home = mkdtempSync(join(tmpdir(), 'botmux-worker-budget-cli-')); - mkdirSync(join(home, '.botmux'), { recursive: true }); -}); - -afterAll(() => { - if (home) rmSync(home, { recursive: true, force: true }); -}); - -function runCli(args: string[]): { status: number; stdout: string; stderr: string } { - const r = spawnSync('node', [CLI_PATH, ...args], { - cwd: home, - env: { ...process.env, HOME: home, USERPROFILE: home }, - stdio: ['ignore', 'pipe', 'pipe'], - encoding: 'utf-8', - }); - return { status: r.status ?? 1, stdout: r.stdout ?? '', stderr: r.stderr ?? '' }; -} - -function readConfig(): any { - return JSON.parse(readFileSync(join(home, '.botmux', 'config.json'), 'utf-8')); -} - -describe('botmux worker-budget CLI', () => { - it('shows the auto-derived budget and the agent-safe edit command', () => { - const out = runCli(['worker-budget', 'status']); - - expect(out.status).toBe(0); - expect(out.stdout).toContain('Worker budget'); - expect(out.stdout).toContain('maxLiveWorkers'); - expect(out.stdout).toContain('botmux worker-budget set'); - }); - - it('sets and unsets worker budget config without manual JSON editing', () => { - const set = runCli(['worker-budget', 'set', '--max-live-workers', '12', '--idle-minutes', '45']); - expect(set.status).toBe(0); - expect(readConfig().worker).toEqual({ - maxLiveWorkers: 12, - idleSuspendMs: 45 * 60_000, - }); - - const unset = runCli(['worker-budget', 'unset']); - expect(unset.status).toBe(0); - expect(readConfig().worker).toBeUndefined(); - }); -}); diff --git a/test/worker-budget.test.ts b/test/worker-budget.test.ts deleted file mode 100644 index 35b6474a..00000000 --- a/test/worker-budget.test.ts +++ /dev/null @@ -1,28 +0,0 @@ -import { describe, expect, it } from 'vitest'; - -import { resolveWorkerBudget } from '../src/core/worker-budget.js'; - -const gib = (n: number) => n * 1024 ** 3; - -describe('resolveWorkerBudget', () => { - it('derives the default live-worker budget from CPU and memory', () => { - expect(resolveWorkerBudget(undefined, { cpuCount: 4, memoryBytes: gib(8) }).maxLiveWorkers).toBe(8); - expect(resolveWorkerBudget(undefined, { cpuCount: 8, memoryBytes: gib(16) }).maxLiveWorkers).toBe(16); - expect(resolveWorkerBudget(undefined, { cpuCount: 64, memoryBytes: gib(128) }).maxLiveWorkers).toBe(32); - }); - - it('lets global config override max live workers and idle threshold independently', () => { - const resolved = resolveWorkerBudget( - { maxLiveWorkers: 12, idleSuspendMs: 45 * 60_000 }, - { cpuCount: 4, memoryBytes: gib(8) }, - ); - - expect(resolved).toEqual({ - maxLiveWorkers: 12, - idleSuspendMs: 45 * 60_000, - autoMaxLiveWorkers: 8, - maxLiveWorkersSource: 'config', - idleSuspendMsSource: 'config', - }); - }); -}); diff --git a/test/worker-suspend.test.ts b/test/worker-suspend.test.ts index 5d4f4cf6..d3b93a7c 100644 --- a/test/worker-suspend.test.ts +++ b/test/worker-suspend.test.ts @@ -87,6 +87,11 @@ describe('suspendWorker', () => { expect(ds.worker).toBe(null); expect(ds.workerPort).toBe(null); expect(ds.workerToken).toBe(null); + // The worker's suspend handler destroys the backing session + CLI, so the + // next turn must cold-resume: mark history (→ forkWorker resume=true builds + // --resume) and persist the suspend intent (→ restore won't zombie-close it). + expect(ds.hasHistory).toBe(true); + expect(ds.session.suspendedColdResume).toBe(true); }); it('does not suspend pty workers', () => {