From b90380fad7aab77a943b54fe7e2a474bbc9043e7 Mon Sep 17 00:00:00 2001 From: chenzhenghao Date: Thu, 19 Mar 2026 04:30:28 +0800 Subject: [PATCH 1/2] Fix SubprocessRunner blocking cooperative thread pool, causing permanent refresh stall MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SubprocessRunner.run() called waitUntilExit() and readDataToEndOfFile() inside Task closures that ran on Swift's cooperative thread pool. When multiple providers refreshed concurrently, these blocking calls starved the pool, preventing the timeout mechanism (Task.sleep) from firing. A single hung subprocess could permanently block all future refreshes via the isRefreshing guard in UsageStore.refresh(). Changes: - Move waitUntilExit() and readDataToEndOfFile() to DispatchQueue.global() via withCheckedContinuation, freeing the cooperative pool for timeout scheduling - Kill the process inside the timeout task before throwing, so withThrowingTaskGroup can exit promptly (previously it waited for all child tasks, which couldn't complete because the process kill was in the catch block) - Add race guard: if the timeout kills the process but the exit code arrives at group.next() first, detect via terminationReason == .uncaughtSignal and reclassify as .timedOut - Extract terminateProcess() helper to deduplicate SIGTERM→SIGKILL escalation The timeout regression test deleted in 3961770 ("waitUntilExit() blocks the cooperative thread pool, starving the timeout task on low-core CI runners") is restored and now passes reliably. Closes #189 Co-Authored-By: Claude Opus 4.6 (1M context) --- .../Host/Process/SubprocessRunner.swift | 95 ++++++++++++++----- .../CodexBarTests/SubprocessRunnerTests.swift | 94 ++++++++++++++++++ 2 files changed, 165 insertions(+), 24 deletions(-) diff --git a/Sources/CodexBarCore/Host/Process/SubprocessRunner.swift b/Sources/CodexBarCore/Host/Process/SubprocessRunner.swift index 19bf47639..087029884 100644 --- a/Sources/CodexBarCore/Host/Process/SubprocessRunner.swift +++ b/Sources/CodexBarCore/Host/Process/SubprocessRunner.swift @@ -37,6 +37,49 @@ public struct SubprocessResult: Sendable { public enum SubprocessRunner { private static let log = CodexBarLog.logger(LogCategories.subprocess) + // MARK: - Helpers to move blocking calls off the cooperative thread pool + + /// Runs `readDataToEndOfFile()` on a GCD thread so it does not block the Swift cooperative pool. + private static func readDataOffPool(_ fileHandle: FileHandle) async -> Data { + await withCheckedContinuation { continuation in + DispatchQueue.global().async { + let data = fileHandle.readDataToEndOfFile() + continuation.resume(returning: data) + } + } + } + + /// Runs `waitUntilExit()` on a GCD thread so it does not block the Swift cooperative pool. + private static func waitForExitOffPool(_ process: Process) async -> Int32 { + await withCheckedContinuation { continuation in + DispatchQueue.global().async { + process.waitUntilExit() + continuation.resume(returning: process.terminationStatus) + } + } + } + + /// Terminates a process and its process group, escalating from SIGTERM to SIGKILL. + private static func terminateProcess(_ process: Process, processGroup: pid_t?) { + guard process.isRunning else { return } + process.terminate() + if let pgid = processGroup { + kill(-pgid, SIGTERM) + } + let killDeadline = Date().addingTimeInterval(0.4) + while process.isRunning, Date() < killDeadline { + usleep(50000) + } + if process.isRunning { + if let pgid = processGroup { + kill(-pgid, SIGKILL) + } + kill(process.processIdentifier, SIGKILL) + } + } + + // MARK: - Public API + public static func run( binary: String, arguments: [String], @@ -66,10 +109,10 @@ public enum SubprocessRunner { process.standardInput = nil let stdoutTask = Task { - stdoutPipe.fileHandleForReading.readDataToEndOfFile() + await self.readDataOffPool(stdoutPipe.fileHandleForReading) } let stderrTask = Task { - stderrPipe.fileHandleForReading.readDataToEndOfFile() + await self.readDataOffPool(stderrPipe.fileHandleForReading) } do { @@ -82,15 +125,11 @@ public enum SubprocessRunner { throw SubprocessRunnerError.launchFailed(error.localizedDescription) } - var processGroup: pid_t? let pid = process.processIdentifier - if setpgid(pid, pid) == 0 { - processGroup = pid - } + let processGroup: pid_t? = setpgid(pid, pid) == 0 ? pid : nil let exitCodeTask = Task { - process.waitUntilExit() - return process.terminationStatus + await self.waitForExitOffPool(process) } do { @@ -98,6 +137,9 @@ public enum SubprocessRunner { group.addTask { await exitCodeTask.value } group.addTask { try await Task.sleep(for: .seconds(timeout)) + // Kill the process BEFORE throwing so the exit-code task can complete + // and withThrowingTaskGroup can exit promptly. + self.terminateProcess(process, processGroup: processGroup) throw SubprocessRunnerError.timedOut(label) } let code = try await group.next()! @@ -105,6 +147,25 @@ public enum SubprocessRunner { return code } + // Race guard: if the timeout task killed the process but the exit code arrived + // at group.next() before the .timedOut throw, the process will have been killed + // by a signal. Reclassify as timeout so callers get the correct error type. + if process.terminationReason == .uncaughtSignal { + let duration = Date().timeIntervalSince(start) + self.log.warning( + "Subprocess error", + metadata: [ + "label": label, + "binary": binaryName, + "duration_ms": "\(Int(duration * 1000))", + ]) + stdoutTask.cancel() + stderrTask.cancel() + stdoutPipe.fileHandleForReading.closeFile() + stderrPipe.fileHandleForReading.closeFile() + throw SubprocessRunnerError.timedOut(label) + } + let stdoutData = await stdoutTask.value let stderrData = await stderrTask.value let stdout = String(data: stdoutData, encoding: .utf8) ?? "" @@ -142,22 +203,8 @@ public enum SubprocessRunner { "binary": binaryName, "duration_ms": "\(Int(duration * 1000))", ]) - if process.isRunning { - process.terminate() - if let pgid = processGroup { - kill(-pgid, SIGTERM) - } - let killDeadline = Date().addingTimeInterval(0.4) - while process.isRunning, Date() < killDeadline { - usleep(50000) - } - if process.isRunning { - if let pgid = processGroup { - kill(-pgid, SIGKILL) - } - kill(process.processIdentifier, SIGKILL) - } - } + // Safety net: ensure the process is dead (may already be killed by timeout task). + self.terminateProcess(process, processGroup: processGroup) exitCodeTask.cancel() stdoutTask.cancel() stderrTask.cancel() diff --git a/Tests/CodexBarTests/SubprocessRunnerTests.swift b/Tests/CodexBarTests/SubprocessRunnerTests.swift index fee8707ca..0436bc799 100644 --- a/Tests/CodexBarTests/SubprocessRunnerTests.swift +++ b/Tests/CodexBarTests/SubprocessRunnerTests.swift @@ -15,4 +15,98 @@ struct SubprocessRunnerTests { #expect(result.stdout.count >= 1_000_000) #expect(result.stderr.isEmpty) } + + /// Regression test for #474: a hung subprocess must be killed and throw `.timedOut` + /// instead of blocking indefinitely. + /// + /// This test was previously deleted (commit 3961770) because `waitUntilExit()` blocked + /// the cooperative thread pool, starving the timeout task. The fix moves blocking calls + /// to `DispatchQueue.global()`, making this test reliable. + @Test + func throwsTimedOutWhenProcessHangs() async throws { + let start = Date() + do { + _ = try await SubprocessRunner.run( + binary: "/bin/sleep", + arguments: ["30"], + environment: ProcessInfo.processInfo.environment, + timeout: 1, + label: "hung-process-test") + Issue.record("Expected SubprocessRunnerError.timedOut but no error was thrown") + } catch let error as SubprocessRunnerError { + guard case let .timedOut(label) = error else { + Issue.record("Expected .timedOut, got \(error)") + return + } + #expect(label == "hung-process-test") + } catch { + Issue.record("Expected SubprocessRunnerError.timedOut, got unexpected error: \(error)") + } + + let elapsed = Date().timeIntervalSince(start) + // Must complete in well under 30s (the sleep duration). Allow generous bound for CI. + #expect(elapsed < 10, "Timeout should fire in ~1s, not wait for process to exit naturally") + } + + /// Multiple concurrent hung subprocesses must all time out independently, proving that + /// one blocked subprocess does not starve the timeout mechanism of others. + /// This is the core scenario that caused the original permanent-refresh-stall bug. + @Test + func concurrentHungProcessesAllTimeOut() async { + let start = Date() + let count = 8 + + await withTaskGroup(of: Void.self) { group in + for i in 0.. Date: Thu, 19 Mar 2026 05:28:45 +0800 Subject: [PATCH 2/2] Address Codex bot review: fix timeout race classification - terminateProcess() now returns Bool: true if it actually killed the process, false if it had already exited. The timeout task only throws .timedOut when the process was actually killed, avoiding false timeouts for near-threshold commands. - Race guard uses both terminationReason == .uncaughtSignal AND elapsed time >= timeout to distinguish our timeout kills from unrelated signal crashes. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../Host/Process/SubprocessRunner.swift | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/Sources/CodexBarCore/Host/Process/SubprocessRunner.swift b/Sources/CodexBarCore/Host/Process/SubprocessRunner.swift index 087029884..0398ececd 100644 --- a/Sources/CodexBarCore/Host/Process/SubprocessRunner.swift +++ b/Sources/CodexBarCore/Host/Process/SubprocessRunner.swift @@ -60,8 +60,10 @@ public enum SubprocessRunner { } /// Terminates a process and its process group, escalating from SIGTERM to SIGKILL. - private static func terminateProcess(_ process: Process, processGroup: pid_t?) { - guard process.isRunning else { return } + /// Returns `true` if the process was actually killed, `false` if it had already exited. + @discardableResult + private static func terminateProcess(_ process: Process, processGroup: pid_t?) -> Bool { + guard process.isRunning else { return false } process.terminate() if let pgid = processGroup { kill(-pgid, SIGTERM) @@ -76,6 +78,7 @@ public enum SubprocessRunner { } kill(process.processIdentifier, SIGKILL) } + return true } // MARK: - Public API @@ -138,8 +141,12 @@ public enum SubprocessRunner { group.addTask { try await Task.sleep(for: .seconds(timeout)) // Kill the process BEFORE throwing so the exit-code task can complete - // and withThrowingTaskGroup can exit promptly. - self.terminateProcess(process, processGroup: processGroup) + // and withThrowingTaskGroup can exit promptly. Only throw if we + // actually killed the process; if it already exited, let the exit + // code win the race naturally. + guard self.terminateProcess(process, processGroup: processGroup) else { + return await exitCodeTask.value + } throw SubprocessRunnerError.timedOut(label) } let code = try await group.next()! @@ -147,10 +154,13 @@ public enum SubprocessRunner { return code } - // Race guard: if the timeout task killed the process but the exit code arrived - // at group.next() before the .timedOut throw, the process will have been killed - // by a signal. Reclassify as timeout so callers get the correct error type. - if process.terminationReason == .uncaughtSignal { + // Race guard: our timeout task killed the process (SIGTERM/SIGKILL), but + // the exit code arrived at group.next() before the .timedOut throw. + // Detect by requiring BOTH a signal termination AND elapsed time at/past the + // timeout — this avoids misclassifying processes that crash on their own. + if process.terminationReason == .uncaughtSignal, + Date().timeIntervalSince(start) >= timeout - 0.5 + { let duration = Date().timeIntervalSince(start) self.log.warning( "Subprocess error",