diff --git a/Sources/CodexBarCore/Host/Process/SubprocessRunner.swift b/Sources/CodexBarCore/Host/Process/SubprocessRunner.swift index 19bf47639..0398ececd 100644 --- a/Sources/CodexBarCore/Host/Process/SubprocessRunner.swift +++ b/Sources/CodexBarCore/Host/Process/SubprocessRunner.swift @@ -37,6 +37,52 @@ public struct SubprocessResult: Sendable { public enum SubprocessRunner { private static let log = CodexBarLog.logger(LogCategories.subprocess) + // MARK: - Helpers to move blocking calls off the cooperative thread pool + + /// Runs `readDataToEndOfFile()` on a GCD thread so it does not block the Swift cooperative pool. + private static func readDataOffPool(_ fileHandle: FileHandle) async -> Data { + await withCheckedContinuation { continuation in + DispatchQueue.global().async { + let data = fileHandle.readDataToEndOfFile() + continuation.resume(returning: data) + } + } + } + + /// Runs `waitUntilExit()` on a GCD thread so it does not block the Swift cooperative pool. + private static func waitForExitOffPool(_ process: Process) async -> Int32 { + await withCheckedContinuation { continuation in + DispatchQueue.global().async { + process.waitUntilExit() + continuation.resume(returning: process.terminationStatus) + } + } + } + + /// Terminates a process and its process group, escalating from SIGTERM to SIGKILL. + /// Returns `true` if the process was actually killed, `false` if it had already exited. + @discardableResult + private static func terminateProcess(_ process: Process, processGroup: pid_t?) -> Bool { + guard process.isRunning else { return false } + process.terminate() + if let pgid = processGroup { + kill(-pgid, SIGTERM) + } + let killDeadline = Date().addingTimeInterval(0.4) + while process.isRunning, Date() < killDeadline { + usleep(50000) + } + if process.isRunning { + if let pgid = processGroup { + kill(-pgid, SIGKILL) + } + kill(process.processIdentifier, SIGKILL) + } + return true + } + + // MARK: - Public API + public static func run( binary: String, arguments: [String], @@ -66,10 +112,10 @@ public enum SubprocessRunner { process.standardInput = nil let stdoutTask = Task { - stdoutPipe.fileHandleForReading.readDataToEndOfFile() + await self.readDataOffPool(stdoutPipe.fileHandleForReading) } let stderrTask = Task { - stderrPipe.fileHandleForReading.readDataToEndOfFile() + await self.readDataOffPool(stderrPipe.fileHandleForReading) } do { @@ -82,15 +128,11 @@ public enum SubprocessRunner { throw SubprocessRunnerError.launchFailed(error.localizedDescription) } - var processGroup: pid_t? let pid = process.processIdentifier - if setpgid(pid, pid) == 0 { - processGroup = pid - } + let processGroup: pid_t? = setpgid(pid, pid) == 0 ? pid : nil let exitCodeTask = Task { - process.waitUntilExit() - return process.terminationStatus + await self.waitForExitOffPool(process) } do { @@ -98,6 +140,13 @@ public enum SubprocessRunner { group.addTask { await exitCodeTask.value } group.addTask { try await Task.sleep(for: .seconds(timeout)) + // Kill the process BEFORE throwing so the exit-code task can complete + // and withThrowingTaskGroup can exit promptly. Only throw if we + // actually killed the process; if it already exited, let the exit + // code win the race naturally. + guard self.terminateProcess(process, processGroup: processGroup) else { + return await exitCodeTask.value + } throw SubprocessRunnerError.timedOut(label) } let code = try await group.next()! @@ -105,6 +154,28 @@ public enum SubprocessRunner { return code } + // Race guard: our timeout task killed the process (SIGTERM/SIGKILL), but + // the exit code arrived at group.next() before the .timedOut throw. + // Detect by requiring BOTH a signal termination AND elapsed time at/past the + // timeout — this avoids misclassifying processes that crash on their own. + if process.terminationReason == .uncaughtSignal, + Date().timeIntervalSince(start) >= timeout - 0.5 + { + let duration = Date().timeIntervalSince(start) + self.log.warning( + "Subprocess error", + metadata: [ + "label": label, + "binary": binaryName, + "duration_ms": "\(Int(duration * 1000))", + ]) + stdoutTask.cancel() + stderrTask.cancel() + stdoutPipe.fileHandleForReading.closeFile() + stderrPipe.fileHandleForReading.closeFile() + throw SubprocessRunnerError.timedOut(label) + } + let stdoutData = await stdoutTask.value let stderrData = await stderrTask.value let stdout = String(data: stdoutData, encoding: .utf8) ?? "" @@ -142,22 +213,8 @@ public enum SubprocessRunner { "binary": binaryName, "duration_ms": "\(Int(duration * 1000))", ]) - if process.isRunning { - process.terminate() - if let pgid = processGroup { - kill(-pgid, SIGTERM) - } - let killDeadline = Date().addingTimeInterval(0.4) - while process.isRunning, Date() < killDeadline { - usleep(50000) - } - if process.isRunning { - if let pgid = processGroup { - kill(-pgid, SIGKILL) - } - kill(process.processIdentifier, SIGKILL) - } - } + // Safety net: ensure the process is dead (may already be killed by timeout task). + self.terminateProcess(process, processGroup: processGroup) exitCodeTask.cancel() stdoutTask.cancel() stderrTask.cancel() diff --git a/Tests/CodexBarTests/SubprocessRunnerTests.swift b/Tests/CodexBarTests/SubprocessRunnerTests.swift index fee8707ca..0436bc799 100644 --- a/Tests/CodexBarTests/SubprocessRunnerTests.swift +++ b/Tests/CodexBarTests/SubprocessRunnerTests.swift @@ -15,4 +15,98 @@ struct SubprocessRunnerTests { #expect(result.stdout.count >= 1_000_000) #expect(result.stderr.isEmpty) } + + /// Regression test for #474: a hung subprocess must be killed and throw `.timedOut` + /// instead of blocking indefinitely. + /// + /// This test was previously deleted (commit 3961770) because `waitUntilExit()` blocked + /// the cooperative thread pool, starving the timeout task. The fix moves blocking calls + /// to `DispatchQueue.global()`, making this test reliable. + @Test + func throwsTimedOutWhenProcessHangs() async throws { + let start = Date() + do { + _ = try await SubprocessRunner.run( + binary: "/bin/sleep", + arguments: ["30"], + environment: ProcessInfo.processInfo.environment, + timeout: 1, + label: "hung-process-test") + Issue.record("Expected SubprocessRunnerError.timedOut but no error was thrown") + } catch let error as SubprocessRunnerError { + guard case let .timedOut(label) = error else { + Issue.record("Expected .timedOut, got \(error)") + return + } + #expect(label == "hung-process-test") + } catch { + Issue.record("Expected SubprocessRunnerError.timedOut, got unexpected error: \(error)") + } + + let elapsed = Date().timeIntervalSince(start) + // Must complete in well under 30s (the sleep duration). Allow generous bound for CI. + #expect(elapsed < 10, "Timeout should fire in ~1s, not wait for process to exit naturally") + } + + /// Multiple concurrent hung subprocesses must all time out independently, proving that + /// one blocked subprocess does not starve the timeout mechanism of others. + /// This is the core scenario that caused the original permanent-refresh-stall bug. + @Test + func concurrentHungProcessesAllTimeOut() async { + let start = Date() + let count = 8 + + await withTaskGroup(of: Void.self) { group in + for i in 0..