-
Notifications
You must be signed in to change notification settings - Fork 694
Fix SubprocessRunner blocking cooperative thread pool, causing permanent refresh stall #566
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,6 +37,52 @@ public struct SubprocessResult: Sendable { | |
| public enum SubprocessRunner { | ||
| private static let log = CodexBarLog.logger(LogCategories.subprocess) | ||
|
|
||
| // MARK: - Helpers to move blocking calls off the cooperative thread pool | ||
|
|
||
| /// Runs `readDataToEndOfFile()` on a GCD thread so it does not block the Swift cooperative pool. | ||
| private static func readDataOffPool(_ fileHandle: FileHandle) async -> Data { | ||
| await withCheckedContinuation { continuation in | ||
| DispatchQueue.global().async { | ||
| let data = fileHandle.readDataToEndOfFile() | ||
| continuation.resume(returning: data) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// Runs `waitUntilExit()` on a GCD thread so it does not block the Swift cooperative pool. | ||
| private static func waitForExitOffPool(_ process: Process) async -> Int32 { | ||
| await withCheckedContinuation { continuation in | ||
| DispatchQueue.global().async { | ||
| process.waitUntilExit() | ||
| continuation.resume(returning: process.terminationStatus) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// Terminates a process and its process group, escalating from SIGTERM to SIGKILL. | ||
| /// Returns `true` if the process was actually killed, `false` if it had already exited. | ||
| @discardableResult | ||
| private static func terminateProcess(_ process: Process, processGroup: pid_t?) -> Bool { | ||
| guard process.isRunning else { return false } | ||
| process.terminate() | ||
| if let pgid = processGroup { | ||
| kill(-pgid, SIGTERM) | ||
| } | ||
| let killDeadline = Date().addingTimeInterval(0.4) | ||
| while process.isRunning, Date() < killDeadline { | ||
| usleep(50000) | ||
| } | ||
| if process.isRunning { | ||
| if let pgid = processGroup { | ||
| kill(-pgid, SIGKILL) | ||
| } | ||
| kill(process.processIdentifier, SIGKILL) | ||
| } | ||
| return true | ||
| } | ||
|
|
||
| // MARK: - Public API | ||
|
|
||
| public static func run( | ||
| binary: String, | ||
| arguments: [String], | ||
|
|
@@ -66,10 +112,10 @@ public enum SubprocessRunner { | |
| process.standardInput = nil | ||
|
|
||
| let stdoutTask = Task<Data, Never> { | ||
| stdoutPipe.fileHandleForReading.readDataToEndOfFile() | ||
| await self.readDataOffPool(stdoutPipe.fileHandleForReading) | ||
| } | ||
| let stderrTask = Task<Data, Never> { | ||
| stderrPipe.fileHandleForReading.readDataToEndOfFile() | ||
| await self.readDataOffPool(stderrPipe.fileHandleForReading) | ||
| } | ||
|
|
||
| do { | ||
|
|
@@ -82,29 +128,54 @@ public enum SubprocessRunner { | |
| throw SubprocessRunnerError.launchFailed(error.localizedDescription) | ||
| } | ||
|
|
||
| var processGroup: pid_t? | ||
| let pid = process.processIdentifier | ||
| if setpgid(pid, pid) == 0 { | ||
| processGroup = pid | ||
| } | ||
| let processGroup: pid_t? = setpgid(pid, pid) == 0 ? pid : nil | ||
|
|
||
| let exitCodeTask = Task<Int32, Never> { | ||
| process.waitUntilExit() | ||
| return process.terminationStatus | ||
| await self.waitForExitOffPool(process) | ||
| } | ||
|
|
||
| do { | ||
| let exitCode = try await withThrowingTaskGroup(of: Int32.self) { group in | ||
| group.addTask { await exitCodeTask.value } | ||
| group.addTask { | ||
| try await Task.sleep(for: .seconds(timeout)) | ||
| // Kill the process BEFORE throwing so the exit-code task can complete | ||
| // and withThrowingTaskGroup can exit promptly. Only throw if we | ||
| // actually killed the process; if it already exited, let the exit | ||
| // code win the race naturally. | ||
| guard self.terminateProcess(process, processGroup: processGroup) else { | ||
| return await exitCodeTask.value | ||
| } | ||
| throw SubprocessRunnerError.timedOut(label) | ||
| } | ||
| let code = try await group.next()! | ||
| group.cancelAll() | ||
| return code | ||
| } | ||
|
|
||
| // Race guard: our timeout task killed the process (SIGTERM/SIGKILL), but | ||
| // the exit code arrived at group.next() before the .timedOut throw. | ||
| // Detect by requiring BOTH a signal termination AND elapsed time at/past the | ||
| // timeout — this avoids misclassifying processes that crash on their own. | ||
| if process.terminationReason == .uncaughtSignal, | ||
| Date().timeIntervalSince(start) >= timeout - 0.5 | ||
|
Comment on lines
+161
to
+162
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
A subprocess that exits by signal on its own shortly before the deadline is still reclassified as Useful? React with 👍 / 👎. |
||
| { | ||
| let duration = Date().timeIntervalSince(start) | ||
| self.log.warning( | ||
| "Subprocess error", | ||
| metadata: [ | ||
| "label": label, | ||
| "binary": binaryName, | ||
| "duration_ms": "\(Int(duration * 1000))", | ||
| ]) | ||
| stdoutTask.cancel() | ||
| stderrTask.cancel() | ||
| stdoutPipe.fileHandleForReading.closeFile() | ||
| stderrPipe.fileHandleForReading.closeFile() | ||
| throw SubprocessRunnerError.timedOut(label) | ||
| } | ||
|
|
||
| let stdoutData = await stdoutTask.value | ||
| let stderrData = await stderrTask.value | ||
| let stdout = String(data: stdoutData, encoding: .utf8) ?? "" | ||
|
|
@@ -142,22 +213,8 @@ public enum SubprocessRunner { | |
| "binary": binaryName, | ||
| "duration_ms": "\(Int(duration * 1000))", | ||
| ]) | ||
| if process.isRunning { | ||
| process.terminate() | ||
| if let pgid = processGroup { | ||
| kill(-pgid, SIGTERM) | ||
| } | ||
| let killDeadline = Date().addingTimeInterval(0.4) | ||
| while process.isRunning, Date() < killDeadline { | ||
| usleep(50000) | ||
| } | ||
| if process.isRunning { | ||
| if let pgid = processGroup { | ||
| kill(-pgid, SIGKILL) | ||
| } | ||
| kill(process.processIdentifier, SIGKILL) | ||
| } | ||
| } | ||
| // Safety net: ensure the process is dead (may already be killed by timeout task). | ||
| self.terminateProcess(process, processGroup: processGroup) | ||
| exitCodeTask.cancel() | ||
| stdoutTask.cancel() | ||
| stderrTask.cancel() | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After
waitUntilExit()was moved behindDispatchQueue.global()inSubprocessRunner.run, there is now an extra scheduling hop beforeexitCodeTaskcan finish. For commands that complete right around the deadline, this timeout task can wake first,terminateProcesswill no-op becauseprocess.isRunningis already false, and we still throw.timedOut. That turns successful near-threshold commands into false failures/fallbacks; the timeout branch should only throw if the process was still running or if it actually performed the kill.Useful? React with 👍 / 👎.