From 0b3ffddd89fdb5c6f8396eff88a89c6e5e1477a5 Mon Sep 17 00:00:00 2001 From: Van Dang Date: Sun, 8 Mar 2026 19:52:52 +0700 Subject: [PATCH 1/5] fix: add process-group kill to eliminate orphan TOCTOU race kill_process_tree() now uses os.killpg() when the subprocess was started with start_new_session=True, killing the entire process group atomically before falling back to psutil tree walk. This eliminates the race where children get reparented to PID 1 before psutil.children() runs. Addresses #164, #197. Co-Authored-By: Claude Opus 4.6 --- server/utils/process_utils.py | 87 +++++++++++++++++------------------ tests/test_process_utils.py | 53 +++++++++++++++++++++ 2 files changed, 96 insertions(+), 44 deletions(-) create mode 100644 tests/test_process_utils.py diff --git a/server/utils/process_utils.py b/server/utils/process_utils.py index 40ec931c..62240cf7 100644 --- a/server/utils/process_utils.py +++ b/server/utils/process_utils.py @@ -6,7 +6,11 @@ """ import logging +import os +import signal import subprocess +import sys +import time from dataclasses import dataclass from typing import Literal @@ -40,9 +44,11 @@ class KillResult: def kill_process_tree(proc: subprocess.Popen, timeout: float = 5.0) -> KillResult: """Kill a process and all its child processes. - On Windows, subprocess.terminate() only kills the immediate process, leaving - orphaned child processes (e.g., spawned browser instances, coding/testing agents). - This function uses psutil to kill the entire process tree. + Uses a two-phase approach for reliable cleanup: + 1. If the process is a process group leader (start_new_session=True on Unix), + kill the entire group via os.killpg(). This is atomic and immune to the + TOCTOU race where children get reparented to PID 1. + 2. Fall back to psutil tree walk for Windows and any stragglers. Args: proc: The subprocess.Popen object to kill @@ -53,82 +59,75 @@ def kill_process_tree(proc: subprocess.Popen, timeout: float = 5.0) -> KillResul """ result = KillResult(status="success", parent_pid=proc.pid) + # Phase 1: Process group kill (Unix only, atomic, no TOCTOU race) + if sys.platform != "win32": + try: + pgid = os.getpgid(proc.pid) + if pgid == proc.pid: + logger.debug("Killing process group PGID %d", pgid) + try: + os.killpg(pgid, signal.SIGTERM) + except ProcessLookupError: + pass + + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + try: + os.killpg(pgid, 0) + except ProcessLookupError: + break + time.sleep(0.1) + else: + try: + os.killpg(pgid, signal.SIGKILL) + result.status = "partial" + except ProcessLookupError: + pass + except (ProcessLookupError, OSError) as e: + logger.debug("Process group kill skipped for PID %d: %s", proc.pid, e) + + # Phase 2: psutil tree walk (catches Windows + non-group-leader children) try: parent = psutil.Process(proc.pid) - # Get all children recursively before terminating children = parent.children(recursive=True) result.children_found = len(children) - logger.debug( - "Killing process tree: PID %d with %d children", - proc.pid, len(children) - ) - - # Terminate children first (graceful) for child in children: try: - logger.debug("Terminating child PID %d (%s)", child.pid, child.name()) child.terminate() - except (psutil.NoSuchProcess, psutil.AccessDenied) as e: - # NoSuchProcess: already dead - # AccessDenied: Windows can raise this for system processes or already-exited processes - logger.debug("Child PID %d already gone or inaccessible: %s", child.pid, e) + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass - # Wait for children to terminate gone, still_alive = psutil.wait_procs(children, timeout=timeout) result.children_terminated = len(gone) - logger.debug( - "Children after graceful wait: %d terminated, %d still alive", - len(gone), len(still_alive) - ) - - # Force kill any remaining children for child in still_alive: try: - logger.debug("Force-killing child PID %d", child.pid) child.kill() result.children_killed += 1 - except (psutil.NoSuchProcess, psutil.AccessDenied) as e: - logger.debug("Child PID %d gone during force-kill: %s", child.pid, e) + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass if result.children_killed > 0: result.status = "partial" - # Now terminate the parent - logger.debug("Terminating parent PID %d", proc.pid) proc.terminate() try: proc.wait(timeout=timeout) - logger.debug("Parent PID %d terminated gracefully", proc.pid) except subprocess.TimeoutExpired: - logger.debug("Parent PID %d did not terminate, force-killing", proc.pid) proc.kill() proc.wait() result.parent_forcekilled = True result.status = "partial" - logger.debug( - "Process tree kill complete: status=%s, children=%d (terminated=%d, killed=%d)", - result.status, result.children_found, - result.children_terminated, result.children_killed - ) - - except (psutil.NoSuchProcess, psutil.AccessDenied) as e: - # NoSuchProcess: Process already dead - # AccessDenied: Windows can raise this for protected/system processes - # In either case, just ensure cleanup - logger.debug("Parent PID %d inaccessible (%s), attempting direct cleanup", proc.pid, e) + except (psutil.NoSuchProcess, psutil.AccessDenied): try: proc.terminate() proc.wait(timeout=1) - logger.debug("Direct termination of PID %d succeeded", proc.pid) except (subprocess.TimeoutExpired, OSError): try: proc.kill() - logger.debug("Direct force-kill of PID %d succeeded", proc.pid) - except OSError as kill_error: - logger.debug("Direct force-kill of PID %d failed: %s", proc.pid, kill_error) + except OSError: result.status = "failure" return result diff --git a/tests/test_process_utils.py b/tests/test_process_utils.py new file mode 100644 index 00000000..2b91cd03 --- /dev/null +++ b/tests/test_process_utils.py @@ -0,0 +1,53 @@ +"""Tests for process_utils — process group kill.""" +import os +import subprocess +import sys +import time + +import pytest + +from server.utils.process_utils import kill_process_tree + + +@pytest.mark.skipif(sys.platform == "win32", reason="Process groups differ on Windows") +class TestKillProcessGroup: + def test_kills_children_via_process_group(self): + proc = subprocess.Popen( + ["bash", "-c", "sleep 300 & echo $!; wait"], + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + start_new_session=True, + ) + child_pid = int(proc.stdout.readline().decode().strip()) + assert _pid_alive(child_pid) + result = kill_process_tree(proc, timeout=3.0) + time.sleep(0.5) + assert not _pid_alive(child_pid) + assert result.status in ("success", "partial") + + def test_kills_deeply_nested_children(self): + proc = subprocess.Popen( + ["bash", "-c", "bash -c 'sleep 300' & echo $!; wait"], + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + start_new_session=True, + ) + grandchild_pid = int(proc.stdout.readline().decode().strip()) + assert _pid_alive(grandchild_pid) + kill_process_tree(proc, timeout=3.0) + time.sleep(0.5) + assert not _pid_alive(grandchild_pid) + + def test_handles_already_dead_process(self): + proc = subprocess.Popen(["true"], start_new_session=True) + proc.wait() + result = kill_process_tree(proc, timeout=1.0) + assert result.status in ("success", "partial", "failure") + + +def _pid_alive(pid: int) -> bool: + try: + os.kill(pid, 0) + return True + except OSError: + return False From 956032db4ded1dc5802489b361c40f6d5a2fc095 Mon Sep 17 00:00:00 2001 From: Van Dang Date: Sun, 8 Mar 2026 19:53:07 +0700 Subject: [PATCH 2/5] fix: spawn all subprocesses with start_new_session=True Each subprocess now runs in its own process group, enabling atomic cleanup via os.killpg(). This prevents children from escaping kill_process_tree() by getting reparented to PID 1. 6 spawn sites updated: 4 in parallel_orchestrator.py, 1 in process_manager.py, 1 in dev_server_manager.py. Co-Authored-By: Claude Opus 4.6 --- parallel_orchestrator.py | 8 ++++++++ server/services/dev_server_manager.py | 1 + server/services/process_manager.py | 2 ++ 3 files changed, 11 insertions(+) diff --git a/parallel_orchestrator.py b/parallel_orchestrator.py index b7f2bac5..58f9404f 100644 --- a/parallel_orchestrator.py +++ b/parallel_orchestrator.py @@ -860,6 +860,8 @@ def _spawn_coding_agent(self, feature_id: int) -> tuple[bool, str]: } if sys.platform == "win32": popen_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW + else: + popen_kwargs["start_new_session"] = True proc = subprocess.Popen(cmd, **popen_kwargs) except Exception as e: @@ -923,6 +925,8 @@ def _spawn_coding_agent_batch(self, feature_ids: list[int]) -> tuple[bool, str]: } if sys.platform == "win32": popen_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW + else: + popen_kwargs["start_new_session"] = True proc = subprocess.Popen(cmd, **popen_kwargs) except Exception as e: @@ -1028,6 +1032,8 @@ def _spawn_testing_agent(self) -> tuple[bool, str]: self._testing_session_counter += 1 if sys.platform == "win32": popen_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW + else: + popen_kwargs["start_new_session"] = True proc = subprocess.Popen(cmd, **popen_kwargs) except Exception as e: @@ -1089,6 +1095,8 @@ async def _run_initializer(self) -> bool: } if sys.platform == "win32": popen_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW + else: + popen_kwargs["start_new_session"] = True proc = subprocess.Popen(cmd, **popen_kwargs) diff --git a/server/services/dev_server_manager.py b/server/services/dev_server_manager.py index 7455a13a..5c6f690b 100644 --- a/server/services/dev_server_manager.py +++ b/server/services/dev_server_manager.py @@ -352,6 +352,7 @@ async def start(self, command: str) -> tuple[bool, str]: stdout=subprocess.PIPE, stderr=subprocess.STDOUT, cwd=str(self.project_dir), + start_new_session=True, ) self._command = command diff --git a/server/services/process_manager.py b/server/services/process_manager.py index e21ffefc..f586e2c7 100644 --- a/server/services/process_manager.py +++ b/server/services/process_manager.py @@ -470,6 +470,8 @@ async def start( } if sys.platform == "win32": popen_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW + else: + popen_kwargs["start_new_session"] = True self.process = subprocess.Popen(cmd, **popen_kwargs) From d029330e32d324736ac0979a9e6d37125001cf93 Mon Sep 17 00:00:00 2001 From: Van Dang Date: Sun, 8 Mar 2026 19:56:27 +0700 Subject: [PATCH 3/5] fix: cli.js kills entire process group on shutdown killProcess() now sends SIGTERM to -pid (the process group), ensuring all child processes are terminated on Ctrl+C or SIGTERM. Uvicorn is spawned with detached:true to create a new process group on Unix. Also switches from execSync to execFileSync to avoid shell injection. Co-Authored-By: Claude Opus 4.6 --- lib/cli.js | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/lib/cli.js b/lib/cli.js index 682ba849..d2ae2c36 100644 --- a/lib/cli.js +++ b/lib/cli.js @@ -504,16 +504,21 @@ function isHeadless() { // Process cleanup // --------------------------------------------------------------------------- -/** Kill a process tree. On Windows uses taskkill; elsewhere sends SIGTERM. */ +/** Kill a process and its children. Windows: taskkill /t. Unix: kill process group. */ function killProcess(pid) { try { if (IS_WIN) { - execSync(`taskkill /pid ${pid} /t /f`, { stdio: 'ignore' }); + execFileSync('taskkill', ['/pid', String(pid), '/t', '/f'], { stdio: 'ignore' }); } else { - process.kill(pid, 'SIGTERM'); + // Kill entire process group (negative PID = PGID) + try { process.kill(-pid, 'SIGTERM'); } catch { /* group may be gone */ } + // Brief wait then force kill + setTimeout(() => { + try { process.kill(-pid, 'SIGKILL'); } catch { /* already dead */ } + }, 5000); } } catch { - // Process may already be gone + // Process/group may already be gone } } @@ -724,6 +729,7 @@ function startServer(opts) { cwd: PKG_DIR, env: serverEnv, stdio: 'inherit', + detached: !IS_WIN, } ); From 7f492b6aa08717d5ef8369795743bf29a743eaae Mon Sep 17 00:00:00 2001 From: Van Dang Date: Sun, 8 Mar 2026 19:57:00 +0700 Subject: [PATCH 4/5] fix: add periodic orphan reaper as safety net Scans every 60s for chrome/node/esbuild processes orphaned under PID 1 and kills them after a 30s grace period. Only active on Linux containers. This is defense-in-depth: catches any orphans that escape the process-group kill mechanism. Co-Authored-By: Claude Opus 4.6 --- server/main.py | 5 ++ server/services/orphan_reaper.py | 128 +++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 server/services/orphan_reaper.py diff --git a/server/main.py b/server/main.py index 803c1072..36ca19f5 100644 --- a/server/main.py +++ b/server/main.py @@ -52,6 +52,7 @@ from .services.expand_chat_session import cleanup_all_expand_sessions from .services.process_manager import cleanup_all_managers, cleanup_orphaned_locks from .services.scheduler_service import cleanup_scheduler, get_scheduler +from .services.orphan_reaper import start_reaper, stop_reaper from .services.terminal_manager import cleanup_all_terminals from .websocket import project_websocket @@ -81,6 +82,9 @@ async def lifespan(app: FastAPI): scheduler = get_scheduler() await scheduler.start() + # Start orphan process reaper (Linux containers only) + start_reaper() + yield # Shutdown - cleanup scheduler first to stop triggering new starts @@ -91,6 +95,7 @@ async def lifespan(app: FastAPI): await cleanup_all_expand_sessions() await cleanup_all_terminals() await cleanup_all_devservers() + stop_reaper() # Create FastAPI app diff --git a/server/services/orphan_reaper.py b/server/services/orphan_reaper.py new file mode 100644 index 00000000..faa7bd3e --- /dev/null +++ b/server/services/orphan_reaper.py @@ -0,0 +1,128 @@ +""" +Orphan Process Reaper +===================== + +Periodic background task that kills orphaned processes (PPid=1) inside the +container that were spawned by AutoForge but survived agent shutdown. + +This is a safety net for the process-group kill mechanism. It runs every +60 seconds and kills any chrome, node, esbuild, or npm processes that: + 1. Have PPid == 1 (reparented to init — they're orphans) + 2. Are NOT the main autoforge-bin or uvicorn process + 3. Have been orphaned for at least 30 seconds (grace period) + +Only active on Linux (containers). No-op on macOS/Windows. +""" + +import asyncio +import logging +import os +import sys +import time + +import psutil + +logger = logging.getLogger(__name__) + +# Process names that are known AutoForge children and safe to kill when orphaned +ORPHAN_TARGETS = { + "chrome", "chrome_crashpad", # Playwright browsers + "chromium", "chromium_crashpad", + "node", "esbuild", # Dev servers, vitest, vite + "npm", "npx", # Package manager wrappers + "sh", "bash", # Shell wrappers from Bash tool +} + +# Minimum age (seconds) before an orphan is eligible for kill +ORPHAN_GRACE_PERIOD = 30 + +# How often to run the reaper (seconds) +REAP_INTERVAL = 60 + +_reaper_task: asyncio.Task | None = None + + +def _find_and_kill_orphans() -> dict: + """Scan for orphaned processes and kill them. + + Returns dict with stats: {killed: int, errors: int, scanned: int} + """ + stats = {"killed": 0, "errors": 0, "scanned": 0} + now = time.time() + my_pid = os.getpid() + + for proc in psutil.process_iter(["pid", "ppid", "name", "create_time"]): + try: + info = proc.info + stats["scanned"] += 1 + + # Skip non-orphans (ppid != 1) and PID 1 itself + if info["ppid"] != 1 or info["pid"] in (1, my_pid): + continue + + # Skip processes not in our target list + name = (info["name"] or "").lower() + if name not in ORPHAN_TARGETS: + continue + + # Skip recently created processes (grace period) + age = now - (info["create_time"] or now) + if age < ORPHAN_GRACE_PERIOD: + continue + + # Kill the orphan + logger.info( + "Reaping orphan: PID %d (%s), age %.0fs", + info["pid"], name, age, + ) + try: + proc.terminate() + try: + proc.wait(timeout=3) + except psutil.TimeoutExpired: + proc.kill() + stats["killed"] += 1 + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + + except (psutil.NoSuchProcess, psutil.AccessDenied): + stats["errors"] += 1 + + return stats + + +async def _reaper_loop(): + """Background loop that periodically reaps orphans.""" + logger.info( + "Orphan reaper started (interval=%ds, grace=%ds)", + REAP_INTERVAL, ORPHAN_GRACE_PERIOD, + ) + while True: + await asyncio.sleep(REAP_INTERVAL) + try: + loop = asyncio.get_running_loop() + stats = await loop.run_in_executor(None, _find_and_kill_orphans) + if stats["killed"] > 0: + logger.info("Orphan reaper: killed %d orphan(s)", stats["killed"]) + except Exception: + logger.warning("Orphan reaper error", exc_info=True) + + +def start_reaper(): + """Start the orphan reaper background task. Only runs on Linux.""" + global _reaper_task + if sys.platform != "linux": + logger.debug("Orphan reaper skipped (not Linux)") + return + if _reaper_task is not None: + return + _reaper_task = asyncio.create_task(_reaper_loop()) + logger.info("Orphan reaper background task registered") + + +def stop_reaper(): + """Stop the orphan reaper.""" + global _reaper_task + if _reaper_task: + _reaper_task.cancel() + _reaper_task = None From 3d9f0f27a7902e109817d2a56de9eeec47e6e66d Mon Sep 17 00:00:00 2001 From: Van Dang Date: Sun, 8 Mar 2026 19:57:04 +0700 Subject: [PATCH 5/5] docs: add implementation plan for process orphan leak fix Co-Authored-By: Claude Opus 4.6 --- .../2026-03-08-fix-process-orphan-leaks.md | 434 ++++++++++++++++++ 1 file changed, 434 insertions(+) create mode 100644 docs/plans/2026-03-08-fix-process-orphan-leaks.md diff --git a/docs/plans/2026-03-08-fix-process-orphan-leaks.md b/docs/plans/2026-03-08-fix-process-orphan-leaks.md new file mode 100644 index 00000000..dad21e78 --- /dev/null +++ b/docs/plans/2026-03-08-fix-process-orphan-leaks.md @@ -0,0 +1,434 @@ +# Fix Process Orphan Leaks — Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Eliminate orphaned Chrome/node/esbuild processes that accumulate and cause OOM kills (issues #164, #197). + +**Architecture:** Defense-in-depth across 3 layers: (1) Process groups via `start_new_session=True` so entire subtrees can be killed atomically with `os.killpg()`, eliminating the TOCTOU race in the current `psutil.children()` approach. (2) Fix `cli.js` to kill the process group instead of a single PID. (3) Add a periodic orphan reaper in the server that sweeps for reparented processes under PID 1. + +**Tech Stack:** Python 3.11+ (psutil, subprocess), Node.js (child_process), Linux process groups + +--- + +## Context for the Implementer + +### The Bug + +When a Claude subagent exits, `kill_process_tree()` uses `psutil.Process(pid).children(recursive=True)` to find children. But between the Claude process dying and the tree walk, children get **reparented to PID 1**. They're no longer in the tree, so they survive. Each leaked Chrome instance is ~200MB; over hours this hits the 10Gi memory limit. + +### Files Overview + +| File | Role | +|------|------| +| `server/utils/process_utils.py` | `kill_process_tree()` — the core kill function | +| `server/services/process_manager.py` | `AgentProcessManager` — manages agent subprocess lifecycle | +| `server/services/dev_server_manager.py` | `DevServerProcessManager` — manages dev server subprocess | +| `parallel_orchestrator.py` | Orchestrator — spawns coding/testing agent subprocesses | +| `lib/cli.js` | Node.js PID 1 entry point — spawns uvicorn | + +### Key Constraint + +- `start_new_session=True` is Linux/macOS only. On Windows, use `CREATE_NEW_PROCESS_GROUP` flag instead. +- Must not break Windows support (the codebase has extensive Windows handling). + +--- + +### Task 1: Add process-group-aware kill to `process_utils.py` + +**Files:** +- Modify: `server/utils/process_utils.py` +- Create: `tests/test_process_utils.py` + +**Step 1: Write the failing test** + +Create `tests/test_process_utils.py`: + +```python +"""Tests for process_utils — process group kill.""" +import os +import signal +import subprocess +import sys +import time + +import pytest + +from server.utils.process_utils import kill_process_tree + + +@pytest.mark.skipif(sys.platform == "win32", reason="Process groups differ on Windows") +class TestKillProcessGroup: + def test_kills_children_via_process_group(self): + """Children in the same process group are killed even if reparented.""" + # Spawn a shell that starts a background sleep (simulates orphan-to-be) + proc = subprocess.Popen( + ["bash", "-c", "sleep 300 & echo $!; wait"], + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + start_new_session=True, + ) + # Read the child PID + child_pid_line = proc.stdout.readline().decode().strip() + child_pid = int(child_pid_line) + + # Verify child is alive + assert _pid_alive(child_pid), "child should be alive before kill" + + result = kill_process_tree(proc, timeout=3.0) + time.sleep(0.5) + + assert not _pid_alive(child_pid), "child should be dead after process group kill" + assert result.status in ("success", "partial") + + def test_kills_deeply_nested_children(self): + """Grandchildren in the same process group are also killed.""" + proc = subprocess.Popen( + ["bash", "-c", "bash -c 'sleep 300' & echo $!; wait"], + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + start_new_session=True, + ) + grandchild_pid = int(proc.stdout.readline().decode().strip()) + + assert _pid_alive(grandchild_pid) + + kill_process_tree(proc, timeout=3.0) + time.sleep(0.5) + + assert not _pid_alive(grandchild_pid) + + def test_handles_already_dead_process(self): + """Killing an already-exited process doesn't raise.""" + proc = subprocess.Popen( + ["true"], + start_new_session=True, + ) + proc.wait() + result = kill_process_tree(proc, timeout=1.0) + # Should not raise, status can be success or failure + assert result.status in ("success", "partial", "failure") + + +def _pid_alive(pid: int) -> bool: + try: + os.kill(pid, 0) + return True + except OSError: + return False +``` + +**Step 2: Run test to verify it fails** + +Run: `cd /Users/vandang/GitProjects/autoforge-fork && python -m pytest tests/test_process_utils.py -v` +Expected: FAIL — `start_new_session` children survive because `kill_process_tree` doesn't use `os.killpg()` + +**Step 3: Implement process-group kill in `kill_process_tree()`** + +Replace the body of `kill_process_tree()` in `server/utils/process_utils.py` (lines 40-134). + +Add these imports to the top of the file: `import os`, `import signal`, `import time`. + +The new `kill_process_tree()` function uses a two-phase approach: +1. **Phase 1 (process group):** If on Unix and the process is a group leader (`os.getpgid(pid) == pid`), call `os.killpg(pgid, SIGTERM)`, wait up to `timeout`, then `os.killpg(pgid, SIGKILL)`. This atomically kills all processes in the group regardless of reparenting. +2. **Phase 2 (psutil fallback):** Walk the process tree via `psutil.Process(pid).children(recursive=True)` and terminate/kill remaining children. This handles Windows and processes not in the group. + +See the full implementation in the function body below: + +```python +def kill_process_tree(proc: subprocess.Popen, timeout: float = 5.0) -> KillResult: + """Kill a process and all its child processes. + + Uses a two-phase approach for reliable cleanup: + 1. If the process is a process group leader (start_new_session=True on Unix), + kill the entire group via os.killpg(). This is atomic and immune to the + TOCTOU race where children get reparented to PID 1. + 2. Fall back to psutil tree walk for Windows and any stragglers. + + Args: + proc: The subprocess.Popen object to kill + timeout: Seconds to wait for graceful termination before force-killing + + Returns: + KillResult with status and statistics about the termination + """ + result = KillResult(status="success", parent_pid=proc.pid) + + # Phase 1: Process group kill (Unix only, atomic, no TOCTOU race) + if sys.platform != "win32": + try: + pgid = os.getpgid(proc.pid) + if pgid == proc.pid: + logger.debug("Killing process group PGID %d", pgid) + try: + os.killpg(pgid, signal.SIGTERM) + except ProcessLookupError: + pass + + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + try: + os.killpg(pgid, 0) + except ProcessLookupError: + break + time.sleep(0.1) + else: + try: + os.killpg(pgid, signal.SIGKILL) + result.status = "partial" + except ProcessLookupError: + pass + except (ProcessLookupError, OSError) as e: + logger.debug("Process group kill skipped for PID %d: %s", proc.pid, e) + + # Phase 2: psutil tree walk (catches Windows + non-group-leader children) + try: + parent = psutil.Process(proc.pid) + children = parent.children(recursive=True) + result.children_found = len(children) + + for child in children: + try: + child.terminate() + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + + gone, still_alive = psutil.wait_procs(children, timeout=timeout) + result.children_terminated = len(gone) + + for child in still_alive: + try: + child.kill() + result.children_killed += 1 + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + + if result.children_killed > 0: + result.status = "partial" + + proc.terminate() + try: + proc.wait(timeout=timeout) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + result.parent_forcekilled = True + result.status = "partial" + + except (psutil.NoSuchProcess, psutil.AccessDenied): + try: + proc.terminate() + proc.wait(timeout=1) + except (subprocess.TimeoutExpired, OSError): + try: + proc.kill() + except OSError: + result.status = "failure" + + return result +``` + +**Step 4: Run test to verify it passes** + +Run: `cd /Users/vandang/GitProjects/autoforge-fork && python -m pytest tests/test_process_utils.py -v` +Expected: PASS + +**Step 5: Commit** + +```bash +git add server/utils/process_utils.py tests/test_process_utils.py +git commit -m "fix: add process-group kill to eliminate orphan TOCTOU race + +Addresses #164, #197. kill_process_tree() now uses os.killpg() when the +subprocess was started with start_new_session=True, killing the entire +process group atomically before falling back to psutil tree walk." +``` + +--- + +### Task 2: Spawn all subprocesses with `start_new_session=True` + +**Files:** +- Modify: `parallel_orchestrator.py` (4 spawn sites) +- Modify: `server/services/process_manager.py:464-474` +- Modify: `server/services/dev_server_manager.py` (both branches of start()) + +**Step 1: Add `start_new_session=True` to every `subprocess.Popen` call** + +In each of the files above, add to the `popen_kwargs` dict, right after the existing Windows `creationflags` block: + +```python +# On Unix: create new process group so kill_process_tree can use os.killpg() +if sys.platform != "win32": + popen_kwargs["start_new_session"] = True +``` + +There are **6 spawn sites** total. For each, find the pattern: +```python +if sys.platform == "win32": + popen_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW +``` +And add the Unix branch immediately after. + +The 6 locations: +1. `parallel_orchestrator.py` `_spawn_coding_agent` (~line 862) +2. `parallel_orchestrator.py` `_spawn_coding_agent_batch` (~line 925) +3. `parallel_orchestrator.py` `_spawn_testing_agent_batch` (~line 1030) +4. `parallel_orchestrator.py` `run_initializer` (~line 1091) +5. `server/services/process_manager.py` `AgentProcessManager.start` (~line 472) +6. `server/services/dev_server_manager.py` `DevServerProcessManager.start` (both the Windows and Unix Popen calls) + +**Step 2: Verify no tests break** + +Run: `cd /Users/vandang/GitProjects/autoforge-fork && python -m pytest tests/ -v` +Expected: PASS + +**Step 3: Commit** + +```bash +git add parallel_orchestrator.py server/services/process_manager.py server/services/dev_server_manager.py +git commit -m "fix: spawn all subprocesses with start_new_session=True + +Each subprocess now runs in its own process group, enabling atomic cleanup +via os.killpg(). This prevents children from escaping kill_process_tree() +by getting reparented to PID 1 before the psutil tree walk runs." +``` + +--- + +### Task 3: Fix `cli.js` to kill the process group on shutdown + +**Files:** +- Modify: `lib/cli.js:507-518` (killProcess function) +- Modify: `lib/cli.js:715-728` (uvicorn spawn) + +**Step 1: Update `killProcess()` to kill the process group** + +Replace `killProcess` (lines 507-518) to send signal to negative PID (= process group): + +```javascript +/** Kill a process and its children. Windows: taskkill /t. Unix: kill process group. */ +function killProcess(pid) { + try { + if (IS_WIN) { + execFileSync('taskkill', ['/pid', String(pid), '/t', '/f'], { stdio: 'ignore' }); + } else { + // Kill entire process group (negative PID = PGID) + try { process.kill(-pid, 'SIGTERM'); } catch { /* already dead */ } + // Give a moment for graceful shutdown, then force + try { + execFileSync('kill', ['-0', String(-pid)], { stdio: 'ignore', timeout: 5000 }); + // Still alive, force kill + try { process.kill(-pid, 'SIGKILL'); } catch { /* already dead */ } + } catch { + // Group already gone + } + } + } catch { + // Process/group may already be gone + } +} +``` + +Note: Uses `execFileSync` instead of `execSync` to avoid shell injection (per project conventions). + +**Step 2: Spawn uvicorn with `detached: true` to create process group** + +Update the spawn call (~line 715): + +```javascript + const child = spawn( + pyExe, + [ + '-m', 'uvicorn', + 'server.main:app', + '--host', host, + '--port', String(port), + ], + { + cwd: PKG_DIR, + env: serverEnv, + stdio: 'inherit', + detached: !IS_WIN, // Create new process group on Unix (setsid) + } + ); +``` + +**Step 3: Commit** + +```bash +git add lib/cli.js +git commit -m "fix: cli.js kills entire process group on shutdown + +killProcess() now sends SIGTERM to -pid (the process group), ensuring +all child processes (uvicorn, agents, Chrome, dev servers) are terminated +on Ctrl+C or SIGTERM. Uses execFileSync instead of execSync for safety." +``` + +--- + +### Task 4: Add periodic orphan reaper to the server + +**Files:** +- Create: `server/services/orphan_reaper.py` +- Modify: `server/main.py` (register reaper on startup/shutdown) + +**Step 1: Create `server/services/orphan_reaper.py`** + +This module runs a background asyncio task every 60s that: +1. Iterates all processes via `psutil.process_iter()` +2. Finds any with `ppid == 1` and name in `{chrome, chrome_crashpad, node, esbuild, npm, sh, bash}` +3. Skips processes younger than 30 seconds (grace period) +4. Terminates them (SIGTERM, then SIGKILL after 3s) +5. Only active on Linux (no-op on macOS/Windows) + +See full implementation in the code block above (Task 4 in the first version of this plan). + +**Step 2: Register in `server/main.py`** + +Find the startup event handler and add: +```python +from server.services.orphan_reaper import start_reaper, stop_reaper +``` + +In startup: `start_reaper()` +In shutdown: `stop_reaper()` + +**Step 3: Commit** + +```bash +git add server/services/orphan_reaper.py server/main.py +git commit -m "fix: add periodic orphan reaper as safety net + +Scans every 60s for chrome/node/esbuild processes orphaned under PID 1 +and kills them after a 30s grace period. Only active on Linux containers." +``` + +--- + +### Task 5: Integration verification + +**Step 1: Build and deploy to test cluster** + +**Step 2: Monitor for orphans** + +```bash +kubectl exec -n autoforge -- bash -c ' + count=0 + for f in /proc/[0-9]*/status; do + pid=$(grep "^Pid:" $f 2>/dev/null | awk "{print \$2}") + ppid=$(grep "^PPid:" $f 2>/dev/null | awk "{print \$2}") + if [ "$ppid" = "1" ] && [ "$pid" != "1" ]; then count=$((count+1)); fi + done + echo "$count orphaned processes" +' +``` + +Expected: 0 orphaned processes (or very few within grace period) + +**Step 3: Check memory stability** + +```bash +kubectl exec -n autoforge -- cat /sys/fs/cgroup/memory.current +``` + +Memory should stay stable rather than growing toward the 10Gi limit.