Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
434 changes: 434 additions & 0 deletions docs/plans/2026-03-08-fix-process-orphan-leaks.md

Large diffs are not rendered by default.

14 changes: 10 additions & 4 deletions lib/cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -504,16 +504,21 @@ function isHeadless() {
// Process cleanup
// ---------------------------------------------------------------------------

/** Kill a process tree. On Windows uses taskkill; elsewhere sends SIGTERM. */
/** Kill a process and its children. Windows: taskkill /t. Unix: kill process group. */
function killProcess(pid) {
try {
if (IS_WIN) {
execSync(`taskkill /pid ${pid} /t /f`, { stdio: 'ignore' });
execFileSync('taskkill', ['/pid', String(pid), '/t', '/f'], { stdio: 'ignore' });
} else {
process.kill(pid, 'SIGTERM');
// Kill entire process group (negative PID = PGID)
try { process.kill(-pid, 'SIGTERM'); } catch { /* group may be gone */ }
// Brief wait then force kill
setTimeout(() => {
try { process.kill(-pid, 'SIGKILL'); } catch { /* already dead */ }
}, 5000);
}
} catch {
// Process may already be gone
// Process/group may already be gone
}
}

Expand Down Expand Up @@ -724,6 +729,7 @@ function startServer(opts) {
cwd: PKG_DIR,
env: serverEnv,
stdio: 'inherit',
detached: !IS_WIN,
}
);

Expand Down
8 changes: 8 additions & 0 deletions parallel_orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -860,6 +860,8 @@ def _spawn_coding_agent(self, feature_id: int) -> tuple[bool, str]:
}
if sys.platform == "win32":
popen_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW
else:
popen_kwargs["start_new_session"] = True

proc = subprocess.Popen(cmd, **popen_kwargs)
except Exception as e:
Expand Down Expand Up @@ -923,6 +925,8 @@ def _spawn_coding_agent_batch(self, feature_ids: list[int]) -> tuple[bool, str]:
}
if sys.platform == "win32":
popen_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW
else:
popen_kwargs["start_new_session"] = True

proc = subprocess.Popen(cmd, **popen_kwargs)
except Exception as e:
Expand Down Expand Up @@ -1028,6 +1032,8 @@ def _spawn_testing_agent(self) -> tuple[bool, str]:
self._testing_session_counter += 1
if sys.platform == "win32":
popen_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW
else:
popen_kwargs["start_new_session"] = True

proc = subprocess.Popen(cmd, **popen_kwargs)
except Exception as e:
Expand Down Expand Up @@ -1089,6 +1095,8 @@ async def _run_initializer(self) -> bool:
}
if sys.platform == "win32":
popen_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW
else:
popen_kwargs["start_new_session"] = True

proc = subprocess.Popen(cmd, **popen_kwargs)

Expand Down
5 changes: 5 additions & 0 deletions server/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
from .services.expand_chat_session import cleanup_all_expand_sessions
from .services.process_manager import cleanup_all_managers, cleanup_orphaned_locks
from .services.scheduler_service import cleanup_scheduler, get_scheduler
from .services.orphan_reaper import start_reaper, stop_reaper
from .services.terminal_manager import cleanup_all_terminals
from .websocket import project_websocket

Expand Down Expand Up @@ -81,6 +82,9 @@ async def lifespan(app: FastAPI):
scheduler = get_scheduler()
await scheduler.start()

# Start orphan process reaper (Linux containers only)
start_reaper()

yield

# Shutdown - cleanup scheduler first to stop triggering new starts
Expand All @@ -91,6 +95,7 @@ async def lifespan(app: FastAPI):
await cleanup_all_expand_sessions()
await cleanup_all_terminals()
await cleanup_all_devservers()
stop_reaper()


# Create FastAPI app
Expand Down
1 change: 1 addition & 0 deletions server/services/dev_server_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,7 @@ async def start(self, command: str) -> tuple[bool, str]:
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
cwd=str(self.project_dir),
start_new_session=True,
)

self._command = command
Expand Down
138 changes: 138 additions & 0 deletions server/services/orphan_reaper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
"""
Orphan Process Reaper
=====================

Periodic background task that kills orphaned processes (PPid=1) inside the
container that were spawned by AutoForge but survived agent shutdown.

This is a safety net for the process-group kill mechanism. It runs every
60 seconds and kills any chrome, node, esbuild, or npm processes that:
1. Have PPid == 1 (reparented to init — they're orphans)
2. Are NOT the main autoforge-bin or uvicorn process
3. Have been orphaned for at least 30 seconds (grace period)

Only active on Linux (containers). No-op on macOS/Windows.

TODO: The reaper kills live orphans but cannot clean up zombie processes
(state Z). Zombies occur when terminated children are reparented to PID 1
but PID 1 never calls waitpid(). This happens in containerized deployments
where PID 1 is not a proper init (e.g. no tini/dumb-init). Zombies don't
consume memory but accumulate PID table entries. Known solutions:
- Container-side: use tini or dumb-init as PID 1 (ENTRYPOINT ["tini", "--"])
- Code-side: prctl(PR_SET_CHILD_SUBREAPER) to adopt orphans into this
process, then reap via SIGCHLD + os.waitpid(-1, WNOHANG)
See: https://github.com/AutoForgeAI/autoforge/pull/222
"""

import asyncio
import logging
import os
import sys
import time

import psutil

logger = logging.getLogger(__name__)

# Process names that are known AutoForge children and safe to kill when orphaned
ORPHAN_TARGETS = {
"chrome", "chrome_crashpad", # Playwright browsers
"chromium", "chromium_crashpad",
"node", "esbuild", # Dev servers, vitest, vite
"npm", "npx", # Package manager wrappers
"sh", "bash", # Shell wrappers from Bash tool
}

# Minimum age (seconds) before an orphan is eligible for kill
ORPHAN_GRACE_PERIOD = 30

# How often to run the reaper (seconds)
REAP_INTERVAL = 60

_reaper_task: asyncio.Task | None = None


def _find_and_kill_orphans() -> dict:
"""Scan for orphaned processes and kill them.

Returns dict with stats: {killed: int, errors: int, scanned: int}
"""
stats = {"killed": 0, "errors": 0, "scanned": 0}
now = time.time()
my_pid = os.getpid()

for proc in psutil.process_iter(["pid", "ppid", "name", "create_time"]):
try:
info = proc.info
stats["scanned"] += 1

# Skip non-orphans (ppid != 1) and PID 1 itself
if info["ppid"] != 1 or info["pid"] in (1, my_pid):
continue

# Skip processes not in our target list
name = (info["name"] or "").lower()
if name not in ORPHAN_TARGETS:
continue

# Skip recently created processes (grace period)
age = now - (info["create_time"] or now)
if age < ORPHAN_GRACE_PERIOD:
continue

# Kill the orphan
logger.info(
"Reaping orphan: PID %d (%s), age %.0fs",
info["pid"], name, age,
)
try:
proc.terminate()
try:
proc.wait(timeout=3)
except psutil.TimeoutExpired:
proc.kill()
stats["killed"] += 1
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass

except (psutil.NoSuchProcess, psutil.AccessDenied):
stats["errors"] += 1

return stats


async def _reaper_loop():
"""Background loop that periodically reaps orphans."""
logger.info(
"Orphan reaper started (interval=%ds, grace=%ds)",
REAP_INTERVAL, ORPHAN_GRACE_PERIOD,
)
while True:
await asyncio.sleep(REAP_INTERVAL)
try:
loop = asyncio.get_running_loop()
stats = await loop.run_in_executor(None, _find_and_kill_orphans)
if stats["killed"] > 0:
logger.info("Orphan reaper: killed %d orphan(s)", stats["killed"])
except Exception:
logger.warning("Orphan reaper error", exc_info=True)


def start_reaper():
"""Start the orphan reaper background task. Only runs on Linux."""
global _reaper_task
if sys.platform != "linux":
logger.debug("Orphan reaper skipped (not Linux)")
return
if _reaper_task is not None:
return
_reaper_task = asyncio.create_task(_reaper_loop())
logger.info("Orphan reaper background task registered")


def stop_reaper():
"""Stop the orphan reaper."""
global _reaper_task
if _reaper_task:
_reaper_task.cancel()
_reaper_task = None
2 changes: 2 additions & 0 deletions server/services/process_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,8 @@ async def start(
}
if sys.platform == "win32":
popen_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW
else:
popen_kwargs["start_new_session"] = True

self.process = subprocess.Popen(cmd, **popen_kwargs)

Expand Down
87 changes: 43 additions & 44 deletions server/utils/process_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
"""

import logging
import os
import signal
import subprocess
import sys
import time
from dataclasses import dataclass
from typing import Literal

Expand Down Expand Up @@ -40,9 +44,11 @@ class KillResult:
def kill_process_tree(proc: subprocess.Popen, timeout: float = 5.0) -> KillResult:
"""Kill a process and all its child processes.

On Windows, subprocess.terminate() only kills the immediate process, leaving
orphaned child processes (e.g., spawned browser instances, coding/testing agents).
This function uses psutil to kill the entire process tree.
Uses a two-phase approach for reliable cleanup:
1. If the process is a process group leader (start_new_session=True on Unix),
kill the entire group via os.killpg(). This is atomic and immune to the
TOCTOU race where children get reparented to PID 1.
2. Fall back to psutil tree walk for Windows and any stragglers.

Args:
proc: The subprocess.Popen object to kill
Expand All @@ -53,82 +59,75 @@ def kill_process_tree(proc: subprocess.Popen, timeout: float = 5.0) -> KillResul
"""
result = KillResult(status="success", parent_pid=proc.pid)

# Phase 1: Process group kill (Unix only, atomic, no TOCTOU race)
if sys.platform != "win32":
try:
pgid = os.getpgid(proc.pid)
if pgid == proc.pid:
logger.debug("Killing process group PGID %d", pgid)
try:
os.killpg(pgid, signal.SIGTERM)
except ProcessLookupError:
pass

deadline = time.monotonic() + timeout
while time.monotonic() < deadline:
try:
os.killpg(pgid, 0)
except ProcessLookupError:
break
time.sleep(0.1)
else:
try:
os.killpg(pgid, signal.SIGKILL)
result.status = "partial"
except ProcessLookupError:
pass
except (ProcessLookupError, OSError) as e:
logger.debug("Process group kill skipped for PID %d: %s", proc.pid, e)

# Phase 2: psutil tree walk (catches Windows + non-group-leader children)
try:
parent = psutil.Process(proc.pid)
# Get all children recursively before terminating
children = parent.children(recursive=True)
result.children_found = len(children)

logger.debug(
"Killing process tree: PID %d with %d children",
proc.pid, len(children)
)

# Terminate children first (graceful)
for child in children:
try:
logger.debug("Terminating child PID %d (%s)", child.pid, child.name())
child.terminate()
except (psutil.NoSuchProcess, psutil.AccessDenied) as e:
# NoSuchProcess: already dead
# AccessDenied: Windows can raise this for system processes or already-exited processes
logger.debug("Child PID %d already gone or inaccessible: %s", child.pid, e)
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass

# Wait for children to terminate
gone, still_alive = psutil.wait_procs(children, timeout=timeout)
result.children_terminated = len(gone)

logger.debug(
"Children after graceful wait: %d terminated, %d still alive",
len(gone), len(still_alive)
)

# Force kill any remaining children
for child in still_alive:
try:
logger.debug("Force-killing child PID %d", child.pid)
child.kill()
result.children_killed += 1
except (psutil.NoSuchProcess, psutil.AccessDenied) as e:
logger.debug("Child PID %d gone during force-kill: %s", child.pid, e)
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass

if result.children_killed > 0:
result.status = "partial"

# Now terminate the parent
logger.debug("Terminating parent PID %d", proc.pid)
proc.terminate()
try:
proc.wait(timeout=timeout)
logger.debug("Parent PID %d terminated gracefully", proc.pid)
except subprocess.TimeoutExpired:
logger.debug("Parent PID %d did not terminate, force-killing", proc.pid)
proc.kill()
proc.wait()
result.parent_forcekilled = True
result.status = "partial"

logger.debug(
"Process tree kill complete: status=%s, children=%d (terminated=%d, killed=%d)",
result.status, result.children_found,
result.children_terminated, result.children_killed
)

except (psutil.NoSuchProcess, psutil.AccessDenied) as e:
# NoSuchProcess: Process already dead
# AccessDenied: Windows can raise this for protected/system processes
# In either case, just ensure cleanup
logger.debug("Parent PID %d inaccessible (%s), attempting direct cleanup", proc.pid, e)
except (psutil.NoSuchProcess, psutil.AccessDenied):
try:
proc.terminate()
proc.wait(timeout=1)
logger.debug("Direct termination of PID %d succeeded", proc.pid)
except (subprocess.TimeoutExpired, OSError):
try:
proc.kill()
logger.debug("Direct force-kill of PID %d succeeded", proc.pid)
except OSError as kill_error:
logger.debug("Direct force-kill of PID %d failed: %s", proc.pid, kill_error)
except OSError:
result.status = "failure"

return result
Loading