From 7186c3d117acb10e47256f70bd510bdc90cda2a4 Mon Sep 17 00:00:00 2001 From: Harshit Singh Bhandari Date: Thu, 25 Jun 2026 16:15:13 +0530 Subject: [PATCH 01/23] fix(daemon): do not tear down live sessions on shutdown; adopt them on boot Remove SaveAndTeardownAll from the graceful shutdown path. Live tmux/ConPTY sessions survive daemon exit; Reconcile on the next boot adopts them via reconcileLive, preserving session IDs and preventing the id-increment bug. Add runShutdownSessionLifecycle as a testable seam and narrow the sessionLifecycle interface to Reconcile+RestoreAll only. Co-Authored-By: Claude Sonnet 4.6 --- backend/internal/daemon/daemon.go | 29 +++++++-------- backend/internal/daemon/lifecycle_wiring.go | 7 ++-- backend/internal/daemon/wiring_test.go | 39 ++++++++++++--------- 3 files changed, 39 insertions(+), 36 deletions(-) diff --git a/backend/internal/daemon/daemon.go b/backend/internal/daemon/daemon.go index 3603fbe9ba..8eecec9c59 100644 --- a/backend/internal/daemon/daemon.go +++ b/backend/internal/daemon/daemon.go @@ -151,19 +151,7 @@ func Run() error { runErr := srv.Run(ctx) - // Save and tear down all live sessions before the store closes. Both SIGTERM - // and POST /shutdown funnel through srv.Run returning (SIGTERM cancels ctx, - // which srv.Run selects on; POST /shutdown closes the shutdownRequested channel, - // which srv.Run also selects on), so this single call site covers both paths. - // - // Use a fresh context with a bounded deadline: the ctx that caused srv.Run - // to return is already cancelled, so passing it would abort the save - // immediately and leave every session unsaved. - shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), shutdownSaveTimeout) - defer shutdownCancel() - if saveErr := sessMgr.SaveAndTeardownAll(shutdownCtx); saveErr != nil { - log.Error("save sessions on shutdown failed", "err", saveErr) - } + runShutdownSessionLifecycle(context.Background(), sessMgr) // Shut the background goroutines down in order: cancel the context FIRST so // their loops exit, then wait for them to drain. Doing this explicitly (not @@ -178,9 +166,18 @@ func Run() error { return runErr } -// shutdownSaveTimeout bounds the SaveAndTeardownAll call on shutdown so a -// pathological session cannot stall the process exit indefinitely. -const shutdownSaveTimeout = 30 * time.Second +// runShutdownSessionLifecycle is called after srv.Run returns on both graceful +// shutdown paths (SIGTERM and POST /shutdown). It does NOT call +// SaveAndTeardownAll: live tmux/ConPTY sessions survive the daemon exit and +// are adopted by Reconcile on the next boot, preserving session IDs. +// Destroying them here caused the id-increment bug where the orchestrator +// session counter incremented on every restart. +// +// ponytail: no-op on purpose; exists as a testable seam so the test can assert +// SaveAndTeardownAll is never called on the shutdown path. +func runShutdownSessionLifecycle(_ context.Context, _ sessionLifecycle) { + // Intentionally empty. Sessions stay alive; next boot's Reconcile adopts them. +} // newLogger returns the daemon's slog logger. It writes to stderr so supervisors // can capture it separately from any structured stdout protocol added later. diff --git a/backend/internal/daemon/lifecycle_wiring.go b/backend/internal/daemon/lifecycle_wiring.go index 676dcb8e70..a2deec9f2d 100644 --- a/backend/internal/daemon/lifecycle_wiring.go +++ b/backend/internal/daemon/lifecycle_wiring.go @@ -61,18 +61,19 @@ func (l *lifecycleStack) Stop() { // sessionLifecycle is the narrow surface of sessionmanager.Manager used for // boot/shutdown wiring. A minimal interface keeps the daemon testable without // depending on the concrete manager type. +// +// SaveAndTeardownAll is intentionally absent: shutdown no longer tears down +// live sessions. Reconcile on the next boot adopts them (see runShutdownSessionLifecycle). type sessionLifecycle interface { Reconcile(ctx context.Context) error RestoreAll(ctx context.Context) error - SaveAndTeardownAll(ctx context.Context) error } // startSession builds the controller-facing session service: a session manager // over the selected runtime, a per-session gitworktree workspace, the shared // store + LCM, the per-session agent resolver, and the agent messenger. The // returned service is mounted at httpd APIDeps.Sessions. It also returns the -// manager so the caller can wire Reconcile/SaveAndTeardownAll into the -// boot/shutdown sequence. +// manager so the caller can wire Reconcile into the boot sequence. func startSession(cfg config.Config, runtime runtimeselect.Runtime, store *sqlite.Store, lcm *lifecycle.Manager, messenger ports.AgentMessenger, telemetry ports.EventSink, log *slog.Logger) (*sessionsvc.Service, reviewsvc.Manager, sessionLifecycle, error) { defaultAgent := cfg.Agent if defaultAgent == "" { diff --git a/backend/internal/daemon/wiring_test.go b/backend/internal/daemon/wiring_test.go index 21dbcbd59f..160cbc88dd 100644 --- a/backend/internal/daemon/wiring_test.go +++ b/backend/internal/daemon/wiring_test.go @@ -381,16 +381,17 @@ func TestProjectRepoResolver_ResolvesRegisteredProject(t *testing.T) { } } -// fakeSessionLifecycle records calls to Reconcile, RestoreAll, and -// SaveAndTeardownAll so tests can assert the daemon wiring invokes the correct -// methods without needing a real runtime or worktree. +// fakeSessionLifecycle records calls to Reconcile and RestoreAll so tests can +// assert the daemon wiring invokes the correct methods without needing a real +// runtime or worktree. saveAndTeardownCalled is intentionally retained so +// TestShutdown_DoesNotCallSaveAndTeardownAll can assert it stays false; +// it is set from outside the interface (not via a method the interface requires). type fakeSessionLifecycle struct { reconcileCalled bool restoreAllCalled bool saveAndTeardownCalled bool reconcileErr error restoreErr error - saveErr error } func (f *fakeSessionLifecycle) Reconcile(_ context.Context) error { @@ -403,16 +404,10 @@ func (f *fakeSessionLifecycle) RestoreAll(_ context.Context) error { return f.restoreErr } -func (f *fakeSessionLifecycle) SaveAndTeardownAll(_ context.Context) error { - f.saveAndTeardownCalled = true - return f.saveErr -} - // TestWiring_SessionLifecycleInterfaceInvokedByDaemon asserts the // sessionLifecycle interface is satisfied by *sessionmanager.Manager (compile -// check) and that Reconcile, RestoreAll, and SaveAndTeardownAll dispatch -// correctly through the interface, matching what daemon.go wires at -// boot/shutdown. +// check) and that Reconcile and RestoreAll dispatch correctly through the +// interface, matching what daemon.go wires at boot. func TestWiring_SessionLifecycleInterfaceInvokedByDaemon(t *testing.T) { // Verify *sessionmanager.Manager satisfies the interface at compile time. var _ sessionLifecycle = (*sessionmanager.Manager)(nil) @@ -437,11 +432,21 @@ func TestWiring_SessionLifecycleInterfaceInvokedByDaemon(t *testing.T) { if !fake.restoreAllCalled { t.Fatal("RestoreAll was not called through the interface") } +} - if err := sl.SaveAndTeardownAll(ctx); err != nil { - t.Fatalf("SaveAndTeardownAll: %v", err) - } - if !fake.saveAndTeardownCalled { - t.Fatal("SaveAndTeardownAll was not called through the interface") +// TestShutdown_DoesNotCallSaveAndTeardownAll asserts that the daemon's graceful +// shutdown path does NOT invoke SaveAndTeardownAll on the session lifecycle. +// Live sessions must survive shutdown so the next boot's Reconcile can adopt +// them: tearing them down here causes id-increment on restart. +// +// RED: before the fix, runShutdownSessionLifecycle called SaveAndTeardownAll +// (saveAndTeardownCalled would be true). GREEN: after the fix it is never set. +func TestShutdown_DoesNotCallSaveAndTeardownAll(t *testing.T) { + fake := &fakeSessionLifecycle{} + // Simulate the daemon shutdown path. runShutdownSessionLifecycle must not + // set saveAndTeardownCalled (it is only set if someone calls SaveAndTeardownAll). + runShutdownSessionLifecycle(context.Background(), fake) + if fake.saveAndTeardownCalled { + t.Fatal("graceful shutdown must NOT call SaveAndTeardownAll: live sessions must survive for Reconcile to adopt on next boot") } } From 6fb48f66a98e0782e5dc1eb8ca5ba7fc5d088750 Mon Sep 17 00:00:00 2001 From: harshitsinghbhandari Date: Thu, 25 Jun 2026 16:21:03 +0530 Subject: [PATCH 02/23] fix(daemon): make shutdown-teardown regression test falsifiable Re-add SaveAndTeardownAll to the sessionLifecycle interface and to fakeSessionLifecycle so TestShutdown_DoesNotCallSaveAndTeardownAll is genuinely falsifiable: the flag flips if runShutdownSessionLifecycle ever calls sl.SaveAndTeardownAll, making the assertion meaningful. Name the sl parameter (was discarded with _) so the seam is visible. RED: with a temporary sl.SaveAndTeardownAll call, test fails. GREEN: without it, test passes. go test ./internal/daemon/... -race: 23/23. Co-Authored-By: Claude Sonnet 4.6 --- backend/internal/daemon/daemon.go | 7 +++++-- backend/internal/daemon/lifecycle_wiring.go | 10 ++++++++-- backend/internal/daemon/wiring_test.go | 20 +++++++++++++++----- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/backend/internal/daemon/daemon.go b/backend/internal/daemon/daemon.go index 8eecec9c59..a6cad2a709 100644 --- a/backend/internal/daemon/daemon.go +++ b/backend/internal/daemon/daemon.go @@ -175,8 +175,11 @@ func Run() error { // // ponytail: no-op on purpose; exists as a testable seam so the test can assert // SaveAndTeardownAll is never called on the shutdown path. -func runShutdownSessionLifecycle(_ context.Context, _ sessionLifecycle) { - // Intentionally empty. Sessions stay alive; next boot's Reconcile adopts them. +func runShutdownSessionLifecycle(_ context.Context, sl sessionLifecycle) { + // Intentionally empty. sl.SaveAndTeardownAll is deliberately NOT called here: + // sessions stay alive across daemon exit; next boot's Reconcile adopts them. + // ponytail: no-op on purpose; the test asserts sl.SaveAndTeardownAll is never + // called on this path (TestShutdown_DoesNotCallSaveAndTeardownAll). } // newLogger returns the daemon's slog logger. It writes to stderr so supervisors diff --git a/backend/internal/daemon/lifecycle_wiring.go b/backend/internal/daemon/lifecycle_wiring.go index a2deec9f2d..f870842f1d 100644 --- a/backend/internal/daemon/lifecycle_wiring.go +++ b/backend/internal/daemon/lifecycle_wiring.go @@ -62,11 +62,17 @@ func (l *lifecycleStack) Stop() { // boot/shutdown wiring. A minimal interface keeps the daemon testable without // depending on the concrete manager type. // -// SaveAndTeardownAll is intentionally absent: shutdown no longer tears down -// live sessions. Reconcile on the next boot adopts them (see runShutdownSessionLifecycle). +// SaveAndTeardownAll is part of this surface but runShutdownSessionLifecycle +// intentionally does NOT call it on shutdown: live tmux/ConPTY sessions survive +// the daemon exit and Reconcile on the next boot adopts them, preserving session +// IDs. Keeping the method on the interface (rather than removing it) means any +// future caller that accidentally re-adds a teardown call is a compile error +// against the fake, making TestShutdown_DoesNotCallSaveAndTeardownAll genuinely +// falsifiable. type sessionLifecycle interface { Reconcile(ctx context.Context) error RestoreAll(ctx context.Context) error + SaveAndTeardownAll(ctx context.Context) error } // startSession builds the controller-facing session service: a session manager diff --git a/backend/internal/daemon/wiring_test.go b/backend/internal/daemon/wiring_test.go index 160cbc88dd..dfbe36a8b3 100644 --- a/backend/internal/daemon/wiring_test.go +++ b/backend/internal/daemon/wiring_test.go @@ -381,17 +381,22 @@ func TestProjectRepoResolver_ResolvesRegisteredProject(t *testing.T) { } } -// fakeSessionLifecycle records calls to Reconcile and RestoreAll so tests can -// assert the daemon wiring invokes the correct methods without needing a real -// runtime or worktree. saveAndTeardownCalled is intentionally retained so -// TestShutdown_DoesNotCallSaveAndTeardownAll can assert it stays false; -// it is set from outside the interface (not via a method the interface requires). +// fakeSessionLifecycle records calls to Reconcile, RestoreAll, and +// SaveAndTeardownAll so tests can assert the daemon wiring invokes the correct +// methods without needing a real runtime or worktree. +// +// SaveAndTeardownAll is on the interface (and therefore on the fake) so that +// TestShutdown_DoesNotCallSaveAndTeardownAll is genuinely falsifiable: if +// runShutdownSessionLifecycle ever calls sl.SaveAndTeardownAll, the flag flips +// and the test fails. Without the method here the flag could never be set and +// the assertion would be tautological. type fakeSessionLifecycle struct { reconcileCalled bool restoreAllCalled bool saveAndTeardownCalled bool reconcileErr error restoreErr error + saveErr error } func (f *fakeSessionLifecycle) Reconcile(_ context.Context) error { @@ -404,6 +409,11 @@ func (f *fakeSessionLifecycle) RestoreAll(_ context.Context) error { return f.restoreErr } +func (f *fakeSessionLifecycle) SaveAndTeardownAll(_ context.Context) error { + f.saveAndTeardownCalled = true + return f.saveErr +} + // TestWiring_SessionLifecycleInterfaceInvokedByDaemon asserts the // sessionLifecycle interface is satisfied by *sessionmanager.Manager (compile // check) and that Reconcile and RestoreAll dispatch correctly through the From 3a52589697d635d3f37dd4ea5debcce64be01b90 Mon Sep 17 00:00:00 2001 From: harshitsinghbhandari Date: Thu, 25 Jun 2026 16:29:13 +0530 Subject: [PATCH 03/23] refactor(daemon): guard shutdown-teardown at compile time via narrowed interface Remove SaveAndTeardownAll from sessionLifecycle so daemon.Run physically cannot call teardown on shutdown. Delete the no-op runShutdownSessionLifecycle seam and its test. The narrowed interface is the guard: re-introducing teardown requires a visible, reviewable interface change. Co-Authored-By: Claude Sonnet 4.6 --- backend/internal/daemon/daemon.go | 22 +++-------- backend/internal/daemon/lifecycle_wiring.go | 12 ++---- backend/internal/daemon/wiring_test.go | 43 ++++----------------- 3 files changed, 16 insertions(+), 61 deletions(-) diff --git a/backend/internal/daemon/daemon.go b/backend/internal/daemon/daemon.go index a6cad2a709..b14d7ca803 100644 --- a/backend/internal/daemon/daemon.go +++ b/backend/internal/daemon/daemon.go @@ -151,7 +151,11 @@ func Run() error { runErr := srv.Run(ctx) - runShutdownSessionLifecycle(context.Background(), sessMgr) + // Both graceful shutdown paths (SIGTERM and POST /shutdown) funnel through + // srv.Run returning. We deliberately do NOT tear down sessions here: they + // survive the daemon exit and the next boot's Reconcile adopts them, + // preserving session IDs. The narrowed sessionLifecycle interface makes + // teardown-on-shutdown a compile error. // Shut the background goroutines down in order: cancel the context FIRST so // their loops exit, then wait for them to drain. Doing this explicitly (not @@ -166,22 +170,6 @@ func Run() error { return runErr } -// runShutdownSessionLifecycle is called after srv.Run returns on both graceful -// shutdown paths (SIGTERM and POST /shutdown). It does NOT call -// SaveAndTeardownAll: live tmux/ConPTY sessions survive the daemon exit and -// are adopted by Reconcile on the next boot, preserving session IDs. -// Destroying them here caused the id-increment bug where the orchestrator -// session counter incremented on every restart. -// -// ponytail: no-op on purpose; exists as a testable seam so the test can assert -// SaveAndTeardownAll is never called on the shutdown path. -func runShutdownSessionLifecycle(_ context.Context, sl sessionLifecycle) { - // Intentionally empty. sl.SaveAndTeardownAll is deliberately NOT called here: - // sessions stay alive across daemon exit; next boot's Reconcile adopts them. - // ponytail: no-op on purpose; the test asserts sl.SaveAndTeardownAll is never - // called on this path (TestShutdown_DoesNotCallSaveAndTeardownAll). -} - // newLogger returns the daemon's slog logger. It writes to stderr so supervisors // can capture it separately from any structured stdout protocol added later. func newLogger() *slog.Logger { diff --git a/backend/internal/daemon/lifecycle_wiring.go b/backend/internal/daemon/lifecycle_wiring.go index f870842f1d..c5391fe197 100644 --- a/backend/internal/daemon/lifecycle_wiring.go +++ b/backend/internal/daemon/lifecycle_wiring.go @@ -62,17 +62,13 @@ func (l *lifecycleStack) Stop() { // boot/shutdown wiring. A minimal interface keeps the daemon testable without // depending on the concrete manager type. // -// SaveAndTeardownAll is part of this surface but runShutdownSessionLifecycle -// intentionally does NOT call it on shutdown: live tmux/ConPTY sessions survive -// the daemon exit and Reconcile on the next boot adopts them, preserving session -// IDs. Keeping the method on the interface (rather than removing it) means any -// future caller that accidentally re-adds a teardown call is a compile error -// against the fake, making TestShutdown_DoesNotCallSaveAndTeardownAll genuinely -// falsifiable. +// SaveAndTeardownAll is deliberately ABSENT from this interface so the daemon +// cannot tear down live sessions on shutdown. Sessions survive the daemon exit +// and Reconcile on the next boot adopts them, preserving session IDs. Re-adding +// the method here is a visible, reviewable interface change. type sessionLifecycle interface { Reconcile(ctx context.Context) error RestoreAll(ctx context.Context) error - SaveAndTeardownAll(ctx context.Context) error } // startSession builds the controller-facing session service: a session manager diff --git a/backend/internal/daemon/wiring_test.go b/backend/internal/daemon/wiring_test.go index dfbe36a8b3..72f8eb8945 100644 --- a/backend/internal/daemon/wiring_test.go +++ b/backend/internal/daemon/wiring_test.go @@ -381,22 +381,14 @@ func TestProjectRepoResolver_ResolvesRegisteredProject(t *testing.T) { } } -// fakeSessionLifecycle records calls to Reconcile, RestoreAll, and -// SaveAndTeardownAll so tests can assert the daemon wiring invokes the correct -// methods without needing a real runtime or worktree. -// -// SaveAndTeardownAll is on the interface (and therefore on the fake) so that -// TestShutdown_DoesNotCallSaveAndTeardownAll is genuinely falsifiable: if -// runShutdownSessionLifecycle ever calls sl.SaveAndTeardownAll, the flag flips -// and the test fails. Without the method here the flag could never be set and -// the assertion would be tautological. +// fakeSessionLifecycle records calls to Reconcile and RestoreAll so tests can +// assert the daemon wiring invokes the correct methods without needing a real +// runtime or worktree. type fakeSessionLifecycle struct { - reconcileCalled bool - restoreAllCalled bool - saveAndTeardownCalled bool - reconcileErr error - restoreErr error - saveErr error + reconcileCalled bool + restoreAllCalled bool + reconcileErr error + restoreErr error } func (f *fakeSessionLifecycle) Reconcile(_ context.Context) error { @@ -409,11 +401,6 @@ func (f *fakeSessionLifecycle) RestoreAll(_ context.Context) error { return f.restoreErr } -func (f *fakeSessionLifecycle) SaveAndTeardownAll(_ context.Context) error { - f.saveAndTeardownCalled = true - return f.saveErr -} - // TestWiring_SessionLifecycleInterfaceInvokedByDaemon asserts the // sessionLifecycle interface is satisfied by *sessionmanager.Manager (compile // check) and that Reconcile and RestoreAll dispatch correctly through the @@ -444,19 +431,3 @@ func TestWiring_SessionLifecycleInterfaceInvokedByDaemon(t *testing.T) { } } -// TestShutdown_DoesNotCallSaveAndTeardownAll asserts that the daemon's graceful -// shutdown path does NOT invoke SaveAndTeardownAll on the session lifecycle. -// Live sessions must survive shutdown so the next boot's Reconcile can adopt -// them: tearing them down here causes id-increment on restart. -// -// RED: before the fix, runShutdownSessionLifecycle called SaveAndTeardownAll -// (saveAndTeardownCalled would be true). GREEN: after the fix it is never set. -func TestShutdown_DoesNotCallSaveAndTeardownAll(t *testing.T) { - fake := &fakeSessionLifecycle{} - // Simulate the daemon shutdown path. runShutdownSessionLifecycle must not - // set saveAndTeardownCalled (it is only set if someone calls SaveAndTeardownAll). - runShutdownSessionLifecycle(context.Background(), fake) - if fake.saveAndTeardownCalled { - t.Fatal("graceful shutdown must NOT call SaveAndTeardownAll: live sessions must survive for Reconcile to adopt on next boot") - } -} From 000fc690852942ed014d4d854411c1de1148eb88 Mon Sep 17 00:00:00 2001 From: harshitsinghbhandari Date: Thu, 25 Jun 2026 16:41:25 +0530 Subject: [PATCH 04/23] fix(session): make ensure-orchestrator idempotent so POST cannot mint a duplicate When clean=false, SpawnOrchestrator now checks for an existing active orchestrator and returns it directly instead of always calling Spawn. Adds RED/GREEN tests. Co-Authored-By: Claude Sonnet 4.6 --- backend/internal/service/session/service.go | 16 ++++- .../internal/service/session/service_test.go | 69 ++++++++++++++++++- 2 files changed, 79 insertions(+), 6 deletions(-) diff --git a/backend/internal/service/session/service.go b/backend/internal/service/session/service.go index facfc7bdce..2e85a201eb 100644 --- a/backend/internal/service/session/service.go +++ b/backend/internal/service/session/service.go @@ -258,11 +258,12 @@ func (s *Service) emitSpawnFailed(cfg ports.SpawnConfig, err error, durationMs i // SpawnOrchestrator spawns an orchestrator session for a project. When clean is // true it first tears down any active orchestrator(s) for that project so the new -// one is the only live coordinator — a business rule that belongs here, not in the -// HTTP controller. +// one is the only live coordinator. When clean is false it is idempotent: if an +// active orchestrator already exists it is returned as-is. A business rule that +// belongs here, not in the HTTP controller. func (s *Service) SpawnOrchestrator(ctx context.Context, projectID domain.ProjectID, clean bool) (domain.Session, error) { + active := true if clean { - active := true existing, err := s.List(ctx, ListFilter{ProjectID: projectID, Active: &active, OrchestratorOnly: true}) if err != nil { return domain.Session{}, err @@ -272,6 +273,15 @@ func (s *Service) SpawnOrchestrator(ctx context.Context, projectID domain.Projec return domain.Session{}, err } } + } else { + // ponytail: check-then-spawn is not atomic; fine for the single-frontend ensure-on-load case. Upgrade path: a partial unique index on (project_id) where kind=orchestrator and not terminated. + existing, err := s.List(ctx, ListFilter{ProjectID: projectID, Active: &active, OrchestratorOnly: true}) + if err != nil { + return domain.Session{}, err + } + if len(existing) > 0 { + return existing[0], nil + } } return s.Spawn(ctx, ports.SpawnConfig{ProjectID: projectID, Kind: domain.KindOrchestrator}) } diff --git a/backend/internal/service/session/service_test.go b/backend/internal/service/session/service_test.go index 3b0476f288..aef9163b45 100644 --- a/backend/internal/service/session/service_test.go +++ b/backend/internal/service/session/service_test.go @@ -542,19 +542,82 @@ func TestToAPIErrorMapsWorkspaceBranchSentinels(t *testing.T) { } } -func TestSpawnOrchestratorNoCleanSkipsKills(t *testing.T) { +// TestSpawnOrchestratorNoCleanReturnsExistingWhenActiveExists is the RED test +// for the idempotency fix: when an active orchestrator already exists and +// clean=false, SpawnOrchestrator must return that orchestrator without minting +// a second one. Before the fix this test fails because a duplicate is spawned. +func TestSpawnOrchestratorNoCleanReturnsExistingWhenActiveExists(t *testing.T) { st := newFakeStore() st.projects["mer"] = domain.ProjectRecord{ID: "mer"} + // Pre-load an active orchestrator. st.sessions["mer-1"] = domain.SessionRecord{ID: "mer-1", ProjectID: "mer", Kind: domain.KindOrchestrator} fc := &fakeCommander{} svc := &Service{manager: fc, store: st} + got, err := svc.SpawnOrchestrator(context.Background(), "mer", false) + if err != nil { + t.Fatalf("SpawnOrchestrator: %v", err) + } + // Must return the existing orchestrator, not a newly minted one. + if got.ID != "mer-1" { + t.Fatalf("returned id = %q, want existing orchestrator mer-1", got.ID) + } + // Must NOT have called manager.Spawn (no duplicate created). + if fc.spawned { + t.Fatal("manager.Spawn must NOT be called when an active orchestrator already exists") + } + // Must NOT have killed anything. + if len(fc.killed) != 0 { + t.Fatalf("no kills expected with clean=false, got %v", fc.killed) + } + // Exactly one session in the store (no duplicate). + if len(st.sessions) != 1 { + t.Fatalf("session count = %d, want 1 (no duplicate)", len(st.sessions)) + } +} + +// TestSpawnOrchestratorNoCleanSpawnsWhenNoneExists: clean=false spawns a new +// orchestrator when no active one exists for the project. +func TestSpawnOrchestratorNoCleanSpawnsWhenNoneExists(t *testing.T) { + st := newFakeStore() + st.projects["mer"] = domain.ProjectRecord{ID: "mer"} + // No active orchestrator present. + + fc := &fakeCommander{} + svc := &Service{manager: fc, store: st} + + got, err := svc.SpawnOrchestrator(context.Background(), "mer", false) + if err != nil { + t.Fatalf("SpawnOrchestrator: %v", err) + } + if !fc.spawned { + t.Fatal("manager.Spawn must be called when no active orchestrator exists") + } + if len(fc.killed) != 0 { + t.Fatalf("no kills expected with clean=false, got %v", fc.killed) + } + if got.ID == "" { + t.Fatal("returned session must have an id") + } +} + +// TestSpawnOrchestratorNoCleanSkipsKills is preserved for backward-compat +// documentation: the old assertion that clean=false does not kill is still true. +// The spawn behavior is now covered by the idempotency tests above. +func TestSpawnOrchestratorNoCleanSkipsKills(t *testing.T) { + st := newFakeStore() + st.projects["mer"] = domain.ProjectRecord{ID: "mer"} + // No pre-existing active orchestrator so a spawn is expected. + + fc := &fakeCommander{} + svc := &Service{manager: fc, store: st} + if _, err := svc.SpawnOrchestrator(context.Background(), "mer", false); err != nil { t.Fatalf("SpawnOrchestrator: %v", err) } - if len(fc.killed) != 0 || !fc.spawned { - t.Fatalf("clean=false must spawn without kills: killed=%v spawned=%v", fc.killed, fc.spawned) + if len(fc.killed) != 0 { + t.Fatalf("clean=false must not kill anything, got %v", fc.killed) } } From 5602027a2a0dc8a4681deae8538c84457506657d Mon Sep 17 00:00:00 2001 From: Harshit Singh Bhandari Date: Thu, 25 Jun 2026 16:44:15 +0530 Subject: [PATCH 05/23] test(session): drop redundant NoCleanSkipsKills, covered by SpawnsWhenNoneExists Co-Authored-By: Claude Opus 4.8 --- .../internal/service/session/service_test.go | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/backend/internal/service/session/service_test.go b/backend/internal/service/session/service_test.go index aef9163b45..9ef255a2d3 100644 --- a/backend/internal/service/session/service_test.go +++ b/backend/internal/service/session/service_test.go @@ -602,25 +602,6 @@ func TestSpawnOrchestratorNoCleanSpawnsWhenNoneExists(t *testing.T) { } } -// TestSpawnOrchestratorNoCleanSkipsKills is preserved for backward-compat -// documentation: the old assertion that clean=false does not kill is still true. -// The spawn behavior is now covered by the idempotency tests above. -func TestSpawnOrchestratorNoCleanSkipsKills(t *testing.T) { - st := newFakeStore() - st.projects["mer"] = domain.ProjectRecord{ID: "mer"} - // No pre-existing active orchestrator so a spawn is expected. - - fc := &fakeCommander{} - svc := &Service{manager: fc, store: st} - - if _, err := svc.SpawnOrchestrator(context.Background(), "mer", false); err != nil { - t.Fatalf("SpawnOrchestrator: %v", err) - } - if len(fc.killed) != 0 { - t.Fatalf("clean=false must not kill anything, got %v", fc.killed) - } -} - type fakePRClaimer struct { out errorFreeClaimOutcome err error From 5a61b75e618ea101b64ad9360de1db20b5e4265f Mon Sep 17 00:00:00 2001 From: harshitsinghbhandari Date: Thu, 25 Jun 2026 16:52:02 +0530 Subject: [PATCH 06/23] feat(daemon): supervisor watchdog Adds backend/internal/daemon/supervisor: a transport-agnostic watchdog that fires onLastClientGone() exactly once when the live connection count drops to zero and stays zero for a configurable grace period. Arms only after the first accepted connection (headless safety: CLI ao start never self-stops). Reconnect before grace elapses cancels the pending timer. Mutex guards liveCount/armed/pendingTimer; sync.Once guards the callback. Tests use net.Pipe + a fake listener (no OS sockets); all three behavior contracts verified with -race: never fires pre-connect, fires once on last disconnect, reconnect within grace cancels and re-arms correctly. Co-Authored-By: Claude Sonnet 4.6 --- .../internal/daemon/supervisor/supervisor.go | 147 ++++++++++ .../daemon/supervisor/supervisor_test.go | 259 ++++++++++++++++++ 2 files changed, 406 insertions(+) create mode 100644 backend/internal/daemon/supervisor/supervisor.go create mode 100644 backend/internal/daemon/supervisor/supervisor_test.go diff --git a/backend/internal/daemon/supervisor/supervisor.go b/backend/internal/daemon/supervisor/supervisor.go new file mode 100644 index 0000000000..dd1623a301 --- /dev/null +++ b/backend/internal/daemon/supervisor/supervisor.go @@ -0,0 +1,147 @@ +// Package supervisor provides a transport-agnostic watchdog that fires a +// callback when the last connected client disconnects and stays gone for a +// configurable grace period. It arms only after the FIRST client ever +// connects so a daemon started with no frontend (e.g. CLI "ao start") never +// self-stops. +// +// Task 3 (later) wires a real net.Listener (Unix socket / Windows named pipe) +// and the daemon entry-point; Task 4 wires the Electron side. This package is +// a leaf: it imports only stdlib. +package supervisor + +import ( + "context" + "io" + "log/slog" + "net" + "sync" + "time" +) + +// Supervisor watches connections on a net.Listener and calls onLastClientGone +// exactly once (per process lifetime) when the live-count drops to zero and +// stays zero for the grace period. +// +// Concurrency model: +// - mu guards liveCount, armed, and pendingTimer. +// - armed flips to true on the first accepted connection and never resets; +// it is the "headless-safety" gate that prevents a pre-connect fire. +// - pendingTimer holds the *time.Timer from time.AfterFunc so it can be +// stopped on reconnect. A non-nil pendingTimer means a grace countdown is +// running. +// - fireOnce ensures onLastClientGone is called at most once for the entire +// process lifetime, even if the timer fires concurrently with a reconnect. +type Supervisor struct { + grace time.Duration + onLastClientGone func() + log *slog.Logger + + mu sync.Mutex + liveCount int + armed bool // true once any connection has been accepted + pendingTimer *time.Timer // non-nil while grace countdown is running + + fireOnce sync.Once +} + +// New creates a Supervisor. grace is the delay before the callback fires after +// the last connection closes. onLastClientGone is called at most once for the +// process lifetime, so it is safe to use it to trigger os.Exit or context +// cancellation. +func New(grace time.Duration, onLastClientGone func(), log *slog.Logger) *Supervisor { + return &Supervisor{ + grace: grace, + onLastClientGone: onLastClientGone, + log: log, + } +} + +// Serve runs the accept loop on ln until ctx is cancelled or ln is closed. +// It returns nil on a clean shutdown (context cancelled or listener closed +// normally); it only returns a non-nil error for unexpected Accept failures. +func (s *Supervisor) Serve(ctx context.Context, ln net.Listener) error { + // Close the listener when ctx is cancelled so Accept() unblocks. + go func() { + <-ctx.Done() + _ = ln.Close() + }() + + for { + conn, err := ln.Accept() + if err != nil { + // A closed listener or context cancellation is a clean stop. + select { + case <-ctx.Done(): + return nil + default: + } + // io.EOF is what fakeListener returns on Close(); treat it as clean too. + if err == io.EOF { + return nil + } + s.log.Error("supervisor: accept error", "err", err) + return nil // ponytail: first error exits; persistent restart is caller's job + } + + s.mu.Lock() + s.armed = true + s.liveCount++ + // If a grace timer was pending (reconnect before grace elapsed), cancel it. + if s.pendingTimer != nil { + s.pendingTimer.Stop() + s.pendingTimer = nil + } + s.mu.Unlock() + + s.log.Debug("supervisor: client connected", "liveCount", s.liveCount) + go s.watchConn(conn) + } +} + +// watchConn drains conn (reads into a scratch buffer) purely to detect close. +// When read returns io.EOF or any error, the connection is gone. +func (s *Supervisor) watchConn(conn net.Conn) { + // ponytail: 32-byte scratch buffer; we never process the payload. + scratch := make([]byte, 32) + for { + _, err := conn.Read(scratch) + if err != nil { + break + } + } + _ = conn.Close() + + s.mu.Lock() + s.liveCount-- + live := s.liveCount + armed := s.armed + s.mu.Unlock() + + s.log.Debug("supervisor: client disconnected", "liveCount", live) + + if armed && live == 0 { + s.armGrace() + } +} + +// armGrace starts the grace countdown. If another client connects before it +// elapses, Serve() will Stop() the timer via pendingTimer. +func (s *Supervisor) armGrace() { + s.mu.Lock() + // Stop any previous timer (defensive; shouldn't be one here). + if s.pendingTimer != nil { + s.pendingTimer.Stop() + } + s.pendingTimer = time.AfterFunc(s.grace, func() { + s.mu.Lock() + live := s.liveCount + s.pendingTimer = nil + s.mu.Unlock() + + if live == 0 { + s.log.Info("supervisor: last client gone; grace elapsed, firing callback") + s.fireOnce.Do(s.onLastClientGone) + } + }) + s.mu.Unlock() +} diff --git a/backend/internal/daemon/supervisor/supervisor_test.go b/backend/internal/daemon/supervisor/supervisor_test.go new file mode 100644 index 0000000000..122d4539a1 --- /dev/null +++ b/backend/internal/daemon/supervisor/supervisor_test.go @@ -0,0 +1,259 @@ +// Package supervisor_test exercises the Supervisor watchdog via in-process +// net.Pipe connections so no real OS sockets are needed. +package supervisor_test + +import ( + "context" + "io" + "log/slog" + "net" + "sync" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/daemon/supervisor" +) + +// fakeListener queues pre-made conns and blocks (or returns a closed error) +// once the queue is drained. Close() unblocks any pending Accept(). +type fakeListener struct { + mu sync.Mutex + conns []net.Conn + closed bool + ready chan struct{} // closed when a conn is enqueued or the listener is closed +} + +func newFakeListener() *fakeListener { + return &fakeListener{ready: make(chan struct{}, 1)} +} + +// enqueue adds a conn for the next Accept() call. +func (fl *fakeListener) enqueue(c net.Conn) { + fl.mu.Lock() + fl.conns = append(fl.conns, c) + fl.mu.Unlock() + select { + case fl.ready <- struct{}{}: + default: + } +} + +func (fl *fakeListener) Accept() (net.Conn, error) { + for { + fl.mu.Lock() + if fl.closed { + fl.mu.Unlock() + return nil, io.EOF // signals Serve to stop + } + if len(fl.conns) > 0 { + c := fl.conns[0] + fl.conns = fl.conns[1:] + fl.mu.Unlock() + return c, nil + } + fl.mu.Unlock() + // drain the ready channel so we can block below + select { + case <-fl.ready: + default: + } + // wait for a new conn or a close signal + <-fl.ready + } +} + +func (fl *fakeListener) Close() error { + fl.mu.Lock() + fl.closed = true + fl.mu.Unlock() + select { + case fl.ready <- struct{}{}: + default: + } + return nil +} + +func (fl *fakeListener) Addr() net.Addr { return &net.UnixAddr{Name: "fake", Net: "unix"} } + +// noopLogger returns a slog.Logger that discards all output. +func noopLogger() *slog.Logger { + return slog.New(slog.NewTextHandler(io.Discard, nil)) +} + +const testGrace = 30 * time.Millisecond + +// comfortWait is how long we wait when asserting the callback did NOT fire. +// It must be strictly greater than testGrace so a real timer would have fired. +const comfortWait = testGrace * 5 + +// TestNeverFiresPreConnect: start Serve with no connections, wait well past +// grace, assert callback was NOT called. +func TestNeverFiresPreConnect(t *testing.T) { + t.Parallel() + + fired := make(chan struct{}) + cb := func() { close(fired) } + + s := supervisor.New(testGrace, cb, noopLogger()) + ln := newFakeListener() + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan error, 1) + go func() { done <- s.Serve(ctx, ln) }() + + // wait comfortably past grace with no connections ever accepted + time.Sleep(comfortWait) + + select { + case <-fired: + t.Fatal("callback fired before any client ever connected") + default: + } + + cancel() + _ = ln.Close() + <-done +} + +// TestFiresOnceAfterGrace: connect one client, close it, assert the callback +// fires exactly once within a reasonable window. +func TestFiresOnceAfterGrace(t *testing.T) { + t.Parallel() + + fireCount := 0 + var mu sync.Mutex + fired := make(chan struct{}) + cb := func() { + mu.Lock() + fireCount++ + mu.Unlock() + // close is safe even if called once, but use a Once-guarded close via + // a sync.Once in the real impl; here we just close the channel once + select { + case fired <- struct{}{}: + default: + } + } + + s := supervisor.New(testGrace, cb, noopLogger()) + ln := newFakeListener() + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + done := make(chan error, 1) + go func() { done <- s.Serve(ctx, ln) }() + + // create a pipe, enqueue the server-side end + serverConn, clientConn, err := makePipe() + if err != nil { + t.Fatal(err) + } + ln.enqueue(serverConn) + + // close the client side to signal disconnect + _ = clientConn.Close() + + // wait for the callback within a bounded window + select { + case <-fired: + // good + case <-time.After(comfortWait * 2): + t.Fatal("callback did not fire after client disconnected and grace elapsed") + } + + // close and wait a bit more to make sure it only fires once + time.Sleep(comfortWait) + mu.Lock() + count := fireCount + mu.Unlock() + if count != 1 { + t.Fatalf("expected callback to fire exactly once, got %d", count) + } + + cancel() + _ = ln.Close() + <-done +} + +// TestReconnectWithinGraceCancels: connect, disconnect (arms grace), reconnect +// before grace elapses, wait past grace, assert callback NOT called. Then +// disconnect again and assert it DOES fire. +func TestReconnectWithinGraceCancels(t *testing.T) { + t.Parallel() + + fireCount := 0 + var mu sync.Mutex + fired := make(chan struct{}, 1) + cb := func() { + mu.Lock() + fireCount++ + mu.Unlock() + select { + case fired <- struct{}{}: + default: + } + } + + s := supervisor.New(testGrace, cb, noopLogger()) + ln := newFakeListener() + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + done := make(chan error, 1) + go func() { done <- s.Serve(ctx, ln) }() + + // --- first connection --- + serverConn1, clientConn1, err := makePipe() + if err != nil { + t.Fatal(err) + } + ln.enqueue(serverConn1) + // small sleep so the server-side accept loop picks up the first conn + time.Sleep(5 * time.Millisecond) + + // disconnect first client: this arms grace + _ = clientConn1.Close() + + // reconnect immediately (well within grace period) before grace elapses + serverConn2, clientConn2, err := makePipe() + if err != nil { + t.Fatal(err) + } + ln.enqueue(serverConn2) + + // wait well past grace: grace should have been cancelled by the reconnect + time.Sleep(comfortWait) + + select { + case <-fired: + t.Fatal("callback fired even though a client reconnected before grace elapsed") + default: + } + + // now disconnect the second client: grace re-arms, callback should fire + _ = clientConn2.Close() + + select { + case <-fired: + // good + case <-time.After(comfortWait * 2): + t.Fatal("callback did not fire after second client disconnected and grace elapsed") + } + + mu.Lock() + count := fireCount + mu.Unlock() + if count != 1 { + t.Fatalf("expected exactly one callback fire (process-lifetime once), got %d", count) + } + + cancel() + _ = ln.Close() + <-done +} + +// makePipe returns a server-side and client-side net.Conn pair via net.Pipe. +func makePipe() (net.Conn, net.Conn, error) { + server, client := net.Pipe() + return server, client, nil +} From 63ce6af36ac3bc74420eb4a94caaf2fa36426191 Mon Sep 17 00:00:00 2001 From: harshitsinghbhandari Date: Thu, 25 Jun 2026 16:58:00 +0530 Subject: [PATCH 07/23] fix(daemon): supervisor watchdog review fixes (data race, ErrClosed, leak, cleanup) - Fix data race: capture liveCount into local inside lock before logging - Replace io.EOF with errors.Is(err, net.ErrClosed) for correct production behavior; update fakeListener to match - Fix goroutine leak: derive cancellable child context in Serve so watcher always unblocks on return - Simplify makePipe: drop dead error return, update 3 call sites - Remove dead defensive pendingTimer nil-check in armGrace Co-Authored-By: Claude Sonnet 4.6 --- .../internal/daemon/supervisor/supervisor.go | 19 ++++++++------- .../daemon/supervisor/supervisor_test.go | 23 ++++++------------- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/backend/internal/daemon/supervisor/supervisor.go b/backend/internal/daemon/supervisor/supervisor.go index dd1623a301..3970d98f86 100644 --- a/backend/internal/daemon/supervisor/supervisor.go +++ b/backend/internal/daemon/supervisor/supervisor.go @@ -11,7 +11,7 @@ package supervisor import ( "context" - "io" + "errors" "log/slog" "net" "sync" @@ -60,6 +60,12 @@ func New(grace time.Duration, onLastClientGone func(), log *slog.Logger) *Superv // It returns nil on a clean shutdown (context cancelled or listener closed // normally); it only returns a non-nil error for unexpected Accept failures. func (s *Supervisor) Serve(ctx context.Context, ln net.Listener) error { + // Derive a cancellable context so the watcher goroutine always unblocks + // when Serve returns, even if ctx itself is not cancelled (e.g. listener + // closed directly). + ctx, cancel := context.WithCancel(ctx) + defer cancel() + // Close the listener when ctx is cancelled so Accept() unblocks. go func() { <-ctx.Done() @@ -75,8 +81,8 @@ func (s *Supervisor) Serve(ctx context.Context, ln net.Listener) error { return nil default: } - // io.EOF is what fakeListener returns on Close(); treat it as clean too. - if err == io.EOF { + // net.ErrClosed is what real listeners return when closed normally. + if errors.Is(err, net.ErrClosed) { return nil } s.log.Error("supervisor: accept error", "err", err) @@ -91,9 +97,10 @@ func (s *Supervisor) Serve(ctx context.Context, ln net.Listener) error { s.pendingTimer.Stop() s.pendingTimer = nil } + live := s.liveCount s.mu.Unlock() - s.log.Debug("supervisor: client connected", "liveCount", s.liveCount) + s.log.Debug("supervisor: client connected", "liveCount", live) go s.watchConn(conn) } } @@ -128,10 +135,6 @@ func (s *Supervisor) watchConn(conn net.Conn) { // elapses, Serve() will Stop() the timer via pendingTimer. func (s *Supervisor) armGrace() { s.mu.Lock() - // Stop any previous timer (defensive; shouldn't be one here). - if s.pendingTimer != nil { - s.pendingTimer.Stop() - } s.pendingTimer = time.AfterFunc(s.grace, func() { s.mu.Lock() live := s.liveCount diff --git a/backend/internal/daemon/supervisor/supervisor_test.go b/backend/internal/daemon/supervisor/supervisor_test.go index 122d4539a1..c3195b316a 100644 --- a/backend/internal/daemon/supervisor/supervisor_test.go +++ b/backend/internal/daemon/supervisor/supervisor_test.go @@ -43,7 +43,7 @@ func (fl *fakeListener) Accept() (net.Conn, error) { fl.mu.Lock() if fl.closed { fl.mu.Unlock() - return nil, io.EOF // signals Serve to stop + return nil, net.ErrClosed // signals Serve to stop } if len(fl.conns) > 0 { c := fl.conns[0] @@ -144,10 +144,7 @@ func TestFiresOnceAfterGrace(t *testing.T) { go func() { done <- s.Serve(ctx, ln) }() // create a pipe, enqueue the server-side end - serverConn, clientConn, err := makePipe() - if err != nil { - t.Fatal(err) - } + serverConn, clientConn := makePipe() ln.enqueue(serverConn) // close the client side to signal disconnect @@ -203,10 +200,7 @@ func TestReconnectWithinGraceCancels(t *testing.T) { go func() { done <- s.Serve(ctx, ln) }() // --- first connection --- - serverConn1, clientConn1, err := makePipe() - if err != nil { - t.Fatal(err) - } + serverConn1, clientConn1 := makePipe() ln.enqueue(serverConn1) // small sleep so the server-side accept loop picks up the first conn time.Sleep(5 * time.Millisecond) @@ -215,10 +209,7 @@ func TestReconnectWithinGraceCancels(t *testing.T) { _ = clientConn1.Close() // reconnect immediately (well within grace period) before grace elapses - serverConn2, clientConn2, err := makePipe() - if err != nil { - t.Fatal(err) - } + serverConn2, clientConn2 := makePipe() ln.enqueue(serverConn2) // wait well past grace: grace should have been cancelled by the reconnect @@ -253,7 +244,7 @@ func TestReconnectWithinGraceCancels(t *testing.T) { } // makePipe returns a server-side and client-side net.Conn pair via net.Pipe. -func makePipe() (net.Conn, net.Conn, error) { - server, client := net.Pipe() - return server, client, nil +func makePipe() (net.Conn, net.Conn) { + s, c := net.Pipe() + return s, c } From dbdd9bdb7aba2e9cfe8655d64d4dadaf4b7f6dcb Mon Sep 17 00:00:00 2001 From: harshitsinghbhandari Date: Thu, 25 Jun 2026 17:04:26 +0530 Subject: [PATCH 08/23] feat(daemon): OS-native supervisor listener triggers clean shutdown Creates platform-split Listen() in supervisor package (Unix UDS sibling of run-file; Windows named pipe via go-winio). Wires it into daemon.Run before srv.Run: listener failure is non-fatal so headless ao-start works. Adds RequestShutdown() on Server and three unit tests for listen_unix.go. Co-Authored-By: Claude Sonnet 4.6 --- backend/go.mod | 1 + backend/go.sum | 2 + backend/internal/daemon/daemon.go | 14 ++++ .../internal/daemon/supervisor/listen_unix.go | 25 ++++++ .../daemon/supervisor/listen_unix_test.go | 81 +++++++++++++++++++ .../daemon/supervisor/listen_windows.go | 23 ++++++ backend/internal/httpd/server.go | 4 + 7 files changed, 150 insertions(+) create mode 100644 backend/internal/daemon/supervisor/listen_unix.go create mode 100644 backend/internal/daemon/supervisor/listen_unix_test.go create mode 100644 backend/internal/daemon/supervisor/listen_windows.go diff --git a/backend/go.mod b/backend/go.mod index cee5b42f80..55657904d7 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -3,6 +3,7 @@ module github.com/aoagents/agent-orchestrator/backend go 1.25.7 require ( + github.com/Microsoft/go-winio v0.6.2 github.com/aymanbagabas/go-pty v0.2.3 github.com/coder/websocket v1.8.14 github.com/creack/pty v1.1.24 diff --git a/backend/go.sum b/backend/go.sum index 56fcf87f3f..a7e9d8331b 100644 --- a/backend/go.sum +++ b/backend/go.sum @@ -1,3 +1,5 @@ +github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= +github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= github.com/aymanbagabas/go-pty v0.2.3 h1:hsqcTIUV8I4iTSh3HQl61CR2wh0YPS6gHOYLhAfWu/E= github.com/aymanbagabas/go-pty v0.2.3/go.mod h1:GLkgQovzqN5A1xMB79yHWiG1rhcquZCjkwKQGKFPdPg= github.com/bool64/dev v0.2.43 h1:yQ7qiZVef6WtCl2vDYU0Y+qSq+0aBrQzY8KXkklk9cQ= diff --git a/backend/internal/daemon/daemon.go b/backend/internal/daemon/daemon.go index b14d7ca803..492ac58928 100644 --- a/backend/internal/daemon/daemon.go +++ b/backend/internal/daemon/daemon.go @@ -15,6 +15,7 @@ import ( "github.com/aoagents/agent-orchestrator/backend/internal/adapters/runtime/runtimeselect" "github.com/aoagents/agent-orchestrator/backend/internal/config" + "github.com/aoagents/agent-orchestrator/backend/internal/daemon/supervisor" "github.com/aoagents/agent-orchestrator/backend/internal/httpd" "github.com/aoagents/agent-orchestrator/backend/internal/notify" "github.com/aoagents/agent-orchestrator/backend/internal/ports" @@ -149,6 +150,19 @@ func Run() error { log.Error("reconcile sessions on boot failed", "err", reconcileErr) } + // ponytail: 5s tolerates a brief frontend restart; tune if dev hot-reload trips it. + const supervisorGrace = 5 * time.Second + + if ln, addr, err := supervisor.Listen(cfg.RunFilePath); err != nil { + // Non-fatal: without the link the daemon still works (e.g. headless "ao start"), + // it just will not auto-stop when a frontend dies. Do not block startup on it. + log.Warn("supervisor: listener unavailable; frontend-death auto-stop disabled", "err", err) + } else { + log.Info("supervisor: listening", "addr", addr) + sup := supervisor.New(supervisorGrace, srv.RequestShutdown, log) + go sup.Serve(ctx, ln) + } + runErr := srv.Run(ctx) // Both graceful shutdown paths (SIGTERM and POST /shutdown) funnel through diff --git a/backend/internal/daemon/supervisor/listen_unix.go b/backend/internal/daemon/supervisor/listen_unix.go new file mode 100644 index 0000000000..159c7e9737 --- /dev/null +++ b/backend/internal/daemon/supervisor/listen_unix.go @@ -0,0 +1,25 @@ +//go:build !windows + +package supervisor + +import ( + "net" + "os" + "path/filepath" +) + +// Listen creates a Unix domain socket listener alongside the run-file. +// The socket path is a sibling of runFilePath: /supervise.sock. +// Any stale socket file is removed before binding (ignored if absent). +// The returned net.Listener unlinks the socket on Close (Go stdlib default for UnixListener). +// Returns (listener, socketPath, error). +func Listen(runFilePath string) (net.Listener, string, error) { + sockPath := filepath.Join(filepath.Dir(runFilePath), "supervise.sock") + // Remove stale socket; ignore not-exist error. + _ = os.Remove(sockPath) + ln, err := net.Listen("unix", sockPath) + if err != nil { + return nil, "", err + } + return ln, sockPath, nil +} diff --git a/backend/internal/daemon/supervisor/listen_unix_test.go b/backend/internal/daemon/supervisor/listen_unix_test.go new file mode 100644 index 0000000000..c4888325d9 --- /dev/null +++ b/backend/internal/daemon/supervisor/listen_unix_test.go @@ -0,0 +1,81 @@ +//go:build !windows + +package supervisor + +import ( + "net" + "os" + "path/filepath" + "testing" +) + +// TestListen_basic verifies that Listen returns a listener whose address is +// /supervise.sock, that the socket file exists on disk, and +// that a Dial to that address succeeds. +func TestListen_basic(t *testing.T) { + t.Parallel() + dir := t.TempDir() + runFile := filepath.Join(dir, "running.json") + + ln, addr, err := Listen(runFile) + if err != nil { + t.Fatalf("Listen: unexpected error: %v", err) + } + defer ln.Close() + + wantSock := filepath.Join(dir, "supervise.sock") + if addr != wantSock { + t.Errorf("addr = %q, want %q", addr, wantSock) + } + + // Socket file must exist after Listen. + if _, err := os.Stat(wantSock); err != nil { + t.Errorf("socket file missing after Listen: %v", err) + } + + // Dialing the returned address must succeed. + conn, err := net.Dial("unix", addr) + if err != nil { + t.Fatalf("Dial(%q): %v", addr, err) + } + conn.Close() +} + +// TestListen_staleSocket verifies that a pre-existing file at the socket path +// does not prevent Listen from succeeding (the stale file is removed first). +func TestListen_staleSocket(t *testing.T) { + t.Parallel() + dir := t.TempDir() + runFile := filepath.Join(dir, "running.json") + sockPath := filepath.Join(dir, "supervise.sock") + + // Pre-create a regular file to simulate a stale socket. + if err := os.WriteFile(sockPath, []byte("stale"), 0o600); err != nil { + t.Fatalf("pre-create stale file: %v", err) + } + + ln, _, err := Listen(runFile) + if err != nil { + t.Fatalf("Listen with stale socket: unexpected error: %v", err) + } + ln.Close() +} + +// TestListen_unlinkOnClose verifies that closing the listener removes the +// socket file from the filesystem (Go stdlib default for UnixListener). +func TestListen_unlinkOnClose(t *testing.T) { + t.Parallel() + dir := t.TempDir() + runFile := filepath.Join(dir, "running.json") + sockPath := filepath.Join(dir, "supervise.sock") + + ln, _, err := Listen(runFile) + if err != nil { + t.Fatalf("Listen: %v", err) + } + ln.Close() + + if _, err := os.Stat(sockPath); !os.IsNotExist(err) { + t.Errorf("socket file still present after Close (err=%v); expected not-exist", err) + } +} diff --git a/backend/internal/daemon/supervisor/listen_windows.go b/backend/internal/daemon/supervisor/listen_windows.go new file mode 100644 index 0000000000..52fa226242 --- /dev/null +++ b/backend/internal/daemon/supervisor/listen_windows.go @@ -0,0 +1,23 @@ +//go:build windows + +package supervisor + +import ( + "net" + + "github.com/Microsoft/go-winio" +) + +const pipeName = `\\.\pipe\ao-supervise` + +// Listen creates a Windows named pipe listener for the supervisor watchdog. +// runFilePath is ignored on Windows: named pipes are global and identified +// by name only. +// ponytail: global pipe name; add a per-instance suffix if multiple daemons must coexist on one machine. +func Listen(_ string) (net.Listener, string, error) { + ln, err := winio.ListenPipe(pipeName, nil) + if err != nil { + return nil, "", err + } + return ln, pipeName, nil +} diff --git a/backend/internal/httpd/server.go b/backend/internal/httpd/server.go index 58f6d5bd9e..95ff7c5c93 100644 --- a/backend/internal/httpd/server.go +++ b/backend/internal/httpd/server.go @@ -148,3 +148,7 @@ func (s *Server) requestShutdown() { close(s.shutdownRequested) }) } + +// RequestShutdown triggers the same clean shutdown as POST /shutdown: it makes +// Run return so the daemon exits without tearing down sessions. Idempotent. +func (s *Server) RequestShutdown() { s.requestShutdown() } From 02977e0ff9ca28e37373c7bcd7e222d02d183d45 Mon Sep 17 00:00:00 2001 From: Harshit Singh Bhandari Date: Thu, 25 Jun 2026 17:09:46 +0530 Subject: [PATCH 09/23] fix(daemon): log supervisor Serve error instead of discarding it Co-Authored-By: Claude Opus 4.8 --- backend/internal/daemon/daemon.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/internal/daemon/daemon.go b/backend/internal/daemon/daemon.go index 492ac58928..d565a16c5a 100644 --- a/backend/internal/daemon/daemon.go +++ b/backend/internal/daemon/daemon.go @@ -160,7 +160,11 @@ func Run() error { } else { log.Info("supervisor: listening", "addr", addr) sup := supervisor.New(supervisorGrace, srv.RequestShutdown, log) - go sup.Serve(ctx, ln) + go func() { + if err := sup.Serve(ctx, ln); err != nil { + log.Warn("supervisor: serve stopped with error", "err", err) + } + }() } runErr := srv.Run(ctx) From 76a995ec6451faf2427a38c14691769994478676 Mon Sep 17 00:00:00 2001 From: harshitsinghbhandari Date: Thu, 25 Jun 2026 17:28:00 +0530 Subject: [PATCH 10/23] feat(desktop): supervisor link; daemon self-stops (clean) on frontend exit Add connectSupervisor() in frontend/src/main/supervisor-link.ts: holds one client connection to the daemon's OS-native supervisor socket (Unix domain socket on macOS/Linux, named pipe on Windows). Retries with bounded backoff until the daemon is up; reconnects automatically on drop. In main.ts, connect the link from reportBoundPort (once per daemon ready transition). Remove the quit-time daemon teardown: the before-quit handler now only disposes the browser view. The process.on("exit") killDaemon call is also removed. When Electron exits for any reason the OS closes the fd, the daemon detects EOF, and self-stops after its 5s grace period. tmux and ConPTY sessions survive and are adopted on the next boot. killDaemon and stopDaemon are kept for the explicit user-stop path (ipcMain.handle("daemon:stop", ...)). Co-Authored-By: Claude Sonnet 4.6 --- frontend/src/main.ts | 98 +++++---------- frontend/src/main/supervisor-link.test.ts | 140 ++++++++++++++++++++++ frontend/src/main/supervisor-link.ts | 108 +++++++++++++++++ frontend/src/renderer/test/setup.ts | 10 ++ 4 files changed, 285 insertions(+), 71 deletions(-) create mode 100644 frontend/src/main/supervisor-link.test.ts create mode 100644 frontend/src/main/supervisor-link.ts diff --git a/frontend/src/main.ts b/frontend/src/main.ts index ac3bb4af87..560acf55de 100644 --- a/frontend/src/main.ts +++ b/frontend/src/main.ts @@ -33,6 +33,7 @@ import { buildDaemonEnv, resolveShellEnv, type ShellRunner } from "./shared/shel import { DEFAULT_POSTHOG_HOST, DEFAULT_POSTHOG_PROJECT_KEY } from "./shared/posthog-config"; import { buildTelemetryBootstrap } from "./shared/telemetry"; import { createBrowserViewHost, type BrowserViewHost } from "./main/browser-view-host"; +import { connectSupervisor, type SupervisorLinkHandle } from "./main/supervisor-link"; // Globals injected at compile time by @electron-forge/plugin-vite. declare const MAIN_WINDOW_VITE_DEV_SERVER_URL: string | undefined; @@ -68,6 +69,8 @@ let daemonStartPromise: Promise | null = null; let daemonStartEpoch = 0; let daemonStatus: DaemonStatus = { state: "stopped" }; let browserViewHost: BrowserViewHost | null = null; +// Held for the app lifetime. Dropping it (on any exit) triggers daemon self-stop. +let supervisorLink: SupervisorLinkHandle | null = null; const isDev = !app.isPackaged; @@ -593,6 +596,25 @@ async function startDaemonInner(startEpoch: number): Promise { portConfirmed = true; stopDiscovery(); setDaemonStatus({ state: "ready", port }); + + // Establish (or re-establish) the OS-native liveness link to the daemon's + // supervisor socket. Holding this connection keeps the daemon alive. When + // Electron exits for any reason (Cmd+Q, crash, SIGKILL), the OS closes the + // fd; the daemon detects EOF and self-stops after its ~5s grace period, + // leaving tmux/ConPTY sessions alive for the next launch to adopt. + const rfp = runFilePath(); + const addr = + process.platform === "win32" + ? "\\\\.\\pipe\\ao-supervise" + : rfp + ? path.join(path.dirname(rfp), "supervise.sock") + : null; + if (addr) { + supervisorLink?.dispose(); + supervisorLink = connectSupervisor(addr, { + log: (msg) => console.log(`AO: ${msg}`), + }); + } }; // One scanner per stream: each keeps its own partial-line buffer. @@ -751,79 +773,13 @@ app.whenReady().then(() => { }); }); -// Re-entrancy guard: the first before-quit fires, prevents default, does async -// work, then calls app.exit(). If app.quit() is called concurrently (e.g. from -// window-all-closed on non-darwin), the second before-quit fires while the first -// is still in flight. Without a guard it would preventDefault again and loop. -// With the guard set to true, the second invocation falls through and lets the -// quit proceed. app.exit() itself does NOT re-fire before-quit, so the guard -// mainly protects against a concurrent app.quit() race. -let quitting = false; - -app.on("before-quit", (event) => { +// Daemon teardown is now handled via the OS-native supervisor socket: the daemon +// self-stops ~5s after the last client (this process) drops its connection. +// The supervisorLink fd is NOT explicitly closed on quit; the OS closes it when +// the process exits for any reason (Cmd+Q, crash, SIGKILL). Sessions survive. +app.on("before-quit", () => { browserViewHost?.dispose(); browserViewHost = null; - - // Re-entrancy: if we already started the async quit sequence, let this - // invocation fall through so the app actually exits. - if (quitting) return; - quitting = true; - - // Capture the current daemon handle and port before any async gap so that - // a race with stopDaemon() cannot null them out underneath us. - const child = daemonProcess; - const port = daemonStatus.state === "ready" ? daemonStatus.port : undefined; - - if (!child) { - // No daemon we own: nothing to shut down. - return; - } - - // Prevent the synchronous quit so we can ask the daemon to save gracefully - // before killing it. - event.preventDefault(); - - const doQuit = async () => { - // Best-effort graceful shutdown: POST /shutdown so the daemon flushes - // its session state before exiting. An ~8s timeout prevents a hung or - // absent daemon from blocking quit indefinitely. - // Note: the daemon's internal save bound is 30s (shutdownSaveTimeout), so - // if this fetch times out and we proceed to killDaemon (SIGTERM), the first - // SIGTERM only cancels the daemon's listen context; the daemon's in-flight - // save (on a fresh context) still runs to completion or its own 30s bound. - if (port !== undefined) { - try { - await fetch(`http://127.0.0.1:${port}/shutdown`, { - method: "POST", - signal: AbortSignal.timeout(8_000), - }); - } catch { - // Timeout, network error, or daemon already gone: proceed to kill. - console.log(`AO: /shutdown fetch failed (port ${port}); proceeding with SIGTERM.`); - } - } - - // Kill the daemon process group (reaches the daemon behind any shell - // wrapper and its PTY children). - killDaemon(child); - - // Exit without re-firing before-quit (app.exit bypasses the event). - app.exit(0); - }; - - void doQuit(); -}); - -// Last-resort teardown. before-quit covers the normal quit path, but app.exit() -// and some shutdown routes skip it, which would orphan the detached daemon and -// leave it holding the port for the next launch. The Node 'exit' event fires -// synchronously on those paths too, so the daemon's process group is always -// signalled when the supervisor goes away. (A hard SIGKILL/crash still can't run -// JS; the daemon's port-conflict fallback covers the orphan that leaves behind.) -process.on("exit", () => { - if (daemonProcess) { - killDaemon(daemonProcess); - } }); app.on("window-all-closed", () => { diff --git a/frontend/src/main/supervisor-link.test.ts b/frontend/src/main/supervisor-link.test.ts new file mode 100644 index 0000000000..cb1f8f6156 --- /dev/null +++ b/frontend/src/main/supervisor-link.test.ts @@ -0,0 +1,140 @@ +// @vitest-environment node +import net from "node:net"; +import os from "node:os"; +import path from "node:path"; +import { describe, it, expect, afterEach } from "vitest"; +import { connectSupervisor, type SupervisorLinkHandle } from "./supervisor-link"; + +// Bounded wait: resolves when the promise resolves, rejects after timeoutMs. +function withTimeout(promise: Promise, timeoutMs: number, label: string): Promise { + return new Promise((resolve, reject) => { + const timer = setTimeout(() => reject(new Error(`Timeout: ${label}`)), timeoutMs); + promise.then( + (v) => { + clearTimeout(timer); + resolve(v); + }, + (e) => { + clearTimeout(timer); + reject(e); + }, + ); + }); +} + +function tmpSocketPath(): string { + return path.join(os.tmpdir(), `ao-svlink-test-${process.pid}-${Date.now()}.sock`); +} + +// Promisify: resolves the next time server.on("connection") fires. +function nextConnection(server: net.Server): Promise { + return new Promise((resolve) => { + server.once("connection", resolve); + }); +} + +describe("supervisor-link", () => { + const handles: SupervisorLinkHandle[] = []; + const servers: net.Server[] = []; + + afterEach(async () => { + for (const h of handles.splice(0)) h.dispose(); + await Promise.all( + servers.splice(0).map( + (s) => + new Promise((resolve) => { + s.close(() => resolve()); + }), + ), + ); + }); + + it("retries until connected: connects after server is started later", async () => { + const addr = tmpSocketPath(); + + // Start the link BEFORE the server exists. + const link = connectSupervisor(addr, { log: () => undefined }); + handles.push(link); + + // Wait a bit so a few retry attempts have fired against a missing socket. + await new Promise((r) => setTimeout(r, 400)); + + // Now start the server. + const server = net.createServer(); + servers.push(server); + const connectionPromise = nextConnection(server); + await new Promise((resolve, reject) => { + server.listen(addr, () => resolve()); + server.once("error", reject); + }); + + // The link should reconnect and the server should receive a connection. + const conn = await withTimeout(connectionPromise, 5_000, "retry-until-connected: server did not receive connection"); + expect(conn).toBeTruthy(); + conn.destroy(); + }); + + it("reconnects on drop: re-establishes after the accepted socket is closed", async () => { + const addr = tmpSocketPath(); + + // Start server first. + const server = net.createServer(); + servers.push(server); + + let connectionCount = 0; + const secondConnection = new Promise((resolve) => { + let first = true; + server.on("connection", (sock) => { + connectionCount++; + if (first) { + first = false; + // Close the first accepted socket to simulate a drop. + setTimeout(() => sock.destroy(), 50); + } else { + resolve(sock); + } + }); + }); + + await new Promise((resolve, reject) => { + server.listen(addr, () => resolve()); + server.once("error", reject); + }); + + // Connect after server is up. + const link = connectSupervisor(addr, { log: () => undefined }); + handles.push(link); + + // Wait for both the initial connection and the reconnect. + const reconn = await withTimeout(secondConnection, 6_000, "reconnect-on-drop: second connection never arrived"); + expect(connectionCount).toBeGreaterThanOrEqual(2); + reconn.destroy(); + }); + + it("dispose stops reconnect: no connection arrives after dispose", async () => { + const addr = tmpSocketPath(); + + // Start link against a missing socket (no server), then dispose quickly. + const link = connectSupervisor(addr, { log: () => undefined }); + + // Dispose before the server exists. + link.dispose(); + + // Start a server and assert no connection arrives within a bounded window. + const server = net.createServer(); + servers.push(server); + let receivedConnection = false; + server.on("connection", () => { + receivedConnection = true; + }); + await new Promise((resolve, reject) => { + server.listen(addr, () => resolve()); + server.once("error", reject); + }); + + // Wait long enough for at least one retry cycle to have run if dispose failed. + await new Promise((r) => setTimeout(r, 600)); + + expect(receivedConnection).toBe(false); + }); +}); diff --git a/frontend/src/main/supervisor-link.ts b/frontend/src/main/supervisor-link.ts new file mode 100644 index 0000000000..9fb14c2579 --- /dev/null +++ b/frontend/src/main/supervisor-link.ts @@ -0,0 +1,108 @@ +import net from "node:net"; + +// ponytail: no heartbeat. The open socket IS the liveness signal. When the +// Electron process dies the kernel closes the fd and the daemon detects EOF +// immediately (proven against the real daemon with a write-free held +// connection). A heartbeat adds nothing for a Unix domain socket or named +// pipe and is omitted deliberately. + +const BACKOFF_INIT_MS = 200; +const BACKOFF_MAX_MS = 2_000; + +export interface SupervisorLinkHandle { + dispose(): void; +} + +/** + * Hold one client connection to the daemon's supervisor socket for the + * lifetime of the Electron process. When this process exits for any reason + * (Cmd+Q, crash, SIGKILL), the OS closes the fd. The daemon detects EOF and + * self-stops after its ~5s grace period, leaving tmux/ConPTY sessions alive + * for the next boot to adopt. + * + * Retry semantics: if the daemon has not created the socket yet (or restarts), + * we reconnect with bounded exponential backoff so the link re-establishes + * automatically. dispose() cancels any pending retry and destroys the socket. + */ +export function connectSupervisor( + addr: string, + opts?: { log?: (msg: string) => void }, +): SupervisorLinkHandle { + const log = opts?.log ?? (() => undefined); + + let disposed = false; + let socket: net.Socket | null = null; + let retryTimer: ReturnType | null = null; + let backoff = BACKOFF_INIT_MS; + + function clearRetry() { + if (retryTimer !== null) { + clearTimeout(retryTimer); + retryTimer = null; + } + } + + function destroySocket() { + if (socket !== null) { + socket.removeAllListeners(); + socket.destroy(); + socket = null; + } + } + + function scheduleReconnect() { + if (disposed) return; + clearRetry(); + const delay = backoff; + backoff = Math.min(backoff * 2, BACKOFF_MAX_MS); + log(`supervisor-link: reconnecting in ${delay}ms`); + retryTimer = setTimeout(() => { + retryTimer = null; + if (!disposed) connect(); + }, delay); + } + + function connect() { + if (disposed) return; + + destroySocket(); + + const s = net.connect(addr); + socket = s; + + s.on("connect", () => { + if (disposed) { + s.destroy(); + return; + } + log("supervisor-link: connected"); + // Reset backoff on successful connection. + backoff = BACKOFF_INIT_MS; + }); + + // Drain inbound data: the daemon never sends payload; discard so the + // socket buffer never stalls. ponytail: no payload to process. + s.on("data", () => undefined); + + s.on("error", (err) => { + log(`supervisor-link: error: ${err.message}`); + // close fires after error, which schedules the reconnect. + }); + + s.on("close", () => { + if (disposed) return; + log("supervisor-link: connection closed, will retry"); + scheduleReconnect(); + }); + } + + connect(); + + return { + dispose() { + disposed = true; + clearRetry(); + destroySocket(); + }, + }; +} diff --git a/frontend/src/renderer/test/setup.ts b/frontend/src/renderer/test/setup.ts index b228fa15ef..5a9c265fb8 100644 --- a/frontend/src/renderer/test/setup.ts +++ b/frontend/src/renderer/test/setup.ts @@ -1,5 +1,13 @@ import "@testing-library/jest-dom/vitest"; +// Guard: src/main/** tests run in the Node.js environment (no DOM). The +// environmentMatchGlobs config still routes setupFiles here, so skip the +// DOM stubs rather than crashing on window/Element references. +// ponytail: single guard; node env has no DOM to stub. +if (typeof window === "undefined") { + // Nothing to set up for node-environment tests. +} else { + class ResizeObserverStub { observe() {} unobserve() {} @@ -134,3 +142,5 @@ window.ao = { onClick: () => () => undefined, }, }; + +} // end if (typeof window !== "undefined") From e9eb791fc355e21290abf9b5af8b20709b474796 Mon Sep 17 00:00:00 2001 From: harshitsinghbhandari Date: Thu, 25 Jun 2026 17:49:10 +0530 Subject: [PATCH 11/23] fix(desktop): guard against daemon orphan when supervisor link is down; tidy test setup Expose a live `connected` getter on SupervisorLinkHandle and add a last-resort process.on("exit") kill that fires only when the link is not actually connected (UDS never bound or addr was null), preventing orphan daemons while keeping the OS-fd teardown path for the normal case. Log a warning when addr is null so the skip is diagnosable. Invert the setup.ts guard to the natural form, dropping the empty if/else skeleton. Co-Authored-By: Claude Sonnet 4.6 --- frontend/src/main.ts | 15 +++++++ frontend/src/main/supervisor-link.test.ts | 51 +++++++++++++++++++++++ frontend/src/main/supervisor-link.ts | 8 ++++ frontend/src/renderer/test/setup.ts | 9 ++-- 4 files changed, 77 insertions(+), 6 deletions(-) diff --git a/frontend/src/main.ts b/frontend/src/main.ts index 560acf55de..ee9f474831 100644 --- a/frontend/src/main.ts +++ b/frontend/src/main.ts @@ -614,6 +614,8 @@ async function startDaemonInner(startEpoch: number): Promise { supervisorLink = connectSupervisor(addr, { log: (msg) => console.log(`AO: ${msg}`), }); + } else { + console.warn("AO: supervisor link skipped; run-file path unavailable"); } }; @@ -782,6 +784,19 @@ app.on("before-quit", () => { browserViewHost = null; }); +// Last resort: if the OS-native supervisor link is not actually connected +// (daemon socket never bound, e.g. UDS path-length limit, or addr was null), +// the dropped fd will NOT stop the daemon on quit, so kill it here to avoid an +// orphan. Safe because Phase A made the daemon's SIGTERM non-destructive: it +// exits without tearing down sessions, which survive for the next boot to adopt. +// When the link IS connected we do nothing here and rely on the OS closing the +// fd on exit, which covers crash and SIGKILL uniformly. +process.on("exit", () => { + if (daemonProcess && !supervisorLink?.connected) { + killDaemon(daemonProcess); + } +}); + app.on("window-all-closed", () => { if (process.platform !== "darwin") { app.quit(); diff --git a/frontend/src/main/supervisor-link.test.ts b/frontend/src/main/supervisor-link.test.ts index cb1f8f6156..f94931a235 100644 --- a/frontend/src/main/supervisor-link.test.ts +++ b/frontend/src/main/supervisor-link.test.ts @@ -111,6 +111,57 @@ describe("supervisor-link", () => { reconn.destroy(); }); + it("connected flag: true after connect, false after server closes connection", async () => { + const addr = tmpSocketPath(); + + const server = net.createServer(); + servers.push(server); + const connectionPromise = nextConnection(server); + await new Promise((resolve, reject) => { + server.listen(addr, () => resolve()); + server.once("error", reject); + }); + + const link = connectSupervisor(addr, { log: () => undefined }); + handles.push(link); + + // Wait for the server to receive the connection. + const conn = await withTimeout(connectionPromise, 3_000, "connected-flag: server did not receive connection"); + + // Poll until connected is true (the "connect" event fires asynchronously). + await withTimeout( + new Promise((resolve) => { + const check = () => { + if (link.connected) { resolve(); return; } + setTimeout(check, 20); + }; + check(); + }), + 1_000, + "connected-flag: handle.connected never became true", + ); + expect(link.connected).toBe(true); + + // Server-side close of the accepted socket triggers the client "close" event. + conn.destroy(); + + // Poll until connected drops back to false. + await withTimeout( + new Promise((resolve) => { + const check = () => { + if (!link.connected) { resolve(); return; } + setTimeout(check, 20); + }; + check(); + }), + 3_000, + "connected-flag: handle.connected never became false after server closed", + ); + expect(link.connected).toBe(false); + + link.dispose(); + }); + it("dispose stops reconnect: no connection arrives after dispose", async () => { const addr = tmpSocketPath(); diff --git a/frontend/src/main/supervisor-link.ts b/frontend/src/main/supervisor-link.ts index 9fb14c2579..2594ead3ba 100644 --- a/frontend/src/main/supervisor-link.ts +++ b/frontend/src/main/supervisor-link.ts @@ -10,6 +10,7 @@ const BACKOFF_INIT_MS = 200; const BACKOFF_MAX_MS = 2_000; export interface SupervisorLinkHandle { + readonly connected: boolean; dispose(): void; } @@ -31,6 +32,7 @@ export function connectSupervisor( const log = opts?.log ?? (() => undefined); let disposed = false; + let connected = false; let socket: net.Socket | null = null; let retryTimer: ReturnType | null = null; let backoff = BACKOFF_INIT_MS; @@ -75,6 +77,7 @@ export function connectSupervisor( s.destroy(); return; } + connected = true; log("supervisor-link: connected"); // Reset backoff on successful connection. backoff = BACKOFF_INIT_MS; @@ -90,6 +93,7 @@ export function connectSupervisor( }); s.on("close", () => { + connected = false; if (disposed) return; log("supervisor-link: connection closed, will retry"); scheduleReconnect(); @@ -99,8 +103,12 @@ export function connectSupervisor( connect(); return { + get connected() { + return connected; + }, dispose() { disposed = true; + connected = false; clearRetry(); destroySocket(); }, diff --git a/frontend/src/renderer/test/setup.ts b/frontend/src/renderer/test/setup.ts index 5a9c265fb8..65f9a2a625 100644 --- a/frontend/src/renderer/test/setup.ts +++ b/frontend/src/renderer/test/setup.ts @@ -1,12 +1,9 @@ import "@testing-library/jest-dom/vitest"; -// Guard: src/main/** tests run in the Node.js environment (no DOM). The -// environmentMatchGlobs config still routes setupFiles here, so skip the -// DOM stubs rather than crashing on window/Element references. +// Guard: src/main/** tests run in the Node.js environment (no DOM). vitest still +// routes setupFiles here, so only install the DOM stubs when a DOM exists. // ponytail: single guard; node env has no DOM to stub. -if (typeof window === "undefined") { - // Nothing to set up for node-environment tests. -} else { +if (typeof window !== "undefined") { class ResizeObserverStub { observe() {} From aabaad7981684a508d0fc1308a1b01a89f9c0206 Mon Sep 17 00:00:00 2001 From: harshitsinghbhandari Date: Thu, 25 Jun 2026 17:57:18 +0530 Subject: [PATCH 12/23] fix(core): restore promptless sessions in place (reboot recovery, no increment) Drop the empty-prompt early return in restoreArgv that returned ErrNotResumable when the adapter could not resume and no prompt was saved. Now control falls through to GetLaunchCommand unconditionally: a saved prompt is replayed, an empty prompt (orchestrator) launches fresh with the system prompt only, same id, same workspace. Removes the only producer of ErrNotResumable and deletes the now-dead sentinel, its service mapping, and the corresponding tests. Frontend reference to SESSION_NOT_RESUMABLE in TerminalPane.tsx is left intact (the handler becomes harmlessly dead; the API simply stops returning that code). Co-Authored-By: Claude Sonnet 4.6 --- backend/internal/service/session/service.go | 3 -- .../internal/service/session/service_test.go | 13 -------- backend/internal/session_manager/manager.go | 21 ++++--------- .../internal/session_manager/manager_test.go | 31 +++++++++++++------ 4 files changed, 27 insertions(+), 41 deletions(-) diff --git a/backend/internal/service/session/service.go b/backend/internal/service/session/service.go index 2e85a201eb..e924913e5c 100644 --- a/backend/internal/service/session/service.go +++ b/backend/internal/service/session/service.go @@ -463,9 +463,6 @@ func toAPIError(err error) error { return apierr.Conflict("SESSION_TERMINATED", "Session is terminated", nil) case errors.Is(err, sessionmanager.ErrIncompleteHandle): return apierr.Conflict("SESSION_INCOMPLETE_HANDLE", "Session is missing runtime or workspace handles", nil) - case errors.Is(err, sessionmanager.ErrNotResumable): - return apierr.Conflict("SESSION_NOT_RESUMABLE", - "This session has no saved agent session or prompt to resume from", nil) case errors.Is(err, sessionmanager.ErrProjectNotResolvable): return apierr.Invalid("PROJECT_NOT_RESOLVABLE", "Project is not registered or has no repo — register it with `ao project add`", nil) case errors.Is(err, sessionmanager.ErrUnknownHarness): diff --git a/backend/internal/service/session/service_test.go b/backend/internal/service/session/service_test.go index 9ef255a2d3..57c0a2ed18 100644 --- a/backend/internal/service/session/service_test.go +++ b/backend/internal/service/session/service_test.go @@ -903,16 +903,3 @@ func containsString(values []string, want string) bool { return false } -func TestToAPIError_NotResumable(t *testing.T) { - err := toAPIError(fmt.Errorf("restore foo: %w", sessionmanager.ErrNotResumable)) - var ae *apierr.Error - if !errors.As(err, &ae) { - t.Fatalf("want *apierr.Error, got %T: %v", err, err) - } - if ae.Kind != apierr.KindConflict { - t.Errorf("kind = %v, want %v", ae.Kind, apierr.KindConflict) - } - if ae.Code != "SESSION_NOT_RESUMABLE" { - t.Errorf("code = %q, want SESSION_NOT_RESUMABLE", ae.Code) - } -} diff --git a/backend/internal/session_manager/manager.go b/backend/internal/session_manager/manager.go index 89cbf05e02..9fb6b1bca1 100644 --- a/backend/internal/session_manager/manager.go +++ b/backend/internal/session_manager/manager.go @@ -26,13 +26,6 @@ var ( ErrNotRestorable = errors.New("session: not restorable (not terminal)") ErrTerminated = errors.New("session: terminated") ErrIncompleteHandle = errors.New("session: incomplete teardown handle") - // ErrNotResumable means there is nothing for Restore to relaunch from: the - // harness adapter cannot resume the session (no native or derivable session - // id) AND no prompt was saved to fresh-launch from. Resumability is decided - // by the adapter (e.g. Claude Code pins a deterministic --session-id, so it - // resumes with no captured token), not by inspecting metadata fields here. - // Distinct from ErrNotRestorable (which is "not terminal yet"). - ErrNotResumable = errors.New("session: nothing to resume from") // ErrProjectNotResolvable means the spawn's project has no usable repo // (unregistered, archived, or missing a path). The API maps it to a 400. ErrProjectNotResolvable = errors.New("session: project repo not resolvable") @@ -489,8 +482,8 @@ func (m *Manager) Restore(ctx context.Context, id domain.SessionID) (domain.Sess } // Resumability is NOT decided here: a promptless session can still be fully // resumable when the harness pins a deterministic session id (Claude Code). - // restoreArgv asks the adapter and returns ErrNotResumable only when the - // adapter cannot resume AND there is no prompt to fresh-launch from. + // restoreArgv always produces a launch command: adapter resume, saved-prompt + // replay, or fresh launch (empty prompt with system prompt only). Same id. project, err := m.loadProject(ctx, rec.ProjectID) if err != nil { @@ -1233,12 +1226,10 @@ func restoreArgv(ctx context.Context, agent ports.Agent, id domain.SessionID, wo if ok { return cmd, nil } - // The adapter reports no session to resume (no native or derivable session - // id). A saved prompt lets us relaunch fresh; with neither, there is - // genuinely nothing to restore from. - if meta.Prompt == "" { - return nil, ErrNotResumable - } + // The adapter cannot resume the session (no native or derivable session id). + // Relaunch fresh via GetLaunchCommand: a saved prompt is replayed, an empty + // prompt (e.g. an orchestrator) launches fresh with the system prompt only. + // Same id, same workspace - no new session is minted. argv, err := agent.GetLaunchCommand(ctx, ports.LaunchConfig{ SessionID: string(id), WorkspacePath: workspacePath, diff --git a/backend/internal/session_manager/manager_test.go b/backend/internal/session_manager/manager_test.go index 2d7d646ac4..fd94e4fe45 100644 --- a/backend/internal/session_manager/manager_test.go +++ b/backend/internal/session_manager/manager_test.go @@ -937,23 +937,34 @@ func TestRestore_PromptlessOrchestratorResumesViaAdapter(t *testing.T) { } } -// TestRestore_RefusesPromptlessWhenAdapterCannotResume preserves the typed -// error: a promptless session whose adapter cannot resume (no native session id) -// has genuinely nothing to relaunch from and must still return ErrNotResumable. -func TestRestore_RefusesPromptlessWhenAdapterCannotResume(t *testing.T) { +// TestRestore_PromptlessUnresumableRelaunchesFresh covers the genuine-reboot +// case: a promptless session whose adapter cannot resume (no native session id, +// no captured AgentSessionID) must be relaunched fresh via GetLaunchCommand +// in the SAME id. The orchestrator is the canonical example: after a reboot +// where tmux is truly gone, RestoreAll must recover it in place rather than +// abandon it and mint a new one (which caused the id-increment bug). +func TestRestore_PromptlessUnresumableRelaunchesFresh(t *testing.T) { st := newFakeStore() st.sessions["mer-1"] = domain.SessionRecord{ - ID: "mer-1", ProjectID: "mer", Kind: domain.KindWorker, IsTerminated: true, - Metadata: domain.SessionMetadata{WorkspacePath: "/ws/mer-1", Branch: "ao/mer-1/root"}, + ID: "mer-1", ProjectID: "mer", Kind: domain.KindOrchestrator, IsTerminated: true, + // No AgentSessionID, no Prompt: exactly how an orchestrator is persisted. + Metadata: domain.SessionMetadata{WorkspacePath: "/ws/mer-1", Branch: "ao/mer-orchestrator"}, Activity: domain.Activity{State: domain.ActivityExited}, } + rt := &fakeRuntime{} lookPath := func(string) (string, error) { return "/bin/true", nil } // fakeAgents resolves to fakeAgent, whose GetRestoreCommand returns ok=false - // without an agentSessionId. - m := New(Deps{Runtime: &fakeRuntime{}, Agents: fakeAgents{}, Workspace: &fakeWorkspace{}, Store: st, Messenger: &fakeMessenger{}, Lifecycle: &fakeLCM{store: st}, LookPath: lookPath}) + // without an agentSessionId, and GetLaunchCommand returns a valid argv. + m := New(Deps{Runtime: rt, Agents: fakeAgents{}, Workspace: &fakeWorkspace{}, Store: st, Messenger: &fakeMessenger{}, Lifecycle: &fakeLCM{store: st}, LookPath: lookPath}) - if _, err := m.Restore(ctx, "mer-1"); !errors.Is(err, ErrNotResumable) { - t.Fatalf("Restore err = %v, want ErrNotResumable", err) + if _, err := m.Restore(ctx, "mer-1"); err != nil { + t.Fatalf("promptless unresumable session must relaunch fresh, got err = %v", err) + } + if rt.created != 1 { + t.Fatalf("runtime.Create = %d, want 1 (fresh launch)", rt.created) + } + if st.sessions["mer-1"].IsTerminated { + t.Error("session must be live after fresh relaunch") } } From 3c11afd8746cd6a06bfb19c271d1ebc3a8a9cf5d Mon Sep 17 00:00:00 2001 From: Harshit Singh Bhandari Date: Thu, 25 Jun 2026 18:10:23 +0530 Subject: [PATCH 13/23] test(integration): dead-live session is restored, not abandoned, after reconcile Task 5 made promptless sessions relaunch fresh instead of hitting ErrNotResumable. The reconcile crash-recovery path (documented in reconcileLive) terminates a dead-live session then RestoreAll relaunches it on the same boot. This test asserted the old promptless-stays-terminated artifact; update it to the intended restored end state (live again, same id, one fresh runtime Create). Co-Authored-By: Claude Opus 4.8 --- .../integration/lifecycle_sqlite_test.go | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/backend/internal/integration/lifecycle_sqlite_test.go b/backend/internal/integration/lifecycle_sqlite_test.go index 4b70fb2528..11468f2cec 100644 --- a/backend/internal/integration/lifecycle_sqlite_test.go +++ b/backend/internal/integration/lifecycle_sqlite_test.go @@ -212,7 +212,7 @@ func TestRestoreRoundTripPreservesMetadata(t *testing.T) { // mark it terminated in the DB. // - Session B: is_terminated=1 but its runtime is still ALIVE (leaked teardown) // => Reconcile must call Destroy on its handle. -func TestReconcile_TerminatesDeadLiveSessionAndReapsLeakedTmux(t *testing.T) { +func TestReconcile_RestoresDeadLiveSessionAndReapsLeakedTmux(t *testing.T) { ctx := context.Background() st := newStack(t) @@ -274,7 +274,11 @@ func TestReconcile_TerminatesDeadLiveSessionAndReapsLeakedTmux(t *testing.T) { t.Fatalf("Reconcile: %v", err) } - // Session A must now be terminated in the store. + // Session A's runtime was gone, so reconcileLive captured its work, marked it + // terminated, and RestoreAll relaunched it fresh on the SAME boot (the + // documented crash-recovery path: a promptless session relaunches with the + // system prompt only rather than being abandoned). End state: live again in + // the same id, via a fresh runtime Create. gotA, ok, err := st.store.GetSession(ctx, recA.ID) if err != nil { t.Fatalf("get session A: %v", err) @@ -282,8 +286,14 @@ func TestReconcile_TerminatesDeadLiveSessionAndReapsLeakedTmux(t *testing.T) { if !ok { t.Fatalf("session A: not found after Reconcile") } - if !gotA.IsTerminated { - t.Fatalf("session A: want is_terminated=true after Reconcile, got false") + if gotA.IsTerminated { + t.Fatalf("session A: want restored (is_terminated=false) after crash recovery, got terminated") + } + // Exactly one fresh runtime Create: A was torn down and relaunched. An + // adopted (still-alive) session would not Create; this proves the full + // terminate-then-restore cycle ran for A and only A. + if st.rt.created != 1 { + t.Fatalf("want 1 runtime Create for A's fresh relaunch, got %d", st.rt.created) } // Session B's leaked runtime must have been destroyed. From 72e9d1d5fee3449a1562e73045f88941c36dbec3 Mon Sep 17 00:00:00 2001 From: Harshit Singh Bhandari Date: Thu, 25 Jun 2026 18:12:43 +0530 Subject: [PATCH 14/23] style(session): replace em dashes in service.go messages and comment Co-Authored-By: Claude Opus 4.8 --- backend/internal/service/session/service.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/internal/service/session/service.go b/backend/internal/service/session/service.go index e924913e5c..798467ed4a 100644 --- a/backend/internal/service/session/service.go +++ b/backend/internal/service/session/service.go @@ -88,7 +88,7 @@ type Service struct { telemetry ports.EventSink // signalCapable reports whether a harness has a hook pipeline that can // deliver activity signals at all. Only capable harnesses are eligible for - // the no_signal downgrade — a hook-less harness staying silent forever is + // the no_signal downgrade: a hook-less harness staying silent forever is // normal, not a broken pipeline. nil means "unknown": never downgrade. signalCapable func(domain.AgentHarness) bool } @@ -166,7 +166,7 @@ func (s *Service) requireProject(ctx context.Context, id domain.ProjectID) (doma return domain.ProjectRecord{}, fmt.Errorf("get project %s: %w", id, err) } if !ok { - return domain.ProjectRecord{}, apierr.NotFound("PROJECT_NOT_FOUND", "Unknown project — register it with `ao project add`") + return domain.ProjectRecord{}, apierr.NotFound("PROJECT_NOT_FOUND", "Unknown project. Register it with `ao project add`") } return rec, nil } @@ -464,7 +464,7 @@ func toAPIError(err error) error { case errors.Is(err, sessionmanager.ErrIncompleteHandle): return apierr.Conflict("SESSION_INCOMPLETE_HANDLE", "Session is missing runtime or workspace handles", nil) case errors.Is(err, sessionmanager.ErrProjectNotResolvable): - return apierr.Invalid("PROJECT_NOT_RESOLVABLE", "Project is not registered or has no repo — register it with `ao project add`", nil) + return apierr.Invalid("PROJECT_NOT_RESOLVABLE", "Project is not registered or has no repo. Register it with `ao project add`", nil) case errors.Is(err, sessionmanager.ErrUnknownHarness): return apierr.Invalid("UNKNOWN_HARNESS", err.Error(), nil) case errors.Is(err, sessionmanager.ErrMissingHarness): From 6ed9bc59cbab6ed7e12f11c08b77638abde3139a Mon Sep 17 00:00:00 2001 From: Harshit Singh Bhandari Date: Thu, 25 Jun 2026 18:34:34 +0530 Subject: [PATCH 15/23] docs(desktop): scope supervisor link to spawn path; dispose link on explicit stop Document that the liveness link is established only when the app spawned the daemon: the attach path intentionally does not link, to keep an `ao start` daemon persistent (headless safety). Also dispose the link on an explicit daemon:stop so its reconnect loop does not retry a deliberately-stopped daemon. Co-Authored-By: Claude Opus 4.8 --- frontend/src/main.ts | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/frontend/src/main.ts b/frontend/src/main.ts index ee9f474831..bed1b70d8b 100644 --- a/frontend/src/main.ts +++ b/frontend/src/main.ts @@ -602,6 +602,14 @@ async function startDaemonInner(startEpoch: number): Promise { // Electron exits for any reason (Cmd+Q, crash, SIGKILL), the OS closes the // fd; the daemon detects EOF and self-stops after its ~5s grace period, // leaving tmux/ConPTY sessions alive for the next launch to adopt. + // + // Scope: this runs only on the path where WE spawned the daemon (it is the + // bound-port discovery callback). The attach path (connecting to a daemon + // that was already running) intentionally does NOT link, to preserve + // headless safety: a daemon started via `ao start` must stay persistent and + // must not be self-stopped when the app quits. A daemon left lingering by a + // prior app instance self-stops via its own grace; a relaunch within that + // window respawns and adopts the live sessions (Task 1), so no work is lost. const rfp = runFilePath(); const addr = process.platform === "win32" @@ -707,6 +715,11 @@ function stopDaemon(): DaemonStatus { } daemonStoppingProcess = daemonProcess; + // Drop the liveness link: an explicit stop is not a frontend death, so stop + // holding the socket open (and stop the reconnect loop retrying a dead daemon). + // A later daemon:start re-establishes the link via reportBoundPort. + supervisorLink?.dispose(); + supervisorLink = null; killDaemon(daemonProcess); setDaemonStatus({ state: "stopped" }); return daemonStatus; From cd9f21ce5c16af49cb959d5d941fb053b1062436 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 25 Jun 2026 14:02:43 +0000 Subject: [PATCH 16/23] chore: format with prettier [skip ci] --- frontend/src/main/supervisor-link.test.ts | 16 +- frontend/src/main/supervisor-link.ts | 5 +- frontend/src/renderer/test/setup.ts | 256 +++++++++++----------- 3 files changed, 141 insertions(+), 136 deletions(-) diff --git a/frontend/src/main/supervisor-link.test.ts b/frontend/src/main/supervisor-link.test.ts index f94931a235..901044a6f4 100644 --- a/frontend/src/main/supervisor-link.test.ts +++ b/frontend/src/main/supervisor-link.test.ts @@ -69,7 +69,11 @@ describe("supervisor-link", () => { }); // The link should reconnect and the server should receive a connection. - const conn = await withTimeout(connectionPromise, 5_000, "retry-until-connected: server did not receive connection"); + const conn = await withTimeout( + connectionPromise, + 5_000, + "retry-until-connected: server did not receive connection", + ); expect(conn).toBeTruthy(); conn.destroy(); }); @@ -132,7 +136,10 @@ describe("supervisor-link", () => { await withTimeout( new Promise((resolve) => { const check = () => { - if (link.connected) { resolve(); return; } + if (link.connected) { + resolve(); + return; + } setTimeout(check, 20); }; check(); @@ -149,7 +156,10 @@ describe("supervisor-link", () => { await withTimeout( new Promise((resolve) => { const check = () => { - if (!link.connected) { resolve(); return; } + if (!link.connected) { + resolve(); + return; + } setTimeout(check, 20); }; check(); diff --git a/frontend/src/main/supervisor-link.ts b/frontend/src/main/supervisor-link.ts index 2594ead3ba..b8f556b254 100644 --- a/frontend/src/main/supervisor-link.ts +++ b/frontend/src/main/supervisor-link.ts @@ -25,10 +25,7 @@ export interface SupervisorLinkHandle { * we reconnect with bounded exponential backoff so the link re-establishes * automatically. dispose() cancels any pending retry and destroys the socket. */ -export function connectSupervisor( - addr: string, - opts?: { log?: (msg: string) => void }, -): SupervisorLinkHandle { +export function connectSupervisor(addr: string, opts?: { log?: (msg: string) => void }): SupervisorLinkHandle { const log = opts?.log ?? (() => undefined); let disposed = false; diff --git a/frontend/src/renderer/test/setup.ts b/frontend/src/renderer/test/setup.ts index 65f9a2a625..795db7b4cd 100644 --- a/frontend/src/renderer/test/setup.ts +++ b/frontend/src/renderer/test/setup.ts @@ -4,140 +4,138 @@ import "@testing-library/jest-dom/vitest"; // routes setupFiles here, so only install the DOM stubs when a DOM exists. // ponytail: single guard; node env has no DOM to stub. if (typeof window !== "undefined") { + class ResizeObserverStub { + observe() {} + unobserve() {} + disconnect() {} + } -class ResizeObserverStub { - observe() {} - unobserve() {} - disconnect() {} -} + Object.defineProperty(window, "ResizeObserver", { + configurable: true, + writable: true, + value: ResizeObserverStub, + }); -Object.defineProperty(window, "ResizeObserver", { - configurable: true, - writable: true, - value: ResizeObserverStub, -}); - -Object.defineProperty(window, "matchMedia", { - configurable: true, - writable: true, - value: (query: string) => ({ - matches: false, - media: query, - onchange: null, - addEventListener: () => undefined, - removeEventListener: () => undefined, - addListener: () => undefined, - removeListener: () => undefined, - dispatchEvent: () => false, - }), -}); - -const localStorageStub = (() => { - const values = new Map(); - return { - clear: () => values.clear(), - getItem: (key: string) => values.get(key) ?? null, - removeItem: (key: string) => values.delete(key), - setItem: (key: string, value: string) => values.set(key, value), - }; -})(); + Object.defineProperty(window, "matchMedia", { + configurable: true, + writable: true, + value: (query: string) => ({ + matches: false, + media: query, + onchange: null, + addEventListener: () => undefined, + removeEventListener: () => undefined, + addListener: () => undefined, + removeListener: () => undefined, + dispatchEvent: () => false, + }), + }); -Object.defineProperty(window, "localStorage", { - configurable: true, - writable: true, - value: localStorageStub, -}); + const localStorageStub = (() => { + const values = new Map(); + return { + clear: () => values.clear(), + getItem: (key: string) => values.get(key) ?? null, + removeItem: (key: string) => values.delete(key), + setItem: (key: string, value: string) => values.set(key, value), + }; + })(); -HTMLCanvasElement.prototype.getContext = (() => ({})) as unknown as typeof HTMLCanvasElement.prototype.getContext; + Object.defineProperty(window, "localStorage", { + configurable: true, + writable: true, + value: localStorageStub, + }); -Element.prototype.hasPointerCapture = (() => false) as typeof Element.prototype.hasPointerCapture; -Element.prototype.setPointerCapture = (() => undefined) as typeof Element.prototype.setPointerCapture; -Element.prototype.releasePointerCapture = (() => undefined) as typeof Element.prototype.releasePointerCapture; -Element.prototype.scrollIntoView = (() => undefined) as typeof Element.prototype.scrollIntoView; + HTMLCanvasElement.prototype.getContext = (() => ({})) as unknown as typeof HTMLCanvasElement.prototype.getContext; -window.ao = { - app: { - getVersion: async () => "0.0.0-test", - chooseDirectory: async () => null, - }, - clipboard: { - writeText: async () => undefined, - readText: async () => "", - }, - daemon: { - getStatus: async () => ({ state: "stopped" }), - start: async () => ({ state: "starting" }), - stop: async () => ({ state: "stopped" }), - onStatus: () => () => undefined, - }, - telemetry: { - getBootstrap: async () => null, - }, - browser: { - ensure: async (sessionId: string) => ({ - viewId: `test:${sessionId}`, - url: "", - title: "", - canGoBack: false, - canGoForward: false, - isLoading: false, - }), - setBounds: () => undefined, - navigate: async ({ viewId }: { viewId: string }) => ({ - viewId, - url: "", - title: "", - canGoBack: false, - canGoForward: false, - isLoading: false, - }), - clear: async (viewId: string) => ({ - viewId, - url: "", - title: "", - canGoBack: false, - canGoForward: false, - isLoading: false, - }), - goBack: async (viewId: string) => ({ - viewId, - url: "", - title: "", - canGoBack: false, - canGoForward: false, - isLoading: false, - }), - goForward: async (viewId: string) => ({ - viewId, - url: "", - title: "", - canGoBack: false, - canGoForward: false, - isLoading: false, - }), - reload: async (viewId: string) => ({ - viewId, - url: "", - title: "", - canGoBack: false, - canGoForward: false, - isLoading: false, - }), - stop: async (viewId: string) => ({ - viewId, - url: "", - title: "", - canGoBack: false, - canGoForward: false, - isLoading: false, - }), - destroy: () => undefined, - onNavState: () => () => undefined, - }, - notifications: { - show: async () => undefined, - onClick: () => () => undefined, - }, -}; + Element.prototype.hasPointerCapture = (() => false) as typeof Element.prototype.hasPointerCapture; + Element.prototype.setPointerCapture = (() => undefined) as typeof Element.prototype.setPointerCapture; + Element.prototype.releasePointerCapture = (() => undefined) as typeof Element.prototype.releasePointerCapture; + Element.prototype.scrollIntoView = (() => undefined) as typeof Element.prototype.scrollIntoView; + window.ao = { + app: { + getVersion: async () => "0.0.0-test", + chooseDirectory: async () => null, + }, + clipboard: { + writeText: async () => undefined, + readText: async () => "", + }, + daemon: { + getStatus: async () => ({ state: "stopped" }), + start: async () => ({ state: "starting" }), + stop: async () => ({ state: "stopped" }), + onStatus: () => () => undefined, + }, + telemetry: { + getBootstrap: async () => null, + }, + browser: { + ensure: async (sessionId: string) => ({ + viewId: `test:${sessionId}`, + url: "", + title: "", + canGoBack: false, + canGoForward: false, + isLoading: false, + }), + setBounds: () => undefined, + navigate: async ({ viewId }: { viewId: string }) => ({ + viewId, + url: "", + title: "", + canGoBack: false, + canGoForward: false, + isLoading: false, + }), + clear: async (viewId: string) => ({ + viewId, + url: "", + title: "", + canGoBack: false, + canGoForward: false, + isLoading: false, + }), + goBack: async (viewId: string) => ({ + viewId, + url: "", + title: "", + canGoBack: false, + canGoForward: false, + isLoading: false, + }), + goForward: async (viewId: string) => ({ + viewId, + url: "", + title: "", + canGoBack: false, + canGoForward: false, + isLoading: false, + }), + reload: async (viewId: string) => ({ + viewId, + url: "", + title: "", + canGoBack: false, + canGoForward: false, + isLoading: false, + }), + stop: async (viewId: string) => ({ + viewId, + url: "", + title: "", + canGoBack: false, + canGoForward: false, + isLoading: false, + }), + destroy: () => undefined, + onNavState: () => () => undefined, + }, + notifications: { + show: async () => undefined, + onClick: () => () => undefined, + }, + }; } // end if (typeof window !== "undefined") From fa2e4c178a469a47161831b3b075c42eb420334f Mon Sep 17 00:00:00 2001 From: Harshit Singh Bhandari Date: Thu, 25 Jun 2026 19:39:40 +0530 Subject: [PATCH 17/23] docs: add daemon-lifecycle adopt-on-shutdown implementation plan Co-Authored-By: Claude Opus 4.8 --- ...-25-ao-cli-install-and-daemon-lifecycle.md | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 docs/superpowers/plans/2026-06-25-ao-cli-install-and-daemon-lifecycle.md diff --git a/docs/superpowers/plans/2026-06-25-ao-cli-install-and-daemon-lifecycle.md b/docs/superpowers/plans/2026-06-25-ao-cli-install-and-daemon-lifecycle.md new file mode 100644 index 0000000000..f307f0c25c --- /dev/null +++ b/docs/superpowers/plans/2026-06-25-ao-cli-install-and-daemon-lifecycle.md @@ -0,0 +1,115 @@ +# Daemon Shutdown = Adopt-Alive, Not Teardown — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development or superpowers:executing-plans. Steps use checkbox (`- [ ]`) syntax. + +**Goal:** Stopping the daemon (or losing the frontend) must NOT destroy live agent sessions. Sessions persist in the runtime and are re-adopted on the next boot, with their ids and context intact, on macOS/tmux exactly as on Windows/ConPTY. + +**Architecture (validated by direct experiment):** tmux sessions survive the daemon process dying (they reparent to `launchd`), and the boot reconcile already adopts surviving sessions correctly. The bug is that the **graceful** shutdown path runs `SaveAndTeardownAll`, which calls `runtime.Destroy` (`tmux kill-session`) and DESTROYS the live session; the orchestrator then can't be restored (promptless → `ErrNotResumable`) and the frontend mints a new one (id increments). Fix: shutdown becomes a clean exit that leaves sessions alive; boot adopts them. The liveness link makes the daemon reliably stop when the frontend dies, via that same clean exit. + +**Tech Stack:** Go daemon (`session_manager`, `daemon`, `httpd`, tmux/ConPTY runtimes), Electron main (TypeScript, `node:net`), `go test` + Vitest. + +## Reproduction (recorded, sandboxed `AO_DATA_DIR`) +- Spawn worker + orchestrator → both get a live tmux session, DB `is_terminated=0`. +- `kill -9` daemon → tmux sessions survive → restart → both **adopted**: same id, `is_terminated=0`, same tmux session. No increment. ✅ +- `ao stop` (graceful) → `SaveAndTeardownAll` → tmux sessions **killed**, marked `exited`, marker written → restart → orchestrator stays `is_terminated=1` (Restore returned `ErrNotResumable`) → `POST /orchestrators` creates `…-3` (num 2→3). ❌ This is the bug. + +## Why only the orchestrator visibly breaks (workers are NOT immune) +The graceful path kills BOTH workers' and orchestrators' live sessions; the orchestrator is just the only one where the damage is visible, for two independent reasons that both happen to land on it: +1. **Restore failure is orchestrator-only.** Workers carry a saved `prompt`, so when the agent can't natively resume, `restoreArgv` falls back to relaunching fresh from the prompt and "succeeds". The orchestrator is promptless (`prompt=""` for all of them in the DB), so the same path hits `ErrNotResumable` (`manager.go:1238`) and it is left terminated. +2. **Auto-recreation is orchestrator-only.** The frontend calls `POST /api/v1/orchestrators` on load to *ensure* an orchestrator; finding none active, it mints a new one (`num+1` → the visible `14→15→16`). Nothing auto-respawns workers, so a worker that fails to restore just silently vanishes. +So both workers and the orchestrator lose their LIVE session + context on a graceful stop today; the orchestrator alone advertises it via the incrementing id. Task 1 (adopt-alive instead of teardown) restores live context for ALL sessions and needs no orchestrator special-casing — this is the unification you asked for. + +## Global Constraints +- All app state under `~/.ao` (`AO_DATA_DIR`/`AO_RUN_FILE` overrides). Liveness socket under `~/.ao` (unix) / named pipe (Windows). +- No em dashes anywhere. +- Headless safety: a daemon with no frontend (CLI `ao start`) must never self-stop and must not have its sessions destroyed. +- `ao` on agent PATH already works in packaged builds via `HookPATH` (`session_manager/manager.go:1082`). No global install. Dev `go run` is the only gap (optional Phase D). +- Do not break genuine reboot recovery: when the runtime is truly gone, `reconcileLive` stashes the worktree and relaunches (work preserved). Keep that path. +- Mark shortcuts with `ponytail:` comments. + +--- + +## File Structure +- **Modify** `backend/internal/daemon/daemon.go` — remove the `SaveAndTeardownAll` call from the normal shutdown path (`daemon.go:164`); the daemon exits leaving sessions alive. (Keep `Reconcile` on boot, which already adopts.) +- **Modify** `backend/internal/session_manager/manager.go` — `restoreArgv` (~1238): a promptless session relaunches fresh in the same id instead of `ErrNotResumable` (covers the reboot-only case). Audit/remove now-dead orchestrator divergence. +- **Create** `backend/internal/daemon/supervisor/{supervisor.go,listen_unix.go,listen_windows.go,supervisor_test.go}` — OS-native liveness listener + watchdog → triggers a clean shutdown (the same `RequestShutdown` the HTTP `/shutdown` uses) when the frontend link drops. +- **Modify** `backend/internal/daemon/daemon.go` — start the supervisor, publish its address (extend `runfile`/`/healthz`), wire `onLastClientGone` to `RequestShutdown`. +- **Create** `frontend/src/main/supervisor-link.ts` (+ test); **Modify** `frontend/src/main.ts` — hold the supervisor link open for the app lifetime; remove all daemon-stop logic from `before-quit`/`process.on("exit")`. + +Phases (independently shippable): +- **Phase A** (Task 1): shutdown no longer tears down sessions → adopt-alive on the graceful path. THE fix; stops the increment and preserves context. +- **Phase B** (Tasks 2-4): OS-native liveness link → daemon cleanly stops when the frontend dies (no orphan), without tearing down sessions. +- **Phase C** (Task 5): promptless restore + de-segregate (covers the genuine reboot case). +- **Phase D** (Task 6, optional): dev `ao`-on-PATH. + +--- + +## Task 1: Shutdown stops destroying live sessions (Phase A — the fix) + +**Files:** Modify `backend/internal/daemon/daemon.go`. + +**Change:** On the shutdown path (after `srv.Run` returns, `daemon.go:~162-166`), do NOT call `SaveAndTeardownAll`. The daemon exits and the tmux/ConPTY sessions stay alive; the next boot's `Reconcile`→`reconcileLive` adopts them (already implemented and verified). `SaveAndTeardownAll` is reserved for explicit teardown needs, not routine shutdown. + +> Rationale proven above: with the teardown removed, the graceful path behaves like the hard-kill path, which already adopts cleanly. Uncommitted work is not lost: it stays in the on-disk worktree and, if the runtime is ever genuinely gone (reboot), `reconcileLive` stashes it on boot. + +- [ ] **Step 1:** Write a failing `go test` at the daemon/session_manager seam: after a simulated graceful shutdown, live sessions remain non-terminated and their runtime handles are NOT destroyed. (Use the existing manager test doubles / a fake runtime that records `Destroy` calls; assert `Destroy` is not called on shutdown.) +- [ ] **Step 2:** Run it → FAIL (current code calls `SaveAndTeardownAll` → `Destroy`). +- [ ] **Step 3:** Remove the `SaveAndTeardownAll` invocation from `daemon.Run`'s shutdown sequence (keep ordered teardown of CDC/preview/lifecycle goroutines). Leave the function in place for explicit callers. +- [ ] **Step 4:** Run `go test ./internal/daemon/... ./internal/session_manager/... -race` → PASS (fix any test asserting the old teardown-on-shutdown). +- [ ] **Step 5:** Manual repro (sandbox `AO_DATA_DIR`): spawn orchestrator → `ao stop` → tmux session SURVIVES → `ao start` → orchestrator adopted, SAME id, no increment. +- [ ] **Step 6:** Commit `fix(daemon): do not tear down live sessions on shutdown; adopt them on boot`. + +## Task 2: Supervisor watchdog core +**Files:** Create `backend/internal/daemon/supervisor/supervisor.go`, `supervisor_test.go`. +**Produces:** `New(grace, onLastClientGone, log)`, `(*Supervisor) Serve(ctx, ln net.Listener) error`. Arms on first accepted conn; when live count hits 0, starts `grace`; if it elapses still 0, calls `onLastClientGone()` once; a reconnect cancels it. Each conn read into a scratch buffer purely to detect close. +- [ ] **Step 1:** Failing tests: never fires pre-connect; fires once after grace on last disconnect; reconnect within grace cancels. Use `net.Pipe()` + a fake listener + short grace. +- [ ] **Step 2:** Run → FAIL. +- [ ] **Step 3:** Implement (mutex `liveCount`, `time.AfterFunc` grace, `sync.Once` fire). +- [ ] **Step 4:** Run → PASS. +- [ ] **Step 5:** Commit `feat(daemon): supervisor watchdog`. + +## Task 3: Platform listeners + daemon wiring +**Files:** Create `supervisor/listen_unix.go`, `listen_windows.go`; Modify `daemon.go`. +**Produces:** `Listen(dataDir) (net.Listener, string, error)` — unix UDS at `~/.ao/supervise.sock` (unlink-stale first); windows named pipe `\\.\pipe\ao-supervise` (`//go:build windows`, via `go-winio` — confirm/declare the dep). Wire into `daemon.Run` after the HTTP server is up; publish `addr` (extend `runfile` write or `/healthz`); `go sup.Serve(ctx, ln)`; `onLastClientGone = deps.RequestShutdown`. Because Task 1 made shutdown non-destructive, a watchdog-triggered shutdown simply exits leaving sessions alive. +- [ ] **Step 1:** Implement listeners (unix first; windows behind build tag). +- [ ] **Step 2:** Wire + publish address. +- [ ] **Step 3:** `go build ./... && go vet ./...` clean (darwin at least). +- [ ] **Step 4:** Manual: start daemon, `nc -U ~/.ao/supervise.sock`, kill `nc` → daemon exits after grace, **tmux sessions still alive**; reconnect within grace → no shutdown. +- [ ] **Step 5:** Commit `feat(daemon): OS-native supervisor listener triggers clean shutdown`. + +## Task 4: Electron holds the link; drop quit-time daemon teardown +**Files:** Create `frontend/src/main/supervisor-link.ts` (+ test); Modify `frontend/src/main.ts`. +**Produces:** `connectSupervisor(addr, opts?) -> { dispose() }` (`node:net` connect to UDS/pipe; retry with backoff if the daemon is not up yet; heartbeat byte every N s). In `main.ts`: connect after the daemon is ready (read addr from the handshake); **remove** all daemon-stop logic from `before-quit`/`process.on("exit")` (delete `killDaemon`/`ao stop`). Closing the app drops the socket → daemon self-stops cleanly, sessions persist. +- [ ] **Step 1:** Failing test: retry-until-connected against a throwaway `net.Server` on a temp UDS. +- [ ] **Step 2:** Run `pnpm vitest run src/main/supervisor-link.test.ts` → FAIL. +- [ ] **Step 3:** Implement `supervisor-link.ts`. +- [ ] **Step 4:** Edit `main.ts`; `pnpm tsc --noEmit && pnpm vite build --config vite.main.config.ts` clean. +- [ ] **Step 5:** Dev smoke: Cmd+Q AND `kill -9` Electron → daemon exits both ways, `running.json` gone, **tmux sessions still alive** → reopen → sessions adopted with context. +- [ ] **Step 6:** Commit `feat(desktop): supervisor link; daemon self-stops (clean) on frontend exit`. + +## Task 5: Promptless restore + de-segregate (covers the reboot case) +**Files:** Modify `backend/internal/session_manager/manager.go`. +**Change:** In `restoreArgv` (~1238), when `ok=false` and `meta.Prompt==""`, relaunch fresh via `GetLaunchCommand` (empty prompt, system prompt only) instead of returning `ErrNotResumable`. This only matters when the runtime is genuinely gone (reboot) and `RestoreAll` runs; with Task 1, normal restarts adopt and never reach here. Remove orchestrator-only divergence the audit surfaces. +- [ ] **Step 1:** Failing `go test`: `restoreArgv` with `ok=false`, empty `AgentSessionID` + empty `Prompt` returns the fresh `GetLaunchCommand` argv, not `ErrNotResumable`. +- [ ] **Step 2:** Run → FAIL. +- [ ] **Step 3:** Implement (drop the empty-prompt early return; fall through to `GetLaunchCommand`). +- [ ] **Step 4:** `go test ./internal/session_manager/... -race` → PASS. +- [ ] **Step 5:** Manual reboot-sim: spawn orchestrator → `tmux kill-server` (simulate reboot losing tmux) → restart daemon → orchestrator restored in the SAME id (not recreated). +- [ ] **Step 6:** Commit `fix(core): restore promptless sessions in place (reboot recovery, no increment)`. + +## Task 6 (optional, Phase D): `ao` on agent PATH in dev +`HookPATH` needs the daemon binary named `ao`. Packaged satisfies this; dev `go run` produces a hash-named temp binary. If wanted, build a stable `~/.ao/dev/ao` once at dev startup and launch from it. Detail on request. + +--- + +## Verification (whole feature) +- [ ] `go build ./... && go vet ./... && go test ./... -race` green; `cd frontend && pnpm vitest run && pnpm tsc --noEmit` green; full `pnpm build`. +- [ ] **Graceful stop preserves sessions:** spawn orchestrator → `ao stop` → tmux session ALIVE → `ao start` → orchestrator adopted, SAME id (reproduces the fix for the recorded bug). +- [ ] **Frontend death:** Cmd+Q AND `kill -9` Electron → daemon exits, sessions alive, reopen → adopted with context. +- [ ] **Reboot recovery:** `tmux kill-server` then restart → orchestrator restored in the same id. +- [ ] **Headless safety:** `ao start` from a terminal, no app → daemon runs forever, sessions intact. + +## Self-Review +- Spec coverage: don't depend on clean close (Tasks 1+4), no orphan daemon (Task 4), orchestrator survives restart treated like a worker (Task 1 adopt; Task 5 reboot), OS-native pipe/socket transport (Tasks 2-3), `ao` to workers (HookPATH; dev = Task 6). Covered. +- Key insight baked in: the fix is primarily DELETION (stop tearing down on shutdown), validated by the hard-kill adopt experiment. +- Open implementation check (Task 1): confirm nothing else relies on `SaveAndTeardownAll` running at shutdown (e.g., a test or a resource-flush); the function stays available for explicit teardown. From c5fdfdc690290ff66d21c3269c60b879381be449 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 25 Jun 2026 14:11:35 +0000 Subject: [PATCH 18/23] chore: format with prettier [skip ci] --- ...-25-ao-cli-install-and-daemon-lifecycle.md | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/docs/superpowers/plans/2026-06-25-ao-cli-install-and-daemon-lifecycle.md b/docs/superpowers/plans/2026-06-25-ao-cli-install-and-daemon-lifecycle.md index f307f0c25c..8ddfa040fc 100644 --- a/docs/superpowers/plans/2026-06-25-ao-cli-install-and-daemon-lifecycle.md +++ b/docs/superpowers/plans/2026-06-25-ao-cli-install-and-daemon-lifecycle.md @@ -9,17 +9,21 @@ **Tech Stack:** Go daemon (`session_manager`, `daemon`, `httpd`, tmux/ConPTY runtimes), Electron main (TypeScript, `node:net`), `go test` + Vitest. ## Reproduction (recorded, sandboxed `AO_DATA_DIR`) + - Spawn worker + orchestrator → both get a live tmux session, DB `is_terminated=0`. - `kill -9` daemon → tmux sessions survive → restart → both **adopted**: same id, `is_terminated=0`, same tmux session. No increment. ✅ - `ao stop` (graceful) → `SaveAndTeardownAll` → tmux sessions **killed**, marked `exited`, marker written → restart → orchestrator stays `is_terminated=1` (Restore returned `ErrNotResumable`) → `POST /orchestrators` creates `…-3` (num 2→3). ❌ This is the bug. ## Why only the orchestrator visibly breaks (workers are NOT immune) + The graceful path kills BOTH workers' and orchestrators' live sessions; the orchestrator is just the only one where the damage is visible, for two independent reasons that both happen to land on it: + 1. **Restore failure is orchestrator-only.** Workers carry a saved `prompt`, so when the agent can't natively resume, `restoreArgv` falls back to relaunching fresh from the prompt and "succeeds". The orchestrator is promptless (`prompt=""` for all of them in the DB), so the same path hits `ErrNotResumable` (`manager.go:1238`) and it is left terminated. -2. **Auto-recreation is orchestrator-only.** The frontend calls `POST /api/v1/orchestrators` on load to *ensure* an orchestrator; finding none active, it mints a new one (`num+1` → the visible `14→15→16`). Nothing auto-respawns workers, so a worker that fails to restore just silently vanishes. -So both workers and the orchestrator lose their LIVE session + context on a graceful stop today; the orchestrator alone advertises it via the incrementing id. Task 1 (adopt-alive instead of teardown) restores live context for ALL sessions and needs no orchestrator special-casing — this is the unification you asked for. +2. **Auto-recreation is orchestrator-only.** The frontend calls `POST /api/v1/orchestrators` on load to _ensure_ an orchestrator; finding none active, it mints a new one (`num+1` → the visible `14→15→16`). Nothing auto-respawns workers, so a worker that fails to restore just silently vanishes. + So both workers and the orchestrator lose their LIVE session + context on a graceful stop today; the orchestrator alone advertises it via the incrementing id. Task 1 (adopt-alive instead of teardown) restores live context for ALL sessions and needs no orchestrator special-casing — this is the unification you asked for. ## Global Constraints + - All app state under `~/.ao` (`AO_DATA_DIR`/`AO_RUN_FILE` overrides). Liveness socket under `~/.ao` (unix) / named pipe (Windows). - No em dashes anywhere. - Headless safety: a daemon with no frontend (CLI `ao start`) must never self-stop and must not have its sessions destroyed. @@ -30,6 +34,7 @@ So both workers and the orchestrator lose their LIVE session + context on a grac --- ## File Structure + - **Modify** `backend/internal/daemon/daemon.go` — remove the `SaveAndTeardownAll` call from the normal shutdown path (`daemon.go:164`); the daemon exits leaving sessions alive. (Keep `Reconcile` on boot, which already adopts.) - **Modify** `backend/internal/session_manager/manager.go` — `restoreArgv` (~1238): a promptless session relaunches fresh in the same id instead of `ErrNotResumable` (covers the reboot-only case). Audit/remove now-dead orchestrator divergence. - **Create** `backend/internal/daemon/supervisor/{supervisor.go,listen_unix.go,listen_windows.go,supervisor_test.go}` — OS-native liveness listener + watchdog → triggers a clean shutdown (the same `RequestShutdown` the HTTP `/shutdown` uses) when the frontend link drops. @@ -37,6 +42,7 @@ So both workers and the orchestrator lose their LIVE session + context on a grac - **Create** `frontend/src/main/supervisor-link.ts` (+ test); **Modify** `frontend/src/main.ts` — hold the supervisor link open for the app lifetime; remove all daemon-stop logic from `before-quit`/`process.on("exit")`. Phases (independently shippable): + - **Phase A** (Task 1): shutdown no longer tears down sessions → adopt-alive on the graceful path. THE fix; stops the increment and preserves context. - **Phase B** (Tasks 2-4): OS-native liveness link → daemon cleanly stops when the frontend dies (no orphan), without tearing down sessions. - **Phase C** (Task 5): promptless restore + de-segregate (covers the genuine reboot case). @@ -60,8 +66,10 @@ Phases (independently shippable): - [ ] **Step 6:** Commit `fix(daemon): do not tear down live sessions on shutdown; adopt them on boot`. ## Task 2: Supervisor watchdog core + **Files:** Create `backend/internal/daemon/supervisor/supervisor.go`, `supervisor_test.go`. **Produces:** `New(grace, onLastClientGone, log)`, `(*Supervisor) Serve(ctx, ln net.Listener) error`. Arms on first accepted conn; when live count hits 0, starts `grace`; if it elapses still 0, calls `onLastClientGone()` once; a reconnect cancels it. Each conn read into a scratch buffer purely to detect close. + - [ ] **Step 1:** Failing tests: never fires pre-connect; fires once after grace on last disconnect; reconnect within grace cancels. Use `net.Pipe()` + a fake listener + short grace. - [ ] **Step 2:** Run → FAIL. - [ ] **Step 3:** Implement (mutex `liveCount`, `time.AfterFunc` grace, `sync.Once` fire). @@ -69,8 +77,10 @@ Phases (independently shippable): - [ ] **Step 5:** Commit `feat(daemon): supervisor watchdog`. ## Task 3: Platform listeners + daemon wiring + **Files:** Create `supervisor/listen_unix.go`, `listen_windows.go`; Modify `daemon.go`. **Produces:** `Listen(dataDir) (net.Listener, string, error)` — unix UDS at `~/.ao/supervise.sock` (unlink-stale first); windows named pipe `\\.\pipe\ao-supervise` (`//go:build windows`, via `go-winio` — confirm/declare the dep). Wire into `daemon.Run` after the HTTP server is up; publish `addr` (extend `runfile` write or `/healthz`); `go sup.Serve(ctx, ln)`; `onLastClientGone = deps.RequestShutdown`. Because Task 1 made shutdown non-destructive, a watchdog-triggered shutdown simply exits leaving sessions alive. + - [ ] **Step 1:** Implement listeners (unix first; windows behind build tag). - [ ] **Step 2:** Wire + publish address. - [ ] **Step 3:** `go build ./... && go vet ./...` clean (darwin at least). @@ -78,8 +88,10 @@ Phases (independently shippable): - [ ] **Step 5:** Commit `feat(daemon): OS-native supervisor listener triggers clean shutdown`. ## Task 4: Electron holds the link; drop quit-time daemon teardown + **Files:** Create `frontend/src/main/supervisor-link.ts` (+ test); Modify `frontend/src/main.ts`. **Produces:** `connectSupervisor(addr, opts?) -> { dispose() }` (`node:net` connect to UDS/pipe; retry with backoff if the daemon is not up yet; heartbeat byte every N s). In `main.ts`: connect after the daemon is ready (read addr from the handshake); **remove** all daemon-stop logic from `before-quit`/`process.on("exit")` (delete `killDaemon`/`ao stop`). Closing the app drops the socket → daemon self-stops cleanly, sessions persist. + - [ ] **Step 1:** Failing test: retry-until-connected against a throwaway `net.Server` on a temp UDS. - [ ] **Step 2:** Run `pnpm vitest run src/main/supervisor-link.test.ts` → FAIL. - [ ] **Step 3:** Implement `supervisor-link.ts`. @@ -88,8 +100,10 @@ Phases (independently shippable): - [ ] **Step 6:** Commit `feat(desktop): supervisor link; daemon self-stops (clean) on frontend exit`. ## Task 5: Promptless restore + de-segregate (covers the reboot case) + **Files:** Modify `backend/internal/session_manager/manager.go`. **Change:** In `restoreArgv` (~1238), when `ok=false` and `meta.Prompt==""`, relaunch fresh via `GetLaunchCommand` (empty prompt, system prompt only) instead of returning `ErrNotResumable`. This only matters when the runtime is genuinely gone (reboot) and `RestoreAll` runs; with Task 1, normal restarts adopt and never reach here. Remove orchestrator-only divergence the audit surfaces. + - [ ] **Step 1:** Failing `go test`: `restoreArgv` with `ok=false`, empty `AgentSessionID` + empty `Prompt` returns the fresh `GetLaunchCommand` argv, not `ErrNotResumable`. - [ ] **Step 2:** Run → FAIL. - [ ] **Step 3:** Implement (drop the empty-prompt early return; fall through to `GetLaunchCommand`). @@ -98,11 +112,13 @@ Phases (independently shippable): - [ ] **Step 6:** Commit `fix(core): restore promptless sessions in place (reboot recovery, no increment)`. ## Task 6 (optional, Phase D): `ao` on agent PATH in dev + `HookPATH` needs the daemon binary named `ao`. Packaged satisfies this; dev `go run` produces a hash-named temp binary. If wanted, build a stable `~/.ao/dev/ao` once at dev startup and launch from it. Detail on request. --- ## Verification (whole feature) + - [ ] `go build ./... && go vet ./... && go test ./... -race` green; `cd frontend && pnpm vitest run && pnpm tsc --noEmit` green; full `pnpm build`. - [ ] **Graceful stop preserves sessions:** spawn orchestrator → `ao stop` → tmux session ALIVE → `ao start` → orchestrator adopted, SAME id (reproduces the fix for the recorded bug). - [ ] **Frontend death:** Cmd+Q AND `kill -9` Electron → daemon exits, sessions alive, reopen → adopted with context. @@ -110,6 +126,7 @@ Phases (independently shippable): - [ ] **Headless safety:** `ao start` from a terminal, no app → daemon runs forever, sessions intact. ## Self-Review + - Spec coverage: don't depend on clean close (Tasks 1+4), no orphan daemon (Task 4), orchestrator survives restart treated like a worker (Task 1 adopt; Task 5 reboot), OS-native pipe/socket transport (Tasks 2-3), `ao` to workers (HookPATH; dev = Task 6). Covered. - Key insight baked in: the fix is primarily DELETION (stop tearing down on shutdown), validated by the hard-kill adopt experiment. - Open implementation check (Task 1): confirm nothing else relies on `SaveAndTeardownAll` running at shutdown (e.g., a test or a resource-flush); the function stays available for explicit teardown. From 90134021fb4132ef93ea283d4307a50831ee134c Mon Sep 17 00:00:00 2001 From: Harshit Singh Bhandari Date: Thu, 25 Jun 2026 20:01:59 +0530 Subject: [PATCH 19/23] fix(daemon): keep supervisor watchdog alive across transient Accept errors Review of #2185: Serve previously returned on the first unexpected Accept error, silently disabling the watchdog (the 'restart is caller's job' comment described a contract the caller never fulfilled). Back off and keep accepting instead, so a transient error (e.g. EMFILE) cannot leave the daemon unable to self-stop on frontend death. Also drop the stale Task 3/4 planning comment. Co-Authored-By: Claude Opus 4.8 --- .../internal/daemon/supervisor/supervisor.go | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/backend/internal/daemon/supervisor/supervisor.go b/backend/internal/daemon/supervisor/supervisor.go index 3970d98f86..faa4439552 100644 --- a/backend/internal/daemon/supervisor/supervisor.go +++ b/backend/internal/daemon/supervisor/supervisor.go @@ -4,9 +4,7 @@ // connects so a daemon started with no frontend (e.g. CLI "ao start") never // self-stops. // -// Task 3 (later) wires a real net.Listener (Unix socket / Windows named pipe) -// and the daemon entry-point; Task 4 wires the Electron side. This package is -// a leaf: it imports only stdlib. +// This package is a leaf: it imports only stdlib. package supervisor import ( @@ -18,6 +16,10 @@ import ( "time" ) +// acceptRetryBackoff bounds the retry after a transient Accept error so a +// persistent failure cannot hot-spin the accept loop. +const acceptRetryBackoff = 200 * time.Millisecond + // Supervisor watches connections on a net.Listener and calls onLastClientGone // exactly once (per process lifetime) when the live-count drops to zero and // stays zero for the grace period. @@ -85,8 +87,18 @@ func (s *Supervisor) Serve(ctx context.Context, ln net.Listener) error { if errors.Is(err, net.ErrClosed) { return nil } - s.log.Error("supervisor: accept error", "err", err) - return nil // ponytail: first error exits; persistent restart is caller's job + // A transient Accept error (e.g. EMFILE) must NOT silently kill the + // watchdog: that would leave the daemon unable to self-stop on + // frontend death. Back off briefly and keep accepting. A genuinely + // closed listener returns net.ErrClosed (handled above) or trips + // ctx.Done during the backoff. + s.log.Warn("supervisor: accept error, retrying", "err", err) + select { + case <-ctx.Done(): + return nil + case <-time.After(acceptRetryBackoff): + } + continue } s.mu.Lock() From 2931a19d9926e123f3eb6a87851e59206d90c786 Mon Sep 17 00:00:00 2001 From: harshitsinghbhandari Date: Thu, 25 Jun 2026 20:11:41 +0530 Subject: [PATCH 20/23] fix(core): leave promptless workers terminated on restore (orchestrators still relaunch fresh) A promptless, unresumable KindWorker session had no prompt to replay and no native session id to resume from. Blank-relaunching it via GetLaunchCommand would silently drop its task. restoreArgv now returns ErrNotResumable for this case, gated on (meta.Prompt == "" && kind != domain.KindOrchestrator). Orchestrators are promptless by design and continue to relaunch fresh with the system prompt only. Re-introduces ErrNotResumable sentinel and its Conflict API mapping. Co-Authored-By: Claude Sonnet 4.6 --- .../integration/lifecycle_sqlite_test.go | 29 ++++++++-------- backend/internal/service/session/service.go | 3 ++ .../internal/service/session/service_test.go | 11 ++++++ backend/internal/session_manager/manager.go | 34 +++++++++++++------ .../internal/session_manager/manager_test.go | 33 ++++++++++++++++++ 5 files changed, 85 insertions(+), 25 deletions(-) diff --git a/backend/internal/integration/lifecycle_sqlite_test.go b/backend/internal/integration/lifecycle_sqlite_test.go index 11468f2cec..fcfe64d11c 100644 --- a/backend/internal/integration/lifecycle_sqlite_test.go +++ b/backend/internal/integration/lifecycle_sqlite_test.go @@ -208,11 +208,13 @@ func TestRestoreRoundTripPreservesMetadata(t *testing.T) { // TestReconcile_TerminatesDeadLiveSessionAndReapsLeakedTmux exercises // Manager.Reconcile against a real sqlite.Store: // -// - Session A: is_terminated=0 but its runtime is GONE => Reconcile must -// mark it terminated in the DB. +// - Session A: is_terminated=0 but its runtime is GONE and it is a promptless +// KindWorker. reconcileLive marks it terminated. RestoreAll does NOT relaunch it +// (ErrNotResumable: no prompt, no session id, not an orchestrator). End state: +// is_terminated=true, runtime.Create count stays 0. // - Session B: is_terminated=1 but its runtime is still ALIVE (leaked teardown) // => Reconcile must call Destroy on its handle. -func TestReconcile_RestoresDeadLiveSessionAndReapsLeakedTmux(t *testing.T) { +func TestReconcile_TerminatesDeadLiveSessionAndReapsLeakedTmux(t *testing.T) { ctx := context.Background() st := newStack(t) @@ -274,11 +276,10 @@ func TestReconcile_RestoresDeadLiveSessionAndReapsLeakedTmux(t *testing.T) { t.Fatalf("Reconcile: %v", err) } - // Session A's runtime was gone, so reconcileLive captured its work, marked it - // terminated, and RestoreAll relaunched it fresh on the SAME boot (the - // documented crash-recovery path: a promptless session relaunches with the - // system prompt only rather than being abandoned). End state: live again in - // the same id, via a fresh runtime Create. + // Session A is a promptless KindWorker: reconcileLive captured its work and + // marked it terminated. RestoreAll skips it (ErrNotResumable: no prompt, no + // AgentSessionID, not an orchestrator). End state: is_terminated=true, no fresh + // runtime.Create (a blank relaunch would silently lose its task). gotA, ok, err := st.store.GetSession(ctx, recA.ID) if err != nil { t.Fatalf("get session A: %v", err) @@ -286,14 +287,12 @@ func TestReconcile_RestoresDeadLiveSessionAndReapsLeakedTmux(t *testing.T) { if !ok { t.Fatalf("session A: not found after Reconcile") } - if gotA.IsTerminated { - t.Fatalf("session A: want restored (is_terminated=false) after crash recovery, got terminated") + if !gotA.IsTerminated { + t.Fatalf("session A: want terminated (is_terminated=true) after crash recovery of promptless worker, got live") } - // Exactly one fresh runtime Create: A was torn down and relaunched. An - // adopted (still-alive) session would not Create; this proves the full - // terminate-then-restore cycle ran for A and only A. - if st.rt.created != 1 { - t.Fatalf("want 1 runtime Create for A's fresh relaunch, got %d", st.rt.created) + // No runtime.Create: a promptless worker must not be blank-relaunched. + if st.rt.created != 0 { + t.Fatalf("want 0 runtime Creates (promptless worker must not relaunch), got %d", st.rt.created) } // Session B's leaked runtime must have been destroyed. diff --git a/backend/internal/service/session/service.go b/backend/internal/service/session/service.go index 798467ed4a..9f092d2e32 100644 --- a/backend/internal/service/session/service.go +++ b/backend/internal/service/session/service.go @@ -463,6 +463,9 @@ func toAPIError(err error) error { return apierr.Conflict("SESSION_TERMINATED", "Session is terminated", nil) case errors.Is(err, sessionmanager.ErrIncompleteHandle): return apierr.Conflict("SESSION_INCOMPLETE_HANDLE", "Session is missing runtime or workspace handles", nil) + case errors.Is(err, sessionmanager.ErrNotResumable): + return apierr.Conflict("SESSION_NOT_RESUMABLE", + "This session has no saved agent session or prompt to resume from", nil) case errors.Is(err, sessionmanager.ErrProjectNotResolvable): return apierr.Invalid("PROJECT_NOT_RESOLVABLE", "Project is not registered or has no repo. Register it with `ao project add`", nil) case errors.Is(err, sessionmanager.ErrUnknownHarness): diff --git a/backend/internal/service/session/service_test.go b/backend/internal/service/session/service_test.go index 57c0a2ed18..e8c1c24227 100644 --- a/backend/internal/service/session/service_test.go +++ b/backend/internal/service/session/service_test.go @@ -542,6 +542,17 @@ func TestToAPIErrorMapsWorkspaceBranchSentinels(t *testing.T) { } } +// TestToAPIError_NotResumable asserts that ErrNotResumable (promptless worker +// with no adapter resume handle) maps to a Conflict with code SESSION_NOT_RESUMABLE. +func TestToAPIError_NotResumable(t *testing.T) { + err := fmt.Errorf("restore mer-1: %w", sessionmanager.ErrNotResumable) + mapped := toAPIError(err) + var e *apierr.Error + if !errors.As(mapped, &e) || e.Kind != apierr.KindConflict || e.Code != "SESSION_NOT_RESUMABLE" { + t.Fatalf("mapped = %v, want Conflict SESSION_NOT_RESUMABLE", mapped) + } +} + // TestSpawnOrchestratorNoCleanReturnsExistingWhenActiveExists is the RED test // for the idempotency fix: when an active orchestrator already exists and // clean=false, SpawnOrchestrator must return that orchestrator without minting diff --git a/backend/internal/session_manager/manager.go b/backend/internal/session_manager/manager.go index 9fb6b1bca1..a569c20498 100644 --- a/backend/internal/session_manager/manager.go +++ b/backend/internal/session_manager/manager.go @@ -36,6 +36,12 @@ var ( // ErrMissingHarness means neither the spawn request nor the project's role // config selected an agent. Worker/orchestrator spawns must be explicit. ErrMissingHarness = errors.New("session: agent harness required") + // ErrNotResumable means a terminated session cannot be relaunched: its adapter + // cannot natively resume it AND it has no prompt to fresh-launch from, and it is + // not an orchestrator (orchestrators are promptless by design and relaunch fresh + // with the system prompt only). Workers without a task and without a native + // session id have nothing meaningful to restore. + ErrNotResumable = errors.New("session: nothing to resume from") ) // Env vars a spawned process reads to learn who it is. @@ -480,10 +486,11 @@ func (m *Manager) Restore(ctx context.Context, id domain.SessionID) (domain.Sess if meta.WorkspacePath == "" || meta.Branch == "" { return domain.SessionRecord{}, fmt.Errorf("restore %s: %w", id, ErrIncompleteHandle) } - // Resumability is NOT decided here: a promptless session can still be fully - // resumable when the harness pins a deterministic session id (Claude Code). - // restoreArgv always produces a launch command: adapter resume, saved-prompt - // replay, or fresh launch (empty prompt with system prompt only). Same id. + // Resumability is decided inside restoreArgv, not here. A promptless session + // can still be fully resumable when the harness pins a deterministic session id + // (Claude Code). restoreArgv returns ErrNotResumable only for a promptless, + // unresumable non-orchestrator (a worker with no task and no native id to resume). + // Orchestrators always relaunch fresh with the system prompt only. project, err := m.loadProject(ctx, rec.ProjectID) if err != nil { @@ -514,7 +521,7 @@ func (m *Manager) Restore(ctx context.Context, id domain.SessionID) (domain.Sess } // Restore re-applies the project's resolved agent config so a configured // model/permissions carry across a restore, matching fresh spawn. - argv, err := restoreArgv(ctx, agent, id, ws.Path, meta, systemPrompt, effectiveAgentConfig(rec.Kind, project.Config)) + argv, err := restoreArgv(ctx, agent, id, ws.Path, meta, systemPrompt, effectiveAgentConfig(rec.Kind, project.Config), rec.Kind) if err != nil { return domain.SessionRecord{}, fmt.Errorf("restore %s: %w", id, err) } @@ -1213,7 +1220,11 @@ func (m *Manager) prepareWorkspace(ctx context.Context, agent ports.Agent, id do // restoreArgv builds the argv to relaunch a torn-down session: the agent's // native resume command when it can continue the session, else a fresh launch. // The agent signals via ok=false (e.g. no native session id captured yet). -func restoreArgv(ctx context.Context, agent ports.Agent, id domain.SessionID, workspacePath string, meta domain.SessionMetadata, systemPrompt string, agentConfig ports.AgentConfig) ([]string, error) { +// Returns ErrNotResumable only for a promptless, unresumable non-orchestrator: +// a worker with no prompt and no native session id has nothing to restore from. +// Orchestrators are promptless by design and always relaunch fresh with the +// system prompt only. +func restoreArgv(ctx context.Context, agent ports.Agent, id domain.SessionID, workspacePath string, meta domain.SessionMetadata, systemPrompt string, agentConfig ports.AgentConfig, kind domain.SessionKind) ([]string, error) { ref := ports.SessionRef{ ID: string(id), WorkspacePath: workspacePath, @@ -1226,10 +1237,13 @@ func restoreArgv(ctx context.Context, agent ports.Agent, id domain.SessionID, wo if ok { return cmd, nil } - // The adapter cannot resume the session (no native or derivable session id). - // Relaunch fresh via GetLaunchCommand: a saved prompt is replayed, an empty - // prompt (e.g. an orchestrator) launches fresh with the system prompt only. - // Same id, same workspace - no new session is minted. + // Adapter cannot resume. A saved prompt is replayed fresh. An orchestrator is + // promptless by design and relaunches with the system prompt only. A promptless + // WORKER has no task and no session id to restore from: do not blank-relaunch it. + if meta.Prompt == "" && kind != domain.KindOrchestrator { + return nil, ErrNotResumable + } + // Fall through to GetLaunchCommand (replays meta.Prompt; empty for an orchestrator). argv, err := agent.GetLaunchCommand(ctx, ports.LaunchConfig{ SessionID: string(id), WorkspacePath: workspacePath, diff --git a/backend/internal/session_manager/manager_test.go b/backend/internal/session_manager/manager_test.go index fd94e4fe45..82ff2c491a 100644 --- a/backend/internal/session_manager/manager_test.go +++ b/backend/internal/session_manager/manager_test.go @@ -968,6 +968,39 @@ func TestRestore_PromptlessUnresumableRelaunchesFresh(t *testing.T) { } } +// TestRestore_PromptlessWorkerNotResumable is the RED test for the promptless-worker +// fix: a KindWorker session with no prompt and no captured AgentSessionID (so the +// adapter returns ok=false) must NOT be blank-relaunched. The session had no task +// to replay and no native id to resume from, so relaunching fresh would silently +// drop its work. Restore must return ErrNotResumable and leave the session terminated +// (runtime.Create must NOT be called). +func TestRestore_PromptlessWorkerNotResumable(t *testing.T) { + st := newFakeStore() + st.sessions["mer-1"] = domain.SessionRecord{ + ID: "mer-1", ProjectID: "mer", Kind: domain.KindWorker, IsTerminated: true, + // No AgentSessionID, no Prompt: promptless worker with no resume handle. + Metadata: domain.SessionMetadata{WorkspacePath: "/ws/mer-1", Branch: "ao/mer-1/root"}, + Activity: domain.Activity{State: domain.ActivityExited}, + } + rt := &fakeRuntime{} + lookPath := func(string) (string, error) { return "/bin/true", nil } + // fakeAgents resolves to fakeAgent, whose GetRestoreCommand returns ok=false + // when there is no AgentSessionID. With a KindWorker and empty Prompt, this + // must produce ErrNotResumable instead of a blank relaunch. + m := New(Deps{Runtime: rt, Agents: fakeAgents{}, Workspace: &fakeWorkspace{}, Store: st, Messenger: &fakeMessenger{}, Lifecycle: &fakeLCM{store: st}, LookPath: lookPath}) + + _, err := m.Restore(ctx, "mer-1") + if !errors.Is(err, ErrNotResumable) { + t.Fatalf("promptless unresumable worker must return ErrNotResumable, got %v", err) + } + if rt.created != 0 { + t.Fatalf("runtime.Create = %d, want 0 (must not relaunch a promptless worker)", rt.created) + } + if !st.sessions["mer-1"].IsTerminated { + t.Error("session must remain terminated after ErrNotResumable") + } +} + // TestRestore_WorkerPointsAtCurrentOrchestrator: a restored worker's // coordination hint must reference the orchestrator active at restore time, // not the one from its original spawn. From cc0309bfe9f1e9beac136a41245f1af87251fec7 Mon Sep 17 00:00:00 2001 From: harshitsinghbhandari Date: Thu, 25 Jun 2026 20:13:07 +0530 Subject: [PATCH 21/23] fix(desktop): re-link supervisor on attach for app-owned daemons (close lingering-daemon gap) - runfile.Info gets Owner field (omitempty): "app" when Electron spawned, empty for headless `ao start`. server.go reads AO_OWNER env to set it. - daemonEnv() injects AO_OWNER=app so spawned daemons self-identify. - Extracted establishSupervisorLink() from inline reportBoundPort code. Spawn path calls it unconditionally; attach paths call it only when shouldLinkOnAttach(owner) is true (owner === "app"). - Both attach paths (inspectExistingDaemon, resolveDaemonFromPort) now read the owner from the run-file and re-link when app-owned, preventing a lingering app-spawned daemon from self-stopping mid-session. - Headless ao start daemons stay unlinked: persistent across app quit. - New daemon-owner.ts + daemon-owner.test.ts (4 vitest cases, all pass). - Go: 9 tests pass with -race; vet clean. Co-Authored-By: Claude Sonnet 4.6 --- backend/internal/httpd/server.go | 1 + backend/internal/runfile/runfile.go | 5 ++ backend/internal/runfile/runfile_test.go | 36 ++++++++ frontend/src/main.ts | 107 ++++++++++++++++------- frontend/src/main/daemon-owner.test.ts | 21 +++++ frontend/src/main/daemon-owner.ts | 9 ++ frontend/src/shared/daemon-discovery.ts | 13 ++- 7 files changed, 157 insertions(+), 35 deletions(-) create mode 100644 frontend/src/main/daemon-owner.test.ts create mode 100644 frontend/src/main/daemon-owner.ts diff --git a/backend/internal/httpd/server.go b/backend/internal/httpd/server.go index 95ff7c5c93..e770db9b87 100644 --- a/backend/internal/httpd/server.go +++ b/backend/internal/httpd/server.go @@ -89,6 +89,7 @@ func (s *Server) Run(ctx context.Context) error { PID: os.Getpid(), Port: s.boundPort(), StartedAt: time.Now().UTC(), + Owner: os.Getenv("AO_OWNER"), } if err := runfile.Write(s.cfg.RunFilePath, info); err != nil { _ = s.listen.Close() diff --git a/backend/internal/runfile/runfile.go b/backend/internal/runfile/runfile.go index 92718d34c1..f2383044df 100644 --- a/backend/internal/runfile/runfile.go +++ b/backend/internal/runfile/runfile.go @@ -24,6 +24,11 @@ type Info struct { Port int `json:"port"` // StartedAt is when the daemon came up (RFC 3339). StartedAt time.Time `json:"startedAt"` + // Owner is "app" when the desktop Electron app spawned this daemon; empty + // for a headless `ao start` daemon. Used by the app to decide whether to + // hold a supervisor link on attach (app-owned: re-link; headless: skip so + // the daemon stays persistent across app quit). + Owner string `json:"owner,omitempty"` } // Write atomically writes running.json at path, creating parent directories diff --git a/backend/internal/runfile/runfile_test.go b/backend/internal/runfile/runfile_test.go index 6a926874b1..87b62b320f 100644 --- a/backend/internal/runfile/runfile_test.go +++ b/backend/internal/runfile/runfile_test.go @@ -30,6 +30,42 @@ func TestWriteReadRoundTrip(t *testing.T) { // running.json from a crashed predecessor must be replaced cleanly. POSIX // rename(2) handles this natively; Windows needs MoveFileEx with // MOVEFILE_REPLACE_EXISTING — atomicReplace gives us both. +func TestWriteReadRoundTripOwner(t *testing.T) { + path := filepath.Join(t.TempDir(), "running.json") + + // app-owned daemon: Owner round-trips as "app". + want := Info{PID: 1, Port: 3001, Owner: "app"} + if err := Write(path, want); err != nil { + t.Fatalf("Write: %v", err) + } + got, err := Read(path) + if err != nil { + t.Fatalf("Read: %v", err) + } + if got == nil { + t.Fatal("Read returned nil for an existing file") + } + if got.Owner != "app" { + t.Errorf("Owner round trip: got %q, want %q", got.Owner, "app") + } + + // headless daemon: Owner is empty (omitempty), round-trips as "". + headless := Info{PID: 2, Port: 3002} + if err := Write(path, headless); err != nil { + t.Fatalf("Write headless: %v", err) + } + got, err = Read(path) + if err != nil { + t.Fatalf("Read headless: %v", err) + } + if got == nil { + t.Fatal("Read returned nil for headless file") + } + if got.Owner != "" { + t.Errorf("headless Owner round trip: got %q, want %q", got.Owner, "") + } +} + func TestWriteOverwritesExisting(t *testing.T) { path := filepath.Join(t.TempDir(), "running.json") diff --git a/frontend/src/main.ts b/frontend/src/main.ts index bed1b70d8b..7881ffcb22 100644 --- a/frontend/src/main.ts +++ b/frontend/src/main.ts @@ -34,6 +34,7 @@ import { DEFAULT_POSTHOG_HOST, DEFAULT_POSTHOG_PROJECT_KEY } from "./shared/post import { buildTelemetryBootstrap } from "./shared/telemetry"; import { createBrowserViewHost, type BrowserViewHost } from "./main/browser-view-host"; import { connectSupervisor, type SupervisorLinkHandle } from "./main/supervisor-link"; +import { shouldLinkOnAttach } from "./main/daemon-owner"; // Globals injected at compile time by @electron-forge/plugin-vite. declare const MAIN_WINDOW_VITE_DEV_SERVER_URL: string | undefined; @@ -303,11 +304,15 @@ function ensureShellEnv(): Promise { } function daemonEnv(): NodeJS.ProcessEnv { + // AO_OWNER=app marks this daemon as app-spawned so the app can re-link the + // supervisor on attach (headless `ao start` daemons get no AO_OWNER and stay + // unlinked, preserving their persistence across app quit). + const ownerTag = { AO_OWNER: "app" }; // Windows keeps the old behavior exactly: no shell probe, no unix PATH floor. if (process.platform === "win32") { - return { ...process.env, ...telemetryOverrides() }; + return { ...process.env, ...telemetryOverrides(), ...ownerTag }; } - return buildDaemonEnv(process.env, cachedShellEnv, telemetryOverrides()); + return buildDaemonEnv(process.env, cachedShellEnv, { ...telemetryOverrides(), ...ownerTag }); } function pathKey(value: string): string { @@ -374,7 +379,38 @@ function daemonIdentityError(launch: DaemonLaunchSpec, probe: DaemonProbe): stri return null; } -async function inspectExistingDaemon(launch: DaemonLaunchSpec): Promise { +/** + * Establish (or re-establish) the OS-native liveness link to the daemon's + * supervisor socket. Holding this connection keeps the daemon alive: when + * Electron exits for any reason (Cmd+Q, crash, SIGKILL), the OS closes the fd + * and the daemon detects EOF, then self-stops after its ~5s grace period. + * + * Called unconditionally on the spawn path (we always own that daemon). + * Called on the attach path only when the daemon is app-owned (owner === "app"); + * headless `ao start` daemons stay unlinked so they remain persistent after + * app quit. + */ +function establishSupervisorLink(): void { + const rfp = runFilePath(); + const addr = + process.platform === "win32" + ? "\\\\.\\pipe\\ao-supervise" + : rfp + ? path.join(path.dirname(rfp), "supervise.sock") + : null; + if (addr) { + supervisorLink?.dispose(); + supervisorLink = connectSupervisor(addr, { + log: (msg) => console.log(`AO: ${msg}`), + }); + } else { + console.warn("AO: supervisor link skipped; run-file path unavailable"); + } +} + +async function inspectExistingDaemon( + launch: DaemonLaunchSpec, +): Promise<{ status: DaemonStatus; owner: string | undefined } | null> { const handshakePath = runFilePath(); let runFileContents: string | null = null; if (handshakePath) { @@ -384,12 +420,15 @@ async function inspectExistingDaemon(launch: DaemonLaunchSpec): Promise daemonIdentityError(launch, probe), }); + if (!status) return null; + const owner = runFileContents ? (parseRunFile(runFileContents)?.owner ?? undefined) : undefined; + return { status, owner }; } async function refreshDaemonStatus(): Promise { @@ -406,7 +445,7 @@ async function refreshDaemonStatus(): Promise { if (!launch) return daemonStatus; const existing = await inspectExistingDaemon(launch); if (existing) { - setDaemonStatus(existing); + setDaemonStatus(existing.status); } else if ( daemonStatus.state === "ready" || (daemonStatus.state === "error" && (daemonStatus.pid || daemonStatus.port)) @@ -463,7 +502,13 @@ async function startDaemonInner(startEpoch: number): Promise { return daemonStatus; } if (existing) { - setDaemonStatus(existing); + setDaemonStatus(existing.status); + // Re-link the supervisor only when attaching to an app-owned daemon (one we + // previously spawned). Headless `ao start` daemons (owner unset) stay unlinked + // so they remain persistent after app quit. + if (shouldLinkOnAttach(existing.owner)) { + establishSupervisorLink(); + } return daemonStatus; } @@ -485,6 +530,20 @@ async function startDaemonInner(startEpoch: number): Promise { } if (directDaemon) { setDaemonStatus(directDaemon); + // Re-link iff the daemon is app-owned. Read the run-file for the owner tag; + // if unavailable (run-file absent or unreadable), treat as headless and skip. + const rfp = runFilePath(); + let portAttachOwner: string | undefined; + if (rfp) { + try { + portAttachOwner = parseRunFile(await readFile(rfp, "utf8"))?.owner ?? undefined; + } catch { + // run-file absent or unreadable: treat as headless, skip link. + } + } + if (shouldLinkOnAttach(portAttachOwner)) { + establishSupervisorLink(); + } return daemonStatus; } @@ -597,34 +656,14 @@ async function startDaemonInner(startEpoch: number): Promise { stopDiscovery(); setDaemonStatus({ state: "ready", port }); - // Establish (or re-establish) the OS-native liveness link to the daemon's - // supervisor socket. Holding this connection keeps the daemon alive. When - // Electron exits for any reason (Cmd+Q, crash, SIGKILL), the OS closes the - // fd; the daemon detects EOF and self-stops after its ~5s grace period, - // leaving tmux/ConPTY sessions alive for the next launch to adopt. - // - // Scope: this runs only on the path where WE spawned the daemon (it is the - // bound-port discovery callback). The attach path (connecting to a daemon - // that was already running) intentionally does NOT link, to preserve - // headless safety: a daemon started via `ao start` must stay persistent and - // must not be self-stopped when the app quits. A daemon left lingering by a - // prior app instance self-stops via its own grace; a relaunch within that - // window respawns and adopts the live sessions (Task 1), so no work is lost. - const rfp = runFilePath(); - const addr = - process.platform === "win32" - ? "\\\\.\\pipe\\ao-supervise" - : rfp - ? path.join(path.dirname(rfp), "supervise.sock") - : null; - if (addr) { - supervisorLink?.dispose(); - supervisorLink = connectSupervisor(addr, { - log: (msg) => console.log(`AO: ${msg}`), - }); - } else { - console.warn("AO: supervisor link skipped; run-file path unavailable"); - } + // Establish the OS-native liveness link unconditionally: this callback fires + // only on the spawn path (we own this daemon). Holding the connection keeps + // the daemon alive; when Electron exits for any reason, the OS closes the fd + // and the daemon detects EOF, then self-stops after its ~5s grace period. + // The attach paths link only when the daemon is app-owned (see + // establishSupervisorLink + shouldLinkOnAttach); headless `ao start` daemons + // stay unlinked so they remain persistent across app quit. + establishSupervisorLink(); }; // One scanner per stream: each keeps its own partial-line buffer. diff --git a/frontend/src/main/daemon-owner.test.ts b/frontend/src/main/daemon-owner.test.ts new file mode 100644 index 0000000000..806dc4ba05 --- /dev/null +++ b/frontend/src/main/daemon-owner.test.ts @@ -0,0 +1,21 @@ +// @vitest-environment node +import { describe, it, expect } from "vitest"; +import { shouldLinkOnAttach } from "./daemon-owner"; + +describe("shouldLinkOnAttach", () => { + it('returns true when owner is "app"', () => { + expect(shouldLinkOnAttach("app")).toBe(true); + }); + + it("returns false when owner is undefined (headless ao start)", () => { + expect(shouldLinkOnAttach(undefined)).toBe(false); + }); + + it('returns false when owner is "" (empty string)', () => { + expect(shouldLinkOnAttach("")).toBe(false); + }); + + it('returns false when owner is "cli"', () => { + expect(shouldLinkOnAttach("cli")).toBe(false); + }); +}); diff --git a/frontend/src/main/daemon-owner.ts b/frontend/src/main/daemon-owner.ts new file mode 100644 index 0000000000..771060165e --- /dev/null +++ b/frontend/src/main/daemon-owner.ts @@ -0,0 +1,9 @@ +/** + * Whether the app should hold a supervisor link to a daemon it ATTACHED to + * (did not spawn). Only re-link app-owned daemons (owner === "app"); leave + * headless `ao start` daemons (owner unset or empty) unlinked so they stay + * persistent across app quit. + */ +export function shouldLinkOnAttach(owner: string | undefined): boolean { + return owner === "app"; +} diff --git a/frontend/src/shared/daemon-discovery.ts b/frontend/src/shared/daemon-discovery.ts index 16f407cb68..9e9f608429 100644 --- a/frontend/src/shared/daemon-discovery.ts +++ b/frontend/src/shared/daemon-discovery.ts @@ -66,6 +66,11 @@ export type RunFileInfo = { port: number; /** startedAt in epoch ms; 0 when missing/unparseable. */ startedAtMs: number; + /** + * Daemon ownership tag. "app" when the desktop app spawned this daemon; + * undefined/empty for a headless `ao start` daemon. + */ + owner?: string; }; /** Parse running.json contents. Returns null for malformed JSON or an invalid port. */ @@ -77,13 +82,19 @@ export function parseRunFile(contents: string): RunFileInfo | null { return null; } if (typeof raw !== "object" || raw === null) return null; - const { pid, port, startedAt } = raw as { pid?: unknown; port?: unknown; startedAt?: unknown }; + const { pid, port, startedAt, owner } = raw as { + pid?: unknown; + port?: unknown; + startedAt?: unknown; + owner?: unknown; + }; if (typeof port !== "number" || !Number.isInteger(port) || port < 1 || port > 65535) return null; const startedAtMs = typeof startedAt === "string" ? Date.parse(startedAt) : NaN; return { pid: typeof pid === "number" && Number.isInteger(pid) ? pid : 0, port, startedAtMs: Number.isNaN(startedAtMs) ? 0 : startedAtMs, + owner: typeof owner === "string" ? owner : undefined, }; } From f09bea28ecd798f87be85e26894ce8a490ee870f Mon Sep 17 00:00:00 2001 From: Harshit Singh Bhandari Date: Thu, 25 Jun 2026 20:17:56 +0530 Subject: [PATCH 22/23] fix: quiet expected ErrNotResumable log in RestoreAll; note attach TOCTOU Review polish on #2185: a promptless worker left terminated on boot is expected, not an operational error, so log it at Warn not Error. Document the narrow run-file re-read TOCTOU on the port-attach re-link path (worst case is linking a headless daemon; the dispose-idempotency guard prevents any leak). Co-Authored-By: Claude Opus 4.8 --- backend/internal/session_manager/manager.go | 9 ++++++++- frontend/src/main.ts | 5 +++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/backend/internal/session_manager/manager.go b/backend/internal/session_manager/manager.go index a569c20498..48d93fe26f 100644 --- a/backend/internal/session_manager/manager.go +++ b/backend/internal/session_manager/manager.go @@ -828,7 +828,14 @@ func (m *Manager) RestoreAll(ctx context.Context) error { // Step 3: relaunch via the existing single-session Restore method. if _, err := m.Restore(ctx, rec.ID); err != nil { - m.logger.Error("restore-all: relaunch failed", "sessionID", rec.ID, "error", err) + // A promptless, unresumable worker is intentionally left terminated + // (ErrNotResumable): expected, not an operational failure, so log it + // quietly rather than as an error. + if errors.Is(err, ErrNotResumable) { + m.logger.Warn("restore-all: session left terminated (nothing to resume)", "sessionID", rec.ID) + } else { + m.logger.Error("restore-all: relaunch failed", "sessionID", rec.ID, "error", err) + } } } return nil diff --git a/frontend/src/main.ts b/frontend/src/main.ts index 7881ffcb22..0c00e2a0c0 100644 --- a/frontend/src/main.ts +++ b/frontend/src/main.ts @@ -532,6 +532,11 @@ async function startDaemonInner(startEpoch: number): Promise { setDaemonStatus(directDaemon); // Re-link iff the daemon is app-owned. Read the run-file for the owner tag; // if unavailable (run-file absent or unreadable), treat as headless and skip. + // ponytail: narrow TOCTOU here (the port was probed live, then the run-file + // is read separately), so in theory a headless daemon could have replaced an + // app-owned one in the gap. Acceptable: the window is tiny, the worst case is + // linking a headless daemon, and establishSupervisorLink disposes any prior + // link so nothing leaks. const rfp = runFilePath(); let portAttachOwner: string | undefined; if (rfp) { From 5cc9b74a45659ffb642d5c6fbeff790551ee2986 Mon Sep 17 00:00:00 2001 From: Harshit Singh Bhandari Date: Thu, 25 Jun 2026 20:29:55 +0530 Subject: [PATCH 23/23] style: gofmt the review-fix files (CI format/lint check) Co-Authored-By: Claude Opus 4.8 --- backend/internal/daemon/supervisor/supervisor.go | 2 +- backend/internal/daemon/wiring_test.go | 1 - backend/internal/service/session/service_test.go | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/backend/internal/daemon/supervisor/supervisor.go b/backend/internal/daemon/supervisor/supervisor.go index faa4439552..72c3a7acd7 100644 --- a/backend/internal/daemon/supervisor/supervisor.go +++ b/backend/internal/daemon/supervisor/supervisor.go @@ -40,7 +40,7 @@ type Supervisor struct { mu sync.Mutex liveCount int - armed bool // true once any connection has been accepted + armed bool // true once any connection has been accepted pendingTimer *time.Timer // non-nil while grace countdown is running fireOnce sync.Once diff --git a/backend/internal/daemon/wiring_test.go b/backend/internal/daemon/wiring_test.go index 72f8eb8945..c5a24402d7 100644 --- a/backend/internal/daemon/wiring_test.go +++ b/backend/internal/daemon/wiring_test.go @@ -430,4 +430,3 @@ func TestWiring_SessionLifecycleInterfaceInvokedByDaemon(t *testing.T) { t.Fatal("RestoreAll was not called through the interface") } } - diff --git a/backend/internal/service/session/service_test.go b/backend/internal/service/session/service_test.go index e8c1c24227..84a4849909 100644 --- a/backend/internal/service/session/service_test.go +++ b/backend/internal/service/session/service_test.go @@ -913,4 +913,3 @@ func containsString(values []string, want string) bool { } return false } -