Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
36bc132
feat(ha): Add Postgres advisory lock leader election for singleton wo…
jackthepunished Mar 10, 2026
07ec0bf
feat(ha): Add durable task queue with Redis Streams and execution ledger
jackthepunished Mar 10, 2026
909e78c
feat(resilience): Add circuit breaker enhancements, bulkhead, and ret…
jackthepunished Mar 10, 2026
86b6fdb
feat(resilience): Add resilient adapter wrappers for infrastructure b…
jackthepunished Mar 10, 2026
92b02fb
feat(ha): Wire resilient backends and update workers for HA
jackthepunished Mar 10, 2026
821e78e
test(ha): Add failure drills and release gate tests
jackthepunished Mar 10, 2026
0bac69c
feat(ha): fix CodeRabbit suggestions and race conditions in tests
jackthepunished Mar 11, 2026
1241bc0
fix(ha): harden worker idempotency and test synchronization
jackthepunished Mar 12, 2026
077a9fb
fix(resilience): propagate role errors and harden platform primitives
jackthepunished Apr 14, 2026
67435d2
fix(ha): tighten ledger transitions and durable queue acknowledgements
jackthepunished Apr 14, 2026
297c4c0
fix(workers): improve context propagation and queue/ledger observability
jackthepunished Apr 14, 2026
128cafa
fix(ci): align durable queue wiring and stabilize benchmark PR runs
jackthepunished Apr 14, 2026
61cfba8
fix(workers): log repo Update/Delete failures instead of silent ignores
poyrazK Apr 20, 2026
748fbb6
fix(workers): log pipeline repo Update/Append failures instead of sil…
poyrazK Apr 20, 2026
5da7617
fix(platform): update mock compute to match updated AttachVolume/Deta…
poyrazK Apr 21, 2026
bba166a
fix(lint): resolve all lint errors from CI
poyrazK Apr 21, 2026
25ca95e
fix(lint): properly avoid G115 integer overflow in uint64->int64 conv…
poyrazK Apr 21, 2026
c6c6cf1
fix(lint): truly avoid G115 with no lint directives
poyrazK Apr 21, 2026
01ce374
fix(lint): use crypto/rand.Int for safe random int64 without G115
poyrazK Apr 21, 2026
117194e
fix: address PR #154 review items
poyrazK Apr 23, 2026
c99f0a4
revert: remove fast path from randomInt64 to fix G115 lint error
poyrazK Apr 23, 2026
10da6e3
fix: extend E2E timeouts and add scaling group check to VPC deletion
poyrazK Apr 23, 2026
0fc15fe
fix: further increase E2E VPC deletion timeouts
poyrazK Apr 24, 2026
e9fb55e
fix: increase VPC deletion timeouts to accommodate async LB/ASG cleanup
poyrazK Apr 24, 2026
e322d11
fix: fail checkDeleteDependencies on error instead of silently ignoring
poyrazK Apr 24, 2026
a2ed862
fix: handle DELETING scaling groups and LB cleanup in VPC deletion
poyrazK Apr 24, 2026
b59e6f8
fix: skip DELETED LBs in VPC deletion dependency check
poyrazK Apr 24, 2026
83640ad
test: add status code logging to VPC deletion retry loops
poyrazK Apr 24, 2026
79945c8
test: log every VPC delete attempt status code in E2E
poyrazK Apr 24, 2026
edb7922
test: add diagnostic logging to VPC deletion
poyrazK Apr 24, 2026
98c5229
Revert E2E test changes to diagnose failures
poyrazK Apr 24, 2026
8e51eea
fix: properly check scaling groups and handle errors in VPC deletion
poyrazK Apr 24, 2026
fa0378e
ci: force retest
poyrazK Apr 24, 2026
57ef5fd
test: increase VPC deletion timeouts in E2E tests
poyrazK Apr 25, 2026
de807e0
test: further increase E2E VPC deletion timeouts
poyrazK Apr 25, 2026
5e59b15
feat: add force delete option for VPC deletion
poyrazK Apr 25, 2026
59c6b41
fix: cascade delete dependencies when force=true
poyrazK Apr 25, 2026
c679086
test: handle race condition in TerminateInstance after chaos tests
poyrazK Apr 25, 2026
c1fedbb
fix: resolve rebase conflicts in dependencies.go and resilient_comput…
poyrazK Apr 25, 2026
fe8554b
fix: update all DeleteVPC mock calls to include force parameter
poyrazK Apr 25, 2026
02e2ee1
fix: update DeleteVPC mock expectations to use force=true in stack tests
poyrazK Apr 25, 2026
c066ad8
docs: regenerate swagger docs to include force parameter
poyrazK Apr 25, 2026
1cc75ed
fix: add panic recovery to heartbeat and improve cascade delete error…
poyrazK Apr 25, 2026
3d51def
fix: address critical Copilot review issues
poyrazK Apr 25, 2026
3605d4a
ci: force trigger
poyrazK Apr 25, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion .github/workflows/benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,22 @@ jobs:
| grep -E '^(goos:|goarch:|pkg:|cpu:|Benchmark|PASS$|ok\s)' \
| tee bench.txt

- name: Store Benchmark Result
- name: Store Benchmark Result (PR)
if: github.event_name == 'pull_request'
uses: benchmark-action/github-action-benchmark@v1
with:
name: Go Benchmarks
tool: 'go'
output-file-path: bench.txt
# On PRs, publishing to gh-pages is not allowed in all permission models.
auto-push: false
# Fail if performance drops by more than 50%
alert-threshold: '200%'
comment-on-alert: false
fail-on-alert: false
Comment on lines +39 to +42
Copy link

Copilot AI Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The inline comment says "Fail if performance drops by more than 50%", but alert-threshold: '200%' corresponds to allowing up to a 2x regression. If you intend a 50% regression threshold, this likely should be 150% (or update the comment to match the chosen threshold).

Copilot uses AI. Check for mistakes.

- name: Store Benchmark Result (main)
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: benchmark-action/github-action-benchmark@v1
with:
name: Go Benchmarks
Expand Down
45 changes: 33 additions & 12 deletions cmd/api/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,16 +130,28 @@ func run() error {
defer db.Close()
defer func() { _ = rdb.Close() }()

compute, storage, network, lbProxy, err := initBackends(deps, cfg, logger, db, rdb)
rawCompute, rawStorage, rawNetwork, rawLBProxy, err := initBackends(deps, cfg, logger, db, rdb)
Copy link

Copilot AI Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

initBackends constructs rawLBProxy using compute from InitComputeBackend (per context excerpt). With the current order, the LB proxy's internal compute calls will still go through the raw compute backend, bypassing the compute bulkhead/circuit breaker/timeout protections. Consider initializing the LB proxy with the resilient compute backend (e.g., wrap compute first, then pass it into InitLBProxy) so that LB operations get the same compute-level resilience guarantees.

Copilot uses AI. Check for mistakes.
if err != nil {
logger.Error("backend initialization failed", "error", err)
return err
}

// Wrap raw backends with resilience decorators (circuit breaker, bulkhead, timeouts).
compute := platform.NewResilientCompute(rawCompute, logger, platform.ResilientComputeOpts{})
storage := platform.NewResilientStorage(rawStorage, logger, platform.ResilientStorageOpts{})
network := platform.NewResilientNetwork(rawNetwork, logger, platform.ResilientNetworkOpts{})
lbProxy := platform.NewResilientLB(rawLBProxy, logger, platform.ResilientLBOpts{})

repos := deps.InitRepositories(db, rdb)

// Create leader elector for singleton worker coordination.
// When multiple worker replicas run, only one will hold leadership per key.
leaderElector := postgres.NewPgLeaderElector(db, logger)

svcs, workers, err := deps.InitServices(setup.ServiceConfig{
Config: cfg, Repos: repos, Compute: compute, Storage: storage,
Network: network, LBProxy: lbProxy, DB: db, RDB: rdb, Logger: logger,
LeaderElector: leaderElector,
})
if err != nil {
logger.Error("service initialization failed", "error", err)
Expand All @@ -154,52 +166,61 @@ func run() error {
r.Use(otelgin.Middleware("compute-api"))
}

runApplication(deps, cfg, logger, r, workers)
return nil
return runApplication(deps, cfg, logger, r, workers)
}

func runApplication(deps AppDeps, cfg *platform.Config, logger *slog.Logger, r *gin.Engine, workers *setup.Workers) {
role := os.Getenv("APP_ROLE")
func runApplication(deps AppDeps, cfg *platform.Config, logger *slog.Logger, r *gin.Engine, workers *setup.Workers) error {
role := os.Getenv("ROLE")
if role == "" {
role = "all"
}
Comment on lines +172 to 176
Copy link

Copilot AI Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This changes the runtime configuration environment variable from APP_ROLE to ROLE. If existing deployments/tools still set APP_ROLE, they will silently fall back to all and start unexpected components. To avoid a breaking change, consider supporting both (e.g., prefer ROLE, fall back to APP_ROLE with a deprecation warning) or explicitly document and enforce the new variable at startup.

Copilot uses AI. Check for mistakes.

validRoles := map[string]bool{"api": true, "worker": true, "all": true}
if !validRoles[role] {
logger.Error("invalid ROLE value, must be one of: api, worker, all", "role", role)
return fmt.Errorf("invalid ROLE value %q, must be one of: api, worker, all", role)
}
logger.Info("starting with role", "role", role)

wg := &sync.WaitGroup{}
workerCtx, workerCancel := context.WithCancel(context.Background())

if role == "worker" || role == "all" {
runWorkers(workerCtx, wg, workers)
}

srv := deps.NewHTTPServer(":"+cfg.Port, r)

var srv *http.Server
if role == "api" || role == "all" {
srv = deps.NewHTTPServer(":"+cfg.Port, r)
go func() {
logger.Info("starting compute-api", "port", cfg.Port)
if err := deps.StartHTTPServer(srv); err != nil && !stdlib_errors.Is(err, http.ErrServerClosed) {
logger.Error("failed to start server", "error", err)
}
}()
} else {
logger.Info("running in worker-only mode")
logger.Info("running in worker-only mode, HTTP server disabled")
}

quit := make(chan os.Signal, 1)
deps.NotifySignals(quit, syscall.SIGINT, syscall.SIGTERM)
<-quit

logger.Info("shutting down server...")
logger.Info("shutting down...")

ctx, cancel := context.WithTimeout(context.Background(), defaultShutdownTimeout)
defer cancel()

if err := deps.ShutdownHTTPServer(ctx, srv); err != nil {
logger.Error("server forced to shutdown", "error", err)
if srv != nil {
if err := deps.ShutdownHTTPServer(ctx, srv); err != nil {
logger.Error("server forced to shutdown", "error", err)
}
}

workerCancel()
wg.Wait()
logger.Info("server exited")
logger.Info("shutdown complete")
return nil
}

type runner interface {
Expand Down
105 changes: 104 additions & 1 deletion cmd/api/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,10 @@ func TestRunApplicationApiRoleStartsAndShutsDown(t *testing.T) {
}()
}

runApplication(deps, &platform.Config{Port: "0"}, logger, gin.New(), &setup.Workers{})
err := runApplication(deps, &platform.Config{Port: "0"}, logger, gin.New(), &setup.Workers{})
if err != nil {
t.Fatalf("runApplication returned error: %v", err)
}

select {
case <-shutdownCalled:
Expand All @@ -167,6 +170,106 @@ func TestRunApplicationApiRoleStartsAndShutsDown(t *testing.T) {
}
}

func TestRunApplicationWorkerRoleDoesNotStartHTTP(t *testing.T) {
logger := slog.New(slog.NewTextHandler(io.Discard, nil))
t.Setenv("ROLE", "worker")

deps := DefaultDeps()

deps.NewHTTPServer = func(string, http.Handler) *http.Server {
t.Fatalf("NewHTTPServer should not be called in worker-only mode")
return nil
}
deps.StartHTTPServer = func(*http.Server) error {
t.Fatalf("StartHTTPServer should not be called in worker-only mode")
return nil
}
deps.ShutdownHTTPServer = func(context.Context, *http.Server) error {
t.Fatalf("ShutdownHTTPServer should not be called in worker-only mode")
return nil
}
deps.NotifySignals = func(c chan<- os.Signal, _ ...os.Signal) {
go func() {
// Give workers a moment to start, then signal shutdown
time.Sleep(50 * time.Millisecond)
c <- syscall.SIGTERM
}()
}

err := runApplication(deps, &platform.Config{Port: "0"}, logger, gin.New(), &setup.Workers{})
if err != nil {
t.Fatalf("runApplication returned error: %v", err)
}
// If we reach here without t.Fatalf, the test passes — no HTTP server was touched.
}

func TestRunApplicationDefaultsToAllRole(t *testing.T) {
logger := slog.New(slog.NewTextHandler(io.Discard, nil))
t.Setenv("ROLE", "") // Explicitly empty to verify default

started := make(chan struct{})
shutdownCalled := make(chan struct{})
deps := DefaultDeps()

deps.NewHTTPServer = func(addr string, handler http.Handler) *http.Server {
return &http.Server{
Addr: addr,
Handler: handler,
ReadHeaderTimeout: 10 * time.Second,
}
}
deps.StartHTTPServer = func(*http.Server) error {
close(started)
return http.ErrServerClosed
}
deps.ShutdownHTTPServer = func(context.Context, *http.Server) error {
close(shutdownCalled)
return nil
}
deps.NotifySignals = func(c chan<- os.Signal, _ ...os.Signal) {
go func() {
<-started
c <- syscall.SIGTERM
}()
}

err := runApplication(deps, &platform.Config{Port: "0"}, logger, gin.New(), &setup.Workers{})
if err != nil {
t.Fatalf("runApplication returned error: %v", err)
}

select {
case <-shutdownCalled:
case <-time.After(time.Second):
t.Fatalf("expected server shutdown to be called when ROLE defaults to 'all'")
}
}

func TestRunApplicationInvalidRoleReturnsEarly(t *testing.T) {
logger := slog.New(slog.NewTextHandler(io.Discard, nil))
t.Setenv("ROLE", "invalid")

deps := DefaultDeps()

deps.NewHTTPServer = func(string, http.Handler) *http.Server {
t.Fatalf("NewHTTPServer should not be called for invalid role")
return nil
}
deps.StartHTTPServer = func(*http.Server) error {
t.Fatalf("StartHTTPServer should not be called for invalid role")
return nil
}
deps.NotifySignals = func(c chan<- os.Signal, _ ...os.Signal) {
t.Fatalf("NotifySignals should not be called for invalid role")
}

// Should return immediately without starting anything
err := runApplication(deps, &platform.Config{Port: "0"}, logger, gin.New(), &setup.Workers{})
if err == nil {
t.Fatalf("expected error for invalid role")
}
}

// Stub helpers below keep main.go testable without altering production behavior.

type stubDB struct{ closed bool }
Expand Down
6 changes: 6 additions & 0 deletions docs/swagger/docs.go
Original file line number Diff line number Diff line change
Expand Up @@ -7862,6 +7862,12 @@ const docTemplate = `{
"name": "id",
"in": "path",
"required": true
},
{
"type": "boolean",
"description": "Force delete even with dependent resources (for async cleanup)",
"name": "force",
"in": "query"
}
],
"responses": {
Expand Down
6 changes: 6 additions & 0 deletions docs/swagger/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -7854,6 +7854,12 @@
"name": "id",
"in": "path",
"required": true
},
{
"type": "boolean",
"description": "Force delete even with dependent resources (for async cleanup)",
"name": "force",
"in": "query"
}
],
"responses": {
Expand Down
4 changes: 4 additions & 0 deletions docs/swagger/swagger.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7420,6 +7420,10 @@ paths:
name: id
required: true
type: string
- description: Force delete even with dependent resources (for async cleanup)
in: query
name: force
type: boolean
produces:
- application/json
responses:
Expand Down
Loading
Loading