Skip to content

Commit 58c3867

Browse files
committed
fix(controlplane): bound CAS checker lock-hold and release durations
Cap how long a single tick can hold the distributed lock (25 min) so a hung validation doesn't pin the lock past one tick; the next tick retries. Also bound the pg_advisory_unlock call (5 s) so a stuck session can't hang the release path — Postgres will release the lock on session disconnect anyway. Assisted-by: Claude Code Signed-off-by: Miguel Martinez Trivino <miguel@chainloop.dev> Chainloop-Trace-Sessions: 052e8b56-72b5-4c6c-8d82-ab2d00728889
1 parent 3581a5a commit 58c3867

2 files changed

Lines changed: 19 additions & 1 deletion

File tree

app/controlplane/pkg/biz/casbackend_checker.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ var casBackendCheckerTracer = otelx.Tracer("chainloop-controlplane", "biz/casbac
3030
const (
3131
defaultInterval = 30 * time.Minute
3232
defaultValidationTimeout = 10 * time.Second
33+
// Upper bound on how long a single tick is allowed to hold the
34+
// distributed lock. Defends against a hung validation pinning the lock
35+
// past one tick; the next tick will retry.
36+
defaultMaxTickDuration = 25 * time.Minute
3337

3438
// Separate keys per scope so the two checker goroutines (defaults vs all backends)
3539
// don't block each other.
@@ -152,6 +156,11 @@ func (c *CASBackendChecker) checkBackends(ctx context.Context, defaultsOrFallbac
152156
}
153157
defer release()
154158

159+
// Cap how long we can hold the lock. If validations hang, the next tick
160+
// retries instead of one stuck pod pinning the lock indefinitely.
161+
ctx, cancel := context.WithTimeout(ctx, defaultMaxTickDuration)
162+
defer cancel()
163+
155164
ctx, span := otelx.Start(ctx, casBackendCheckerTracer, "CASBackendChecker.checkBackends")
156165
defer span.End()
157166

app/controlplane/pkg/data/lock.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,17 @@ import (
2020
"database/sql"
2121
"fmt"
2222
"hash/fnv"
23+
"time"
2324

2425
"github.com/chainloop-dev/chainloop/app/controlplane/pkg/biz"
2526
"github.com/go-kratos/kratos/v2/log"
2627
)
2728

29+
// Cap on how long the release path may block. Defends against a stuck
30+
// session: if pg_advisory_unlock can't return, we drop the connection
31+
// and let Postgres release the lock on session disconnect.
32+
const advisoryUnlockTimeout = 5 * time.Second
33+
2834
// PostgresLock implements biz.DistributedLock using Postgres session-level
2935
// advisory locks (pg_try_advisory_lock / pg_advisory_unlock).
3036
//
@@ -65,7 +71,10 @@ func (l *PostgresLock) TryAcquire(ctx context.Context, key string) (bool, func()
6571
release := func() {
6672
// pg_advisory_unlock must run on the same session that took the lock,
6773
// and must run even if the caller's context was cancelled (e.g. shutdown).
68-
if _, err := conn.ExecContext(context.Background(), "SELECT pg_advisory_unlock($1)", intKey); err != nil {
74+
// Bounded so a stuck session can't hang the release path.
75+
releaseCtx, cancel := context.WithTimeout(context.Background(), advisoryUnlockTimeout)
76+
defer cancel()
77+
if _, err := conn.ExecContext(releaseCtx, "SELECT pg_advisory_unlock($1)", intKey); err != nil {
6978
l.log.Warnw("msg", "failed to release advisory lock", "key", key, "error", err)
7079
}
7180
if err := conn.Close(); err != nil {

0 commit comments

Comments
 (0)