From f18801505d44922fa613253ea01a5c4721307253 Mon Sep 17 00:00:00 2001 From: Evrard-Nil Daillet Date: Mon, 15 Jun 2026 12:14:43 +0200 Subject: [PATCH] fix(db): auto-reap idle sessions to prevent connection-pool exhaustion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add migration V0062 setting idle_session_timeout=300s and idle_in_transaction_session_timeout=60s on the database so Postgres closes sessions orphaned by crashed/recycled cloud-api instances, instead of letting them accumulate to max_connections. Incident 2026-06-15: after a prod deploy, crash-looping/recycled instances left dozens of idle client backends (no server idle timeout; dead-peer TCP backends linger). The leader filled to max_connections and even a single healthy instance got 'FATAL: sorry, too many clients already'. Safe with deadpool: Fast recycling re-validates on checkout, so a warm pooled connection reaped after going idle is discarded and re-created transparently. Version-guarded (idle_session_timeout is PG14+, cluster is PG16) so it can never wedge startup on an older node. NOTE: this is the durable/recurrence fix. It does NOT clear an active pileup (it runs after the initial pool connect) and existing sessions are unaffected — pair with a manual pg_terminate_backend of idle backends and a DATABASE_MAX_CONNECTIONS reduction. --- .../sql/V0062__set_idle_session_timeout.sql | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 crates/database/src/migrations/sql/V0062__set_idle_session_timeout.sql diff --git a/crates/database/src/migrations/sql/V0062__set_idle_session_timeout.sql b/crates/database/src/migrations/sql/V0062__set_idle_session_timeout.sql new file mode 100644 index 000000000..4e4e6b5eb --- /dev/null +++ b/crates/database/src/migrations/sql/V0062__set_idle_session_timeout.sql @@ -0,0 +1,37 @@ +-- Auto-reap idle client sessions at the server so connections orphaned by a +-- crashed or recycled cloud-api instance do not accumulate and exhaust the +-- cluster's max_connections. +-- +-- Incident 2026-06-15: after a prod deploy, crash-looping/recycled instances +-- left behind dozens of `idle` client backends (no server-side idle timeout + +-- dead-peer TCP backends linger for minutes). The leader filled to +-- max_connections, so even a single healthy instance could not acquire a +-- connection ("FATAL: sorry, too many clients already"). These settings make +-- Postgres close abandoned sessions on its own. +-- +-- Safe with our pooling: deadpool re-validates a connection on checkout +-- (Fast recycling -> is_closed()), so a warm pooled connection that the server +-- reaps after going idle is simply discarded and re-created on next use — no +-- error surfaces to the app. 300s is well above normal inter-query gaps under +-- load, so only genuinely-abandoned sessions get reaped. +-- +-- idle_session_timeout requires PostgreSQL >= 14 (the cluster is PG16/Spilo-16); +-- the version guard keeps this migration a no-op rather than an error on any +-- older node, so it can never wedge startup. +DO $$ +BEGIN + IF current_setting('server_version_num')::int >= 140000 THEN + EXECUTE format( + 'ALTER DATABASE %I SET idle_session_timeout = %L', + current_database(), '300s' + ); + END IF; + + -- Available since PG 9.6; reaps sessions stuck idle-in-transaction + -- (the classic connection leak) much sooner. + EXECUTE format( + 'ALTER DATABASE %I SET idle_in_transaction_session_timeout = %L', + current_database(), '60s' + ); +END +$$;