From 8bfbf39a70977934d1937c2af88a1ed081573a37 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Date: Fri, 20 Feb 2026 17:45:11 +0500 Subject: [PATCH 1/3] Disable idle_in_transaction_session_timeout on metadata worker connections during shard moves In block_writes mode, LockShardListMetadataOnWorkers() opens coordinated transactions on all metadata workers to hold advisory shard metadata locks. These connections remain open for the entire duration of the shard move, but workers not involved in the data copy have no commands to execute and they sit idle-in-transaction until the coordinated transaction commits. For large shards, the data copy can take hours, easily exceeding common idle_in_transaction_session_timeout values. When the timeout fires on an uninvolved worker, PostgreSQL terminates the connection and the move fails. Fix by sending SET LOCAL idle_in_transaction_session_timeout = 0 on each metadata worker connection before acquiring locks. SET LOCAL scopes the change to the current transaction only, so normal sessions are unaffected. --- src/backend/distributed/utils/resource_lock.c | 12 ++- .../shard_move_constraints_blocking.out | 84 +++++++++++++++++++ .../sql/shard_move_constraints_blocking.sql | 38 +++++++++ 3 files changed, 133 insertions(+), 1 deletion(-) diff --git a/src/backend/distributed/utils/resource_lock.c b/src/backend/distributed/utils/resource_lock.c index 1dbc84c42b2..9edfc4943a9 100644 --- a/src/backend/distributed/utils/resource_lock.c +++ b/src/backend/distributed/utils/resource_lock.c @@ -405,7 +405,17 @@ LockShardListMetadataOnWorkers(LOCKMODE lockmode, List *shardIntervalList) appendStringInfo(lockCommand, "])"); - SendCommandToWorkersWithMetadata(lockCommand->data); + /* + * Disable idle_in_transaction_session_timeout on metadata workers before + * acquiring locks. In block_writes mode, these connections stay open for + * the entire shard copy which can take hours for large shards. Without + * this, the timeout would kill the connection and fail the move. + * SET LOCAL scopes the change to this transaction only. + */ + List *commandList = list_make2( + "SET LOCAL idle_in_transaction_session_timeout = 0", + lockCommand->data); + SendCommandListToWorkersWithMetadata(commandList); } diff --git a/src/test/regress/expected/shard_move_constraints_blocking.out b/src/test/regress/expected/shard_move_constraints_blocking.out index 66dec069e7a..61dbf41ec04 100644 --- a/src/test/regress/expected/shard_move_constraints_blocking.out +++ b/src/test/regress/expected/shard_move_constraints_blocking.out @@ -399,3 +399,87 @@ drop cascades to table "blocking shard Move Fkeys Indexes".reference_table drop cascades to table "blocking shard Move Fkeys Indexes".reference_table_8970028 drop cascades to table "blocking shard Move Fkeys Indexes".index_backed_rep_identity DROP ROLE mx_rebalancer_blocking_role_ent; +-- Test: block_writes shard move succeeds even when workers have a low +-- idle_in_transaction_session_timeout. LockShardListMetadataOnWorkers opens +-- coordinated transactions on ALL metadata workers before the data copy. +-- Workers not involved in the copy sit idle-in-transaction for the entire +-- duration. Without the SET LOCAL override, the timeout would kill those +-- connections and fail the move. +SET citus.next_shard_id TO 8980000; +SET citus.shard_count TO 4; +SET citus.shard_replication_factor TO 1; +CREATE SCHEMA blocking_move_idle_timeout; +SET search_path TO blocking_move_idle_timeout; +-- set a very low idle_in_transaction_session_timeout on all nodes +SELECT 1 FROM run_command_on_all_nodes( + 'ALTER SYSTEM SET idle_in_transaction_session_timeout = ''1s'''); + ?column? +--------------------------------------------------------------------- + 1 + 1 + 1 +(3 rows) + +SELECT 1 FROM run_command_on_all_nodes('SELECT pg_reload_conf()'); + ?column? +--------------------------------------------------------------------- + 1 + 1 + 1 +(3 rows) + +-- allow the reload to take effect +SELECT pg_sleep(0.5); + pg_sleep +--------------------------------------------------------------------- + +(1 row) + +CREATE TABLE test_move(id int PRIMARY KEY, val text); +SELECT create_distributed_table('test_move', 'id'); + create_distributed_table +--------------------------------------------------------------------- + +(1 row) + +INSERT INTO test_move SELECT i, 'val_' || i FROM generate_series(1, 100) i; +-- move a shard using block_writes; should succeed despite the 1s timeout +SELECT citus_move_shard_placement(8980000, 'localhost', :worker_1_port, 'localhost', :worker_2_port, shard_transfer_mode:='block_writes'); + citus_move_shard_placement +--------------------------------------------------------------------- + +(1 row) + +SELECT public.wait_for_resource_cleanup(); + wait_for_resource_cleanup +--------------------------------------------------------------------- + +(1 row) + +-- verify data integrity after move +SELECT count(*) FROM test_move; + count +--------------------------------------------------------------------- + 100 +(1 row) + +-- cleanup: restore idle_in_transaction_session_timeout +SELECT 1 FROM run_command_on_all_nodes( + 'ALTER SYSTEM RESET idle_in_transaction_session_timeout'); + ?column? +--------------------------------------------------------------------- + 1 + 1 + 1 +(3 rows) + +SELECT 1 FROM run_command_on_all_nodes('SELECT pg_reload_conf()'); + ?column? +--------------------------------------------------------------------- + 1 + 1 + 1 +(3 rows) + +DROP SCHEMA blocking_move_idle_timeout CASCADE; +NOTICE: drop cascades to table test_move diff --git a/src/test/regress/sql/shard_move_constraints_blocking.sql b/src/test/regress/sql/shard_move_constraints_blocking.sql index 66b58f42b9c..acbaca76ab9 100644 --- a/src/test/regress/sql/shard_move_constraints_blocking.sql +++ b/src/test/regress/sql/shard_move_constraints_blocking.sql @@ -222,3 +222,41 @@ ALTER TABLE sensors_2020_01_01 DROP CONSTRAINT fkey_from_child_to_child; \c - postgres - :master_port DROP SCHEMA "blocking shard Move Fkeys Indexes" CASCADE; DROP ROLE mx_rebalancer_blocking_role_ent; + +-- Test: block_writes shard move succeeds even when workers have a low +-- idle_in_transaction_session_timeout. LockShardListMetadataOnWorkers opens +-- coordinated transactions on ALL metadata workers before the data copy. +-- Workers not involved in the copy sit idle-in-transaction for the entire +-- duration. Without the SET LOCAL override, the timeout would kill those +-- connections and fail the move. +SET citus.next_shard_id TO 8980000; +SET citus.shard_count TO 4; +SET citus.shard_replication_factor TO 1; + +CREATE SCHEMA blocking_move_idle_timeout; +SET search_path TO blocking_move_idle_timeout; + +-- set a very low idle_in_transaction_session_timeout on all nodes +SELECT 1 FROM run_command_on_all_nodes( + 'ALTER SYSTEM SET idle_in_transaction_session_timeout = ''1s'''); +SELECT 1 FROM run_command_on_all_nodes('SELECT pg_reload_conf()'); +-- allow the reload to take effect +SELECT pg_sleep(0.5); + +CREATE TABLE test_move(id int PRIMARY KEY, val text); +SELECT create_distributed_table('test_move', 'id'); +INSERT INTO test_move SELECT i, 'val_' || i FROM generate_series(1, 100) i; + +-- move a shard using block_writes; should succeed despite the 1s timeout +SELECT citus_move_shard_placement(8980000, 'localhost', :worker_1_port, 'localhost', :worker_2_port, shard_transfer_mode:='block_writes'); +SELECT public.wait_for_resource_cleanup(); + +-- verify data integrity after move +SELECT count(*) FROM test_move; + +-- cleanup: restore idle_in_transaction_session_timeout +SELECT 1 FROM run_command_on_all_nodes( + 'ALTER SYSTEM RESET idle_in_transaction_session_timeout'); +SELECT 1 FROM run_command_on_all_nodes('SELECT pg_reload_conf()'); + +DROP SCHEMA blocking_move_idle_timeout CASCADE; From 898d94cb003a93335d7a018caaa6eb71d0924483 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 25 Feb 2026 13:43:13 +0000 Subject: [PATCH 2/3] Initial plan From 24bd54c329bc90e078c26f74badf6c302a8ab8f1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 25 Feb 2026 14:07:07 +0000 Subject: [PATCH 3/3] Add per-row delay to make idle timeout regression test deterministic Use a NOT VALID check constraint that calls pg_sleep() per row so that COPY (which fires check constraints) reliably takes > 1s during the block_writes shard move. This ensures the idle_in_transaction timeout fires on uninvolved metadata worker connections, making the regression test effective at detecting the original failure mode. Co-authored-by: emelsimsek <13130350+emelsimsek@users.noreply.github.com> --- .../shard_move_constraints_blocking.out | 24 ++++++++++++++++++- .../sql/shard_move_constraints_blocking.sql | 20 ++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/src/test/regress/expected/shard_move_constraints_blocking.out b/src/test/regress/expected/shard_move_constraints_blocking.out index 61dbf41ec04..2d82c0e8278 100644 --- a/src/test/regress/expected/shard_move_constraints_blocking.out +++ b/src/test/regress/expected/shard_move_constraints_blocking.out @@ -435,6 +435,17 @@ SELECT pg_sleep(0.5); (1 row) +-- Helper that sleeps for the given number of seconds and returns TRUE. +-- Used in a NOT VALID check constraint below so that COPY (which fires +-- check constraints) introduces a per-row delay during the shard move, +-- making the data-copy phase reliably exceed the 1s timeout. +CREATE FUNCTION sleep_and_true(float8) RETURNS boolean LANGUAGE plpgsql AS $$ +BEGIN + PERFORM pg_sleep($1); + RETURN true; +END; +$$; +CREATE FUNCTION CREATE TABLE test_move(id int PRIMARY KEY, val text); SELECT create_distributed_table('test_move', 'id'); create_distributed_table @@ -443,6 +454,15 @@ SELECT create_distributed_table('test_move', 'id'); (1 row) INSERT INTO test_move SELECT i, 'val_' || i FROM generate_series(1, 100) i; +-- Add a per-row delay constraint after inserting data (NOT VALID skips +-- checking existing rows). COPY during the shard move fires check constraints, +-- so each copied row will sleep for 0.1 s, ensuring the copy takes > 1 s and +-- the idle timeout would kill uninvolved metadata workers without the fix. +-- Use the schema-qualified function name so the constraint propagates to +-- workers correctly regardless of their search_path. +ALTER TABLE test_move ADD CONSTRAINT slow_copy + CHECK (blocking_move_idle_timeout.sleep_and_true(0.1)) NOT VALID; +ALTER TABLE -- move a shard using block_writes; should succeed despite the 1s timeout SELECT citus_move_shard_placement(8980000, 'localhost', :worker_1_port, 'localhost', :worker_2_port, shard_transfer_mode:='block_writes'); citus_move_shard_placement @@ -482,4 +502,6 @@ SELECT 1 FROM run_command_on_all_nodes('SELECT pg_reload_conf()'); (3 rows) DROP SCHEMA blocking_move_idle_timeout CASCADE; -NOTICE: drop cascades to table test_move +NOTICE: drop cascades to 2 other objects +DETAIL: drop cascades to function sleep_and_true(double precision) +drop cascades to table test_move diff --git a/src/test/regress/sql/shard_move_constraints_blocking.sql b/src/test/regress/sql/shard_move_constraints_blocking.sql index acbaca76ab9..af8312ac76e 100644 --- a/src/test/regress/sql/shard_move_constraints_blocking.sql +++ b/src/test/regress/sql/shard_move_constraints_blocking.sql @@ -243,10 +243,30 @@ SELECT 1 FROM run_command_on_all_nodes('SELECT pg_reload_conf()'); -- allow the reload to take effect SELECT pg_sleep(0.5); +-- Helper that sleeps for the given number of seconds and returns TRUE. +-- Used in a NOT VALID check constraint below so that COPY (which fires +-- check constraints) introduces a per-row delay during the shard move, +-- making the data-copy phase reliably exceed the 1s timeout. +CREATE FUNCTION sleep_and_true(float8) RETURNS boolean LANGUAGE plpgsql AS $$ +BEGIN + PERFORM pg_sleep($1); + RETURN true; +END; +$$; + CREATE TABLE test_move(id int PRIMARY KEY, val text); SELECT create_distributed_table('test_move', 'id'); INSERT INTO test_move SELECT i, 'val_' || i FROM generate_series(1, 100) i; +-- Add a per-row delay constraint after inserting data (NOT VALID skips +-- checking existing rows). COPY during the shard move fires check constraints, +-- so each copied row will sleep for 0.1 s, ensuring the copy takes > 1 s and +-- the idle timeout would kill uninvolved metadata workers without the fix. +-- Use the schema-qualified function name so the constraint propagates to +-- workers correctly regardless of their search_path. +ALTER TABLE test_move ADD CONSTRAINT slow_copy + CHECK (blocking_move_idle_timeout.sleep_and_true(0.1)) NOT VALID; + -- move a shard using block_writes; should succeed despite the 1s timeout SELECT citus_move_shard_placement(8980000, 'localhost', :worker_1_port, 'localhost', :worker_2_port, shard_transfer_mode:='block_writes'); SELECT public.wait_for_resource_cleanup();