From 9f6e191d92aff22a1b1192990e9b44c4b9cedf85 Mon Sep 17 00:00:00 2001 From: Raghavendran Gopalakrishnan Date: Tue, 5 May 2026 17:31:20 +0530 Subject: [PATCH 1/7] D1060213: Retry and Dead-letter handling to clean up the expended task tables https://internal.almoctane.com/ui/entity-navigation?p=131002/6001&entityType=work_item&id=1060213 --- .../containertests/JobServiceIT.java | 196 ++++++++++++++++++ .../V9__add_delete_log_retry_tracking.sql | 52 +++++ .../internal/R__internal__insertDeleteLog.sql | 4 +- .../public/R__dropDeletedTaskTables.sql | 35 +++- 4 files changed, 280 insertions(+), 7 deletions(-) create mode 100644 job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql diff --git a/job-service-container-tests/src/test/java/com/github/jobservice/containertests/JobServiceIT.java b/job-service-container-tests/src/test/java/com/github/jobservice/containertests/JobServiceIT.java index 10df8a562..e62f1a484 100644 --- a/job-service-container-tests/src/test/java/com/github/jobservice/containertests/JobServiceIT.java +++ b/job-service-container-tests/src/test/java/com/github/jobservice/containertests/JobServiceIT.java @@ -1427,12 +1427,140 @@ public void testDeleteLog() throws SQLException assertEquals(foundTables.size(), 0); // assert number of rows in delete_log to be 0. assertEquals(getRowsInDeleteLog(dbConnection), 0); + assertEquals(getRowsInDeleteLogFailed(dbConnection), 0, + "delete_log_failed should be empty after all tables are dropped successfully"); } catch (final Exception e) { LOG.error(e.getMessage(), e); Assert.fail(); } } + @Test + public void testDeleteLogBacklogWhenDropTableFails() throws SQLException + { + final String tablePrefix = "task_delete_log_lock_" + System.currentTimeMillis(); + final String parentTableName = tablePrefix; + + // lockAcquired[0]: set by lock thread once the lock is held — gates the procedure call. + // releaseLock[0]: set by main thread after procedure returns — signals lock thread to release. + final boolean[] lockAcquired = {false}; + final boolean[] releaseLock = {false}; + + Thread lockThread = null; + final String firstChildTable = parentTableName + ".1"; + try (final java.sql.Connection dbConnection = JobServiceConnectionUtil.getDbConnection()) + { + // Create a hierarchy and queue it for asynchronous deletion. + createTaskTable(dbConnection, parentTableName); + insertRecordsInTaskTable(dbConnection, parentTableName, 20); + createAndPopulateChildTables(dbConnection, parentTableName); + insertTableNameIntoParentTableLog(dbConnection, parentTableName); + + // Acquire an EXCLUSIVE lock on the first child table before the procedure starts. + lockThread = new Thread(() -> { + System.out.println("Lock thread started"); + try (final java.sql.Connection lockConn = JobServiceConnectionUtil.getDbConnection()) { + lockConn.setAutoCommit(false); + try (final PreparedStatement lockStmt = lockConn.prepareStatement( + "LOCK TABLE \"" + firstChildTable + "\" IN EXCLUSIVE MODE")) { + lockStmt.execute(); + } + // Signal the main thread that the lock is held. + synchronized (lockAcquired) { + lockAcquired[0] = true; + lockAcquired.notifyAll(); + } + System.out.println("Lock acquired on " + firstChildTable + ", holding until procedure completes"); + // Hold the lock until the main thread signals that the procedure has finished. + final long deadline = System.currentTimeMillis() + 15_000L; + synchronized (releaseLock) { + while (!releaseLock[0] && System.currentTimeMillis() < deadline) { + releaseLock.wait(500); + } + } + lockConn.rollback(); + System.out.println("Lock released on " + firstChildTable); + } catch (final Exception ex) { + LOG.warn("Lock thread error", ex); + } + }); + lockThread.start(); + + // Wait until the lock thread confirms the lock is held before calling the procedure. + synchronized (lockAcquired) { + final long deadline = System.currentTimeMillis() + 10_000L; + while (!lockAcquired[0] && System.currentTimeMillis() < deadline) { + lockAcquired.wait(500); + } + } + if (!lockAcquired[0]) { + Assert.fail("Lock thread did not acquire the lock within 10 seconds"); + } + + // Make the failing DROP deterministic and fast. + setLockTimeout(dbConnection, "1s"); + + // Procedure now handles DROP failures internally — it no longer throws to the caller. + System.out.println("Calling drop_deleted_task_tables procedure which should encounter lock and handle it internally"); + callDropDeletedTaskTables(dbConnection); + System.out.println("drop_deleted_task_tables procedure completed"); + + // Procedure has returned — signal the lock thread to release, then assert. + synchronized (releaseLock) { + releaseLock[0] = true; + releaseLock.notifyAll(); + } + // The procedure's internal WHILE loop runs until no retryable entries remain, so within + // a single call firstChildTable is attempted max_retries (3) times, exhausted, and evicted + // to delete_log_failed. It is no longer in delete_log at this point. + assertFalse(entryInDeleteLog(dbConnection, firstChildTable), + "firstChildTable should have been evicted from delete_log after exhausting max_retries"); + // Verify the entry landed in the dead-letter table with a captured error message. + final String lastError = getLastErrorFromDeleteLogFailed(dbConnection, firstChildTable); + assertNotNull(lastError, + "last_error in delete_log_failed should be set after failed drop attempts"); + assertFalse(lastError.isEmpty(), + "last_error in delete_log_failed should not be empty"); + // delete_log must be empty — all other tables were dropped, firstChildTable was evicted. + assertEquals(getRowsInDeleteLog(dbConnection), 0, + "delete_log should be empty after procedure completes"); + // The physical table must still exist — it could not be dropped. + final List tablesAfterDropAttempt = getTablesByPrefix(dbConnection, firstChildTable); + assertFalse(tablesAfterDropAttempt.isEmpty(), + "firstChildTable should still physically exist since DROP was blocked"); + + } catch (final Exception e) { + LOG.error(e.getMessage(), e); + Assert.fail(); + } finally { + // Ensure the lock thread is always unblocked even if the try block threw. + synchronized (releaseLock) { + releaseLock[0] = true; + releaseLock.notifyAll(); + } + if (lockThread != null) { + try { + lockThread.join(5000); + } catch (InterruptedException ignored) { + Thread.currentThread().interrupt(); + } + } + // Best-effort cleanup so later tests are not affected by leftover task tables. + try (final java.sql.Connection cleanupConnection = JobServiceConnectionUtil.getDbConnection()) { + setLockTimeout(cleanupConnection, "30s"); + for (int i = 0; i < 5; i++) { + callDropDeletedTaskTables(cleanupConnection); + if (getTablesByPrefix(cleanupConnection, tablePrefix).isEmpty()) { + break; + } + Thread.sleep(1000); + } + } catch (final Exception cleanupError) { + LOG.warn("Cleanup after testDeleteLogBacklogWhenDropTableFails did not fully complete", cleanupError); + } + } + } + @Test public void testUpdateJobProgress() { @@ -1619,6 +1747,74 @@ private int getRowsInDeleteLog(final java.sql.Connection dbConnection) throws SQ return 0; } + private int getRowsInDeleteLogFailed(final java.sql.Connection dbConnection) throws SQLException + { + try (final PreparedStatement stmt = dbConnection.prepareStatement( + "SELECT COUNT(*) FROM delete_log_failed"); + final ResultSet rs = stmt.executeQuery()) { + if (rs.next()) { + return rs.getInt(1); + } + } + return 0; + } + + private String getLastErrorFromDeleteLogFailed(final java.sql.Connection dbConnection, final String tableName) throws SQLException + { + try (final PreparedStatement stmt = dbConnection.prepareStatement( + "SELECT last_error FROM delete_log_failed WHERE table_name = ?")) { + stmt.setString(1, tableName); + try (final ResultSet rs = stmt.executeQuery()) { + if (rs.next()) { + return rs.getString(1); + } + } + } + return null; + } + + private boolean entryInDeleteLog(final java.sql.Connection dbConnection, final String tableName) throws SQLException + { + try (final PreparedStatement stmt = dbConnection.prepareStatement( + "SELECT 1 FROM delete_log WHERE table_name = ?")) { + stmt.setString(1, tableName); + try (final ResultSet rs = stmt.executeQuery()) { + return rs.next(); + } + } + } + + private List getTablesByPrefix(final java.sql.Connection dbConnection, final String tablePrefix) throws SQLException + { + final List foundTables = new ArrayList(); + try (final PreparedStatement stmt = dbConnection.prepareStatement( + "SELECT tablename FROM pg_catalog.pg_tables WHERE schemaname = 'public' AND tablename LIKE ?")) { + stmt.setString(1, tablePrefix + "%"); + try (final ResultSet rs = stmt.executeQuery()) { + while (rs.next()) { + foundTables.add(rs.getString(1)); + } + } + } + return foundTables; + } + + private void callDropDeletedTaskTables(final java.sql.Connection dbConnection) throws SQLException + { + try (final CallableStatement dropDeletedTables = dbConnection.prepareCall("call drop_deleted_task_tables()")) { + dropDeletedTables.execute(); + } + } + + private void setLockTimeout(final java.sql.Connection dbConnection, final String timeout) throws SQLException + { + try (final PreparedStatement lockTimeoutStmt = dbConnection.prepareStatement( + "SELECT set_config('lock_timeout', ?, false)")) { + lockTimeoutStmt.setString(1, timeout); + lockTimeoutStmt.execute(); + } + } + /** * Retrieve a single message from a queue. Call this before triggering the message publish. * diff --git a/job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql b/job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql new file mode 100644 index 000000000..b1157344c --- /dev/null +++ b/job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql @@ -0,0 +1,52 @@ +-- +-- Copyright 2016-2026 Open Text. +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. +-- + +/* + ************************************************************** + ** Add retry tracking columns to delete_log and create the ** + ** delete_log_failed dead-letter table. ** + ** ** + ** delete_log table now records how many times a ** + ** deletion has been attempted, the last error encountered, ** + ** and when it was last tried. Entries that exceed the ** + ** maximum retry threshold are promoted to delete_log_failed ** + ** so that operators can inspect and remediate them without ** + ** blocking ongoing cleanup work. ** + ************************************************************** + */ + +-- Add retry tracking columns to the existing delete_log table. +ALTER TABLE public.delete_log + ADD COLUMN IF NOT EXISTS retry_count INTEGER NOT NULL DEFAULT 0; + +ALTER TABLE public.delete_log + ADD COLUMN IF NOT EXISTS last_error TEXT; + +ALTER TABLE public.delete_log + ADD COLUMN IF NOT EXISTS last_attempted_at TIMESTAMPTZ; + +-- Dead-letter table for delete_log entries that have exhausted all retries. +CREATE TABLE IF NOT EXISTS public.delete_log_failed +( + table_name VARCHAR(63) NOT NULL, + last_error TEXT, + first_failed_at TIMESTAMPTZ NOT NULL DEFAULT now(), + last_attempted_at TIMESTAMPTZ NOT NULL DEFAULT now(), + CONSTRAINT delete_log_failed_pkey PRIMARY KEY (table_name) +); + +-- Note: the PRIMARY KEY constraint above already creates a btree index on table_name; +-- no additional secondary index is required. diff --git a/job-service-db/src/main/resources/db/migration/functions/internal/R__internal__insertDeleteLog.sql b/job-service-db/src/main/resources/db/migration/functions/internal/R__internal__insertDeleteLog.sql index 66196d396..cfc4ebe15 100644 --- a/job-service-db/src/main/resources/db/migration/functions/internal/R__internal__insertDeleteLog.sql +++ b/job-service-db/src/main/resources/db/migration/functions/internal/R__internal__insertDeleteLog.sql @@ -1,5 +1,5 @@ -- --- Copyright 2016-2022 Micro Focus or one of its affiliates. +-- Copyright 2016-2026 Micro Focus or one of its affiliates. -- -- Licensed under the Apache License, Version 2.0 (the "License"); -- you may not use this file except in compliance with the License. @@ -27,6 +27,6 @@ RETURNS VOID LANGUAGE plpgsql VOLATILE AS $$ BEGIN - INSERT INTO public.delete_log VALUES (task_table_name); + INSERT INTO public.delete_log (table_name) VALUES (task_table_name); END $$; diff --git a/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql b/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql index c9bd2aae2..d24481f40 100644 --- a/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql +++ b/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql @@ -20,8 +20,13 @@ * Description: * This procedure reads parent task table names from deleted_parent_table_log table and populates delete_log table with names of * parent as well as child task tables to be dropped. After populating the tables, it then reads the table names from delete_log table - * and drops them. + * and drops them. * All the above is done through batch commits. The batch is defined by commit_limit variable. Default batch size being 10. + * + * DROP loop includes retry-and-dead-letter behaviour: each DROP is attempted up to max_retries times (default 3). + * On failure the retry_count, last_error, and last_attempted_at columns of delete_log are updated. Once a row reaches + * max_retries without a successful DROP it is evicted to the dead-letter table delete_log_failed (and removed from + * delete_log) before the batch COMMIT so that it does not block subsequent processing. */ CREATE OR REPLACE PROCEDURE drop_deleted_task_tables() LANGUAGE plpgsql @@ -32,6 +37,8 @@ DECLARE commit_limit INTEGER:=10; parent_table_log_rec RECORD; rec RECORD; + max_retries CONSTANT INTEGER := 3; + drop_error TEXT; BEGIN -- insert table names into delete_log @@ -48,15 +55,33 @@ BEGIN COMMIT; END LOOP; - selected_table_names := $q$SELECT table_name FROM delete_log LIMIT $q$ || commit_limit || $q$ FOR UPDATE SKIP LOCKED$q$; + selected_table_names := $q$SELECT table_name FROM delete_log WHERE retry_count < $q$ || max_retries || $q$ LIMIT $q$ || commit_limit || $q$ FOR UPDATE SKIP LOCKED$q$; - WHILE EXISTS (SELECT 1 FROM delete_log) + WHILE EXISTS (SELECT 1 FROM delete_log WHERE retry_count < max_retries) LOOP FOR rec IN EXECUTE selected_table_names LOOP - EXECUTE 'DROP TABLE IF EXISTS ' || quote_ident(rec.table_name); - DELETE FROM delete_log WHERE table_name = rec.table_name; + BEGIN + EXECUTE 'DROP TABLE IF EXISTS ' || quote_ident(rec.table_name); + DELETE FROM delete_log WHERE table_name = rec.table_name; + EXCEPTION WHEN OTHERS THEN + GET STACKED DIAGNOSTICS drop_error = MESSAGE_TEXT; + UPDATE delete_log + SET retry_count = retry_count + 1, + last_error = drop_error, + last_attempted_at = now() + WHERE table_name = rec.table_name; + END; END LOOP; + -- Move exhausted entries to dead-letter table + INSERT INTO delete_log_failed (table_name, last_error, last_attempted_at) + SELECT table_name, last_error, last_attempted_at + FROM delete_log + WHERE retry_count >= max_retries + ON CONFLICT (table_name) DO UPDATE + SET last_error = EXCLUDED.last_error, + last_attempted_at = EXCLUDED.last_attempted_at; + DELETE FROM delete_log WHERE retry_count >= max_retries; COMMIT; END LOOP; END From 07650fcc1f1018c57135593aefaec027074a7d4d Mon Sep 17 00:00:00 2001 From: Raghavendran Gopalakrishnan Date: Wed, 6 May 2026 17:43:43 +0530 Subject: [PATCH 2/7] Updated release notes --- release-notes-10.1.2.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/release-notes-10.1.2.md b/release-notes-10.1.2.md index b36546391..bb5fb027d 100644 --- a/release-notes-10.1.2.md +++ b/release-notes-10.1.2.md @@ -5,4 +5,7 @@ ${version-number} #### New Features +#### Bug fixes +- **D1060213**: Expended `task_` tables are not getting cleaned up if db error happens in stored procedure. + #### Known Issues From a0c94c90c0acb3b269e248779fc9b5039d9d24e5 Mon Sep 17 00:00:00 2001 From: Raghavendran Gopalakrishnan Date: Wed, 6 May 2026 17:46:08 +0530 Subject: [PATCH 3/7] minor --- .../db/migration/V9__add_delete_log_retry_tracking.sql | 3 --- 1 file changed, 3 deletions(-) diff --git a/job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql b/job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql index b1157344c..27b5b48ea 100644 --- a/job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql +++ b/job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql @@ -47,6 +47,3 @@ CREATE TABLE IF NOT EXISTS public.delete_log_failed last_attempted_at TIMESTAMPTZ NOT NULL DEFAULT now(), CONSTRAINT delete_log_failed_pkey PRIMARY KEY (table_name) ); - --- Note: the PRIMARY KEY constraint above already creates a btree index on table_name; --- no additional secondary index is required. From f180c9ce8b2c36654fc4387d7293a19e773cd237 Mon Sep 17 00:00:00 2001 From: Raghavendran Gopalakrishnan Date: Thu, 14 May 2026 19:08:51 +0530 Subject: [PATCH 4/7] Add error handling during population of delete_log table --- .../containertests/JobServiceIT.java | 163 ++++++++++++++++-- .../V9__add_delete_log_retry_tracking.sql | 17 +- .../public/R__dropDeletedTaskTables.sql | 32 +++- 3 files changed, 188 insertions(+), 24 deletions(-) diff --git a/job-service-container-tests/src/test/java/com/github/jobservice/containertests/JobServiceIT.java b/job-service-container-tests/src/test/java/com/github/jobservice/containertests/JobServiceIT.java index e62f1a484..dff284b05 100644 --- a/job-service-container-tests/src/test/java/com/github/jobservice/containertests/JobServiceIT.java +++ b/job-service-container-tests/src/test/java/com/github/jobservice/containertests/JobServiceIT.java @@ -67,6 +67,7 @@ import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; +import java.sql.Statement; import java.text.SimpleDateFormat; import java.time.Duration; import java.time.Instant; @@ -1427,8 +1428,8 @@ public void testDeleteLog() throws SQLException assertEquals(foundTables.size(), 0); // assert number of rows in delete_log to be 0. assertEquals(getRowsInDeleteLog(dbConnection), 0); - assertEquals(getRowsInDeleteLogFailed(dbConnection), 0, - "delete_log_failed should be empty after all tables are dropped successfully"); + assertEquals(getRowsInTableCleanupFailed(dbConnection), 0, + "table_cleanup_failed should be empty after all tables are dropped successfully"); } catch (final Exception e) { LOG.error(e.getMessage(), e); Assert.fail(); @@ -1512,15 +1513,15 @@ public void testDeleteLogBacklogWhenDropTableFails() throws SQLException } // The procedure's internal WHILE loop runs until no retryable entries remain, so within // a single call firstChildTable is attempted max_retries (3) times, exhausted, and evicted - // to delete_log_failed. It is no longer in delete_log at this point. + // to table_cleanup_failed. It is no longer in delete_log at this point. assertFalse(entryInDeleteLog(dbConnection, firstChildTable), "firstChildTable should have been evicted from delete_log after exhausting max_retries"); // Verify the entry landed in the dead-letter table with a captured error message. - final String lastError = getLastErrorFromDeleteLogFailed(dbConnection, firstChildTable); + final String lastError = getLastErrorFromTableCleanupFailed(dbConnection, firstChildTable); assertNotNull(lastError, - "last_error in delete_log_failed should be set after failed drop attempts"); + "last_error in table_cleanup_failed should be set after failed drop attempts"); assertFalse(lastError.isEmpty(), - "last_error in delete_log_failed should not be empty"); + "last_error in table_cleanup_failed should not be empty"); // delete_log must be empty — all other tables were dropped, firstChildTable was evicted. assertEquals(getRowsInDeleteLog(dbConnection), 0, "delete_log should be empty after procedure completes"); @@ -1561,6 +1562,137 @@ public void testDeleteLogBacklogWhenDropTableFails() throws SQLException } } + @Test + public void testPopulateFailureDoesNotLeaveParentEntryStuck() throws SQLException + { + final String tablePrefix = "task_populate_fail_" + System.currentTimeMillis(); + final String parentTableName = tablePrefix; + + // lockAcquired[0]: set by lock thread once the lock is held — gates the procedure call. + // releaseLock[0]: set by main thread after procedure returns — signals lock thread to release. + final boolean[] lockAcquired = {false}; + final boolean[] releaseLock = {false}; + + Thread lockThread = null; + try (final java.sql.Connection dbConnection = JobServiceConnectionUtil.getDbConnection()) + { + // Create the parent table (no child tables needed — the SELECT on the parent is enough + // to trigger the failure). + createTaskTable(dbConnection, parentTableName); + insertRecordsInTaskTable(dbConnection, parentTableName, 5); + insertTableNameIntoParentTableLog(dbConnection, parentTableName); + + // Hold ACCESS EXCLUSIVE on the parent task table. This blocks the ACCESS SHARE + // (SELECT) issued by internal_populate_delete_log_table when it reads subtask rows. + lockThread = new Thread(() -> { + try (java.sql.Connection lockConn = JobServiceConnectionUtil.getDbConnection()) { + lockConn.setAutoCommit(false); + try (final PreparedStatement lockStmt = lockConn.prepareStatement( + "LOCK TABLE \"" + parentTableName + "\" IN ACCESS EXCLUSIVE MODE")) { + lockStmt.execute(); + } + synchronized (lockAcquired) { + lockAcquired[0] = true; + lockAcquired.notifyAll(); + } + System.out.println("ACCESS EXCLUSIVE lock acquired on " + parentTableName); + final long deadline = System.currentTimeMillis() + 60_000L; + synchronized (releaseLock) { + while (!releaseLock[0] && System.currentTimeMillis() < deadline) { + releaseLock.wait(500); + } + } + lockConn.rollback(); + System.out.println("ACCESS EXCLUSIVE lock released on " + parentTableName); + } catch (final Exception ex) { + LOG.warn("Lock thread error", ex); + } + }); + lockThread.start(); + + // Wait until the lock thread confirms the lock is held before calling the procedure. + synchronized (lockAcquired) { + final long deadline = System.currentTimeMillis() + 10_000L; + while (!lockAcquired[0] && System.currentTimeMillis() < deadline) { + lockAcquired.wait(500); + } + } + if (!lockAcquired[0]) { + Assert.fail("Lock thread did not acquire ACCESS EXCLUSIVE lock within 10 seconds"); + } + + // Short lock_timeout so the SELECT inside populate fails quickly and deterministically. + setLockTimeout(dbConnection, "1s"); + + // populate fails once due to lock timeout, entry immediately evicted to dead-letter + System.out.println("Calling drop_deleted_task_tables — populate should fail due to lock timeout"); + callDropDeletedTaskTables(dbConnection); + System.out.println("drop_deleted_task_tables completed"); + + // Signal the lock thread to release before asserting. + synchronized (releaseLock) { + releaseLock[0] = true; + releaseLock.notifyAll(); + } + + // Entry must be immediately in table_cleanup_failed + assertTrue(getRowsInTableCleanupFailed(dbConnection) > 0, + "table_cleanup_failed should have failed populate entry after first failure (Phase 1 fail-fast)"); + final String lastError = getLastErrorFromTableCleanupFailed(dbConnection, parentTableName); + assertTrue(lastError != null && lastError.contains("populate failed:"), + "dead-letter entry must have error prefixed with 'populate failed:', got: " + lastError); + + // Entry must be removed from deleted_parent_table_log (evicted to dead-letter). + assertFalse(entryInDeletedParentTableLog(dbConnection, parentTableName), + "deleted_parent_table_log entry must be removed after eviction to dead-letter"); + + // delete_log must be empty — populate failed, nothing was queued for DROP. + assertEquals(getRowsInDeleteLog(dbConnection), 0, + "delete_log should be empty because populate failed"); + + // The physical table must still exist — it was never queued for deletion. + assertFalse(getTablesByPrefix(dbConnection, parentTableName).isEmpty(), + "parent task table should still exist since populate never succeeded"); + + } catch (final Exception e) { + LOG.error(e.getMessage(), e); + Assert.fail(); + } finally { + synchronized (releaseLock) { + releaseLock[0] = true; + releaseLock.notifyAll(); + } + if (lockThread != null) { + try { + lockThread.join(5000); + } catch (final InterruptedException ignored) { + Thread.currentThread().interrupt(); + } + } + + try (final java.sql.Connection cleanupConnection = JobServiceConnectionUtil.getDbConnection()) { + setLockTimeout(cleanupConnection, "30s"); + try (final PreparedStatement stmt = cleanupConnection.prepareStatement( + "DELETE FROM table_cleanup_failed WHERE table_name = ?")) { + stmt.setString(1, parentTableName); + stmt.executeUpdate(); + } catch (final Exception deadLetterCleanupError) { + LOG.warn("Could not clean up table_cleanup_failed entry for {}", parentTableName, deadLetterCleanupError); + } + // Drop the physical table(s) — each drop is independently guarded. + for (final String table : getTablesByPrefix(cleanupConnection, tablePrefix)) { + try (final Statement stmt = cleanupConnection.createStatement()) { + stmt.execute("DROP TABLE IF EXISTS \"" + table + "\""); + } catch (final Exception dropError) { + LOG.warn("Could not drop table {} during cleanup", table, dropError); + } + } + } catch (final Exception cleanupError) { + LOG.warn("Cleanup after testPopulateFailureDoesNotLeaveParentEntryStuck did not fully complete", cleanupError); + } + } + } + @Test public void testUpdateJobProgress() { @@ -1747,10 +1879,10 @@ private int getRowsInDeleteLog(final java.sql.Connection dbConnection) throws SQ return 0; } - private int getRowsInDeleteLogFailed(final java.sql.Connection dbConnection) throws SQLException + private int getRowsInTableCleanupFailed(final java.sql.Connection dbConnection) throws SQLException { try (final PreparedStatement stmt = dbConnection.prepareStatement( - "SELECT COUNT(*) FROM delete_log_failed"); + "SELECT COUNT(*) FROM table_cleanup_failed"); final ResultSet rs = stmt.executeQuery()) { if (rs.next()) { return rs.getInt(1); @@ -1759,10 +1891,10 @@ private int getRowsInDeleteLogFailed(final java.sql.Connection dbConnection) thr return 0; } - private String getLastErrorFromDeleteLogFailed(final java.sql.Connection dbConnection, final String tableName) throws SQLException + private String getLastErrorFromTableCleanupFailed(final java.sql.Connection dbConnection, final String tableName) throws SQLException { try (final PreparedStatement stmt = dbConnection.prepareStatement( - "SELECT last_error FROM delete_log_failed WHERE table_name = ?")) { + "SELECT last_error FROM table_cleanup_failed WHERE table_name = ?")) { stmt.setString(1, tableName); try (final ResultSet rs = stmt.executeQuery()) { if (rs.next()) { @@ -1784,6 +1916,17 @@ private boolean entryInDeleteLog(final java.sql.Connection dbConnection, final S } } + private boolean entryInDeletedParentTableLog(final java.sql.Connection dbConnection, final String tableName) throws SQLException + { + try (final PreparedStatement stmt = dbConnection.prepareStatement( + "SELECT 1 FROM deleted_parent_table_log WHERE table_name = ?")) { + stmt.setString(1, tableName); + try (final ResultSet rs = stmt.executeQuery()) { + return rs.next(); + } + } + } + private List getTablesByPrefix(final java.sql.Connection dbConnection, final String tablePrefix) throws SQLException { final List foundTables = new ArrayList(); diff --git a/job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql b/job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql index 27b5b48ea..ecd5220eb 100644 --- a/job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql +++ b/job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql @@ -17,14 +17,17 @@ /* ************************************************************** ** Add retry tracking columns to delete_log and create the ** - ** delete_log_failed dead-letter table. ** + ** table_cleanup_failed dead-letter table. ** ** ** ** delete_log table now records how many times a ** ** deletion has been attempted, the last error encountered, ** ** and when it was last tried. Entries that exceed the ** - ** maximum retry threshold are promoted to delete_log_failed ** - ** so that operators can inspect and remediate them without ** - ** blocking ongoing cleanup work. ** + ** maximum retry threshold are promoted to ** + ** table_cleanup_failed so that operators can inspect and ** + ** remediate them without blocking ongoing cleanup work. ** + ** ** + ** table_cleanup_failed is also used in case of any transient** + ** failures during population of delete_log entries. ** ************************************************************** */ @@ -38,12 +41,12 @@ ALTER TABLE public.delete_log ALTER TABLE public.delete_log ADD COLUMN IF NOT EXISTS last_attempted_at TIMESTAMPTZ; --- Dead-letter table for delete_log entries that have exhausted all retries. -CREATE TABLE IF NOT EXISTS public.delete_log_failed +-- Dead-letter table for entries fails during cleanup procedure. +CREATE TABLE IF NOT EXISTS public.table_cleanup_failed ( table_name VARCHAR(63) NOT NULL, last_error TEXT, first_failed_at TIMESTAMPTZ NOT NULL DEFAULT now(), last_attempted_at TIMESTAMPTZ NOT NULL DEFAULT now(), - CONSTRAINT delete_log_failed_pkey PRIMARY KEY (table_name) + CONSTRAINT table_cleanup_failed_pkey PRIMARY KEY (table_name) ); diff --git a/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql b/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql index d24481f40..3a35684aa 100644 --- a/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql +++ b/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql @@ -23,9 +23,14 @@ * and drops them. * All the above is done through batch commits. The batch is defined by commit_limit variable. Default batch size being 10. * - * DROP loop includes retry-and-dead-letter behaviour: each DROP is attempted up to max_retries times (default 3). + * POPULATE phase (Phase 1) uses immediate failure-to-dead-letter: if populate fails (lock timeout, table corruption, etc.), + * the entry is immediately evicted to table_cleanup_failed for operator inspection and remediation. Retrying Phase 1 is unsafe + * because the recursive procedure with intermediate commits may have partially inserted rows into delete_log already; retrying + * would cause duplicates or silent conflicts. Instead, operators investigate the root cause, fix it, and re-run the procedure. + * + * DROP loop (Phase 2) includes retry-and-dead-letter behaviour: each DROP is attempted up to max_retries times (default 3). * On failure the retry_count, last_error, and last_attempted_at columns of delete_log are updated. Once a row reaches - * max_retries without a successful DROP it is evicted to the dead-letter table delete_log_failed (and removed from + * max_retries without a successful DROP it is evicted to the dead-letter table table_cleanup_failed (and removed from * delete_log) before the batch COMMIT so that it does not block subsequent processing. */ CREATE OR REPLACE PROCEDURE drop_deleted_task_tables() @@ -39,18 +44,31 @@ DECLARE rec RECORD; max_retries CONSTANT INTEGER := 3; drop_error TEXT; + populate_error TEXT; BEGIN - -- insert table names into delete_log + -- if populate fails, entry is immediately evicted selected_parent_table_names := $q$SELECT table_name FROM deleted_parent_table_log LIMIT $q$ || commit_limit || $q$ FOR UPDATE SKIP LOCKED$q$; WHILE EXISTS(SELECT 1 FROM deleted_parent_table_log) LOOP FOR parent_table_log_rec IN EXECUTE selected_parent_table_names LOOP - CALL internal_populate_delete_log_table(parent_table_log_rec.table_name, 0); - -- delete the parent table name from parent table. - DELETE FROM deleted_parent_table_log WHERE table_name = parent_table_log_rec.table_name; + BEGIN + CALL internal_populate_delete_log_table(parent_table_log_rec.table_name, 0); + -- Successfully populated — delete the parent table entry. + DELETE FROM deleted_parent_table_log WHERE table_name = parent_table_log_rec.table_name; + EXCEPTION WHEN OTHERS THEN + -- Populate failed — immediately evict to dead-letter for operator investigation. + GET STACKED DIAGNOSTICS populate_error = MESSAGE_TEXT; + INSERT INTO table_cleanup_failed (table_name, last_error, last_attempted_at) + VALUES (parent_table_log_rec.table_name, 'populate failed: ' || populate_error, now()) + ON CONFLICT (table_name) DO UPDATE + SET last_error = EXCLUDED.last_error, + last_attempted_at = EXCLUDED.last_attempted_at; + -- Delete the parent entry so future calls don't re-attempt. + DELETE FROM deleted_parent_table_log WHERE table_name = parent_table_log_rec.table_name; + END; END LOOP; COMMIT; END LOOP; @@ -74,7 +92,7 @@ BEGIN END; END LOOP; -- Move exhausted entries to dead-letter table - INSERT INTO delete_log_failed (table_name, last_error, last_attempted_at) + INSERT INTO table_cleanup_failed (table_name, last_error, last_attempted_at) SELECT table_name, last_error, last_attempted_at FROM delete_log WHERE retry_count >= max_retries From 5a47a69d24e96c9869b57591dbf26274032e9754 Mon Sep 17 00:00:00 2001 From: Raghavendran Gopalakrishnan Date: Thu, 14 May 2026 20:50:21 +0530 Subject: [PATCH 5/7] Revert "Add error handling during population of delete_log table" This reverts commit f180c9ce8b2c36654fc4387d7293a19e773cd237. --- .../containertests/JobServiceIT.java | 163 ++---------------- .../V9__add_delete_log_retry_tracking.sql | 17 +- .../public/R__dropDeletedTaskTables.sql | 32 +--- 3 files changed, 24 insertions(+), 188 deletions(-) diff --git a/job-service-container-tests/src/test/java/com/github/jobservice/containertests/JobServiceIT.java b/job-service-container-tests/src/test/java/com/github/jobservice/containertests/JobServiceIT.java index dff284b05..e62f1a484 100644 --- a/job-service-container-tests/src/test/java/com/github/jobservice/containertests/JobServiceIT.java +++ b/job-service-container-tests/src/test/java/com/github/jobservice/containertests/JobServiceIT.java @@ -67,7 +67,6 @@ import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; -import java.sql.Statement; import java.text.SimpleDateFormat; import java.time.Duration; import java.time.Instant; @@ -1428,8 +1427,8 @@ public void testDeleteLog() throws SQLException assertEquals(foundTables.size(), 0); // assert number of rows in delete_log to be 0. assertEquals(getRowsInDeleteLog(dbConnection), 0); - assertEquals(getRowsInTableCleanupFailed(dbConnection), 0, - "table_cleanup_failed should be empty after all tables are dropped successfully"); + assertEquals(getRowsInDeleteLogFailed(dbConnection), 0, + "delete_log_failed should be empty after all tables are dropped successfully"); } catch (final Exception e) { LOG.error(e.getMessage(), e); Assert.fail(); @@ -1513,15 +1512,15 @@ public void testDeleteLogBacklogWhenDropTableFails() throws SQLException } // The procedure's internal WHILE loop runs until no retryable entries remain, so within // a single call firstChildTable is attempted max_retries (3) times, exhausted, and evicted - // to table_cleanup_failed. It is no longer in delete_log at this point. + // to delete_log_failed. It is no longer in delete_log at this point. assertFalse(entryInDeleteLog(dbConnection, firstChildTable), "firstChildTable should have been evicted from delete_log after exhausting max_retries"); // Verify the entry landed in the dead-letter table with a captured error message. - final String lastError = getLastErrorFromTableCleanupFailed(dbConnection, firstChildTable); + final String lastError = getLastErrorFromDeleteLogFailed(dbConnection, firstChildTable); assertNotNull(lastError, - "last_error in table_cleanup_failed should be set after failed drop attempts"); + "last_error in delete_log_failed should be set after failed drop attempts"); assertFalse(lastError.isEmpty(), - "last_error in table_cleanup_failed should not be empty"); + "last_error in delete_log_failed should not be empty"); // delete_log must be empty — all other tables were dropped, firstChildTable was evicted. assertEquals(getRowsInDeleteLog(dbConnection), 0, "delete_log should be empty after procedure completes"); @@ -1562,137 +1561,6 @@ public void testDeleteLogBacklogWhenDropTableFails() throws SQLException } } - @Test - public void testPopulateFailureDoesNotLeaveParentEntryStuck() throws SQLException - { - final String tablePrefix = "task_populate_fail_" + System.currentTimeMillis(); - final String parentTableName = tablePrefix; - - // lockAcquired[0]: set by lock thread once the lock is held — gates the procedure call. - // releaseLock[0]: set by main thread after procedure returns — signals lock thread to release. - final boolean[] lockAcquired = {false}; - final boolean[] releaseLock = {false}; - - Thread lockThread = null; - try (final java.sql.Connection dbConnection = JobServiceConnectionUtil.getDbConnection()) - { - // Create the parent table (no child tables needed — the SELECT on the parent is enough - // to trigger the failure). - createTaskTable(dbConnection, parentTableName); - insertRecordsInTaskTable(dbConnection, parentTableName, 5); - insertTableNameIntoParentTableLog(dbConnection, parentTableName); - - // Hold ACCESS EXCLUSIVE on the parent task table. This blocks the ACCESS SHARE - // (SELECT) issued by internal_populate_delete_log_table when it reads subtask rows. - lockThread = new Thread(() -> { - try (java.sql.Connection lockConn = JobServiceConnectionUtil.getDbConnection()) { - lockConn.setAutoCommit(false); - try (final PreparedStatement lockStmt = lockConn.prepareStatement( - "LOCK TABLE \"" + parentTableName + "\" IN ACCESS EXCLUSIVE MODE")) { - lockStmt.execute(); - } - synchronized (lockAcquired) { - lockAcquired[0] = true; - lockAcquired.notifyAll(); - } - System.out.println("ACCESS EXCLUSIVE lock acquired on " + parentTableName); - final long deadline = System.currentTimeMillis() + 60_000L; - synchronized (releaseLock) { - while (!releaseLock[0] && System.currentTimeMillis() < deadline) { - releaseLock.wait(500); - } - } - lockConn.rollback(); - System.out.println("ACCESS EXCLUSIVE lock released on " + parentTableName); - } catch (final Exception ex) { - LOG.warn("Lock thread error", ex); - } - }); - lockThread.start(); - - // Wait until the lock thread confirms the lock is held before calling the procedure. - synchronized (lockAcquired) { - final long deadline = System.currentTimeMillis() + 10_000L; - while (!lockAcquired[0] && System.currentTimeMillis() < deadline) { - lockAcquired.wait(500); - } - } - if (!lockAcquired[0]) { - Assert.fail("Lock thread did not acquire ACCESS EXCLUSIVE lock within 10 seconds"); - } - - // Short lock_timeout so the SELECT inside populate fails quickly and deterministically. - setLockTimeout(dbConnection, "1s"); - - // populate fails once due to lock timeout, entry immediately evicted to dead-letter - System.out.println("Calling drop_deleted_task_tables — populate should fail due to lock timeout"); - callDropDeletedTaskTables(dbConnection); - System.out.println("drop_deleted_task_tables completed"); - - // Signal the lock thread to release before asserting. - synchronized (releaseLock) { - releaseLock[0] = true; - releaseLock.notifyAll(); - } - - // Entry must be immediately in table_cleanup_failed - assertTrue(getRowsInTableCleanupFailed(dbConnection) > 0, - "table_cleanup_failed should have failed populate entry after first failure (Phase 1 fail-fast)"); - final String lastError = getLastErrorFromTableCleanupFailed(dbConnection, parentTableName); - assertTrue(lastError != null && lastError.contains("populate failed:"), - "dead-letter entry must have error prefixed with 'populate failed:', got: " + lastError); - - // Entry must be removed from deleted_parent_table_log (evicted to dead-letter). - assertFalse(entryInDeletedParentTableLog(dbConnection, parentTableName), - "deleted_parent_table_log entry must be removed after eviction to dead-letter"); - - // delete_log must be empty — populate failed, nothing was queued for DROP. - assertEquals(getRowsInDeleteLog(dbConnection), 0, - "delete_log should be empty because populate failed"); - - // The physical table must still exist — it was never queued for deletion. - assertFalse(getTablesByPrefix(dbConnection, parentTableName).isEmpty(), - "parent task table should still exist since populate never succeeded"); - - } catch (final Exception e) { - LOG.error(e.getMessage(), e); - Assert.fail(); - } finally { - synchronized (releaseLock) { - releaseLock[0] = true; - releaseLock.notifyAll(); - } - if (lockThread != null) { - try { - lockThread.join(5000); - } catch (final InterruptedException ignored) { - Thread.currentThread().interrupt(); - } - } - - try (final java.sql.Connection cleanupConnection = JobServiceConnectionUtil.getDbConnection()) { - setLockTimeout(cleanupConnection, "30s"); - try (final PreparedStatement stmt = cleanupConnection.prepareStatement( - "DELETE FROM table_cleanup_failed WHERE table_name = ?")) { - stmt.setString(1, parentTableName); - stmt.executeUpdate(); - } catch (final Exception deadLetterCleanupError) { - LOG.warn("Could not clean up table_cleanup_failed entry for {}", parentTableName, deadLetterCleanupError); - } - // Drop the physical table(s) — each drop is independently guarded. - for (final String table : getTablesByPrefix(cleanupConnection, tablePrefix)) { - try (final Statement stmt = cleanupConnection.createStatement()) { - stmt.execute("DROP TABLE IF EXISTS \"" + table + "\""); - } catch (final Exception dropError) { - LOG.warn("Could not drop table {} during cleanup", table, dropError); - } - } - } catch (final Exception cleanupError) { - LOG.warn("Cleanup after testPopulateFailureDoesNotLeaveParentEntryStuck did not fully complete", cleanupError); - } - } - } - @Test public void testUpdateJobProgress() { @@ -1879,10 +1747,10 @@ private int getRowsInDeleteLog(final java.sql.Connection dbConnection) throws SQ return 0; } - private int getRowsInTableCleanupFailed(final java.sql.Connection dbConnection) throws SQLException + private int getRowsInDeleteLogFailed(final java.sql.Connection dbConnection) throws SQLException { try (final PreparedStatement stmt = dbConnection.prepareStatement( - "SELECT COUNT(*) FROM table_cleanup_failed"); + "SELECT COUNT(*) FROM delete_log_failed"); final ResultSet rs = stmt.executeQuery()) { if (rs.next()) { return rs.getInt(1); @@ -1891,10 +1759,10 @@ private int getRowsInTableCleanupFailed(final java.sql.Connection dbConnection) return 0; } - private String getLastErrorFromTableCleanupFailed(final java.sql.Connection dbConnection, final String tableName) throws SQLException + private String getLastErrorFromDeleteLogFailed(final java.sql.Connection dbConnection, final String tableName) throws SQLException { try (final PreparedStatement stmt = dbConnection.prepareStatement( - "SELECT last_error FROM table_cleanup_failed WHERE table_name = ?")) { + "SELECT last_error FROM delete_log_failed WHERE table_name = ?")) { stmt.setString(1, tableName); try (final ResultSet rs = stmt.executeQuery()) { if (rs.next()) { @@ -1916,17 +1784,6 @@ private boolean entryInDeleteLog(final java.sql.Connection dbConnection, final S } } - private boolean entryInDeletedParentTableLog(final java.sql.Connection dbConnection, final String tableName) throws SQLException - { - try (final PreparedStatement stmt = dbConnection.prepareStatement( - "SELECT 1 FROM deleted_parent_table_log WHERE table_name = ?")) { - stmt.setString(1, tableName); - try (final ResultSet rs = stmt.executeQuery()) { - return rs.next(); - } - } - } - private List getTablesByPrefix(final java.sql.Connection dbConnection, final String tablePrefix) throws SQLException { final List foundTables = new ArrayList(); diff --git a/job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql b/job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql index ecd5220eb..27b5b48ea 100644 --- a/job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql +++ b/job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql @@ -17,17 +17,14 @@ /* ************************************************************** ** Add retry tracking columns to delete_log and create the ** - ** table_cleanup_failed dead-letter table. ** + ** delete_log_failed dead-letter table. ** ** ** ** delete_log table now records how many times a ** ** deletion has been attempted, the last error encountered, ** ** and when it was last tried. Entries that exceed the ** - ** maximum retry threshold are promoted to ** - ** table_cleanup_failed so that operators can inspect and ** - ** remediate them without blocking ongoing cleanup work. ** - ** ** - ** table_cleanup_failed is also used in case of any transient** - ** failures during population of delete_log entries. ** + ** maximum retry threshold are promoted to delete_log_failed ** + ** so that operators can inspect and remediate them without ** + ** blocking ongoing cleanup work. ** ************************************************************** */ @@ -41,12 +38,12 @@ ALTER TABLE public.delete_log ALTER TABLE public.delete_log ADD COLUMN IF NOT EXISTS last_attempted_at TIMESTAMPTZ; --- Dead-letter table for entries fails during cleanup procedure. -CREATE TABLE IF NOT EXISTS public.table_cleanup_failed +-- Dead-letter table for delete_log entries that have exhausted all retries. +CREATE TABLE IF NOT EXISTS public.delete_log_failed ( table_name VARCHAR(63) NOT NULL, last_error TEXT, first_failed_at TIMESTAMPTZ NOT NULL DEFAULT now(), last_attempted_at TIMESTAMPTZ NOT NULL DEFAULT now(), - CONSTRAINT table_cleanup_failed_pkey PRIMARY KEY (table_name) + CONSTRAINT delete_log_failed_pkey PRIMARY KEY (table_name) ); diff --git a/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql b/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql index 3a35684aa..d24481f40 100644 --- a/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql +++ b/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql @@ -23,14 +23,9 @@ * and drops them. * All the above is done through batch commits. The batch is defined by commit_limit variable. Default batch size being 10. * - * POPULATE phase (Phase 1) uses immediate failure-to-dead-letter: if populate fails (lock timeout, table corruption, etc.), - * the entry is immediately evicted to table_cleanup_failed for operator inspection and remediation. Retrying Phase 1 is unsafe - * because the recursive procedure with intermediate commits may have partially inserted rows into delete_log already; retrying - * would cause duplicates or silent conflicts. Instead, operators investigate the root cause, fix it, and re-run the procedure. - * - * DROP loop (Phase 2) includes retry-and-dead-letter behaviour: each DROP is attempted up to max_retries times (default 3). + * DROP loop includes retry-and-dead-letter behaviour: each DROP is attempted up to max_retries times (default 3). * On failure the retry_count, last_error, and last_attempted_at columns of delete_log are updated. Once a row reaches - * max_retries without a successful DROP it is evicted to the dead-letter table table_cleanup_failed (and removed from + * max_retries without a successful DROP it is evicted to the dead-letter table delete_log_failed (and removed from * delete_log) before the batch COMMIT so that it does not block subsequent processing. */ CREATE OR REPLACE PROCEDURE drop_deleted_task_tables() @@ -44,31 +39,18 @@ DECLARE rec RECORD; max_retries CONSTANT INTEGER := 3; drop_error TEXT; - populate_error TEXT; BEGIN - -- if populate fails, entry is immediately evicted + -- insert table names into delete_log selected_parent_table_names := $q$SELECT table_name FROM deleted_parent_table_log LIMIT $q$ || commit_limit || $q$ FOR UPDATE SKIP LOCKED$q$; WHILE EXISTS(SELECT 1 FROM deleted_parent_table_log) LOOP FOR parent_table_log_rec IN EXECUTE selected_parent_table_names LOOP - BEGIN - CALL internal_populate_delete_log_table(parent_table_log_rec.table_name, 0); - -- Successfully populated — delete the parent table entry. - DELETE FROM deleted_parent_table_log WHERE table_name = parent_table_log_rec.table_name; - EXCEPTION WHEN OTHERS THEN - -- Populate failed — immediately evict to dead-letter for operator investigation. - GET STACKED DIAGNOSTICS populate_error = MESSAGE_TEXT; - INSERT INTO table_cleanup_failed (table_name, last_error, last_attempted_at) - VALUES (parent_table_log_rec.table_name, 'populate failed: ' || populate_error, now()) - ON CONFLICT (table_name) DO UPDATE - SET last_error = EXCLUDED.last_error, - last_attempted_at = EXCLUDED.last_attempted_at; - -- Delete the parent entry so future calls don't re-attempt. - DELETE FROM deleted_parent_table_log WHERE table_name = parent_table_log_rec.table_name; - END; + CALL internal_populate_delete_log_table(parent_table_log_rec.table_name, 0); + -- delete the parent table name from parent table. + DELETE FROM deleted_parent_table_log WHERE table_name = parent_table_log_rec.table_name; END LOOP; COMMIT; END LOOP; @@ -92,7 +74,7 @@ BEGIN END; END LOOP; -- Move exhausted entries to dead-letter table - INSERT INTO table_cleanup_failed (table_name, last_error, last_attempted_at) + INSERT INTO delete_log_failed (table_name, last_error, last_attempted_at) SELECT table_name, last_error, last_attempted_at FROM delete_log WHERE retry_count >= max_retries From 9190e0ab97840d2527feabb6424b0d37f248439f Mon Sep 17 00:00:00 2001 From: Raghavendran Gopalakrishnan Date: Fri, 15 May 2026 15:25:46 +0530 Subject: [PATCH 6/7] Error handling for populating tables --- .../public/R__dropDeletedTaskTables.sql | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql b/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql index d24481f40..ff41e10f2 100644 --- a/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql +++ b/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql @@ -23,9 +23,14 @@ * and drops them. * All the above is done through batch commits. The batch is defined by commit_limit variable. Default batch size being 10. * - * DROP loop includes retry-and-dead-letter behaviour: each DROP is attempted up to max_retries times (default 3). - * On failure the retry_count, last_error, and last_attempted_at columns of delete_log are updated. Once a row reaches - * max_retries without a successful DROP it is evicted to the dead-letter table delete_log_failed (and removed from + * Populate loop (Phase 1): each call to internal_populate_delete_log_table is wrapped in an exception handler. + * Because that procedure recursively populates delete_log, retrying on failure would produce duplicate entries; + * therefore a failed populate is written directly to table_cleanup_failed (no retry) and the parent entry is + * removed from deleted_parent_table_log so it does not re-enter the loop. + * + * DROP loop (Phase 2) includes retry-and-dead-letter behaviour: each DROP is attempted up to max_retries times + * (default 3). On failure the retry_count, last_error, and last_attempted_at columns of delete_log are updated. + * Once a row reaches max_retries without a successful DROP it is evicted to delete_log_failed (and removed from * delete_log) before the batch COMMIT so that it does not block subsequent processing. */ CREATE OR REPLACE PROCEDURE drop_deleted_task_tables() @@ -48,8 +53,19 @@ BEGIN LOOP FOR parent_table_log_rec IN EXECUTE selected_parent_table_names LOOP - CALL internal_populate_delete_log_table(parent_table_log_rec.table_name, 0); - -- delete the parent table name from parent table. + BEGIN + CALL internal_populate_delete_log_table(parent_table_log_rec.table_name, 0); + EXCEPTION WHEN OTHERS THEN + -- No retry: re-invoking the recursive procedure would insert duplicate entries into delete_log. + -- Record the failure directly in the dead-letter table and continue processing. + GET STACKED DIAGNOSTICS drop_error = MESSAGE_TEXT; + INSERT INTO delete_log_failed (table_name, last_error, last_attempted_at) + VALUES (parent_table_log_rec.table_name, drop_error, now()) + ON CONFLICT (table_name) DO UPDATE + SET last_error = EXCLUDED.last_error, + last_attempted_at = EXCLUDED.last_attempted_at; + END; + -- Always remove from deleted_parent_table_log (success or failure) to prevent re-processing. DELETE FROM deleted_parent_table_log WHERE table_name = parent_table_log_rec.table_name; END LOOP; COMMIT; From 9a5c6ebd3ad5822c283db9d2f4341ade666a5f91 Mon Sep 17 00:00:00 2001 From: Raghavendran Gopalakrishnan Date: Mon, 25 May 2026 16:25:34 +0530 Subject: [PATCH 7/7] Minor improvs and readme --- .../V9__add_delete_log_retry_tracking.sql | 10 +++---- .../public/R__dropDeletedTaskTables.sql | 26 ++++--------------- release-notes-10.2.0.md | 2 +- 3 files changed, 9 insertions(+), 29 deletions(-) diff --git a/job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql b/job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql index 27b5b48ea..b4a81a8da 100644 --- a/job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql +++ b/job-service-db/src/main/resources/db/migration/V9__add_delete_log_retry_tracking.sql @@ -30,13 +30,9 @@ -- Add retry tracking columns to the existing delete_log table. ALTER TABLE public.delete_log - ADD COLUMN IF NOT EXISTS retry_count INTEGER NOT NULL DEFAULT 0; - -ALTER TABLE public.delete_log - ADD COLUMN IF NOT EXISTS last_error TEXT; - -ALTER TABLE public.delete_log - ADD COLUMN IF NOT EXISTS last_attempted_at TIMESTAMPTZ; + ADD COLUMN IF NOT EXISTS retry_count INTEGER NOT NULL DEFAULT 0, + ADD COLUMN IF NOT EXISTS last_error TEXT, + ADD COLUMN IF NOT EXISTS last_attempted_at TIMESTAMPTZ; -- Dead-letter table for delete_log entries that have exhausted all retries. CREATE TABLE IF NOT EXISTS public.delete_log_failed diff --git a/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql b/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql index ff41e10f2..d24481f40 100644 --- a/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql +++ b/job-service-db/src/main/resources/db/migration/procedures/public/R__dropDeletedTaskTables.sql @@ -23,14 +23,9 @@ * and drops them. * All the above is done through batch commits. The batch is defined by commit_limit variable. Default batch size being 10. * - * Populate loop (Phase 1): each call to internal_populate_delete_log_table is wrapped in an exception handler. - * Because that procedure recursively populates delete_log, retrying on failure would produce duplicate entries; - * therefore a failed populate is written directly to table_cleanup_failed (no retry) and the parent entry is - * removed from deleted_parent_table_log so it does not re-enter the loop. - * - * DROP loop (Phase 2) includes retry-and-dead-letter behaviour: each DROP is attempted up to max_retries times - * (default 3). On failure the retry_count, last_error, and last_attempted_at columns of delete_log are updated. - * Once a row reaches max_retries without a successful DROP it is evicted to delete_log_failed (and removed from + * DROP loop includes retry-and-dead-letter behaviour: each DROP is attempted up to max_retries times (default 3). + * On failure the retry_count, last_error, and last_attempted_at columns of delete_log are updated. Once a row reaches + * max_retries without a successful DROP it is evicted to the dead-letter table delete_log_failed (and removed from * delete_log) before the batch COMMIT so that it does not block subsequent processing. */ CREATE OR REPLACE PROCEDURE drop_deleted_task_tables() @@ -53,19 +48,8 @@ BEGIN LOOP FOR parent_table_log_rec IN EXECUTE selected_parent_table_names LOOP - BEGIN - CALL internal_populate_delete_log_table(parent_table_log_rec.table_name, 0); - EXCEPTION WHEN OTHERS THEN - -- No retry: re-invoking the recursive procedure would insert duplicate entries into delete_log. - -- Record the failure directly in the dead-letter table and continue processing. - GET STACKED DIAGNOSTICS drop_error = MESSAGE_TEXT; - INSERT INTO delete_log_failed (table_name, last_error, last_attempted_at) - VALUES (parent_table_log_rec.table_name, drop_error, now()) - ON CONFLICT (table_name) DO UPDATE - SET last_error = EXCLUDED.last_error, - last_attempted_at = EXCLUDED.last_attempted_at; - END; - -- Always remove from deleted_parent_table_log (success or failure) to prevent re-processing. + CALL internal_populate_delete_log_table(parent_table_log_rec.table_name, 0); + -- delete the parent table name from parent table. DELETE FROM deleted_parent_table_log WHERE table_name = parent_table_log_rec.table_name; END LOOP; COMMIT; diff --git a/release-notes-10.2.0.md b/release-notes-10.2.0.md index 05ce499ed..7f7f7d158 100644 --- a/release-notes-10.2.0.md +++ b/release-notes-10.2.0.md @@ -7,7 +7,7 @@ ${version-number} - **US1122037**: Updated to build with Java 21 #### Bug fixes -- **D1060213**: Expended `task_` tables are not getting cleaned up if db error happens in stored procedure. +- **D1060213**: Proactively handle if any exception happens while dropping `task_` tables. #### Known Issues - None