diff --git a/apiserver/src/main/java/org/dependencytrack/persistence/jdbi/MetricsDao.java b/apiserver/src/main/java/org/dependencytrack/persistence/jdbi/MetricsDao.java index 4b581d3bbe..29c1da7cc7 100644 --- a/apiserver/src/main/java/org/dependencytrack/persistence/jdbi/MetricsDao.java +++ b/apiserver/src/main/java/org/dependencytrack/persistence/jdbi/MetricsDao.java @@ -37,9 +37,20 @@ import java.util.UUID; import java.util.regex.Pattern; -/** - * @since 5.0.0 - */ +/// DAO for interacting with time series metrics. +/// +/// ### Orphans +/// +/// Rows in `DEPENDENCYMETRICS` and `PROJECTMETRICS` may reference +/// deleted `COMPONENT` or `PROJECT` rows. The FKs were dropped to +/// eliminate lock contention between partition maintenance, +/// project / component deletion, and metrics updates (see ADR 029). +/// Orphans are naturally cleaned up as part of partition maintenance. +/// +/// All read paths must therefore be keyed by a still-existing parent IDs. +/// **Do not add joins that expose orphan rows without an explicit existence filter**! +/// +/// @since 5.0.0 public interface MetricsDao extends SqlObject { Pattern VALID_TABLE_IDENTIFIER_PATTERN = Pattern.compile("^\"[A-Z][A-Z0-9_]+\"$"); @@ -654,7 +665,7 @@ default int dropPartitions(final String parentTable, final List partitio } else { getHandle().execute("ALTER TABLE %s DETACH PARTITION %s CONCURRENTLY".formatted(parentTable, partition)); } - getHandle().execute("DROP TABLE IF EXISTS %s CASCADE".formatted(partition)); + getHandle().execute("DROP TABLE IF EXISTS %s".formatted(partition)); deletedCount++; } return deletedCount; diff --git a/apiserver/src/test/java/org/dependencytrack/persistence/jdbi/MetricsDaoTest.java b/apiserver/src/test/java/org/dependencytrack/persistence/jdbi/MetricsDaoTest.java index 7d030a6d7a..bd902e8d38 100644 --- a/apiserver/src/test/java/org/dependencytrack/persistence/jdbi/MetricsDaoTest.java +++ b/apiserver/src/test/java/org/dependencytrack/persistence/jdbi/MetricsDaoTest.java @@ -248,4 +248,53 @@ public void shouldRejectRefreshGlobalPortfolioMetricsOutsideTransaction() { .isInstanceOf(IllegalStateException.class); } + + @Test + public void shouldKeepDependencyMetricsWhenComponentDeleted() { + final var project = qm.createProject("acme-app", null, "1.0.0", null, null, null, null, false); + final var component = new Component(); + component.setProject(project); + component.setName("Acme Component"); + component.setVersion("1.0"); + qm.createComponent(component, false); + + metricsTestDao.createPartitionForDaysAgo("DEPENDENCYMETRICS", 0); + final var metrics = new DependencyMetrics(); + metrics.setProjectId(project.getId()); + metrics.setComponentId(component.getId()); + metrics.setVulnerabilities(1); + metrics.setFirstOccurrence(Date.from(Instant.now())); + metrics.setLastOccurrence(Date.from(Instant.now())); + metricsTestDao.createDependencyMetrics(metrics); + + final long componentId = component.getId(); + jdbiHandle.execute("DELETE FROM \"COMPONENT\" WHERE \"ID\" = ?", componentId); + + final List surviving = + metricsDao.getDependencyMetricsSince( + componentId, Instant.now().minus(Duration.ofDays(1))); + assertThat(surviving).hasSize(1); + } + + @Test + public void shouldKeepProjectMetricsWhenProjectDeleted() { + final var project = qm.createProject("acme-app", null, "1.0.0", null, null, null, null, false); + + metricsTestDao.createPartitionForDaysAgo("PROJECTMETRICS", 0); + final var metrics = new ProjectMetrics(); + metrics.setProjectId(project.getId()); + metrics.setVulnerabilities(1); + metrics.setFirstOccurrence(Date.from(Instant.now())); + metrics.setLastOccurrence(Date.from(Instant.now())); + metricsTestDao.createProjectMetrics(metrics); + + final long projectId = project.getId(); + jdbiHandle.execute("DELETE FROM \"PROJECT\" WHERE \"ID\" = ?", projectId); + + final List surviving = + metricsDao.getProjectMetricsSince( + projectId, Instant.now().minus(Duration.ofDays(1))); + assertThat(surviving).hasSize(1); + } + } diff --git a/apiserver/src/test/java/org/dependencytrack/persistence/jdbi/ProjectDaoTest.java b/apiserver/src/test/java/org/dependencytrack/persistence/jdbi/ProjectDaoTest.java index c90a39030e..d7875518cb 100644 --- a/apiserver/src/test/java/org/dependencytrack/persistence/jdbi/ProjectDaoTest.java +++ b/apiserver/src/test/java/org/dependencytrack/persistence/jdbi/ProjectDaoTest.java @@ -231,10 +231,10 @@ public void testCascadeDeleteProject() { qm.getPersistenceManager().refresh(policy); assertThat(policy.getProjects()).isEmpty(); - // Ensure that metrics have been deleted. + // Metrics are NOT deleted, see ADR 029. MetricsDao dao = jdbiHandle.attach(MetricsDao.class); - assertThat(dao.getProjectMetricsSince(project.getId(), DateUtil.parseShortDate("20250101").toInstant())).isEmpty(); - assertThat(dao.getDependencyMetricsSince(component.getId(), DateUtil.parseShortDate("20250101").toInstant())).isEmpty(); + assertThat(dao.getProjectMetricsSince(project.getId(), DateUtil.parseShortDate("20250101").toInstant())).isNotEmpty(); + assertThat(dao.getDependencyMetricsSince(component.getId(), DateUtil.parseShortDate("20250101").toInstant())).isNotEmpty(); } @Test diff --git a/docs/adr/029-drop-metrics-fks.md b/docs/adr/029-drop-metrics-fks.md new file mode 100644 index 0000000000..f8235d1454 --- /dev/null +++ b/docs/adr/029-drop-metrics-fks.md @@ -0,0 +1,140 @@ +| Status | Date | Author(s) | +|:---------|:-----------|:-------------------------------------| +| Accepted | 2026-06-10 | [@nscuro](https://github.com/nscuro) | + +## Context + +`DEPENDENCYMETRICS` and `PROJECTMETRICS` are partitioned by `LAST_OCCURRENCE`. +Both tables carry `ON DELETE CASCADE` foreign keys to `COMPONENT` and `PROJECT`, +so deleting a parent also deletes its metric rows. + +When a partition is dropped, PostgreSQL also drops the foreign key constraint that the partition inherited. +Dropping the constraint removes the action triggers on the referenced side, +which requires an `ACCESS EXCLUSIVE` lock on `COMPONENT` and `PROJECT` +(see [ALTER TABLE notes][pg-altertable], [lock modes][pg-locking]). Concurrent traffic holds weaker locks on +the same tables: `SELECT` takes `ACCESS SHARE`, and a metric `INSERT` takes `ACCESS SHARE` plus a row-level +`FOR KEY SHARE` while it validates the foreign key. `ACCESS EXCLUSIVE` conflicts with every other lock mode, +so the two sides can deadlock. PostgreSQL aborts the partition drop, and the maintenance task fails. +An occurrence of this was reported in [#6343]. + +The risk is real but small, because the maintenance task runs on startup and then on the cron `1 * * * *`. +The `update-portfolio-metrics` only writes a row for projects that do not yet have one for `$today`. +The `analyze-project` workflow writes a row at the end of each analysis, and the scheduled portfolio +analysis runs daily at 06:00 UTC by default. Most hours, the writers are quiet and the maintenance task +takes the lock without contention. The deadlock becomes likely only under sustained back-to-back BOM uploads, +where re-analysis keeps the writers busy for hours. + +The foreign keys also cost in two other places: + +1. A `DELETE` of a project must visit every metrics partition once per component, + because the partition key is a date and PostgreSQL cannot prune by component or project ID. +2. Every metrics insert takes a read lock on `COMPONENT` and `PROJECT` for foreign key validation, + which blocks exclusive locks taken by deletes. + +Through the API, metrics rows are read only through a parent UUID. +If the parent is gone, the API returns `404` before it touches the metric tables. + +`PORTFOLIOMETRICS_GLOBAL` inner-joins `PROJECT`, so orphan rows drop out at refresh. + +No path in the application reads a metric row without first resolving its parent. + +### Possible Solutions + +#### A: Drop only `ON DELETE CASCADE`, keep the foreign keys + +Change the constraints from `ON DELETE CASCADE` to `ON DELETE NO ACTION`. +Parent deletion no longer cascades into the metric tables. + +*Pro*: + +1. The application can still rely on the foreign key to reject metric inserts that race with a parent delete. + +*Con*: + +1. The action triggers on the referenced side still exist. Dropping a partition still needs `ACCESS EXCLUSIVE` + on `COMPONENT` and `PROJECT`. The deadlock window from [#6343] stays open. +2. Project and component deletion now fails outright if metric rows still reference them. Cleanup must happen + in application code before the parent delete, walking every partition. + +#### B: Replace partition drop with `DELETE` for retention + +Keep the foreign keys. Replace the per-partition `DROP TABLE` with a bulk +`DELETE FROM ... WHERE "LAST_OCCURRENCE" < cutoff`. + +*Pro*: + +1. No DDL, no constraint drop, no `ACCESS EXCLUSIVE` lock on the parent tables. The deadlock window closes. + +*Con*: + +1. Throws away the main benefit of partitioning. Bulk `DELETE` writes dead tuples that autovacuum must reclaim. + Indexes grow rather than shrink. +2. Cascade chain on parent deletion still walks every partition. Same cost as today, with no upside. + +#### C: Schedule the partition drop during a quiet window + +Move the maintenance cron from `1 * * * *` to a known-quiet hour, like overnight. + +*Pro*: + +1. Smallest change. + +*Con*: + +1. Operational, not architectural. A deployment with users across multiple timezones has no globally quiet window. + Scheduled BOM uploads and downstream ingestion can run at any hour. +2. Does not address the cost of cascading deletes on parent removal. + +#### D: Drop the foreign keys + +Drop the three constraints from `DEPENDENCYMETRICS` and `PROJECTMETRICS`. +Partition retention becomes the only mechanism that removes metric rows. + +*Pro*: + +1. The partition drop touches only the metric tables. No lock on `COMPONENT` or `PROJECT`. +2. Parent deletion becomes cheap, with no cross-partition cascade. +3. Metric inserts no longer compete with parent deletes on the row lock. + +*Con*: + +1. Orphan metric rows can outlive their parent until partition retention removes them. +2. A metric insert raced with a parent delete now produces an orphan instead of failing at commit. + +## Decision + +We will follow solution **D**. The three foreign keys from `DEPENDENCYMETRICS` and `PROJECTMETRICS` to `COMPONENT` +and `PROJECT` are dropped. The partition retention task becomes the only mechanism that removes metric rows. +Other tables that reference `COMPONENT` or `PROJECT` are out of scope. + +## Consequences + +The partition drop no longer needs an exclusive lock on `COMPONENT` or `PROJECT`. +The deadlock window closes. Deleting projects or components no longer cascades into the metric tables, +so deleting a large project becomes much faster. Metric inserts no longer validate against the parent tables, +so they stop competing with deletes for the lock. + +Storage cost actually goes *down*. A cascading delete writes dead tuples into every metric partition that holds +a referenced row, and autovacuum has to reclaim them. Dropping a whole partition at retention reclaims the space +at once with no vacuum work. + +Metric rows can outlive their parent. Orphans are not reachable from the API and self-clean when their partition is dropped. + +A race between a metric insert and a parent delete now produces an orphan instead of an aborted insert. +We accept this. + +Reversing the decision later is possible but not free. A re-added foreign key must use `ADD CONSTRAINT ... NOT VALID` +followed by `VALIDATE CONSTRAINT` to avoid the same exclusive lock that motivated this change. + +`NOT VALID` takes the weaker `SHARE ROW EXCLUSIVE` lock on both tables, and `VALIDATE CONSTRAINT` only needs +`SHARE UPDATE EXCLUSIVE` on the referencing table, and `ROW SHARE` on the referenced one +(see [ALTER TABLE notes][pg-altertable]). + +`VALIDATE CONSTRAINT` also fails as soon as it hits an orphan. A reversal must therefore purge orphans first, +with queries like `DELETE FROM "PROJECTMETRICS" WHERE "PROJECT_ID" NOT IN (SELECT "ID" FROM "PROJECT")` +and the corresponding statement for `DEPENDENCYMETRICS`. Each purge writes dead tuples that autovacuum must reclaim, +so the reversal is more disruptive than the original drop. + +[#6343]: https://github.com/DependencyTrack/dependency-track/issues/6343 +[pg-altertable]: https://www.postgresql.org/docs/current/sql-altertable.html#SQL-ALTERTABLE-NOTES +[pg-locking]: https://www.postgresql.org/docs/current/explicit-locking.html#LOCKING-TABLES diff --git a/migration/src/main/resources/org/dependencytrack/migration/R__update_component_metrics.sql b/migration/src/main/resources/org/dependencytrack/migration/R__update_component_metrics.sql index 65a45f3d12..fef04421d5 100644 --- a/migration/src/main/resources/org/dependencytrack/migration/R__update_component_metrics.sql +++ b/migration/src/main/resources/org/dependencytrack/migration/R__update_component_metrics.sql @@ -184,38 +184,43 @@ BEGIN , "POLICYVIOLATIONS_SECURITY_UNAUDITED" , "FIRST_OCCURRENCE" , "LAST_OCCURRENCE" - ) VALUES ( - v_component."ID" - , v_component."PROJECT_ID" - , v_vulnerabilities - , v_critical - , v_high - , v_medium - , v_low - , v_unassigned - , v_risk_score - , v_findings_total - , v_findings_audited - , v_findings_unaudited - , v_findings_suppressed - , v_policy_violations_total - , v_policy_violations_fail - , v_policy_violations_warn - , v_policy_violations_info - , v_policy_violations_audited - , v_policy_violations_unaudited - , v_policy_violations_license_total - , v_policy_violations_license_audited - , v_policy_violations_license_unaudited - , v_policy_violations_operational_total - , v_policy_violations_operational_audited - , v_policy_violations_operational_unaudited - , v_policy_violations_security_total - , v_policy_violations_security_audited - , v_policy_violations_security_unaudited - , NOW() - , NOW() - ); + ) + SELECT v_component."ID" + , v_component."PROJECT_ID" + , v_vulnerabilities + , v_critical + , v_high + , v_medium + , v_low + , v_unassigned + , v_risk_score + , v_findings_total + , v_findings_audited + , v_findings_unaudited + , v_findings_suppressed + , v_policy_violations_total + , v_policy_violations_fail + , v_policy_violations_warn + , v_policy_violations_info + , v_policy_violations_audited + , v_policy_violations_unaudited + , v_policy_violations_license_total + , v_policy_violations_license_audited + , v_policy_violations_license_unaudited + , v_policy_violations_operational_total + , v_policy_violations_operational_audited + , v_policy_violations_operational_unaudited + , v_policy_violations_security_total + , v_policy_violations_security_audited + , v_policy_violations_security_unaudited + , NOW() + , NOW() + -- Skip insert if the component was deleted while metrics were being computed. + WHERE EXISTS ( + SELECT 1 + FROM "COMPONENT" + WHERE "ID" = v_component."ID" + ); UPDATE "COMPONENT" SET "LAST_RISKSCORE" = v_risk_score diff --git a/migration/src/main/resources/org/dependencytrack/migration/R__update_project_metrics.sql b/migration/src/main/resources/org/dependencytrack/migration/R__update_project_metrics.sql index 798f205399..57a85db2c2 100644 --- a/migration/src/main/resources/org/dependencytrack/migration/R__update_project_metrics.sql +++ b/migration/src/main/resources/org/dependencytrack/migration/R__update_project_metrics.sql @@ -153,37 +153,43 @@ BEGIN "POLICYVIOLATIONS_SECURITY_UNAUDITED", "FIRST_OCCURRENCE", "LAST_OCCURRENCE") - VALUES ("v_project_id", - "v_components", - "v_vulnerable_components", - "v_vulnerabilities", - "v_critical", - "v_high", - "v_medium", - "v_low", - "v_unassigned", - "v_risk_score", - "v_findings_total", - "v_findings_audited", - "v_findings_unaudited", - "v_findings_suppressed", - "v_policy_violations_total", - "v_policy_violations_fail", - "v_policy_violations_warn", - "v_policy_violations_info", - "v_policy_violations_audited", - "v_policy_violations_unaudited", - "v_policy_violations_license_total", - "v_policy_violations_license_audited", - "v_policy_violations_license_unaudited", - "v_policy_violations_operational_total", - "v_policy_violations_operational_audited", - "v_policy_violations_operational_unaudited", - "v_policy_violations_security_total", - "v_policy_violations_security_audited", - "v_policy_violations_security_unaudited", - NOW(), - NOW()); + SELECT "v_project_id", + "v_components", + "v_vulnerable_components", + "v_vulnerabilities", + "v_critical", + "v_high", + "v_medium", + "v_low", + "v_unassigned", + "v_risk_score", + "v_findings_total", + "v_findings_audited", + "v_findings_unaudited", + "v_findings_suppressed", + "v_policy_violations_total", + "v_policy_violations_fail", + "v_policy_violations_warn", + "v_policy_violations_info", + "v_policy_violations_audited", + "v_policy_violations_unaudited", + "v_policy_violations_license_total", + "v_policy_violations_license_audited", + "v_policy_violations_license_unaudited", + "v_policy_violations_operational_total", + "v_policy_violations_operational_audited", + "v_policy_violations_operational_unaudited", + "v_policy_violations_security_total", + "v_policy_violations_security_audited", + "v_policy_violations_security_unaudited", + NOW(), + NOW() + -- Skip insert if the project was deleted while metrics were being computed. + WHERE EXISTS ( + SELECT 1 + FROM "PROJECT" + WHERE "ID" = "v_project_id" + ); UPDATE "PROJECT" SET "LAST_RISKSCORE" = "v_risk_score" WHERE "ID" = "v_project_id"; end; diff --git a/migration/src/main/resources/org/dependencytrack/migration/V202606101343__drop_metrics_fks.sql b/migration/src/main/resources/org/dependencytrack/migration/V202606101343__drop_metrics_fks.sql new file mode 100644 index 0000000000..cd614a30e7 --- /dev/null +++ b/migration/src/main/resources/org/dependencytrack/migration/V202606101343__drop_metrics_fks.sql @@ -0,0 +1,5 @@ +-- Drop the FKs from DEPENDENCYMETRICS / PROJECTMETRICS to COMPONENT / PROJECT. +-- See ADR 029 for details. +ALTER TABLE "DEPENDENCYMETRICS" DROP CONSTRAINT IF EXISTS "DEPENDENCYMETRICS_COMPONENT_FK"; +ALTER TABLE "DEPENDENCYMETRICS" DROP CONSTRAINT IF EXISTS "DEPENDENCYMETRICS_PROJECT_FK"; +ALTER TABLE "PROJECTMETRICS" DROP CONSTRAINT IF EXISTS "PROJECTMETRICS_PROJECT_FK";