Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,20 @@
import java.util.UUID;
import java.util.regex.Pattern;

/**
* @since 5.0.0
*/
/// DAO for interacting with time series metrics.
///
/// ### Orphans
///
/// Rows in `DEPENDENCYMETRICS` and `PROJECTMETRICS` may reference
/// deleted `COMPONENT` or `PROJECT` rows. The FKs were dropped to
/// eliminate lock contention between partition maintenance,
/// project / component deletion, and metrics updates (see ADR 029).
/// Orphans are naturally cleaned up as part of partition maintenance.
///
/// All read paths must therefore be keyed by a still-existing parent IDs.
/// **Do not add joins that expose orphan rows without an explicit existence filter**!
///
/// @since 5.0.0
public interface MetricsDao extends SqlObject {

Pattern VALID_TABLE_IDENTIFIER_PATTERN = Pattern.compile("^\"[A-Z][A-Z0-9_]+\"$");
Expand Down Expand Up @@ -654,7 +665,7 @@ default int dropPartitions(final String parentTable, final List<String> partitio
} else {
getHandle().execute("ALTER TABLE %s DETACH PARTITION %s CONCURRENTLY".formatted(parentTable, partition));
}
getHandle().execute("DROP TABLE IF EXISTS %s CASCADE".formatted(partition));
getHandle().execute("DROP TABLE IF EXISTS %s".formatted(partition));
deletedCount++;
}
return deletedCount;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -248,4 +248,53 @@ public void shouldRejectRefreshGlobalPortfolioMetricsOutsideTransaction() {
.isInstanceOf(IllegalStateException.class);
}


@Test
public void shouldKeepDependencyMetricsWhenComponentDeleted() {
final var project = qm.createProject("acme-app", null, "1.0.0", null, null, null, null, false);
final var component = new Component();
component.setProject(project);
component.setName("Acme Component");
component.setVersion("1.0");
qm.createComponent(component, false);

metricsTestDao.createPartitionForDaysAgo("DEPENDENCYMETRICS", 0);
final var metrics = new DependencyMetrics();
metrics.setProjectId(project.getId());
metrics.setComponentId(component.getId());
metrics.setVulnerabilities(1);
metrics.setFirstOccurrence(Date.from(Instant.now()));
metrics.setLastOccurrence(Date.from(Instant.now()));
metricsTestDao.createDependencyMetrics(metrics);

final long componentId = component.getId();
jdbiHandle.execute("DELETE FROM \"COMPONENT\" WHERE \"ID\" = ?", componentId);

final List<DependencyMetrics> surviving =
metricsDao.getDependencyMetricsSince(
componentId, Instant.now().minus(Duration.ofDays(1)));
assertThat(surviving).hasSize(1);
}

@Test
public void shouldKeepProjectMetricsWhenProjectDeleted() {
final var project = qm.createProject("acme-app", null, "1.0.0", null, null, null, null, false);

metricsTestDao.createPartitionForDaysAgo("PROJECTMETRICS", 0);
final var metrics = new ProjectMetrics();
metrics.setProjectId(project.getId());
metrics.setVulnerabilities(1);
metrics.setFirstOccurrence(Date.from(Instant.now()));
metrics.setLastOccurrence(Date.from(Instant.now()));
metricsTestDao.createProjectMetrics(metrics);

final long projectId = project.getId();
jdbiHandle.execute("DELETE FROM \"PROJECT\" WHERE \"ID\" = ?", projectId);

final List<ProjectMetrics> surviving =
metricsDao.getProjectMetricsSince(
projectId, Instant.now().minus(Duration.ofDays(1)));
assertThat(surviving).hasSize(1);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -231,10 +231,10 @@ public void testCascadeDeleteProject() {
qm.getPersistenceManager().refresh(policy);
assertThat(policy.getProjects()).isEmpty();

// Ensure that metrics have been deleted.
// Metrics are NOT deleted, see ADR 029.
MetricsDao dao = jdbiHandle.attach(MetricsDao.class);
assertThat(dao.getProjectMetricsSince(project.getId(), DateUtil.parseShortDate("20250101").toInstant())).isEmpty();
assertThat(dao.getDependencyMetricsSince(component.getId(), DateUtil.parseShortDate("20250101").toInstant())).isEmpty();
assertThat(dao.getProjectMetricsSince(project.getId(), DateUtil.parseShortDate("20250101").toInstant())).isNotEmpty();
assertThat(dao.getDependencyMetricsSince(component.getId(), DateUtil.parseShortDate("20250101").toInstant())).isNotEmpty();
}

@Test
Expand Down
140 changes: 140 additions & 0 deletions docs/adr/029-drop-metrics-fks.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
| Status | Date | Author(s) |
|:---------|:-----------|:-------------------------------------|
| Accepted | 2026-06-10 | [@nscuro](https://github.com/nscuro) |

## Context

`DEPENDENCYMETRICS` and `PROJECTMETRICS` are partitioned by `LAST_OCCURRENCE`.
Both tables carry `ON DELETE CASCADE` foreign keys to `COMPONENT` and `PROJECT`,
so deleting a parent also deletes its metric rows.

When a partition is dropped, PostgreSQL also drops the foreign key constraint that the partition inherited.
Dropping the constraint removes the action triggers on the referenced side,
which requires an `ACCESS EXCLUSIVE` lock on `COMPONENT` and `PROJECT`
(see [ALTER TABLE notes][pg-altertable], [lock modes][pg-locking]). Concurrent traffic holds weaker locks on
the same tables: `SELECT` takes `ACCESS SHARE`, and a metric `INSERT` takes `ACCESS SHARE` plus a row-level
`FOR KEY SHARE` while it validates the foreign key. `ACCESS EXCLUSIVE` conflicts with every other lock mode,
so the two sides can deadlock. PostgreSQL aborts the partition drop, and the maintenance task fails.
An occurrence of this was reported in [#6343].

The risk is real but small, because the maintenance task runs on startup and then on the cron `1 * * * *`.
The `update-portfolio-metrics` only writes a row for projects that do not yet have one for `$today`.
The `analyze-project` workflow writes a row at the end of each analysis, and the scheduled portfolio
analysis runs daily at 06:00 UTC by default. Most hours, the writers are quiet and the maintenance task
takes the lock without contention. The deadlock becomes likely only under sustained back-to-back BOM uploads,
where re-analysis keeps the writers busy for hours.

The foreign keys also cost in two other places:

1. A `DELETE` of a project must visit every metrics partition once per component,
because the partition key is a date and PostgreSQL cannot prune by component or project ID.
2. Every metrics insert takes a read lock on `COMPONENT` and `PROJECT` for foreign key validation,
which blocks exclusive locks taken by deletes.

Through the API, metrics rows are read only through a parent UUID.
If the parent is gone, the API returns `404` before it touches the metric tables.

`PORTFOLIOMETRICS_GLOBAL` inner-joins `PROJECT`, so orphan rows drop out at refresh.

No path in the application reads a metric row without first resolving its parent.

### Possible Solutions

#### A: Drop only `ON DELETE CASCADE`, keep the foreign keys

Change the constraints from `ON DELETE CASCADE` to `ON DELETE NO ACTION`.
Parent deletion no longer cascades into the metric tables.

*Pro*:

1. The application can still rely on the foreign key to reject metric inserts that race with a parent delete.

*Con*:

1. The action triggers on the referenced side still exist. Dropping a partition still needs `ACCESS EXCLUSIVE`
on `COMPONENT` and `PROJECT`. The deadlock window from [#6343] stays open.
2. Project and component deletion now fails outright if metric rows still reference them. Cleanup must happen
in application code before the parent delete, walking every partition.

#### B: Replace partition drop with `DELETE` for retention

Keep the foreign keys. Replace the per-partition `DROP TABLE` with a bulk
`DELETE FROM ... WHERE "LAST_OCCURRENCE" < cutoff`.

*Pro*:

1. No DDL, no constraint drop, no `ACCESS EXCLUSIVE` lock on the parent tables. The deadlock window closes.

*Con*:

1. Throws away the main benefit of partitioning. Bulk `DELETE` writes dead tuples that autovacuum must reclaim.
Indexes grow rather than shrink.
2. Cascade chain on parent deletion still walks every partition. Same cost as today, with no upside.

#### C: Schedule the partition drop during a quiet window

Move the maintenance cron from `1 * * * *` to a known-quiet hour, like overnight.

*Pro*:

1. Smallest change.

*Con*:

1. Operational, not architectural. A deployment with users across multiple timezones has no globally quiet window.
Scheduled BOM uploads and downstream ingestion can run at any hour.
2. Does not address the cost of cascading deletes on parent removal.

#### D: Drop the foreign keys

Drop the three constraints from `DEPENDENCYMETRICS` and `PROJECTMETRICS`.
Partition retention becomes the only mechanism that removes metric rows.

*Pro*:

1. The partition drop touches only the metric tables. No lock on `COMPONENT` or `PROJECT`.
2. Parent deletion becomes cheap, with no cross-partition cascade.
3. Metric inserts no longer compete with parent deletes on the row lock.

*Con*:

1. Orphan metric rows can outlive their parent until partition retention removes them.
2. A metric insert raced with a parent delete now produces an orphan instead of failing at commit.

## Decision

We will follow solution **D**. The three foreign keys from `DEPENDENCYMETRICS` and `PROJECTMETRICS` to `COMPONENT`
and `PROJECT` are dropped. The partition retention task becomes the only mechanism that removes metric rows.
Other tables that reference `COMPONENT` or `PROJECT` are out of scope.

## Consequences

The partition drop no longer needs an exclusive lock on `COMPONENT` or `PROJECT`.
The deadlock window closes. Deleting projects or components no longer cascades into the metric tables,
so deleting a large project becomes much faster. Metric inserts no longer validate against the parent tables,
so they stop competing with deletes for the lock.

Storage cost actually goes *down*. A cascading delete writes dead tuples into every metric partition that holds
a referenced row, and autovacuum has to reclaim them. Dropping a whole partition at retention reclaims the space
at once with no vacuum work.

Metric rows can outlive their parent. Orphans are not reachable from the API and self-clean when their partition is dropped.

A race between a metric insert and a parent delete now produces an orphan instead of an aborted insert.
We accept this.

Reversing the decision later is possible but not free. A re-added foreign key must use `ADD CONSTRAINT ... NOT VALID`
followed by `VALIDATE CONSTRAINT` to avoid the same exclusive lock that motivated this change.

`NOT VALID` takes the weaker `SHARE ROW EXCLUSIVE` lock on both tables, and `VALIDATE CONSTRAINT` only needs
`SHARE UPDATE EXCLUSIVE` on the referencing table, and `ROW SHARE` on the referenced one
(see [ALTER TABLE notes][pg-altertable]).

`VALIDATE CONSTRAINT` also fails as soon as it hits an orphan. A reversal must therefore purge orphans first,
with queries like `DELETE FROM "PROJECTMETRICS" WHERE "PROJECT_ID" NOT IN (SELECT "ID" FROM "PROJECT")`
and the corresponding statement for `DEPENDENCYMETRICS`. Each purge writes dead tuples that autovacuum must reclaim,
so the reversal is more disruptive than the original drop.

[#6343]: https://github.com/DependencyTrack/dependency-track/issues/6343
[pg-altertable]: https://www.postgresql.org/docs/current/sql-altertable.html#SQL-ALTERTABLE-NOTES
[pg-locking]: https://www.postgresql.org/docs/current/explicit-locking.html#LOCKING-TABLES
Original file line number Diff line number Diff line change
Expand Up @@ -184,38 +184,43 @@ BEGIN
, "POLICYVIOLATIONS_SECURITY_UNAUDITED"
, "FIRST_OCCURRENCE"
, "LAST_OCCURRENCE"
) VALUES (
v_component."ID"
, v_component."PROJECT_ID"
, v_vulnerabilities
, v_critical
, v_high
, v_medium
, v_low
, v_unassigned
, v_risk_score
, v_findings_total
, v_findings_audited
, v_findings_unaudited
, v_findings_suppressed
, v_policy_violations_total
, v_policy_violations_fail
, v_policy_violations_warn
, v_policy_violations_info
, v_policy_violations_audited
, v_policy_violations_unaudited
, v_policy_violations_license_total
, v_policy_violations_license_audited
, v_policy_violations_license_unaudited
, v_policy_violations_operational_total
, v_policy_violations_operational_audited
, v_policy_violations_operational_unaudited
, v_policy_violations_security_total
, v_policy_violations_security_audited
, v_policy_violations_security_unaudited
, NOW()
, NOW()
);
)
SELECT v_component."ID"
, v_component."PROJECT_ID"
, v_vulnerabilities
, v_critical
, v_high
, v_medium
, v_low
, v_unassigned
, v_risk_score
, v_findings_total
, v_findings_audited
, v_findings_unaudited
, v_findings_suppressed
, v_policy_violations_total
, v_policy_violations_fail
, v_policy_violations_warn
, v_policy_violations_info
, v_policy_violations_audited
, v_policy_violations_unaudited
, v_policy_violations_license_total
, v_policy_violations_license_audited
, v_policy_violations_license_unaudited
, v_policy_violations_operational_total
, v_policy_violations_operational_audited
, v_policy_violations_operational_unaudited
, v_policy_violations_security_total
, v_policy_violations_security_audited
, v_policy_violations_security_unaudited
, NOW()
, NOW()
-- Skip insert if the component was deleted while metrics were being computed.
WHERE EXISTS (
SELECT 1
FROM "COMPONENT"
WHERE "ID" = v_component."ID"
);

UPDATE "COMPONENT"
SET "LAST_RISKSCORE" = v_risk_score
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,37 +153,43 @@ BEGIN
"POLICYVIOLATIONS_SECURITY_UNAUDITED",
"FIRST_OCCURRENCE",
"LAST_OCCURRENCE")
VALUES ("v_project_id",
"v_components",
"v_vulnerable_components",
"v_vulnerabilities",
"v_critical",
"v_high",
"v_medium",
"v_low",
"v_unassigned",
"v_risk_score",
"v_findings_total",
"v_findings_audited",
"v_findings_unaudited",
"v_findings_suppressed",
"v_policy_violations_total",
"v_policy_violations_fail",
"v_policy_violations_warn",
"v_policy_violations_info",
"v_policy_violations_audited",
"v_policy_violations_unaudited",
"v_policy_violations_license_total",
"v_policy_violations_license_audited",
"v_policy_violations_license_unaudited",
"v_policy_violations_operational_total",
"v_policy_violations_operational_audited",
"v_policy_violations_operational_unaudited",
"v_policy_violations_security_total",
"v_policy_violations_security_audited",
"v_policy_violations_security_unaudited",
NOW(),
NOW());
SELECT "v_project_id",
"v_components",
"v_vulnerable_components",
"v_vulnerabilities",
"v_critical",
"v_high",
"v_medium",
"v_low",
"v_unassigned",
"v_risk_score",
"v_findings_total",
"v_findings_audited",
"v_findings_unaudited",
"v_findings_suppressed",
"v_policy_violations_total",
"v_policy_violations_fail",
"v_policy_violations_warn",
"v_policy_violations_info",
"v_policy_violations_audited",
"v_policy_violations_unaudited",
"v_policy_violations_license_total",
"v_policy_violations_license_audited",
"v_policy_violations_license_unaudited",
"v_policy_violations_operational_total",
"v_policy_violations_operational_audited",
"v_policy_violations_operational_unaudited",
"v_policy_violations_security_total",
"v_policy_violations_security_audited",
"v_policy_violations_security_unaudited",
NOW(),
NOW()
-- Skip insert if the project was deleted while metrics were being computed.
WHERE EXISTS (
SELECT 1
FROM "PROJECT"
WHERE "ID" = "v_project_id"
);

UPDATE "PROJECT" SET "LAST_RISKSCORE" = "v_risk_score" WHERE "ID" = "v_project_id";
end;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
-- Drop the FKs from DEPENDENCYMETRICS / PROJECTMETRICS to COMPONENT / PROJECT.
-- See ADR 029 for details.
ALTER TABLE "DEPENDENCYMETRICS" DROP CONSTRAINT IF EXISTS "DEPENDENCYMETRICS_COMPONENT_FK";
ALTER TABLE "DEPENDENCYMETRICS" DROP CONSTRAINT IF EXISTS "DEPENDENCYMETRICS_PROJECT_FK";
ALTER TABLE "PROJECTMETRICS" DROP CONSTRAINT IF EXISTS "PROJECTMETRICS_PROJECT_FK";
Loading