From 525723c670a93f735030eb933dd67cf06acd26be Mon Sep 17 00:00:00 2001 From: Sean Kim Date: Fri, 3 Jul 2026 15:25:03 -0700 Subject: [PATCH] fix(report): keep ReDoS distinct from generic DoS in SARIF class hash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_class_keyword` scans `_VULN_CLASS_KEYWORDS` in order and returns the first substring match, so a more specific class must precede any class whose name it contains — an invariant the surrounding comment spells out. "denial of service" was listed before "regex denial of service", so any finding titled "...Regex Denial of Service..." resolved to the generic "denial of service" class instead. For locationless (synthetic-anchored) findings this collapses the `synth_class` component of `_primary_fingerprint`, so a ReDoS finding and a generic DoS finding on the same CWE get the *same* `partialFingerprints.primaryLocationLineHash` — and a SARIF consumer such as GitHub code-scanning silently dedups one genuine finding away. Reorder so "regex denial of service" precedes "denial of service", and add regression tests at both the `_class_keyword` and `write_sarif` levels (the latter asserts the two findings no longer share a fingerprint). --- strix/report/sarif.py | 6 +++++- tests/test_sarif.py | 39 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/strix/report/sarif.py b/strix/report/sarif.py index 8237d7c25..cdd970d31 100644 --- a/strix/report/sarif.py +++ b/strix/report/sarif.py @@ -716,8 +716,12 @@ def _normalise_cwe(value: str) -> str | None: "insecure random", "tls verification", "certificate verification", - "denial of service", + # "regex denial of service" MUST precede "denial of service": it is the + # more specific class and "denial of service" is a substring of it, so + # (first-match-wins) the generic entry would otherwise shadow it and + # collapse ReDoS findings into the generic DoS class hash. "regex denial of service", + "denial of service", "redos", "supply chain", ) diff --git a/tests/test_sarif.py b/tests/test_sarif.py index 849835b0b..aead43803 100644 --- a/tests/test_sarif.py +++ b/tests/test_sarif.py @@ -5,7 +5,7 @@ import json from typing import TYPE_CHECKING, Any -from strix.report.sarif import write_sarif +from strix.report.sarif import _class_keyword, write_sarif if TYPE_CHECKING: @@ -178,6 +178,43 @@ def test_write_sarif_adds_logical_location_for_endpoint(tmp_path: Path) -> None: assert logical == [{"fullyQualifiedName": "GET /api/users/{id}", "kind": "endpoint"}] +def test_class_keyword_prefers_specific_regex_dos_over_generic_dos() -> None: + # "regex denial of service" is the more specific class and *contains* the + # substring "denial of service"; first-match-wins keyword selection must + # return the specific class rather than let the generic entry shadow it. + assert _class_keyword("Regex Denial of Service in email validator") == "regex denial of service" + # A genuinely generic DoS finding still resolves to the generic class. + assert _class_keyword("Denial of Service via unbounded upload") == "denial of service" + + +def test_write_sarif_regex_dos_not_merged_into_generic_dos(tmp_path: Path) -> None: + # Two locationless findings on the same CWE are separated only by their + # vuln-class keyword (the synth_class component of the primary fingerprint). + # If ReDoS is misclassified as generic DoS the two collapse to one + # fingerprint and a SARIF consumer silently dedups a real finding away. + write_sarif( + tmp_path, + [ + _finding( + id="vuln-redos", + title="Regex Denial of Service in email validator", + cwe="CWE-400", + code_locations=None, + ), + _finding( + id="vuln-dos", + title="Denial of Service via unbounded upload", + cwe="CWE-400", + code_locations=None, + ), + ], + ) + results = _read(tmp_path)["runs"][0]["results"] + assert len(results) == 2 + fingerprints = {r["partialFingerprints"]["primaryLocationLineHash"] for r in results} + assert len(fingerprints) == 2 + + def test_write_sarif_synthetic_finding_falls_back_to_resource_logical_location( tmp_path: Path, ) -> None: