From 6ca49aa2ab5bf4f2db6f6805fe10019b9678bef6 Mon Sep 17 00:00:00 2001
From: Michael Kuehl <hello@txs.vc>
Date: Fri, 17 Apr 2026 17:17:09 +0200
Subject: [PATCH] fix(knowledge): validate UUIDs in parse_causal_links (fixes
 #474)

The LLM sometimes returns truncated UUIDs or git SHAs instead of valid
fact UUIDs. These malformed IDs caused DuckDB ConversionException when
cast to UUID in store_causal_links, silently dropping all causal links
for the session. Add uuid.UUID() validation in parse_causal_links to
filter out invalid IDs at the parser layer with a warning log.
---
 agent_fox/knowledge/extraction.py             |  26 +++-
 .../unit/knowledge/test_extraction_causal.py  | 131 ++++++++++++++----
 2 files changed, 123 insertions(+), 34 deletions(-)

diff --git a/agent_fox/knowledge/extraction.py b/agent_fox/knowledge/extraction.py
index ebfa6426..33ed4f5f 100644
--- a/agent_fox/knowledge/extraction.py
+++ b/agent_fox/knowledge/extraction.py
@@ -140,11 +140,20 @@ def enrich_extraction_with_causal(
     return base_prompt + addendum
 
 
+def _is_valid_uuid(s: str) -> bool:
+    """Return True if *s* is a well-formed UUID string."""
+    try:
+        uuid.UUID(s)
+        return True
+    except (ValueError, AttributeError):
+        return False
+
+
 def parse_causal_links(extraction_response: str) -> list[tuple[str, str]]:
     """Parse causal link pairs from the extraction model's response.
 
-    Returns a list of (cause_id, effect_id) tuples. Silently skips
-    malformed entries.
+    Returns a list of (cause_id, effect_id) tuples. Skips entries where
+    either ID is missing, not a string, or not a valid UUID.
     """
     data = extract_json_array(extraction_response, repair_truncated=True)
     if data is None:
@@ -164,10 +173,17 @@ def parse_causal_links(extraction_response: str) -> list[tuple[str, str]]:
             continue
         cause_id = item.get("cause_id")
         effect_id = item.get("effect_id")
-        if isinstance(cause_id, str) and isinstance(effect_id, str):
-            links.append((cause_id, effect_id))
-        else:
+        if not isinstance(cause_id, str) or not isinstance(effect_id, str):
             logger.debug("Skipping malformed causal link entry: %s", item)
+            continue
+        if not _is_valid_uuid(cause_id) or not _is_valid_uuid(effect_id):
+            logger.warning(
+                "Skipping causal link with malformed UUID: cause_id=%s, effect_id=%s",
+                cause_id,
+                effect_id,
+            )
+            continue
+        links.append((cause_id, effect_id))
     return links
 
 
diff --git a/tests/unit/knowledge/test_extraction_causal.py b/tests/unit/knowledge/test_extraction_causal.py
index cdaa83e0..3f6d79ee 100644
--- a/tests/unit/knowledge/test_extraction_causal.py
+++ b/tests/unit/knowledge/test_extraction_causal.py
@@ -11,6 +11,15 @@
     parse_causal_links,
 )
 
+# Valid UUIDs for use in tests
+_UUID_A = "11111111-1111-1111-1111-111111111111"
+_UUID_B = "22222222-2222-2222-2222-222222222222"
+_UUID_C = "33333333-3333-3333-3333-333333333333"
+_UUID_D = "44444444-4444-4444-4444-444444444444"
+_UUID_E = "55555555-5555-5555-5555-555555555555"
+_UUID_X1 = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
+_UUID_X2 = "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb"
+
 
 class TestEnrichExtractionPrompt:
     """TS-13-14: Enrich extraction prompt includes prior facts.
@@ -20,27 +29,27 @@ class TestEnrichExtractionPrompt:
 
     def test_enriched_prompt_contains_base(self) -> None:
         """The enriched prompt includes the original base prompt."""
-        prior = [{"id": "aaa", "content": "User.email nullable"}]
+        prior = [{"id": _UUID_A, "content": "User.email nullable"}]
         result = enrich_extraction_with_causal("Extract facts:", prior)
         assert "Extract facts:" in result
 
     def test_enriched_prompt_contains_causal_section(self) -> None:
         """The enriched prompt includes the Causal Relationships section."""
-        prior = [{"id": "aaa", "content": "User.email nullable"}]
+        prior = [{"id": _UUID_A, "content": "User.email nullable"}]
         result = enrich_extraction_with_causal("Extract facts:", prior)
         assert "Causal Relationships" in result
 
     def test_enriched_prompt_contains_prior_fact_content(self) -> None:
         """The enriched prompt includes prior fact content."""
-        prior = [{"id": "aaa", "content": "User.email nullable"}]
+        prior = [{"id": _UUID_A, "content": "User.email nullable"}]
         result = enrich_extraction_with_causal("Extract facts:", prior)
         assert "User.email nullable" in result
 
     def test_enriched_prompt_with_multiple_prior_facts(self) -> None:
         """Multiple prior facts are all included in the enriched prompt."""
         prior = [
-            {"id": "aaa", "content": "First fact"},
-            {"id": "bbb", "content": "Second fact"},
+            {"id": _UUID_A, "content": "First fact"},
+            {"id": _UUID_B, "content": "Second fact"},
         ]
         result = enrich_extraction_with_causal("Base:", prior)
         assert "First fact" in result
@@ -60,12 +69,15 @@ class TestParseCausalLinks:
     """
 
     def test_parses_valid_links(self) -> None:
-        """Valid JSON causal links are parsed correctly."""
-        response = '[{"cause_id": "aaa", "effect_id": "bbb"}, {"cause_id": "ccc", "effect_id": "ddd"}]'
+        """Valid JSON causal links with proper UUIDs are parsed correctly."""
+        response = (
+            f'[{{"cause_id": "{_UUID_A}", "effect_id": "{_UUID_B}"}}, '
+            f'{{"cause_id": "{_UUID_C}", "effect_id": "{_UUID_D}"}}]'
+        )
         links = parse_causal_links(response)
         assert len(links) == 2
-        assert links[0] == ("aaa", "bbb")
-        assert links[1] == ("ccc", "ddd")
+        assert links[0] == (_UUID_A, _UUID_B)
+        assert links[1] == (_UUID_C, _UUID_D)
 
 
 class TestParseCausalLinksMalformed:
@@ -76,10 +88,13 @@ class TestParseCausalLinksMalformed:
 
     def test_skips_malformed_entries(self) -> None:
         """Malformed entries are silently skipped, valid ones returned."""
-        response = '[{"cause_id": "aaa", "effect_id": "bbb"}, {"bad": "entry"}, "not json"]'
+        response = (
+            f'[{{"cause_id": "{_UUID_A}", "effect_id": "{_UUID_B}"}}, '
+            '{"bad": "entry"}, "not json"]'
+        )
         links = parse_causal_links(response)
         assert len(links) == 1
-        assert links[0] == ("aaa", "bbb")
+        assert links[0] == (_UUID_A, _UUID_B)
 
 
 class TestParseCausalLinksEmpty:
@@ -99,17 +114,17 @@ class TestParseCausalLinksMarkdownFences:
 
     def test_parses_json_inside_code_fence(self) -> None:
         """JSON wrapped in ```json fences is parsed correctly."""
-        response = '```json\n[{"cause_id": "aaa", "effect_id": "bbb"}]\n```'
+        response = f'```json\n[{{"cause_id": "{_UUID_A}", "effect_id": "{_UUID_B}"}}]\n```'
         links = parse_causal_links(response)
         assert len(links) == 1
-        assert links[0] == ("aaa", "bbb")
+        assert links[0] == (_UUID_A, _UUID_B)
 
     def test_parses_json_inside_plain_fence(self) -> None:
         """JSON wrapped in ``` fences (no language tag) is parsed correctly."""
-        response = '```\n[{"cause_id": "x1", "effect_id": "x2"}]\n```'
+        response = f'```\n[{{"cause_id": "{_UUID_X1}", "effect_id": "{_UUID_X2}"}}]\n```'
         links = parse_causal_links(response)
         assert len(links) == 1
-        assert links[0] == ("x1", "x2")
+        assert links[0] == (_UUID_X1, _UUID_X2)
 
 
 class TestParseCausalLinksWithEchoedRefs:
@@ -118,16 +133,16 @@ class TestParseCausalLinksWithEchoedRefs:
     def test_parses_links_after_echoed_uuid_references(self) -> None:
         """JSON causal links are parsed when LLM echoes [uuid] refs in prose."""
         response = (
-            "Looking at [aaa-111] and [bbb-222], I see a causal chain:\n\n"
-            '[{"cause_id": "aaa-111", "effect_id": "bbb-222"}]'
+            f"Looking at [{_UUID_A}] and [{_UUID_B}], I see a causal chain:\n\n"
+            f'[{{"cause_id": "{_UUID_A}", "effect_id": "{_UUID_B}"}}]'
         )
         links = parse_causal_links(response)
         assert len(links) == 1
-        assert links[0] == ("aaa-111", "bbb-222")
+        assert links[0] == (_UUID_A, _UUID_B)
 
     def test_parses_empty_array_after_echoed_refs(self) -> None:
         """Empty JSON array is parsed when LLM echoes [uuid] refs in prose."""
-        response = "Reviewing [fact-1] and [fact-2], no causal relationship found.\n\n[]"
+        response = f"Reviewing [{_UUID_A}] and [{_UUID_B}], no causal relationship found.\n\n[]"
         links = parse_causal_links(response)
         assert len(links) == 0
 
@@ -145,7 +160,7 @@ def test_invalid_json_returns_empty_list(self) -> None:
 
     def test_partial_json_no_complete_entries_returns_empty(self) -> None:
         """Truncated JSON with no complete entries returns an empty list."""
-        links = parse_causal_links('[{"cause_id": "aaa", "effect_id":')
+        links = parse_causal_links(f'[{{"cause_id": "{_UUID_A}", "effect_id":')
         assert len(links) == 0
 
 
@@ -155,25 +170,83 @@ class TestParseCausalLinksTruncatedRecovery:
     def test_recovers_complete_entries_from_truncated_array(self) -> None:
         """Complete entries before the truncation point are recovered."""
         response = (
-            '[{"cause_id": "aaa", "effect_id": "bbb"}, '
-            '{"cause_id": "ccc", "effect_id": "ddd"}, '
-            '{"cause_id": "eee", "effect_'
+            f'[{{"cause_id": "{_UUID_A}", "effect_id": "{_UUID_B}"}}, '
+            f'{{"cause_id": "{_UUID_C}", "effect_id": "{_UUID_D}"}}, '
+            f'{{"cause_id": "{_UUID_E}", "effect_'
         )
         links = parse_causal_links(response)
         assert len(links) == 2
-        assert links[0] == ("aaa", "bbb")
-        assert links[1] == ("ccc", "ddd")
+        assert links[0] == (_UUID_A, _UUID_B)
+        assert links[1] == (_UUID_C, _UUID_D)
 
     def test_recovers_from_truncated_fenced_response(self) -> None:
         """Truncated ```json fenced response recovers valid entries."""
-        response = '```json\n[{"cause_id": "x1", "effect_id": "x2"}, {"cause_id": "y1"'
+        response = (
+            f'```json\n[{{"cause_id": "{_UUID_X1}", "effect_id": "{_UUID_X2}"}}, '
+            f'{{"cause_id": "{_UUID_A}"'
+        )
         links = parse_causal_links(response)
         assert len(links) == 1
-        assert links[0] == ("x1", "x2")
+        assert links[0] == (_UUID_X1, _UUID_X2)
 
     def test_single_complete_entry_before_truncation(self) -> None:
         """A single complete entry followed by truncation is recovered."""
-        response = '[{"cause_id": "a", "effect_id": "b"}, {"cause_id":'
+        response = f'[{{"cause_id": "{_UUID_A}", "effect_id": "{_UUID_B}"}}, {{"cause_id":'
         links = parse_causal_links(response)
         assert len(links) == 1
-        assert links[0] == ("a", "b")
+        assert links[0] == (_UUID_A, _UUID_B)
+
+
+# ---------------------------------------------------------------------------
+# Regression: malformed UUID filtering (fixes #474)
+# ---------------------------------------------------------------------------
+
+
+class TestParseCausalLinksUUIDValidation:
+    """Verify parse_causal_links filters out malformed UUIDs from LLM output."""
+
+    def test_truncated_uuid_filtered(self) -> None:
+        """Truncated UUID (missing a segment) is filtered out."""
+        truncated = "bcdd143f-4363-a85f-77b6748add6c"
+        response = f'[{{"cause_id": "{_UUID_A}", "effect_id": "{truncated}"}}]'
+        links = parse_causal_links(response)
+        assert len(links) == 0
+
+    def test_git_sha_filtered(self) -> None:
+        """40-char git SHA (not a UUID) is filtered out."""
+        git_sha = "b7f2ab9cf46b4552d505a3fac075a1935a653b22"
+        response = f'[{{"cause_id": "{git_sha}", "effect_id": "{_UUID_A}"}}]'
+        links = parse_causal_links(response)
+        assert len(links) == 0
+
+    def test_valid_uuid_passes(self) -> None:
+        """Valid UUID v4 strings pass validation."""
+        response = f'[{{"cause_id": "{_UUID_A}", "effect_id": "{_UUID_B}"}}]'
+        links = parse_causal_links(response)
+        assert len(links) == 1
+        assert links[0] == (_UUID_A, _UUID_B)
+
+    def test_mixed_valid_and_invalid_keeps_only_valid(self) -> None:
+        """Only links with both IDs as valid UUIDs are kept."""
+        git_sha = "b7f2ab9cf46b4552d505a3fac075a1935a653b22"
+        response = (
+            f'[{{"cause_id": "{_UUID_A}", "effect_id": "{_UUID_B}"}}, '
+            f'{{"cause_id": "{git_sha}", "effect_id": "{_UUID_C}"}}, '
+            f'{{"cause_id": "{_UUID_C}", "effect_id": "{_UUID_D}"}}]'
+        )
+        links = parse_causal_links(response)
+        assert len(links) == 2
+        assert links[0] == (_UUID_A, _UUID_B)
+        assert links[1] == (_UUID_C, _UUID_D)
+
+    def test_plain_string_filtered(self) -> None:
+        """Plain non-hex strings are filtered out."""
+        response = f'[{{"cause_id": "not-a-uuid", "effect_id": "{_UUID_A}"}}]'
+        links = parse_causal_links(response)
+        assert len(links) == 0
+
+    def test_empty_string_filtered(self) -> None:
+        """Empty string IDs are filtered out."""
+        response = f'[{{"cause_id": "", "effect_id": "{_UUID_A}"}}]'
+        links = parse_causal_links(response)
+        assert len(links) == 0