From 53e17d53d7956350d577b17d6c78eb8b49991ef6 Mon Sep 17 00:00:00 2001 From: Michael Kuehl Date: Thu, 23 Apr 2026 12:47:09 +0200 Subject: [PATCH] feat(engine): restore eager pre-review with retry-predecessor (fixes #519) Add retry_predecessor=True to pre-review mode so critical findings convert to coder retries instead of permanent blocks. Lower the default pre_review_block_threshold from 3 to 1 and change the comparison from > to >= so a single confirmed critical triggers the feedback loop. Fix group-0 reviewer coder_node_id to correctly target group 1. --- agent_fox/archetypes.py | 1 + agent_fox/core/config.py | 2 +- agent_fox/engine/blocking.py | 5 +- agent_fox/engine/result_handler.py | 93 ++++- agent_fox/session/convergence.py | 4 +- .../unit/core/test_reviewer_consolidation.py | 4 +- tests/unit/engine/test_block_budget.py | 50 ++- .../engine/test_review_retry_predecessor.py | 380 ++++++++++++++++++ tests/unit/session/test_convergence.py | 2 +- 9 files changed, 520 insertions(+), 21 deletions(-) create mode 100644 tests/unit/engine/test_review_retry_predecessor.py diff --git a/agent_fox/archetypes.py b/agent_fox/archetypes.py index 57c4fec2..072b9192 100644 --- a/agent_fox/archetypes.py +++ b/agent_fox/archetypes.py @@ -91,6 +91,7 @@ class ArchetypeEntry: "pre-review": ModeConfig( injection="auto_pre", allowlist=[], # no shell access + retry_predecessor=True, ), "drift-review": ModeConfig( injection="auto_pre", diff --git a/agent_fox/core/config.py b/agent_fox/core/config.py index 895aa666..81e96468 100644 --- a/agent_fox/core/config.py +++ b/agent_fox/core/config.py @@ -399,7 +399,7 @@ class ReviewerConfig(BaseModel): model_config = ConfigDict(extra="ignore") pre_review_block_threshold: Annotated[int, Clamped(ge=0)] = Field( - default=3, + default=1, description="Finding count to block merge for pre-review mode", ) drift_review_block_threshold: int | None = Field( diff --git a/agent_fox/engine/blocking.py b/agent_fox/engine/blocking.py index 60004efb..b7e3be55 100644 --- a/agent_fox/engine/blocking.py +++ b/agent_fox/engine/blocking.py @@ -111,7 +111,8 @@ def evaluate_review_blocking( parsed = parse_node_id(record.node_id) spec_name = parsed.spec_name - task_group = str(parsed.group_number) if parsed.group_number else "1" + # Group-0 nodes are auto_pre reviewers; the first coder group is always 1 + task_group = "1" if parsed.group_number == 0 else str(parsed.group_number) coder_node_id = f"{spec_name}:{task_group}" # Display label for log messages @@ -182,7 +183,7 @@ def evaluate_review_blocking( return BlockDecision(should_block=False) configured_threshold = rc.drift_review_block_threshold - blocked = critical_count > configured_threshold + blocked = critical_count >= configured_threshold if blocked: reason = _format_block_reason( diff --git a/agent_fox/engine/result_handler.py b/agent_fox/engine/result_handler.py index ed45b9a7..a7901cd7 100644 --- a/agent_fox/engine/result_handler.py +++ b/agent_fox/engine/result_handler.py @@ -114,10 +114,99 @@ def check_skeptic_blocking( sink=self._sink, run_id=self._run_id, ) - if decision.should_block: - self._block_task(decision.coder_node_id, state, decision.reason) + if not decision.should_block: + return False + + node_archetype = self._get_node_archetype(record.node_id) + node_mode = self._get_node_mode(record.node_id) + archetype_entry = get_archetype(node_archetype) + if node_mode is not None: + from agent_fox.archetypes import resolve_effective_config + + archetype_entry = resolve_effective_config(archetype_entry, node_mode) + + if archetype_entry.retry_predecessor: + return self._retry_on_review_block(record, decision, state) + + self._block_task(decision.coder_node_id, state, decision.reason) + self._generate_errata(record) + return True + + def _retry_on_review_block( + self, + record: SessionRecord, + decision: Any, + state: ExecutionState, + ) -> bool: + """Convert a review block to a coder retry when retry_predecessor is set. + + Instead of permanently blocking the coder, lets it proceed with review + findings injected as context. Uses the coder's escalation ladder to + prevent infinite retries; permanently blocks when the ladder is exhausted. + + Returns True if the coder was permanently blocked (ladder exhausted), + False if converted to a retry. + """ + from agent_fox.routing.escalation import EscalationLadder + + coder_node_id = decision.coder_node_id + + pred_ladder = self._routing_ladders.get(coder_node_id) + if pred_ladder is None: + coder_archetype = self._get_node_archetype(coder_node_id) + coder_entry = get_archetype(coder_archetype) + pred_ladder = EscalationLadder( + starting_tier=ModelTier(coder_entry.default_model_tier), + tier_ceiling=ModelTier.ADVANCED, + retries_before_escalation=self._retries_before_escalation, + ) + self._routing_ladders[coder_node_id] = pred_ladder + + pred_ladder.record_failure() + + if pred_ladder.is_exhausted: + logger.warning( + "Review retry-predecessor exhausted for %s, permanently blocking", + coder_node_id, + ) + self._block_task(coder_node_id, state, decision.reason) self._generate_errata(record) return True + + logger.info( + "Review blocking converted to retry for %s (findings injected as context)", + coder_node_id, + ) + coder_status = self._graph_sync.node_states.get(coder_node_id) + if coder_status == "completed": + self._graph_sync.node_states[coder_node_id] = "pending" + self._graph_sync.node_states[record.node_id] = "pending" + + emit_audit_event( + self._sink, + self._run_id, + AuditEventType.TASK_STATUS_CHANGE, + node_id=record.node_id, + payload={ + "from_status": "completed", + "to_status": "retry_predecessor", + "reason": decision.reason, + "coder_node_id": coder_node_id, + }, + ) + + if self._task_callback is not None: + self._task_callback( + TaskEvent( + node_id=record.node_id, + status="disagreed", + duration_s=0, + archetype=self._get_node_archetype(record.node_id), + predecessor_node=coder_node_id, + ) + ) + + self._generate_errata(record) return False def _generate_errata(self, record: SessionRecord) -> None: diff --git a/agent_fox/session/convergence.py b/agent_fox/session/convergence.py index cc98c87c..bb38177f 100644 --- a/agent_fox/session/convergence.py +++ b/agent_fox/session/convergence.py @@ -97,7 +97,7 @@ def converge_skeptic( if severity == "critical" and count >= majority_threshold: majority_critical_count += 1 - blocked = majority_critical_count > block_threshold + blocked = majority_critical_count > 0 and majority_critical_count >= block_threshold return merged, blocked @@ -191,7 +191,7 @@ def converge_skeptic_records( if severity == "critical" and count >= majority_threshold: majority_critical_count += 1 - blocked = majority_critical_count > block_threshold + blocked = majority_critical_count > 0 and majority_critical_count >= block_threshold return merged, blocked diff --git a/tests/unit/core/test_reviewer_consolidation.py b/tests/unit/core/test_reviewer_consolidation.py index afdcb1c7..717a52eb 100644 --- a/tests/unit/core/test_reviewer_consolidation.py +++ b/tests/unit/core/test_reviewer_consolidation.py @@ -307,8 +307,8 @@ def test_reviewer_config(self) -> None: from agent_fox.core.config import ReviewerConfig rc = ReviewerConfig() - assert rc.pre_review_block_threshold == 3, ( - f"pre_review_block_threshold should be 3, got {rc.pre_review_block_threshold}" + assert rc.pre_review_block_threshold == 1, ( + f"pre_review_block_threshold should be 1, got {rc.pre_review_block_threshold}" ) assert rc.drift_review_block_threshold is None, ( f"drift_review_block_threshold should be None (advisory), got {rc.drift_review_block_threshold}" diff --git a/tests/unit/engine/test_block_budget.py b/tests/unit/engine/test_block_budget.py index 2467e8e3..ebeacfed 100644 --- a/tests/unit/engine/test_block_budget.py +++ b/tests/unit/engine/test_block_budget.py @@ -204,22 +204,50 @@ class TestSkepticBlocking: async def test_reviewer_blocks_coder_on_critical_findings( self, ) -> None: - """When reviewer:pre-review finds criticals above threshold, coder is blocked.""" - db_conn = _chain_with_reviewer() + """When skeptic finds criticals above threshold, coder is blocked. + + Uses the legacy skeptic archetype (no retry_predecessor) to test + permanent blocking. Pre-review mode now has retry_predecessor=True + and converts blocks to retries instead. + """ + db_conn = write_plan_to_db( + nodes={ + "spec_a:1:skeptic": { + "title": "Skeptic Review", + "spec_name": "spec_a", + "group_number": 1, + "archetype": "skeptic", + }, + "spec_a:1": { + "title": "Implement spec_a", + "spec_name": "spec_a", + "group_number": 1, + "archetype": "coder", + }, + "spec_a:1:verifier": { + "title": "Verify spec_a", + "spec_name": "spec_a", + "group_number": 1, + "archetype": "verifier", + }, + }, + edges=[ + {"source": "spec_a:1", "target": "spec_a:1:skeptic", "kind": "intra_spec"}, + {"source": "spec_a:1", "target": "spec_a:1:verifier", "kind": "intra_spec"}, + ], + order=["spec_a:1", "spec_a:1:skeptic", "spec_a:1:verifier"], + ) runner = MockSessionRunner() runner.configure( - "spec_a:0:reviewer:pre-review", - [ - MockSessionOutcome( - "spec_a:0:reviewer:pre-review", - "completed", - archetype="reviewer", - ) - ], + "spec_a:1", + [MockSessionOutcome("spec_a:1", "completed", archetype="coder")], + ) + runner.configure( + "spec_a:1:skeptic", + [MockSessionOutcome("spec_a:1:skeptic", "completed", archetype="skeptic")], ) - # Mock review findings with 4 criticals (threshold default is 3) mock_findings = [] for i in range(4): finding = MagicMock() diff --git a/tests/unit/engine/test_review_retry_predecessor.py b/tests/unit/engine/test_review_retry_predecessor.py new file mode 100644 index 00000000..4ca50dcc --- /dev/null +++ b/tests/unit/engine/test_review_retry_predecessor.py @@ -0,0 +1,380 @@ +"""Tests for review retry-predecessor on blocking (issue #519). + +Validates that when a reviewer with retry_predecessor=True finds blocking-level +critical findings, the coder is not permanently blocked but instead allowed to +proceed (or retried) with findings as context. Also tests the threshold change +from > to >= and the group-0 coder_node_id fix. +""" + +from __future__ import annotations + +import uuid +from unittest.mock import MagicMock + +import duckdb + +from agent_fox.engine.blocking import BlockDecision, evaluate_review_blocking +from agent_fox.engine.result_handler import SessionResultHandler +from agent_fox.engine.state import ExecutionState, SessionRecord +from agent_fox.graph.types import Edge, Node, TaskGraph +from agent_fox.knowledge.review_store import ReviewFinding, insert_findings + + +def _make_finding( + *, + severity: str = "critical", + description: str = "Test finding", + spec_name: str = "test_spec", + task_group: str = "1", + session_id: str = "test_spec:1:1", + category: str | None = None, +) -> ReviewFinding: + return ReviewFinding( + id=str(uuid.uuid4()), + severity=severity, + description=description, + requirement_ref=None, + spec_name=spec_name, + task_group=task_group, + session_id=session_id, + category=category, + ) + + +def _make_session_record( + node_id: str = "test_spec:1", + archetype: str = "skeptic", + attempt: int = 1, +) -> SessionRecord: + return SessionRecord( + node_id=node_id, + archetype=archetype, + attempt=attempt, + status="completed", + input_tokens=0, + output_tokens=0, + cost=0.0, + duration_ms=0, + error_message=None, + timestamp="2026-01-01T00:00:00", + ) + + +def _make_archetypes_config(block_threshold: int = 1): + config = MagicMock() + config.reviewer_config.pre_review_block_threshold = block_threshold + config.reviewer_config.drift_review_block_threshold = block_threshold + return config + + +class TestThresholdGteComparison: + """Threshold comparison uses >= so threshold=1 blocks on 1 critical.""" + + def test_single_critical_blocks_at_threshold_1( + self, knowledge_conn: duckdb.DuckDBPyConnection + ) -> None: + finding = _make_finding( + severity="critical", + description="Missing error handling", + session_id="test_spec:1:1", + ) + insert_findings(knowledge_conn, [finding]) + + record = _make_session_record() + config = _make_archetypes_config(block_threshold=1) + + decision = evaluate_review_blocking(record, config, knowledge_conn) + + assert decision.should_block is True + + def test_single_critical_does_not_block_at_threshold_2( + self, knowledge_conn: duckdb.DuckDBPyConnection + ) -> None: + finding = _make_finding( + severity="critical", + description="Missing error handling", + session_id="test_spec:1:1", + ) + insert_findings(knowledge_conn, [finding]) + + record = _make_session_record() + config = _make_archetypes_config(block_threshold=2) + + decision = evaluate_review_blocking(record, config, knowledge_conn) + + assert decision.should_block is False + + def test_two_criticals_block_at_threshold_2( + self, knowledge_conn: duckdb.DuckDBPyConnection + ) -> None: + findings = [ + _make_finding(description=f"Critical issue {i}", session_id="test_spec:1:1") + for i in range(2) + ] + insert_findings(knowledge_conn, findings) + + record = _make_session_record() + config = _make_archetypes_config(block_threshold=2) + + decision = evaluate_review_blocking(record, config, knowledge_conn) + + assert decision.should_block is True + + +class TestGroup0CoderNodeId: + """Group-0 reviewers target coder group 1, not group 0.""" + + def test_group_0_reviewer_targets_group_1( + self, knowledge_conn: duckdb.DuckDBPyConnection + ) -> None: + finding = _make_finding( + severity="critical", + description="Command injection", + spec_name="spec_07", + task_group="1", + session_id="spec_07:0:reviewer:pre-review:1", + category="security", + ) + insert_findings(knowledge_conn, [finding]) + + record = _make_session_record( + node_id="spec_07:0:reviewer:pre-review", + archetype="reviewer", + ) + config = _make_archetypes_config(block_threshold=1) + + decision = evaluate_review_blocking( + record, config, knowledge_conn, mode="pre-review" + ) + + assert decision.should_block is True + assert decision.coder_node_id == "spec_07:1" + + def test_non_group_0_reviewer_keeps_own_group( + self, knowledge_conn: duckdb.DuckDBPyConnection + ) -> None: + finding = _make_finding( + severity="critical", + description="Missing validation", + spec_name="spec_07", + task_group="3", + session_id="spec_07:3:1", + category="security", + ) + insert_findings(knowledge_conn, [finding]) + + record = _make_session_record( + node_id="spec_07:3", + archetype="skeptic", + ) + config = _make_archetypes_config(block_threshold=1) + + decision = evaluate_review_blocking(record, config, knowledge_conn) + + assert decision.should_block is True + assert decision.coder_node_id == "spec_07:3" + + +class TestPreReviewRetryPredecessor: + """Pre-review with retry_predecessor=True converts block to retry.""" + + def test_pre_review_has_retry_predecessor(self) -> None: + from agent_fox.archetypes import get_archetype, resolve_effective_config + + entry = get_archetype("reviewer") + resolved = resolve_effective_config(entry, "pre-review") + assert resolved.retry_predecessor is True + + def test_audit_review_has_retry_predecessor(self) -> None: + from agent_fox.archetypes import get_archetype, resolve_effective_config + + entry = get_archetype("reviewer") + resolved = resolve_effective_config(entry, "audit-review") + assert resolved.retry_predecessor is True + + def test_drift_review_does_not_have_retry_predecessor(self) -> None: + from agent_fox.archetypes import get_archetype, resolve_effective_config + + entry = get_archetype("reviewer") + resolved = resolve_effective_config(entry, "drift-review") + assert resolved.retry_predecessor is False + + +class TestRetryOnReviewBlock: + """Result handler converts block to retry when retry_predecessor is set.""" + + def _make_handler_with_graph( + self, + knowledge_conn: duckdb.DuckDBPyConnection, + ) -> tuple[SessionResultHandler, ExecutionState, MagicMock]: + node_states = { + "test_spec:0:reviewer:pre-review": "completed", + "test_spec:1": "pending", + } + edges_dict = { + "test_spec:0:reviewer:pre-review": [], + "test_spec:1": ["test_spec:0:reviewer:pre-review"], + } + graph_sync = MagicMock() + graph_sync.node_states = node_states + graph_sync.predecessors = lambda nid: edges_dict.get(nid, []) + + graph = TaskGraph( + nodes={ + "test_spec:0:reviewer:pre-review": Node( + id="test_spec:0:reviewer:pre-review", + spec_name="test_spec", + group_number=0, + title="Pre-review", + optional=False, + archetype="reviewer", + mode="pre-review", + ), + "test_spec:1": Node( + id="test_spec:1", + spec_name="test_spec", + group_number=1, + title="Coder", + optional=False, + archetype="coder", + ), + }, + edges=[ + Edge( + source="test_spec:0:reviewer:pre-review", + target="test_spec:1", + kind="intra_spec", + ) + ], + order=["test_spec:0:reviewer:pre-review", "test_spec:1"], + ) + + block_task_fn = MagicMock() + + archetypes_config = _make_archetypes_config(block_threshold=1) + + handler = SessionResultHandler( + graph_sync=graph_sync, + routing_ladders={}, + retries_before_escalation=2, + max_retries=3, + task_callback=None, + sink=None, + run_id="test-run", + graph=graph, + archetypes_config=archetypes_config, + knowledge_db_conn=knowledge_conn, + block_task_fn=block_task_fn, + check_block_budget_fn=MagicMock(), + ) + + state = ExecutionState( + plan_hash="test", + node_states=node_states, + started_at="2026-01-01", + updated_at="2026-01-01", + ) + + return handler, state, block_task_fn + + def test_pre_review_block_converts_to_retry( + self, knowledge_conn: duckdb.DuckDBPyConnection + ) -> None: + """Pre-review blocking with retry_predecessor does NOT permanently block.""" + finding = _make_finding( + severity="critical", + description="Command injection vulnerability", + session_id="test_spec:0:reviewer:pre-review:1", + category="security", + ) + insert_findings(knowledge_conn, [finding]) + + handler, state, block_task_fn = self._make_handler_with_graph(knowledge_conn) + + record = _make_session_record( + node_id="test_spec:0:reviewer:pre-review", + archetype="reviewer", + ) + + blocked = handler.check_skeptic_blocking(record, state) + + assert blocked is False + block_task_fn.assert_not_called() + + def test_drift_review_block_is_permanent( + self, knowledge_conn: duckdb.DuckDBPyConnection + ) -> None: + """Drift-review blocking without retry_predecessor permanently blocks.""" + finding = _make_finding( + severity="critical", + description="Missing validation", + session_id="test_spec:1:1", + ) + insert_findings(knowledge_conn, [finding]) + + node_states = { + "test_spec:1": "completed", + } + graph_sync = MagicMock() + graph_sync.node_states = node_states + + graph = TaskGraph( + nodes={ + "test_spec:1": Node( + id="test_spec:1", + spec_name="test_spec", + group_number=1, + title="Coder", + optional=False, + archetype="reviewer", + mode="drift-review", + ), + }, + edges=[], + order=["test_spec:1"], + ) + + block_task_fn = MagicMock() + archetypes_config = _make_archetypes_config(block_threshold=1) + + handler = SessionResultHandler( + graph_sync=graph_sync, + routing_ladders={}, + retries_before_escalation=2, + max_retries=3, + task_callback=None, + sink=None, + run_id="test-run", + graph=graph, + archetypes_config=archetypes_config, + knowledge_db_conn=knowledge_conn, + block_task_fn=block_task_fn, + check_block_budget_fn=MagicMock(), + ) + + state = ExecutionState( + plan_hash="test", + node_states=node_states, + started_at="2026-01-01", + updated_at="2026-01-01", + ) + + record = _make_session_record( + node_id="test_spec:1", + archetype="skeptic", + ) + + blocked = handler.check_skeptic_blocking(record, state) + + assert blocked is True + block_task_fn.assert_called_once() + + +class TestDefaultThreshold: + """Default pre_review_block_threshold is 1.""" + + def test_default_threshold_is_1(self) -> None: + from agent_fox.core.config import ReviewerConfig + + rc = ReviewerConfig() + assert rc.pre_review_block_threshold == 1 diff --git a/tests/unit/session/test_convergence.py b/tests/unit/session/test_convergence.py index 837dd392..ad5f90d3 100644 --- a/tests/unit/session/test_convergence.py +++ b/tests/unit/session/test_convergence.py @@ -280,7 +280,7 @@ def test_prop_blocking_formula(self, n_criticals: int, threshold: int) -> None: findings = [Finding(severity="critical", description=f"Issue {i}") for i in range(n_criticals)] # Use 1 instance so all findings pass majority gate (1/1 >= ceil(1/2)) _, blocked = converge_skeptic([findings], block_threshold=threshold) - assert blocked == (n_criticals > threshold) + assert blocked == (n_criticals > 0 and n_criticals >= threshold) # ---------------------------------------------------------------------------