diff --git a/docker/docker-compose.tif-phase1.yml b/docker/docker-compose.tif-phase1.yml index 5ef383f..95024d3 100644 --- a/docker/docker-compose.tif-phase1.yml +++ b/docker/docker-compose.tif-phase1.yml @@ -1,6 +1,6 @@ services: dakera: - image: ${DAKERA_IMAGE:-ghcr.io/dakera-ai/dakera:0.11.81} + image: ${DAKERA_IMAGE:-ghcr.io/dakera-ai/dakera:0.11.90} ports: - "127.0.0.1:3200:3000" - "127.0.0.1:51051:50051" diff --git a/examples/tif-provenance/README.md b/examples/tif-provenance/README.md new file mode 100644 index 0000000..9b10f43 --- /dev/null +++ b/examples/tif-provenance/README.md @@ -0,0 +1,102 @@ +# T-I-F Feedback Provenance Phase 2 + +This example validates Phase 2 of the Dakera T-I-F decision provenance RFC: + +https://github.com/Dakera-AI/dakera-deploy/issues/161 + +Phase 1 proved that `metadata.reliability` survives store and recall and can +change agent-side decisions. Phase 2 tests the next maintainer-requested +question: can T-I-F scores be derived from real agent interaction signals and +used in a session-scoped decision trace? + +## What This Tests + +The example uses Dakera's public REST API only: + +- `POST /v1/memory/store` +- `POST /v1/memory/recall` +- `POST /v1/memories/{memory_id}/feedback` +- `GET /v1/memories/{memory_id}/feedback` +- `POST /v1/sessions/start` +- `GET /v1/sessions/{session_id}/memories` +- `POST /v1/memories/{memory_id}/links` + +The validation remains agent-side. Dakera stores memories, feedback, sessions, +and links. The local script computes T-I-F from feedback and stores a decision +trace under `metadata.decision_provenance`. + +Dakera `v0.11.90` requires `agent_id` when submitting feedback, reading +feedback history, and creating memory links. The validator keeps those +requirements explicit instead of hiding them behind an SDK. + +## Feedback-Derived T-I-F Rules + +```text +upvote: t + 0.10, i - 0.03, f - 0.05 +downvote: t - 0.10, i + 0.05, f + 0.15 +flag: t - 0.05, i + 0.20, f + 0.10 +``` + +Scores are clamped to `[0.0, 1.0]`. + +Decision priority: + +```text +f >= 0.50 -> surface_contradiction +i >= 0.50 -> ask_clarification +t >= 0.70 and i <= 0.35 and f <= 0.35 -> reuse_confidently +otherwise -> reuse_with_caveat +``` + +These thresholds are validation rules only. They are not proposed as Dakera +engine behavior. + +## Scenarios + +The fixture covers three developer-recognizable workflows: + +| Scenario | Purpose | +|---|---| +| `coding-assistant` | feedback corrects an obsolete endpoint decision | +| `research-agent` | weak-source feedback raises indeterminacy | +| `customer-support` | outdated policy is surfaced as contradiction evidence | + +Each scenario records: + +- baseline importance-only decision; +- feedback-derived T-I-F decision; +- decision trace memory; +- session ID; +- linked evidence memory IDs; +- associated recall proof. + +## Start Dakera + +The shared T-I-F compose file defaults to Dakera `v0.11.90`, binds to +`127.0.0.1`, and disables auth only for local validation. Do not run it on a +shared or internet-facing host. + +```bash +docker compose -f docker/docker-compose.tif-phase1.yml up -d +``` + +Stop: + +```bash +docker compose -f docker/docker-compose.tif-phase1.yml down +``` + +## Run Self-Test + +```bash +python examples/tif-provenance/validate_tif_provenance.py --self-test +``` + +## Run Runtime Validation + +```bash +python examples/tif-provenance/validate_tif_provenance.py --api http://localhost:3200 --request-timeout 240 +``` + +The script fails if feedback history, session trace storage, or associated +recall proof is missing. diff --git a/examples/tif-provenance/VALIDATION_RESULTS.md b/examples/tif-provenance/VALIDATION_RESULTS.md new file mode 100644 index 0000000..55a821d --- /dev/null +++ b/examples/tif-provenance/VALIDATION_RESULTS.md @@ -0,0 +1,152 @@ +# Phase 2 Validation Results + +Date: 2026-06-12 17:07:47 -04:00 + +Status: passed local runtime validation. + +## Target Runtime + +```text +Dakera image: ghcr.io/dakera-ai/dakera:0.11.90 +REST: http://127.0.0.1:3200 +gRPC: 127.0.0.1:51051 +Storage: in-memory +Auth: disabled for local validation only +``` + +The validation compose binds ports to localhost only. + +## Commands + +```powershell +python -m py_compile examples\tif-provenance\validate_tif_provenance.py +python examples\tif-provenance\validate_tif_provenance.py --self-test +docker compose -f docker\docker-compose.tif-phase1.yml down +docker compose -f docker\docker-compose.tif-phase1.yml up -d +python examples\tif-provenance\validate_tif_provenance.py --api http://localhost:3200 --request-timeout 240 +docker compose -f docker\docker-compose.tif-phase1.yml down +``` + +## Acceptance Criteria + +- all three scenarios pass; +- feedback endpoints accept `upvote`, `downvote`, and `flag`; +- feedback history is readable; +- feedback-derived T-I-F changes at least one decision per scenario; +- decision trace memory is stored with `metadata.decision_provenance`; +- session memories include the trace and evidence memories; +- associated recall returns linked evidence or contradiction memories; +- no engine code is modified; +- no first-class recall filters are added. + +## Result Summary + +All three scenarios passed against Dakera `0.11.90`. + +Runtime health reported: + +```json +{ + "ready": true, + "version": "0.11.90", + "checks": { + "embedding_engine": "ok", + "storage": "ok", + "tiered_engine": "disabled" + } +} +``` + +Scenario outcomes: + +| Scenario | Baseline action | Feedback-derived T-I-F action | Decision changed | Session proof | Associated recall proof | +| --- | --- | --- | --- | --- | --- | +| coding-assistant | `reuse_top_memory` | `surface_contradiction` | yes | yes | yes | +| research-agent | `reuse_top_memory` | `ask_clarification` | yes | yes | yes | +| customer-support | `reuse_top_memory` | `surface_contradiction` | yes | yes | yes | + +The runtime accepted feedback signals `upvote`, `downvote`, and `flag`; feedback history was readable for every seeded memory; each scenario stored a decision trace with `metadata.decision_provenance`; session memory listing included the trace and evidence memories; associated recall returned linked evidence memories when recalling the decision trace with `include_associated=true` and `associated_memories_depth=1`. + +Runtime contract notes observed on Dakera `0.11.90`: + +- `POST /v1/sessions/start` returns the session id as `session.id`. +- `POST /v1/memories/{memory_id}/feedback` requires `agent_id`. +- `GET /v1/memories/{memory_id}/feedback` requires `agent_id` as a query parameter. +- `POST /v1/memories/{memory_id}/links` requires `agent_id`. + +No engine code was modified. No first-class recall filters were added. + +## Review Correction Rerun + +Date: 2026-06-12 17:20:58 -04:00 + +Corrections after fork review: + +- healthcheck now requires `ready: true` before runtime validation proceeds; +- unsupported feedback signals now produce a clear validation error instead of a raw `KeyError`; +- Phase 1 recall normalization was reviewed and already handles list, dict, and nested `memory` response shapes. + +Rerun commands: + +```powershell +python -m py_compile examples\tif-provenance\validate_tif_provenance.py examples\tif-reliability\validate_tif_reliability.py +python examples\tif-provenance\validate_tif_provenance.py --self-test +python examples\tif-reliability\validate_tif_reliability.py --self-test +docker compose -f docker\docker-compose.tif-phase1.yml down +docker compose -f docker\docker-compose.tif-phase1.yml up -d +python examples\tif-provenance\validate_tif_provenance.py --api http://localhost:3200 --request-timeout 240 +docker compose -f docker\docker-compose.tif-phase1.yml down +``` + +Result: passed. + +## Codex Review Correction Rerun + +Date: 2026-06-12 18:02:19 -04:00 + +Additional Codex review findings corrected: + +- runtime decisions now use the normalized `/v1/memory/recall` response for each scenario query before choosing the baseline and feedback-aware memory; +- each scenario records `scenario_recall_proof` and the recalled fixture/runtime memory IDs; +- associated recall proof now verifies that every linked evidence memory appears in the full associated recall response and reports `associated_recall_missing_ids`; +- runtime validation was rerun with PowerShell preserving the validator exit code before Docker cleanup. + +Rerun commands: + +```powershell +python -m py_compile examples\tif-provenance\validate_tif_provenance.py examples\tif-reliability\validate_tif_reliability.py +python examples\tif-provenance\validate_tif_provenance.py --self-test +python examples\tif-reliability\validate_tif_reliability.py --self-test +docker compose -f docker\docker-compose.tif-phase1.yml down +docker compose -f docker\docker-compose.tif-phase1.yml up -d +python examples\tif-provenance\validate_tif_provenance.py --api http://localhost:3200 --request-timeout 240 +$validationExit = $LASTEXITCODE +docker compose -f docker\docker-compose.tif-phase1.yml down +exit $validationExit +``` + +Result: passed. All three scenarios returned `scenario_recall_proof: true`, `associated_recall_missing_ids: []`, `associated_recall_proof: true`, and `passed: true`. + +## Second Review Correction Rerun + +Date: 2026-06-12 17:40:38 -04:00 + +Additional Qodo findings corrected: + +- runtime `changed_decision` now mirrors the self-test logic and treats same-memory `reuse_confidently` as unchanged reuse; +- runtime memory metadata is deep-copied before adding derived reliability, and malformed or missing `metadata.reliability` now fails with a clear validation error; +- associated recall keeps a single read-only retry to tolerate cold reranker startup without retrying mutating endpoints. + +Rerun commands: + +```powershell +python -m py_compile examples\tif-provenance\validate_tif_provenance.py examples\tif-reliability\validate_tif_reliability.py +python examples\tif-provenance\validate_tif_provenance.py --self-test +python examples\tif-reliability\validate_tif_reliability.py --self-test +docker compose -f docker\docker-compose.tif-phase1.yml down +docker compose -f docker\docker-compose.tif-phase1.yml up -d +python examples\tif-provenance\validate_tif_provenance.py --api http://localhost:3200 --request-timeout 240 +docker compose -f docker\docker-compose.tif-phase1.yml down +``` + +Result: passed. diff --git a/examples/tif-provenance/phase2_scenarios.json b/examples/tif-provenance/phase2_scenarios.json new file mode 100644 index 0000000..ca8d301 --- /dev/null +++ b/examples/tif-provenance/phase2_scenarios.json @@ -0,0 +1,128 @@ +{ + "agent_id": "dakera-tif-phase2", + "scenarios": [ + { + "id": "coding-assistant", + "title": "Coding assistant review correction", + "query": "Which Dakera REST endpoint should the coding assistant use for storing memory with reliability metadata?", + "expected_action": "surface_contradiction", + "expected_changed_decision": true, + "expected_direct_memory": "coding-obsolete-endpoint", + "expected_safe_memory": "coding-current-endpoint", + "memories": [ + { + "id": "coding-current-endpoint", + "content": "Dakera memory store examples should use POST /v1/memory/store for the current public REST API.", + "importance": 0.84, + "feedback": ["upvote"], + "metadata": { + "reliability": { + "t": 0.66, + "i": 0.14, + "f": 0.10, + "basis": "Phase 1 runtime validation and maintainer review", + "source": "phase2_seed" + } + } + }, + { + "id": "coding-obsolete-endpoint", + "content": "Dakera examples should use POST /v1/memories when storing agent memories.", + "importance": 0.93, + "feedback": ["downvote", "downvote"], + "metadata": { + "reliability": { + "t": 0.38, + "i": 0.20, + "f": 0.34, + "basis": "obsolete quickstart assumption superseded by current API behavior", + "source": "phase2_seed" + } + } + } + ] + }, + { + "id": "research-agent", + "title": "Research agent source conflict", + "query": "Should the research agent cite an unsupported secondary note as confirmed evidence?", + "expected_action": "ask_clarification", + "expected_changed_decision": true, + "expected_direct_memory": "research-uncertain-source", + "expected_safe_memory": "research-source-backed", + "memories": [ + { + "id": "research-source-backed", + "content": "A research agent should prefer source-backed claims and cite the primary evidence when summarizing technical decisions.", + "importance": 0.80, + "feedback": ["upvote"], + "metadata": { + "reliability": { + "t": 0.68, + "i": 0.16, + "f": 0.08, + "basis": "primary-source research discipline", + "source": "phase2_seed" + } + } + }, + { + "id": "research-uncertain-source", + "content": "A research agent can treat an uncited secondary note as confirmed evidence when it sounds plausible.", + "importance": 0.92, + "feedback": ["flag", "flag"], + "metadata": { + "reliability": { + "t": 0.44, + "i": 0.18, + "f": 0.18, + "basis": "weak-source pattern flagged during review", + "source": "phase2_seed" + } + } + } + ] + }, + { + "id": "customer-support", + "title": "Customer support outdated policy", + "query": "Which customer support policy should the agent reuse when an old process conflicts with the current escalation rule?", + "expected_action": "surface_contradiction", + "expected_changed_decision": true, + "expected_direct_memory": "support-outdated-policy", + "expected_safe_memory": "support-current-policy", + "memories": [ + { + "id": "support-current-policy", + "content": "Customer support agents should follow the current escalation policy and ask for verification when a prior policy conflicts.", + "importance": 0.83, + "feedback": ["upvote"], + "metadata": { + "reliability": { + "t": 0.67, + "i": 0.12, + "f": 0.07, + "basis": "current support process", + "source": "phase2_seed" + } + } + }, + { + "id": "support-outdated-policy", + "content": "Customer support agents should always use the old refund process without checking for newer escalation rules.", + "importance": 0.91, + "feedback": ["downvote", "flag"], + "metadata": { + "reliability": { + "t": 0.42, + "i": 0.18, + "f": 0.30, + "basis": "outdated policy deliberately retained as contradiction evidence", + "source": "phase2_seed" + } + } + } + ] + } + ] +} diff --git a/examples/tif-provenance/validate_tif_provenance.py b/examples/tif-provenance/validate_tif_provenance.py new file mode 100644 index 0000000..1f3114f --- /dev/null +++ b/examples/tif-provenance/validate_tif_provenance.py @@ -0,0 +1,586 @@ +#!/usr/bin/env python3 +"""Phase 2 T-I-F provenance validation for Dakera memories. + +This script uses only Python's standard library and Dakera's public REST API. +It validates that T-I-F reliability can be derived from feedback signals, then +recorded in session-scoped decision traces linked to evidence memories. +""" + +from __future__ import annotations + +import argparse +import copy +import json +import sys +import time +import urllib.error +import urllib.parse +import urllib.request +from pathlib import Path +from typing import Any + + +DEFAULT_API = "http://localhost:3200" +DEFAULT_FIXTURE = Path(__file__).with_name("phase2_scenarios.json") +DEFAULT_REQUEST_TIMEOUT = 120 +REQUEST_TIMEOUT = DEFAULT_REQUEST_TIMEOUT + +FEEDBACK_DELTAS = { + "upvote": {"t": 0.10, "i": -0.03, "f": -0.05}, + "downvote": {"t": -0.10, "i": 0.05, "f": 0.15}, + "flag": {"t": -0.05, "i": 0.20, "f": 0.10}, +} + + +def load_fixture(path: Path) -> dict[str, Any]: + with path.open("r", encoding="utf-8") as handle: + return json.load(handle) + + +def request_json(method: str, url: str, payload: dict[str, Any] | None = None) -> Any: + body = None + headers = {"Accept": "application/json"} + if payload is not None: + body = json.dumps(payload).encode("utf-8") + headers["Content-Type"] = "application/json" + + request = urllib.request.Request(url, data=body, headers=headers, method=method) + try: + with urllib.request.urlopen(request, timeout=REQUEST_TIMEOUT) as response: + raw = response.read().decode("utf-8") + if not raw: + return {} + return json.loads(raw) + except urllib.error.HTTPError as exc: + detail = exc.read().decode("utf-8", errors="replace") + raise RuntimeError(f"{method} {url} failed: HTTP {exc.code}: {detail}") from exc + except urllib.error.URLError as exc: + raise RuntimeError(f"{method} {url} failed: {exc}") from exc + + +def healthcheck(api_base: str, retries: int = 120, delay: float = 2.0) -> Any: + last_error: Exception | None = None + for _ in range(retries): + try: + response = request_json("GET", f"{api_base}/health/ready") + if isinstance(response, dict) and response.get("ready") is True: + return response + last_error = RuntimeError(f"health endpoint is not ready: {response!r}") + except Exception as exc: # noqa: BLE001 - report final connection failure. + last_error = exc + time.sleep(delay) + raise RuntimeError(f"Dakera healthcheck failed after {retries} attempts: {last_error}") + + +def clamp(value: float) -> float: + return max(0.0, min(1.0, round(value, 4))) + + +def initial_reliability(memory: dict[str, Any]) -> dict[str, float]: + reliability = memory.get("metadata", {}).get("reliability", {}) + return { + "t": float(reliability.get("t", 0.0) or 0.0), + "i": float(reliability.get("i", 0.0) or 0.0), + "f": float(reliability.get("f", 0.0) or 0.0), + } + + +def derive_reliability(memory: dict[str, Any], feedback_signals: list[str] | None = None) -> dict[str, Any]: + derived = initial_reliability(memory) + signals = list(feedback_signals if feedback_signals is not None else memory.get("feedback", [])) + for signal in signals: + if signal not in FEEDBACK_DELTAS: + raise ValueError(f"unsupported feedback signal for T-I-F derivation: {signal!r}") + delta = FEEDBACK_DELTAS[signal] + derived["t"] = clamp(derived["t"] + delta["t"]) + derived["i"] = clamp(derived["i"] + delta["i"]) + derived["f"] = clamp(derived["f"] + delta["f"]) + return { + "t": derived["t"], + "i": derived["i"], + "f": derived["f"], + "basis": "derived from Dakera memory feedback signals", + "source": "feedback_derived_tif", + "signals": signals, + } + + +def classify_reliability(reliability: dict[str, Any]) -> dict[str, Any]: + t = float(reliability.get("t", 0.0) or 0.0) + i = float(reliability.get("i", 0.0) or 0.0) + f = float(reliability.get("f", 0.0) or 0.0) + + if f >= 0.50: + action = "surface_contradiction" + reason = "feedback-derived falsity makes this contradiction evidence" + elif i >= 0.50: + action = "ask_clarification" + reason = "feedback-derived indeterminacy makes reuse unresolved" + elif t >= 0.70 and i <= 0.35 and f <= 0.35: + action = "reuse_confidently" + reason = "feedback-derived truth is high with low uncertainty and contradiction" + else: + action = "reuse_with_caveat" + reason = "feedback-derived reliability is mixed" + + return {"action": action, "reason": reason, "t": t, "i": i, "f": f} + + +def choose_baseline(memories: list[dict[str, Any]]) -> dict[str, Any] | None: + if not memories: + return None + return max(memories, key=lambda item: float(item.get("importance", 0.0) or 0.0)) + + +def choose_feedback_aware(memories: list[dict[str, Any]]) -> tuple[dict[str, Any] | None, dict[str, Any]]: + if not memories: + return None, {"action": "no_memory", "reason": "no recalled memories"} + + enriched = [(memory, classify_reliability(derive_reliability(memory))) for memory in memories] + contradictions = [item for item in enriched if item[1]["action"] == "surface_contradiction"] + if contradictions: + return max(contradictions, key=lambda item: item[1]["f"]) + + unresolved = [item for item in enriched if item[1]["action"] == "ask_clarification"] + if unresolved: + return max(unresolved, key=lambda item: item[1]["i"]) + + confident = [item for item in enriched if item[1]["action"] == "reuse_confidently"] + if confident: + return max(confident, key=lambda item: item[1]["t"]) + + baseline = choose_baseline(memories) + assert baseline is not None + return baseline, classify_reliability(derive_reliability(baseline)) + + +def memory_label(memory: dict[str, Any] | None) -> str | None: + if memory is None: + return None + value = memory.get("id") or memory.get("fixture_id") or memory.get("memory_id") + if isinstance(value, str): + return value + return str(memory.get("content", ""))[:72] + + +def normalize_store_response(response: Any) -> dict[str, Any]: + if isinstance(response, dict) and isinstance(response.get("memory"), dict): + return response["memory"] + if isinstance(response, dict): + return response + raise RuntimeError(f"unexpected store response: {response!r}") + + +def normalize_recall_response(response: Any) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + memories = normalize_memory_list(response, ("memories", "results", "items", "data")) + associated = normalize_memory_list(response, ("associated_memories", "associated", "linked_memories")) + return memories, associated + + +def normalize_memory_list(response: Any, keys: tuple[str, ...]) -> list[dict[str, Any]]: + if isinstance(response, list): + return [normalize_memory_item(item) for item in response if isinstance(item, dict)] + if not isinstance(response, dict): + return [] + + for key in keys: + value = response.get(key) + if isinstance(value, list): + return [normalize_memory_item(item) for item in value if isinstance(item, dict)] + + if "content" in response: + return [response] + return [] + + +def normalize_memory_item(item: dict[str, Any]) -> dict[str, Any]: + nested = item.get("memory") + if isinstance(nested, dict): + merged = dict(nested) + for key in ("score", "weighted_score", "smart_score", "depth"): + if key in item: + merged[key] = item[key] + return merged + return item + + +def start_session(api_base: str, agent_id: str, scenario: dict[str, Any]) -> str: + response = request_json( + "POST", + f"{api_base}/v1/sessions/start", + { + "agent_id": agent_id, + "metadata": { + "phase": "phase2_tif_provenance", + "scenario": scenario["id"], + "source": "tif_decision_provenance", + }, + }, + ) + for key in ("session_id", "id"): + value = response.get(key) if isinstance(response, dict) else None + if isinstance(value, str): + return value + nested = response.get("session") if isinstance(response, dict) else None + if isinstance(nested, dict) and isinstance(nested.get("id"), str): + return nested["id"] + raise RuntimeError(f"session start did not return a session id: {response!r}") + + +def end_session(api_base: str, session_id: str, summary: str) -> Any: + return request_json("POST", f"{api_base}/v1/sessions/{session_id}/end", {"summary": summary}) + + +def session_memories(api_base: str, session_id: str) -> list[dict[str, Any]]: + response = request_json("GET", f"{api_base}/v1/sessions/{session_id}/memories") + return normalize_memory_list(response, ("memories", "results", "items", "data")) + + +def store_memory(api_base: str, agent_id: str, session_id: str, memory: dict[str, Any]) -> dict[str, Any]: + metadata = copy.deepcopy(memory.get("metadata", {})) + if not isinstance(metadata, dict): + raise ValueError(f"memory {memory.get('id', '')} metadata must be an object") + reliability = metadata.get("reliability") + if not isinstance(reliability, dict): + raise ValueError(f"memory {memory.get('id', '')} requires metadata.reliability") + metadata["fixture_id"] = memory["id"] + reliability["derived"] = derive_reliability(memory) + + response = request_json( + "POST", + f"{api_base}/v1/memory/store", + { + "agent_id": agent_id, + "content": memory["content"], + "memory_type": "semantic", + "importance": memory.get("importance", 0.5), + "metadata": metadata, + "session_id": session_id, + "tags": ["tif-phase2", memory["id"]], + }, + ) + stored = normalize_store_response(response) + stored["fixture_id"] = memory["id"] + stored["feedback"] = list(memory.get("feedback", [])) + return stored + + +def submit_feedback(api_base: str, agent_id: str, memory_id: str, signal: str) -> Any: + return request_json( + "POST", + f"{api_base}/v1/memories/{memory_id}/feedback", + {"agent_id": agent_id, "signal": signal}, + ) + + +def get_feedback(api_base: str, agent_id: str, memory_id: str) -> Any: + query = urllib.parse.urlencode({"agent_id": agent_id}) + return request_json("GET", f"{api_base}/v1/memories/{memory_id}/feedback?{query}") + + +def extract_feedback_signals(history: Any) -> list[str]: + if isinstance(history, dict): + raw_entries = history.get("entries") or history.get("feedback") or history.get("history") or [] + elif isinstance(history, list): + raw_entries = history + else: + raw_entries = [] + + signals = [] + for entry in raw_entries: + if not isinstance(entry, dict): + continue + value = entry.get("signal") + if isinstance(value, str) and value in FEEDBACK_DELTAS: + signals.append(value) + return signals + + +def link_memory(api_base: str, agent_id: str, memory_id: str, target_id: str) -> Any: + return request_json( + "POST", + f"{api_base}/v1/memories/{memory_id}/links", + {"agent_id": agent_id, "target_id": target_id}, + ) + + +def recall_associated(api_base: str, agent_id: str, query: str) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: + payload = { + "agent_id": agent_id, + "query": query, + "top_k": 1, + "include_associated": True, + "associated_memories_depth": 1, + } + try: + response = request_json("POST", f"{api_base}/v1/memory/recall", payload) + except TimeoutError: + # Recall is read-only. Retry once to tolerate cold ONNX/reranker startup. + response = request_json("POST", f"{api_base}/v1/memory/recall", payload) + return normalize_recall_response(response) + + +def recall_scenario_memories(api_base: str, agent_id: str, query: str, top_k: int = 8) -> list[dict[str, Any]]: + payload = {"agent_id": agent_id, "query": query, "top_k": top_k} + try: + response = request_json("POST", f"{api_base}/v1/memory/recall", payload) + except TimeoutError: + # Recall is read-only. Retry once to tolerate cold ONNX/reranker startup. + response = request_json("POST", f"{api_base}/v1/memory/recall", payload) + memories, _associated = normalize_recall_response(response) + return memories + + +def enrich_recalled_memories( + recalled: list[dict[str, Any]], + scenario: dict[str, Any], + stored_by_fixture: dict[str, dict[str, Any]], + feedback_history: dict[str, Any], +) -> list[dict[str, Any]]: + fixture_by_id = {memory["id"]: memory for memory in scenario["memories"]} + fixture_by_stored_id = { + stored["id"]: fixture_id + for fixture_id, stored in stored_by_fixture.items() + if isinstance(stored.get("id"), str) + } + enriched = [] + for item in recalled: + runtime_memory = copy.deepcopy(item) + metadata = runtime_memory.get("metadata") + fixture_id = metadata.get("fixture_id") if isinstance(metadata, dict) else None + if not isinstance(fixture_id, str): + fixture_id = fixture_by_stored_id.get(runtime_memory.get("id")) + if fixture_id not in fixture_by_id: + continue + + source_memory = fixture_by_id[fixture_id] + if not isinstance(metadata, dict): + metadata = {} + runtime_memory["metadata"] = metadata + if not isinstance(metadata.get("reliability"), dict): + metadata["reliability"] = copy.deepcopy(source_memory.get("metadata", {}).get("reliability", {})) + runtime_memory["fixture_id"] = fixture_id + runtime_memory["feedback"] = extract_feedback_signals(feedback_history[fixture_id]) + enriched.append(runtime_memory) + return enriched + + +def store_decision_trace( + api_base: str, + agent_id: str, + session_id: str, + scenario: dict[str, Any], + direct_memory: dict[str, Any], + safe_memory: dict[str, Any] | None, + decision: dict[str, Any], + evidence_ids: list[str], +) -> dict[str, Any]: + provenance = { + "scenario": scenario["id"], + "decision": decision["action"], + "reason": decision["reason"], + "evidence_memory_ids": evidence_ids, + "direct_memory_id": direct_memory["id"], + "safe_memory_id": safe_memory["id"] if safe_memory else None, + "reliability": { + "direct": derive_reliability(direct_memory, direct_memory.get("feedback", [])), + "safe": derive_reliability(safe_memory, safe_memory.get("feedback", [])) if safe_memory else None, + }, + "source": "phase2_feedback_derived_tif", + } + response = request_json( + "POST", + f"{api_base}/v1/memory/store", + { + "agent_id": agent_id, + "content": ( + f"Decision trace for {scenario['id']}: {decision['action']} because {decision['reason']}." + ), + "memory_type": "semantic", + "importance": 0.88, + "metadata": {"decision_provenance": provenance}, + "session_id": session_id, + "tags": ["tif-phase2", "decision-trace", scenario["id"]], + }, + ) + return normalize_store_response(response) + + +def evaluate_static_scenario(scenario: dict[str, Any]) -> dict[str, Any]: + memories = scenario["memories"] + baseline = choose_baseline(memories) + direct, decision = choose_feedback_aware(memories) + safe = next((item for item in memories if item["id"] == scenario["expected_safe_memory"]), None) + changed = bool(baseline and direct) and memory_label(baseline) != memory_label(direct) + if decision["action"] != "reuse_confidently": + changed = True + return { + "scenario": scenario["id"], + "baseline_memory": memory_label(baseline), + "feedback_aware_memory": memory_label(direct), + "safe_memory": memory_label(safe), + "baseline_action": "reuse_top_memory" if baseline else "no_memory", + "feedback_tif_action": decision["action"], + "changed_decision": changed, + "passed": changed == scenario["expected_changed_decision"] + and decision["action"] == scenario["expected_action"] + and memory_label(direct) == scenario["expected_direct_memory"], + } + + +def run_self_test(fixture: dict[str, Any]) -> list[dict[str, Any]]: + return [evaluate_static_scenario(scenario) for scenario in fixture["scenarios"]] + + +def run_runtime_scenario(api_base: str, agent_id: str, scenario: dict[str, Any]) -> dict[str, Any]: + session_id = start_session(api_base, agent_id, scenario) + stored_by_fixture = {} + feedback_history = {} + try: + for memory in scenario["memories"]: + stored = store_memory(api_base, agent_id, session_id, memory) + stored_by_fixture[memory["id"]] = stored + memory_id = stored["id"] + for signal in memory.get("feedback", []): + submit_feedback(api_base, agent_id, memory_id, signal) + feedback_history[memory["id"]] = get_feedback(api_base, agent_id, memory_id) + + recalled = recall_scenario_memories(api_base, agent_id, scenario["query"], top_k=8) + runtime_memories = enrich_recalled_memories(recalled, scenario, stored_by_fixture, feedback_history) + recalled_fixture_ids = {item.get("fixture_id") for item in runtime_memories} + expected_fixture_ids = {scenario["expected_direct_memory"], scenario["expected_safe_memory"]} + recall_selection_ok = expected_fixture_ids.issubset(recalled_fixture_ids) + + baseline = choose_baseline(runtime_memories) + direct, decision = choose_feedback_aware(runtime_memories) + safe = next( + (item for item in runtime_memories if item.get("fixture_id") == scenario["expected_safe_memory"]), + None, + ) + assert direct is not None + evidence_ids = [direct["id"]] + if safe is not None and safe["id"] not in evidence_ids: + evidence_ids.append(safe["id"]) + trace = store_decision_trace(api_base, agent_id, session_id, scenario, direct, safe, decision, evidence_ids) + for evidence_id in evidence_ids: + link_memory(api_base, agent_id, trace["id"], evidence_id) + + direct_recall, associated = recall_associated( + api_base, + agent_id, + f"Decision trace for {scenario['id']} {scenario['title']}", + ) + associated_ids = {item.get("id") for item in associated} + direct_ids = {item.get("id") for item in direct_recall} + session_ids = {item.get("id") for item in session_memories(api_base, session_id)} + associated_response_ids = associated_ids | direct_ids + associated_missing_ids = sorted(set(evidence_ids) - associated_response_ids) + associated_ok = not associated_missing_ids + session_ok = trace["id"] in session_ids and set(evidence_ids).issubset(session_ids) + baseline_action = "reuse_top_memory" if baseline else "no_memory" + same_memory = ( + baseline is not None + and baseline.get("fixture_id") == direct.get("fixture_id") + ) + equivalent_reuse = same_memory and decision["action"] == "reuse_confidently" + changed = baseline_action != decision["action"] and not equivalent_reuse + + return { + "scenario": scenario["id"], + "session_id": session_id, + "baseline_action": baseline_action, + "baseline_memory": baseline["id"] if baseline else None, + "feedback_tif_action": decision["action"], + "feedback_tif_reason": decision["reason"], + "feedback_aware_memory": direct["id"], + "safe_memory": safe["id"] if safe else None, + "scenario_recall_memory_ids": sorted( + item["id"] for item in runtime_memories if isinstance(item.get("id"), str) + ), + "scenario_recall_fixture_ids": sorted( + item for item in recalled_fixture_ids if isinstance(item, str) + ), + "scenario_recall_proof": recall_selection_ok, + "changed_decision": changed, + "decision_trace_memory_id": trace["id"], + "evidence_memory_ids": evidence_ids, + "feedback_history_signals": { + fixture_id: extract_feedback_signals(history) + for fixture_id, history in feedback_history.items() + }, + "associated_recall_memory_ids": sorted(item for item in associated_ids if isinstance(item, str)), + "associated_recall_missing_ids": associated_missing_ids, + "direct_recall_memory_ids": sorted(item for item in direct_ids if isinstance(item, str)), + "session_memory_ids": sorted(item for item in session_ids if isinstance(item, str)), + "associated_recall_proof": associated_ok, + "session_trace_proof": session_ok, + "passed": changed == scenario["expected_changed_decision"] + and decision["action"] == scenario["expected_action"] + and recall_selection_ok + and associated_ok + and session_ok, + } + finally: + try: + end_session(api_base, session_id, f"Phase 2 scenario {scenario['id']} completed.") + except Exception: + pass + + +def run_runtime_validation(api_base: str, fixture: dict[str, Any], agent_id: str) -> dict[str, Any]: + health = healthcheck(api_base) + scenarios = [run_runtime_scenario(api_base, agent_id, scenario) for scenario in fixture["scenarios"]] + return {"agent_id": agent_id, "health": health, "scenarios": scenarios} + + +def print_report(results: list[dict[str, Any]]) -> None: + print("scenario,baseline_action,feedback_tif_action,changed,trace_or_safe,passed") + for result in results: + print( + ",".join( + [ + result["scenario"], + result["baseline_action"], + result["feedback_tif_action"], + str(result["changed_decision"]).lower(), + str(result.get("decision_trace_memory_id") or result.get("safe_memory")), + str(result["passed"]).lower(), + ] + ) + ) + + +def main() -> int: + parser = argparse.ArgumentParser(description="Validate Dakera feedback-derived T-I-F provenance.") + parser.add_argument("--api", default=DEFAULT_API, help="Dakera REST API base URL.") + parser.add_argument("--fixture", type=Path, default=DEFAULT_FIXTURE, help="Path to Phase 2 fixture JSON.") + parser.add_argument( + "--agent-id", + help="Agent ID for runtime validation. Defaults to a unique ID derived from the fixture.", + ) + parser.add_argument( + "--request-timeout", + type=int, + default=DEFAULT_REQUEST_TIMEOUT, + help="HTTP request timeout in seconds.", + ) + parser.add_argument("--self-test", action="store_true", help="Run local evaluator test without Dakera.") + args = parser.parse_args() + + global REQUEST_TIMEOUT + REQUEST_TIMEOUT = args.request_timeout + + fixture = load_fixture(args.fixture) + if args.self_test: + results = run_self_test(fixture) + print_report(results) + return 0 if all(result["passed"] for result in results) else 1 + + agent_id = args.agent_id or f"{fixture['agent_id']}-{int(time.time())}" + runtime = run_runtime_validation(args.api.rstrip("/"), fixture, agent_id) + print(json.dumps(runtime, indent=2, sort_keys=True)) + return 0 if all(item["passed"] for item in runtime["scenarios"]) else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/tif-reliability/validate_tif_reliability.py b/examples/tif-reliability/validate_tif_reliability.py index 8adf515..d7c09ab 100644 --- a/examples/tif-reliability/validate_tif_reliability.py +++ b/examples/tif-reliability/validate_tif_reliability.py @@ -55,10 +55,13 @@ def healthcheck(api_base: str, retries: int = 120, delay: float = 2.0) -> Any: last_error: Exception | None = None for _ in range(retries): try: - return request_json("GET", f"{api_base}/health/ready") + response = request_json("GET", f"{api_base}/health/ready") + if isinstance(response, dict) and response.get("ready") is True: + return response + last_error = RuntimeError(f"health endpoint is not ready: {response!r}") except Exception as exc: # noqa: BLE001 - report final connection failure. last_error = exc - time.sleep(delay) + time.sleep(delay) raise RuntimeError(f"Dakera healthcheck failed after {retries} attempts: {last_error}")