diff --git a/browsertrace/compare.py b/browsertrace/compare.py index c8a1690b..323f71fb 100644 --- a/browsertrace/compare.py +++ b/browsertrace/compare.py @@ -2,7 +2,9 @@ from __future__ import annotations +import json import sqlite3 +from typing import Any def run_summary(run: sqlite3.Row) -> dict[str, str]: @@ -25,16 +27,126 @@ def step_for_compare(step: sqlite3.Row | None) -> dict[str, object] | None: } +def _parse_json_field(value: Any) -> Any: + if value in (None, ""): + return None + if isinstance(value, (dict, list)): + return value + if isinstance(value, (bytes, bytearray)): + value = value.decode("utf-8", errors="ignore") + if isinstance(value, str): + try: + return json.loads(value) + except (TypeError, ValueError): + return None + return None + + +def _deep_get(data: dict[str, Any], paths: list[tuple[str, ...]]) -> str: + for path in paths: + cursor: Any = data + ok = True + for key in path: + if isinstance(cursor, dict) and key in cursor: + cursor = cursor[key] + else: + ok = False + break + if ok and cursor not in (None, ""): + return str(cursor) + return "" + + +def compare_metadata(steps: list[sqlite3.Row]) -> dict[str, str]: + """Extract run-comparison metadata from step metadata/model_input payloads. + + This is best-effort and intentionally narrow: only keys that help determine + whether two runs are reasonably comparable are surfaced. + """ + fields: dict[str, str] = { + "browser_use_version": "", + "browsertrace_version": "", + "model_provider": "", + "model": "", + "prompt_template_version": "", + } + + metadata_paths: dict[str, list[tuple[str, ...]]] = { + "browser_use_version": [ + ("browser_use_version",), + ("browseruse_version",), + ], + "browsertrace_version": [ + ("browsertrace_version",), + ], + "model_provider": [ + ("model_provider",), + ("provider",), + ("llm_provider",), + ("model", "provider"), + ], + "model": [ + ("model",), + ("model_name",), + ("llm_model",), + ("model", "name"), + ], + "prompt_template_version": [ + ("prompt_template_version",), + ("prompt_version",), + ("template_version",), + ("prompt", "template_version"), + ], + } + + for step in steps: + metadata = _parse_json_field(step["metadata"]) + model_input = _parse_json_field(step["model_input"]) + metadata_dict = metadata if isinstance(metadata, dict) else {} + nested_metadata = metadata_dict.get("metadata") if isinstance(metadata_dict.get("metadata"), dict) else {} + candidates = [ + metadata_dict, + nested_metadata, + model_input if isinstance(model_input, dict) else {}, + ] + + for field, paths in metadata_paths.items(): + if fields[field]: + continue + for source in candidates: + value = _deep_get(source, paths) + if value: + fields[field] = value + break + + if all(fields.values()): + break + + return fields + + def compare_runs( left_run: sqlite3.Row, left_steps: list[sqlite3.Row], right_run: sqlite3.Row, right_steps: list[sqlite3.Row], ) -> dict[str, object]: + left_metadata = compare_metadata(left_steps) + right_metadata = compare_metadata(right_steps) + payload: dict[str, object] = { "left": run_summary(left_run), "right": run_summary(right_run), "step_counts": {"left": len(left_steps), "right": len(right_steps)}, + "compare_metadata": { + "left": left_metadata, + "right": right_metadata, + "differences": { + key: {"left": left_metadata[key], "right": right_metadata[key]} + for key in left_metadata.keys() + if left_metadata[key] != right_metadata[key] + }, + }, "first_divergence": None, } diff --git a/tests/test_cli.py b/tests/test_cli.py index d65d6546..4fac28cc 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -78,6 +78,51 @@ def _seed_compare_runs(tmp_path): return failed_id, success_id +def _seed_compare_runs_with_metadata(tmp_path): + tracer = Tracer(home=tmp_path) + + with tracer.run("browser-use success metadata") as run: + run.step( + action="navigate", + url="https://example.com/start", + metadata={ + "browser_use_version": "0.1.45", + "browsertrace_version": "0.1.19", + "model_provider": "openai", + "model": "gpt-4o-mini", + "prompt_template_version": "prompt-v3", + }, + ) + run.step(action="click(selector=#checkout)", url="https://example.com/done") + success_id = run.id + + try: + with tracer.run("browser-use failure metadata") as run: + run.step( + action="navigate", + url="https://example.com/start", + model_input={ + "browser_use_version": "0.1.46", + "browsertrace_version": "0.1.19", + "model_provider": "openai", + "model": "gpt-4.1-mini", + "prompt_template_version": "prompt-v4", + }, + ) + run.step( + action="click(selector=#cancel)", + url="https://example.com/cart", + status="error", + error="wrong target", + ) + failed_id = run.id + raise RuntimeError("wrong target") + except RuntimeError: + pass + + return failed_id, success_id + + def test_cli_module_compiles_on_python311(): """Guard against Python 3.11 f-string syntax regressions. @@ -202,6 +247,14 @@ def test_cli_compare_json_reports_first_divergent_step(cli): assert payload["left"]["status"] == "failed" assert payload["right"]["id"] == success_id assert payload["right"]["status"] == "completed" + assert payload["compare_metadata"]["left"] == { + "browser_use_version": "", + "browsertrace_version": "", + "model_provider": "", + "model": "", + "prompt_template_version": "", + } + assert payload["compare_metadata"]["differences"] == {} assert payload["first_divergence"]["step_index"] == 1 assert payload["first_divergence"]["left_step"]["action"] == "click(selector=#cancel)" assert payload["first_divergence"]["right_step"]["action"] == "click(selector=#checkout)" @@ -219,6 +272,38 @@ def test_cli_compare_json_reports_first_divergent_step(cli): } +def test_cli_compare_json_includes_compare_metadata_and_differences(cli): + cli_mod, tmp_path = cli + failed_id, success_id = _seed_compare_runs_with_metadata(tmp_path) + + buf = StringIO() + with redirect_stdout(buf): + rc = cli_mod.main(["compare", failed_id[:8], success_id[:8], "--json"]) + + payload = json.loads(buf.getvalue()) + + assert rc == 0 + assert payload["compare_metadata"]["left"] == { + "browser_use_version": "0.1.46", + "browsertrace_version": "0.1.19", + "model_provider": "openai", + "model": "gpt-4.1-mini", + "prompt_template_version": "prompt-v4", + } + assert payload["compare_metadata"]["right"] == { + "browser_use_version": "0.1.45", + "browsertrace_version": "0.1.19", + "model_provider": "openai", + "model": "gpt-4o-mini", + "prompt_template_version": "prompt-v3", + } + assert payload["compare_metadata"]["differences"] == { + "browser_use_version": {"left": "0.1.46", "right": "0.1.45"}, + "model": {"left": "gpt-4.1-mini", "right": "gpt-4o-mini"}, + "prompt_template_version": {"left": "prompt-v4", "right": "prompt-v3"}, + } + + def test_cli_compare_human_output_mentions_first_divergence(cli): cli_mod, tmp_path = cli failed_id, success_id = _seed_compare_runs(tmp_path)