diff --git a/.github/workflows/python-bench.yml b/.github/workflows/python-bench.yml
index 6e62e20..986cdf6 100644
--- a/.github/workflows/python-bench.yml
+++ b/.github/workflows/python-bench.yml
@@ -74,7 +74,7 @@ jobs:
 
       - name: Run parse benchmark
         run: |
-          poetry run python -m tests.benchmarks.bench_parse \
+          poetry run python -m pathable.benchmarks.bench_parse \
             --output "reports/bench-parse.${{ inputs.suffix }}.json" \
             ${{ inputs.quick && '--quick' || '' }} \
             --repeats "${{ inputs.repeats }}" \
@@ -82,7 +82,7 @@ jobs:
 
       - name: Run lookup benchmark
         run: |
-          poetry run python -m tests.benchmarks.bench_lookup \
+          poetry run python -m pathable.benchmarks.bench_lookup \
             --output "reports/bench-lookup.${{ inputs.suffix }}.json" \
             ${{ inputs.quick && '--quick' || '' }} \
             --repeats "${{ inputs.repeats }}" \
diff --git a/README.md b/README.md
index bc1b830..0559570 100644
--- a/README.md
+++ b/README.md
@@ -144,32 +144,49 @@ pip install -e git+https://github.com/p1c2u/pathable.git#egg=pathable
 
 ## Benchmarks
 
-Benchmarks live in `tests/benchmarks/` and produce JSON reports.
+Benchmark tooling is shipped in the package and exposed as `pathable-bench`.
 
-Local run (recommended as modules):
+Install (core package):
 
 ```console
-poetry run python -m tests.benchmarks.bench_parse --output reports/bench-parse.json
-poetry run python -m tests.benchmarks.bench_lookup --output reports/bench-lookup.json
+pip install pathable
+```
+
+Optional benchmark extra (reserved for benchmark-specific deps):
+
+```console
+pip install "pathable[bench]"
+```
+
+Run all benchmark scenarios for an implementation:
+
+```console
+poetry run pathable-bench run --impl pathable.LookupPath --output reports/bench-lookup.json
 ```
 
 Quick sanity run:
 
 ```console
-poetry run python -m tests.benchmarks.bench_parse --quick --output reports/bench-parse.quick.json
-poetry run python -m tests.benchmarks.bench_lookup --quick --output reports/bench-lookup.quick.json
+poetry run pathable-bench run --impl pathable.LookupPath --quick --output reports/bench-lookup.quick.json
 ```
 
-Compare two results (fails if candidate is >20% slower in any scenario):
+Compare two results (compares overlapping scenarios only; fails if candidate is >20% slower in any compared scenario):
 
 ```console
-poetry run python -m tests.benchmarks.compare_results \
+poetry run pathable-bench compare \
     --baseline reports/bench-before.json \
     --candidate reports/bench-after.json \
     --tolerance 0.20
 ```
 
+Deprecated compatibility wrappers still exist for now:
+
+```console
+poetry run python -m tests.benchmarks.bench_lookup --output reports/bench-lookup.json
+poetry run python -m tests.benchmarks.bench_parse --output reports/bench-parse.json
+poetry run python -m tests.benchmarks.compare_results --baseline a.json --candidate b.json
+```
+
 CI (on-demand):
 
 - GitHub Actions workflow `Benchmarks` runs via `workflow_dispatch` and uploads the JSON artifacts.
-
diff --git a/pathable/benchmarks/__init__.py b/pathable/benchmarks/__init__.py
new file mode 100644
index 0000000..5ef569a
--- /dev/null
+++ b/pathable/benchmarks/__init__.py
@@ -0,0 +1 @@
+"""Public benchmark toolkit for pathable."""
diff --git a/pathable/benchmarks/bench_lookup.py b/pathable/benchmarks/bench_lookup.py
new file mode 100644
index 0000000..bec84a1
--- /dev/null
+++ b/pathable/benchmarks/bench_lookup.py
@@ -0,0 +1,49 @@
+"""Standalone lookup benchmark command."""
+
+import argparse
+from typing import Iterable
+from typing import cast
+
+from pathable.benchmarks.core import add_common_args
+from pathable.benchmarks.core import default_meta
+from pathable.benchmarks.core import results_to_json
+from pathable.benchmarks.core import write_json
+from pathable.benchmarks.registry import resolve_impl
+from pathable.benchmarks.scenarios.lookup import run_lookup_scenarios
+from pathable.paths import AccessorPath
+
+
+def main(argv: Iterable[str] | None = None) -> int:
+    parser = argparse.ArgumentParser()
+    add_common_args(parser)
+    parser.add_argument(
+        "--impl",
+        default="pathable.LookupPath",
+        help="AccessorPath implementation target (default: pathable.LookupPath).",
+    )
+    args = parser.parse_args(list(argv) if argv is not None else None)
+
+    impl = resolve_impl(args.impl)
+    if not issubclass(impl, AccessorPath):
+        raise TypeError(
+            "lookup benchmark requires an AccessorPath implementation"
+        )
+    accessor_impl = cast(type[AccessorPath[object, object, object]], impl)
+
+    results = run_lookup_scenarios(
+        accessor_impl,
+        quick=args.quick,
+        repeats=args.repeats,
+        warmup_loops=args.warmup_loops,
+    )
+
+    meta = default_meta()
+    meta["impl"] = f"{impl.__module__}.{impl.__qualname__}"
+
+    payload = results_to_json(results=results, meta=meta)
+    write_json(args.output, payload)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/pathable/benchmarks/bench_parse.py b/pathable/benchmarks/bench_parse.py
new file mode 100644
index 0000000..37c569e
--- /dev/null
+++ b/pathable/benchmarks/bench_parse.py
@@ -0,0 +1,29 @@
+"""Standalone parse benchmark command."""
+
+import argparse
+from typing import Iterable
+
+from pathable.benchmarks.core import add_common_args
+from pathable.benchmarks.core import results_to_json
+from pathable.benchmarks.core import write_json
+from pathable.benchmarks.scenarios.parse import run_parse_scenarios
+
+
+def main(argv: Iterable[str] | None = None) -> int:
+    parser = argparse.ArgumentParser()
+    add_common_args(parser)
+    args = parser.parse_args(list(argv) if argv is not None else None)
+
+    results = run_parse_scenarios(
+        quick=args.quick,
+        repeats=args.repeats,
+        warmup_loops=args.warmup_loops,
+    )
+
+    payload = results_to_json(results=results)
+    write_json(args.output, payload)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/pathable/benchmarks/cli.py b/pathable/benchmarks/cli.py
new file mode 100644
index 0000000..3f1c1da
--- /dev/null
+++ b/pathable/benchmarks/cli.py
@@ -0,0 +1,80 @@
+"""CLI entrypoint for pathable benchmarks."""
+
+import argparse
+from typing import Iterable
+
+from pathable.benchmarks.compare import main as compare_main
+from pathable.benchmarks.core import write_json
+from pathable.benchmarks.run import run_all
+
+
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(prog="pathable-bench")
+    subparsers = parser.add_subparsers(dest="command", required=True)
+
+    run_parser = subparsers.add_parser("run", help="Run benchmark scenarios")
+    run_parser.add_argument(
+        "--impl",
+        required=True,
+        help="Implementation target, e.g. pathable.LookupPath",
+    )
+    run_parser.add_argument("--output", required=True)
+    run_parser.add_argument("--quick", action="store_true")
+    run_parser.add_argument("--repeats", type=int, default=5)
+    run_parser.add_argument("--warmup-loops", type=int, default=1)
+    run_parser.add_argument(
+        "--scenario",
+        action="append",
+        choices=["parse", "lookup"],
+        help=(
+            "Run only selected scenario groups. Repeat flag to select multiple; "
+            "defaults to both parse and lookup."
+        ),
+    )
+
+    compare_parser = subparsers.add_parser(
+        "compare",
+        help="Compare benchmark JSON reports",
+    )
+    compare_parser.add_argument("--baseline", required=True)
+    compare_parser.add_argument("--candidate", required=True)
+    compare_parser.add_argument("--tolerance", type=float, default=0.20)
+    return parser
+
+
+def main(argv: Iterable[str] | None = None) -> int:
+    parser = _build_parser()
+    args = parser.parse_args(list(argv) if argv is not None else None)
+
+    if args.command == "run":
+        scenarios = (
+            tuple(args.scenario) if args.scenario else ("parse", "lookup")
+        )
+        payload = run_all(
+            impl_target=args.impl,
+            quick=args.quick,
+            repeats=args.repeats,
+            warmup_loops=args.warmup_loops,
+            scenarios=scenarios,
+        )
+        write_json(args.output, payload)
+        return 0
+
+    if args.command == "compare":
+        return compare_main(
+            [
+                "--baseline",
+                args.baseline,
+                "--candidate",
+                args.candidate,
+                "--tolerance",
+                str(args.tolerance),
+            ]
+        )
+
+    parser.error("unknown command")
+    return 2
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/pathable/benchmarks/compare.py b/pathable/benchmarks/compare.py
new file mode 100644
index 0000000..d7eedb7
--- /dev/null
+++ b/pathable/benchmarks/compare.py
@@ -0,0 +1,149 @@
+"""Compare benchmark JSON results."""
+
+import argparse
+import json
+from dataclasses import dataclass
+from typing import Any
+from typing import Iterable
+from typing import Mapping
+from typing import cast
+
+
+@dataclass(frozen=True)
+class ScenarioComparison:
+    name: str
+    baseline_ops: float
+    candidate_ops: float
+    ratio: float
+
+
+@dataclass(frozen=True)
+class CompareResult:
+    comparisons: list[ScenarioComparison]
+    regressions: list[ScenarioComparison]
+    baseline_only: list[str]
+    candidate_only: list[str]
+
+
+def _load(path: str) -> Mapping[str, Any]:
+    with open(path, "r", encoding="utf-8") as f:
+        data_any = json.load(f)
+    if not isinstance(data_any, dict):
+        raise ValueError("Invalid report: expected top-level JSON object")
+    return cast(dict[str, Any], data_any)
+
+
+def _extract_ops(report: Mapping[str, Any]) -> dict[str, float]:
+    benchmarks = report.get("benchmarks")
+    if not isinstance(benchmarks, dict):
+        raise ValueError("Invalid report: missing 'benchmarks' dict")
+
+    benchmarks_d = cast(dict[str, Any], benchmarks)
+
+    out: dict[str, float] = {}
+    for name, payload in benchmarks_d.items():
+        if not isinstance(payload, dict):
+            continue
+        payload_d = cast(dict[str, Any], payload)
+        ops_any = payload_d.get("median_ops_per_sec")
+        ops = ops_any if isinstance(ops_any, (int, float)) else None
+        if ops is not None:
+            out[name] = float(ops)
+    return out
+
+
+def compare(
+    *,
+    baseline: Mapping[str, Any],
+    candidate: Mapping[str, Any],
+    tolerance: float,
+) -> CompareResult:
+    if tolerance < 0:
+        raise ValueError("tolerance must be >= 0")
+
+    b = _extract_ops(baseline)
+    c = _extract_ops(candidate)
+
+    b_names = set(b)
+    c_names = set(c)
+    common_names = sorted(b_names & c_names)
+
+    comparisons: list[ScenarioComparison] = []
+    for name in common_names:
+        bops = b[name]
+        cops = c[name]
+        ratio = cops / bops if bops > 0 else float("inf")
+        comparisons.append(
+            ScenarioComparison(
+                name=name,
+                baseline_ops=bops,
+                candidate_ops=cops,
+                ratio=ratio,
+            )
+        )
+
+    floor_ratio = 1.0 - tolerance
+    regressions = [x for x in comparisons if x.ratio < floor_ratio]
+
+    return CompareResult(
+        comparisons=comparisons,
+        regressions=regressions,
+        baseline_only=sorted(b_names - c_names),
+        candidate_only=sorted(c_names - b_names),
+    )
+
+
+def main(argv: Iterable[str] | None = None) -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--baseline", required=True)
+    parser.add_argument("--candidate", required=True)
+    parser.add_argument(
+        "--tolerance",
+        type=float,
+        default=0.20,
+        help="Allowed slowdown (e.g. 0.20 means 20% slower allowed).",
+    )
+    args = parser.parse_args(list(argv) if argv is not None else None)
+
+    baseline = _load(args.baseline)
+    candidate = _load(args.candidate)
+
+    result = compare(
+        baseline=baseline,
+        candidate=candidate,
+        tolerance=args.tolerance,
+    )
+
+    common_count = len(result.comparisons)
+    print(
+        "scenarios: "
+        f"common={common_count} "
+        f"baseline_only={len(result.baseline_only)} "
+        f"candidate_only={len(result.candidate_only)}"
+    )
+
+    if common_count == 0:
+        print("ERROR: no overlapping scenarios between reports")
+        return 1
+
+    print("scenario\tbaseline_ops/s\tcandidate_ops/s\tratio")
+    for row in result.comparisons:
+        print(
+            f"{row.name}\t{row.baseline_ops:.2f}\t{row.candidate_ops:.2f}\t{row.ratio:.3f}"
+        )
+
+    if result.regressions:
+        print("\nREGRESSIONS:")
+        for row in result.regressions:
+            print(
+                f"- {row.name}: {row.ratio:.3f}x "
+                f"(baseline {row.baseline_ops:.2f} ops/s, "
+                f"candidate {row.candidate_ops:.2f} ops/s)"
+            )
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/pathable/benchmarks/core.py b/pathable/benchmarks/core.py
new file mode 100644
index 0000000..8649dc4
--- /dev/null
+++ b/pathable/benchmarks/core.py
@@ -0,0 +1,159 @@
+"""Minimal benchmark utilities (dependency-free)."""
+
+import argparse
+import json
+import os
+import platform
+import statistics
+import sys
+import time
+from dataclasses import dataclass
+from typing import Any
+from typing import Callable
+from typing import Iterable
+from typing import Mapping
+from typing import MutableMapping
+
+
+@dataclass(frozen=True)
+class BenchmarkResult:
+    name: str
+    loops: int
+    repeats: int
+    warmup_loops: int
+    times_s: tuple[float, ...]
+
+    @property
+    def total_s_median(self) -> float:
+        return statistics.median(self.times_s)
+
+    @property
+    def per_loop_s_median(self) -> float:
+        if self.loops <= 0:
+            return float("inf")
+        return self.total_s_median / self.loops
+
+    @property
+    def ops_per_sec_median(self) -> float:
+        per = self.per_loop_s_median
+        if per <= 0:
+            return float("inf")
+        return 1.0 / per
+
+
+def _safe_int_env(name: str) -> int | None:
+    value = os.environ.get(name)
+    if value is None:
+        return None
+    try:
+        return int(value)
+    except ValueError:
+        return None
+
+
+def default_meta() -> dict[str, Any]:
+    return {
+        "python": sys.version,
+        "python_implementation": platform.python_implementation(),
+        "platform": platform.platform(),
+        "machine": platform.machine(),
+        "processor": platform.processor(),
+        "pythondotorg": platform.python_build(),
+        "py_hash_seed": os.environ.get("PYTHONHASHSEED"),
+        "github_sha": os.environ.get("GITHUB_SHA"),
+        "github_ref": os.environ.get("GITHUB_REF"),
+        "ci": os.environ.get("CI"),
+    }
+
+
+def run_benchmark(
+    name: str,
+    func: Callable[[], Any],
+    *,
+    loops: int,
+    repeats: int = 5,
+    warmup_loops: int = 1,
+) -> BenchmarkResult:
+    if loops <= 0:
+        raise ValueError("loops must be > 0")
+    if repeats <= 0:
+        raise ValueError("repeats must be > 0")
+    if warmup_loops < 0:
+        raise ValueError("warmup_loops must be >= 0")
+
+    for _ in range(warmup_loops):
+        for __ in range(loops):
+            func()
+
+    times: list[float] = []
+    for _ in range(repeats):
+        start = time.perf_counter()
+        for __ in range(loops):
+            func()
+        end = time.perf_counter()
+        times.append(end - start)
+
+    return BenchmarkResult(
+        name=name,
+        loops=loops,
+        repeats=repeats,
+        warmup_loops=warmup_loops,
+        times_s=tuple(times),
+    )
+
+
+def results_to_json(
+    *,
+    results: Iterable[BenchmarkResult],
+    meta: Mapping[str, Any] | None = None,
+) -> dict[str, Any]:
+    out: dict[str, Any] = {
+        "meta": dict(meta or default_meta()),
+        "benchmarks": {},
+    }
+
+    bench: MutableMapping[str, Any] = out["benchmarks"]
+    for r in results:
+        bench[r.name] = {
+            "loops": r.loops,
+            "repeats": r.repeats,
+            "warmup_loops": r.warmup_loops,
+            "times_s": list(r.times_s),
+            "median_total_s": r.total_s_median,
+            "median_per_loop_s": r.per_loop_s_median,
+            "median_ops_per_sec": r.ops_per_sec_median,
+        }
+
+    return out
+
+
+def add_common_args(parser: argparse.ArgumentParser) -> None:
+    parser.add_argument(
+        "--output",
+        required=True,
+        help="Write JSON results to this file.",
+    )
+    parser.add_argument(
+        "--quick",
+        action="store_true",
+        help="Run fewer iterations for a fast sanity check.",
+    )
+    parser.add_argument(
+        "--repeats",
+        type=int,
+        default=_safe_int_env("PATHABLE_BENCH_REPEATS") or 5,
+        help="Number of repeats per scenario (median is reported).",
+    )
+    parser.add_argument(
+        "--warmup-loops",
+        type=int,
+        default=_safe_int_env("PATHABLE_BENCH_WARMUP") or 1,
+        help="Warmup passes before timing.",
+    )
+
+
+def write_json(path: str, payload: Mapping[str, Any]) -> None:
+    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(payload, f, indent=2, sort_keys=True)
+        f.write("\n")
diff --git a/pathable/benchmarks/registry.py b/pathable/benchmarks/registry.py
new file mode 100644
index 0000000..07532f6
--- /dev/null
+++ b/pathable/benchmarks/registry.py
@@ -0,0 +1,48 @@
+"""Implementation resolution for benchmark CLI."""
+
+import importlib
+from typing import Any
+
+from pathable.paths import BasePath
+
+
+def _resolve_qualname(module_name: str, qualname: str) -> Any:
+    module = importlib.import_module(module_name)
+    obj: Any = module
+    for part in qualname.split("."):
+        obj = getattr(obj, part)
+    return obj
+
+
+def resolve_impl(target: str) -> type[BasePath]:
+    """Resolve an implementation target into a BasePath subclass.
+
+    Supported formats:
+    - ``module.path.ClassName``
+    - ``module.path:ClassName``
+    """
+    if not target:
+        raise ValueError("implementation target must be non-empty")
+
+    obj: Any
+    if ":" in target:
+        module_name, qualname = target.split(":", 1)
+        obj = _resolve_qualname(module_name, qualname)
+    else:
+        if "." not in target:
+            raise ValueError(
+                "implementation target must be dotted path or module:qualname"
+            )
+        module_name, qualname = target.rsplit(".", 1)
+        obj = _resolve_qualname(module_name, qualname)
+
+    if not isinstance(obj, type):
+        raise TypeError(
+            f"implementation target must resolve to a class: {target}"
+        )
+    if not issubclass(obj, BasePath):
+        raise TypeError(
+            "implementation target must resolve to BasePath subclass: "
+            f"{target}"
+        )
+    return obj
diff --git a/pathable/benchmarks/run.py b/pathable/benchmarks/run.py
new file mode 100644
index 0000000..f5096ff
--- /dev/null
+++ b/pathable/benchmarks/run.py
@@ -0,0 +1,64 @@
+"""Benchmark runner orchestration."""
+
+from typing import Any
+
+from pathable.benchmarks.core import BenchmarkResult
+from pathable.benchmarks.core import default_meta
+from pathable.benchmarks.core import results_to_json
+from pathable.benchmarks.registry import resolve_impl
+from pathable.benchmarks.scenarios.lookup import run_lookup_scenarios
+from pathable.benchmarks.scenarios.parse import run_parse_scenarios
+from pathable.paths import AccessorPath
+
+
+def run_all(
+    *,
+    impl_target: str,
+    quick: bool,
+    repeats: int,
+    warmup_loops: int,
+    scenarios: tuple[str, ...] = ("parse", "lookup"),
+) -> dict[str, Any]:
+    impl = resolve_impl(impl_target)
+
+    results: list[BenchmarkResult] = []
+    skipped: list[str] = []
+
+    wanted = set(scenarios)
+    valid = {"parse", "lookup"}
+    unknown = sorted(wanted - valid)
+    if unknown:
+        raise ValueError("unknown scenarios requested: " + ", ".join(unknown))
+
+    if "parse" in wanted:
+        results.extend(
+            run_parse_scenarios(
+                quick=quick,
+                repeats=repeats,
+                warmup_loops=warmup_loops,
+            )
+        )
+
+    if "lookup" in wanted:
+        if issubclass(impl, AccessorPath):
+            results.extend(
+                run_lookup_scenarios(
+                    impl,
+                    quick=quick,
+                    repeats=repeats,
+                    warmup_loops=warmup_loops,
+                )
+            )
+        else:
+            skipped.append("lookup")
+
+    meta = default_meta()
+    meta["impl"] = f"{impl.__module__}.{impl.__qualname__}"
+    meta["quick"] = quick
+    meta["repeats"] = repeats
+    meta["warmup_loops"] = warmup_loops
+    meta["requested_scenarios"] = sorted(wanted)
+    if skipped:
+        meta["skipped_scenarios"] = skipped
+
+    return results_to_json(results=results, meta=meta)
diff --git a/pathable/benchmarks/scenarios/__init__.py b/pathable/benchmarks/scenarios/__init__.py
new file mode 100644
index 0000000..32729f5
--- /dev/null
+++ b/pathable/benchmarks/scenarios/__init__.py
@@ -0,0 +1 @@
+"""Benchmark scenarios."""
diff --git a/pathable/benchmarks/scenarios/lookup.py b/pathable/benchmarks/scenarios/lookup.py
new file mode 100644
index 0000000..e7834c8
--- /dev/null
+++ b/pathable/benchmarks/scenarios/lookup.py
@@ -0,0 +1,282 @@
+"""Benchmarks for AccessorPath lookup-style operations."""
+
+from typing import Any
+
+from pathable.accessors import LookupAccessor
+from pathable.benchmarks.core import BenchmarkResult
+from pathable.benchmarks.core import run_benchmark
+from pathable.paths import AccessorPath
+
+
+def _build_deep_tree(depth: int) -> dict[str, Any]:
+    node: dict[str, Any] = {"value": 1}
+    for i in range(depth - 1, -1, -1):
+        node = {f"k{i}": node}
+    return node
+
+
+def _deep_keys(depth: int) -> tuple[str, ...]:
+    return tuple(f"k{i}" for i in range(depth))
+
+
+def _make_deep_path(root: AccessorPath[Any, Any, Any], depth: int) -> Any:
+    p: Any = root
+    for k in _deep_keys(depth):
+        p = p / k
+    return p
+
+
+def _build_mapping(size: int) -> dict[str, int]:
+    return {f"k{i}": i for i in range(size)}
+
+
+def _from_lookup(
+    impl: type[AccessorPath[Any, Any, Any]], lookup: Any, *parts: Any
+) -> AccessorPath[Any, Any, Any]:
+    ctor = getattr(impl, "from_lookup", None)
+    if ctor is None or not callable(ctor):
+        raise TypeError(
+            f"{impl.__module__}.{impl.__qualname__} does not provide from_lookup()"
+        )
+    path = ctor(lookup, *parts)
+    if not isinstance(path, AccessorPath):
+        raise TypeError("from_lookup() must return AccessorPath instance")
+    return path
+
+
+def _benchmark_lookup(
+    impl: type[AccessorPath[Any, Any, Any]],
+    *,
+    quick: bool,
+    repeats: int,
+    warmup_loops: int,
+) -> list[BenchmarkResult]:
+    results: list[BenchmarkResult] = []
+    depth = 25 if not quick else 10
+    loops_hit = 200_000 if not quick else 20_000
+    loops_miss = 80_000 if not quick else 10_000
+
+    data = _build_deep_tree(depth)
+    root = _from_lookup(impl, data)
+    deep = _make_deep_path(root, depth)
+
+    results.append(
+        run_benchmark(
+            f"lookup.read_value.cache_hit.depth{depth}",
+            deep.read_value,
+            loops=loops_hit,
+            repeats=repeats,
+            warmup_loops=warmup_loops,
+        )
+    )
+
+    leaf_parent = _from_lookup(
+        impl,
+        {"root": {"branch": {"leaf": "value"}}},
+        "root",
+        "branch",
+    )
+
+    def getitem_leaf(_p: AccessorPath[Any, Any, Any] = leaf_parent) -> None:
+        _ = _p["leaf"]
+
+    loops_getitem_leaf = 200_000 if not quick else 20_000
+    results.append(
+        run_benchmark(
+            "lookup.getitem.leaf",
+            getitem_leaf,
+            loops=loops_getitem_leaf,
+            repeats=repeats,
+            warmup_loops=warmup_loops,
+        )
+    )
+
+    branch_parent = _from_lookup(
+        impl,
+        {"root": {"branch": {"child": {"x": 1}}}},
+        "root",
+        "branch",
+    )
+
+    def getitem_branch(
+        _p: AccessorPath[Any, Any, Any] = branch_parent,
+    ) -> None:
+        _ = _p["child"]
+
+    loops_getitem_branch = 200_000 if not quick else 20_000
+    results.append(
+        run_benchmark(
+            "lookup.getitem.branch",
+            getitem_branch,
+            loops=loops_getitem_branch,
+            repeats=repeats,
+            warmup_loops=warmup_loops,
+        )
+    )
+
+    deep_accessor = deep.accessor
+    if not isinstance(deep_accessor, LookupAccessor):
+        raise TypeError(
+            "lookup scenarios require LookupAccessor-backed implementation"
+        )
+    deep_accessor.disable_cache()
+    results.append(
+        run_benchmark(
+            f"lookup.read_value.cache_disabled.depth{depth}",
+            deep.read_value,
+            loops=loops_miss,
+            repeats=repeats,
+            warmup_loops=warmup_loops,
+        )
+    )
+
+    data2 = {
+        "a": _build_deep_tree(depth),
+        "x": _build_deep_tree(depth),
+    }
+    root2 = _from_lookup(impl, data2)
+    a_path = _make_deep_path(root2 / "a", depth)
+    x_path = _make_deep_path(root2 / "x", depth)
+
+    root2_accessor = root2.accessor
+    if not isinstance(root2_accessor, LookupAccessor):
+        raise TypeError(
+            "lookup scenarios require LookupAccessor-backed implementation"
+        )
+    root2_accessor.enable_cache(maxsize=1)
+
+    toggle = {"i": 0}
+
+    def read_alternating() -> None:
+        if toggle["i"] & 1:
+            x_path.read_value()
+        else:
+            a_path.read_value()
+        toggle["i"] += 1
+
+    loops_eviction = 120_000 if not quick else 15_000
+    results.append(
+        run_benchmark(
+            f"lookup.read_value.eviction_alternate.maxsize1.depth{depth}",
+            read_alternating,
+            loops=loops_eviction,
+            repeats=repeats,
+            warmup_loops=warmup_loops,
+        )
+    )
+
+    sizes = [10, 1_000, 50_000] if not quick else [10, 1_000]
+    for size in sizes:
+        mapping = _build_mapping(size)
+        p = _from_lookup(impl, {"root": mapping}) / "root"
+
+        loops_keys = 5_000 if size <= 1_000 else 200
+        if quick:
+            loops_keys = min(loops_keys, 500)
+
+        results.append(
+            run_benchmark(
+                f"lookup.keys.mapping.size{size}",
+                p.keys,
+                loops=loops_keys,
+                repeats=repeats,
+                warmup_loops=warmup_loops,
+            )
+        )
+
+        probe_key = f"k{size - 1}" if size else "k0"
+        loops_contains = 20_000 if size <= 1_000 else 500
+        if quick:
+            loops_contains = min(loops_contains, 2_000)
+
+        def contains_probe(
+            _p: AccessorPath[Any, Any, Any] = p,
+            _key: str = probe_key,
+        ) -> None:
+            _ = _key in _p
+
+        results.append(
+            run_benchmark(
+                f"lookup.contains.mapping.size{size}",
+                contains_probe,
+                loops=loops_contains,
+                repeats=repeats,
+                warmup_loops=warmup_loops,
+            )
+        )
+
+        loops_floordiv = 20_000 if size <= 1_000 else 500
+        if quick:
+            loops_floordiv = min(loops_floordiv, 2_000)
+
+        def floordiv_probe(
+            _p: AccessorPath[Any, Any, Any] = p,
+            _key: str = probe_key,
+        ) -> None:
+            _ = _p // _key
+
+        results.append(
+            run_benchmark(
+                f"lookup.floordiv.mapping.size{size}",
+                floordiv_probe,
+                loops=loops_floordiv,
+                repeats=repeats,
+                warmup_loops=warmup_loops,
+            )
+        )
+
+        missing_key = "missing"
+
+        def floordiv_missing_probe(
+            _p: AccessorPath[Any, Any, Any] = p,
+            _key: str = missing_key,
+        ) -> None:
+            try:
+                _ = _p // _key
+            except KeyError:
+                return
+
+        results.append(
+            run_benchmark(
+                f"lookup.floordiv_missing.mapping.size{size}",
+                floordiv_missing_probe,
+                loops=loops_floordiv,
+                repeats=repeats,
+                warmup_loops=warmup_loops,
+            )
+        )
+
+        loops_iter = 500 if size <= 1_000 else 3
+        if quick:
+            loops_iter = min(loops_iter, 50)
+
+        def iter_children(_p: AccessorPath[Any, Any, Any] = p) -> None:
+            for _ in _p:
+                pass
+
+        results.append(
+            run_benchmark(
+                f"lookup.iter_children.mapping.size{size}",
+                iter_children,
+                loops=loops_iter,
+                repeats=repeats,
+                warmup_loops=warmup_loops,
+            )
+        )
+
+    return results
+
+
+def run_lookup_scenarios(
+    impl: type[AccessorPath[Any, Any, Any]],
+    *,
+    quick: bool,
+    repeats: int,
+    warmup_loops: int,
+) -> list[BenchmarkResult]:
+    return _benchmark_lookup(
+        impl,
+        quick=quick,
+        repeats=repeats,
+        warmup_loops=warmup_loops,
+    )
diff --git a/pathable/benchmarks/scenarios/parse.py b/pathable/benchmarks/scenarios/parse.py
new file mode 100644
index 0000000..cecb283
--- /dev/null
+++ b/pathable/benchmarks/scenarios/parse.py
@@ -0,0 +1,68 @@
+"""Benchmarks for parsing and BasePath construction."""
+
+from pathable.benchmarks.core import BenchmarkResult
+from pathable.benchmarks.core import run_benchmark
+from pathable.paths import BasePath
+
+
+def _build_args(n: int) -> list[object]:
+    out: list[object] = []
+    for i in range(n):
+        if i % 11 == 0:
+            out.append(".")
+        elif i % 11 == 1:
+            out.append(b"bytes")
+        elif i % 11 == 2:
+            out.append(i)
+        elif i % 11 == 3:
+            out.append(f"a/{i}/b")
+        else:
+            out.append(f"seg{i}")
+    return out
+
+
+def run_parse_scenarios(
+    *, quick: bool, repeats: int, warmup_loops: int
+) -> list[BenchmarkResult]:
+    results: list[BenchmarkResult] = []
+    sizes = [10, 100, 1_000] if not quick else [10, 100]
+
+    for n in sizes:
+        inputs = _build_args(n)
+        inputs_t = tuple(inputs)
+
+        loops_parse = 80_000 if n <= 100 else 10_000
+        if quick:
+            loops_parse = min(loops_parse, 10_000)
+
+        def do_parse(_inputs: tuple[object, ...] = inputs_t) -> None:
+            BasePath._parse_args(_inputs)
+
+        results.append(
+            run_benchmark(
+                f"parse.BasePath._parse_args.size{n}",
+                do_parse,
+                loops=loops_parse,
+                repeats=repeats,
+                warmup_loops=warmup_loops,
+            )
+        )
+
+        loops_basepath = 60_000 if n <= 100 else 3_000
+        if quick:
+            loops_basepath = min(loops_basepath, 5_000)
+
+        def do_basepath(_inputs: tuple[object, ...] = inputs_t) -> None:
+            BasePath(*_inputs)
+
+        results.append(
+            run_benchmark(
+                f"paths.BasePath.constructor.size{n}",
+                do_basepath,
+                loops=loops_basepath,
+                repeats=repeats,
+                warmup_loops=warmup_loops,
+            )
+        )
+
+    return results
diff --git a/poetry.lock b/poetry.lock
index 23d7ac6..2e4cb44 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -936,7 +936,10 @@ six = ">=1.9.0,<2"
 docs = ["proselint (>=0.10.2)", "sphinx (>=3)", "sphinx-argparse (>=0.2.5)", "sphinx-rtd-theme (>=0.4.3)", "towncrier (>=21.3)"]
 testing = ["coverage (>=4)", "coverage-enable-subprocess (>=1)", "flaky (>=3)", "packaging (>=20.0) ; python_version > \"3.4\"", "pytest (>=4)", "pytest-env (>=0.6.2)", "pytest-freezegun (>=0.4.1)", "pytest-mock (>=2)", "pytest-randomly (>=1)", "pytest-timeout (>=1)"]
 
+[extras]
+bench = []
+
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10,<4.0"
-content-hash = "fef5732c8b443ab4a7d7f997abb08cc050886e42b3e5c882df3f5b3ecf3e7314"
+content-hash = "5ee48c773806e8267b1f69c691f987f8a5e3ff9798638a244687ecd8ac4a593f"
diff --git a/pyproject.toml b/pyproject.toml
index ef91984..e09a65e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,6 +58,12 @@ classifiers = [
 [tool.poetry.dependencies]
 python = ">=3.10,<4.0"
 
+[tool.poetry.extras]
+bench = []
+
+[tool.poetry.scripts]
+pathable-bench = "pathable.benchmarks.cli:main"
+
 [tool.poetry.group.dev.dependencies]
 tbump = "^6.11.0"
 pre-commit = "*"
diff --git a/tests/benchmarks/__init__.py b/tests/benchmarks/__init__.py
index 8b67c9f..51d3ac7 100644
--- a/tests/benchmarks/__init__.py
+++ b/tests/benchmarks/__init__.py
@@ -1,7 +1,7 @@
-"""Benchmark scripts for pathable.
+"""Deprecated compatibility wrappers for pathable benchmarks.
 
-Run as modules for the most reliable import behavior:
+Prefer:
 
-- `python -m tests.benchmarks.bench_lookup --output bench-lookup.json`
-- `python -m tests.benchmarks.bench_parse --output bench-parse.json`
+- `pathable-bench run --impl pathable.LookupPath --output bench.json`
+- `pathable-bench compare --baseline a.json --candidate b.json`
 """
diff --git a/tests/benchmarks/bench_lookup.py b/tests/benchmarks/bench_lookup.py
index 294f0b3..b10f2a2 100644
--- a/tests/benchmarks/bench_lookup.py
+++ b/tests/benchmarks/bench_lookup.py
@@ -1,276 +1,28 @@
-"""Benchmarks for LookupPath / LookupAccessor hot paths.
+"""Compatibility wrapper for deprecated tests.benchmarks command."""
 
-These benchmarks avoid filesystem I/O (too noisy for CI) and focus on:
-- traversal cost (cache disabled)
-- cache hit speed
-- LRU eviction patterns
-- keys/contains/iter overhead on large mappings
-"""
-
-import argparse
-from typing import Any
+import sys
+import warnings
+from pathlib import Path
 from typing import Iterable
 
-from pathable.accessors import LookupAccessor
-from pathable.paths import LookupPath
-
 try:
-    # Prefer module execution: `python -m tests.benchmarks.bench_lookup ...`
-    from .bench_utils import BenchmarkResult
-    from .bench_utils import add_common_args
-    from .bench_utils import results_to_json
-    from .bench_utils import run_benchmark
-    from .bench_utils import write_json
-except ImportError:  # pragma: no cover
-    # Allow direct execution: `python tests/benchmarks/bench_lookup.py ...`
-    from bench_utils import BenchmarkResult  # type: ignore[no-redef]
-    from bench_utils import add_common_args  # type: ignore[no-redef]
-    from bench_utils import results_to_json  # type: ignore[no-redef]
-    from bench_utils import run_benchmark  # type: ignore[no-redef]
-    from bench_utils import write_json  # type: ignore[no-redef]
-
-
-def _build_deep_tree(depth: int) -> dict[str, Any]:
-    node: dict[str, Any] = {"value": 1}
-    for i in range(depth - 1, -1, -1):
-        node = {f"k{i}": node}
-    return node
-
-
-def _deep_keys(depth: int) -> tuple[str, ...]:
-    return tuple(f"k{i}" for i in range(depth))
+    from pathable.benchmarks.bench_lookup import main as _main
+except ModuleNotFoundError as exc:
+    if exc.name != "pathable":
+        raise
+    sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+    from pathable.benchmarks.bench_lookup import main as _main
 
-
-def _make_deep_path(root: LookupPath, depth: int) -> LookupPath:
-    p = root
-    for k in _deep_keys(depth):
-        p = p / k
-    return p
-
-
-def _build_mapping(size: int) -> dict[str, int]:
-    return {f"k{i}": i for i in range(size)}
+_MESSAGE = (
+    "tests.benchmarks.bench_lookup is deprecated and will be removed in a "
+    "future release; use `pathable-bench run --impl pathable.LookupPath ...` "
+    "or `python -m pathable.benchmarks.bench_lookup ...` instead."
+)
 
 
 def main(argv: Iterable[str] | None = None) -> int:
-    parser = argparse.ArgumentParser()
-    add_common_args(parser)
-    args = parser.parse_args(list(argv) if argv is not None else None)
-
-    repeats: int = args.repeats
-    warmup_loops: int = args.warmup_loops
-
-    results: list[BenchmarkResult] = []
-
-    # --- Lookup read benchmarks ---
-    depth = 25 if not args.quick else 10
-    loops_hit = 200_000 if not args.quick else 20_000
-    loops_miss = 80_000 if not args.quick else 10_000
-
-    data = _build_deep_tree(depth)
-    root = LookupPath.from_lookup(data)
-    deep = _make_deep_path(root, depth)
-
-    # Cache hit: repeated reads of the same path.
-    results.append(
-        run_benchmark(
-            f"lookup.read_value.cache_hit.depth{depth}",
-            deep.read_value,
-            loops=loops_hit,
-            repeats=repeats,
-            warmup_loops=warmup_loops,
-        )
-    )
-
-    # __getitem__ leaf read: should return value for non-traversable child.
-    leaf_parent = LookupPath.from_lookup(
-        {"root": {"branch": {"leaf": "value"}}}, "root", "branch"
-    )
-
-    def getitem_leaf(_p: LookupPath = leaf_parent) -> None:
-        _ = _p["leaf"]
-
-    loops_getitem_leaf = 200_000 if not args.quick else 20_000
-    results.append(
-        run_benchmark(
-            "lookup.getitem.leaf",
-            getitem_leaf,
-            loops=loops_getitem_leaf,
-            repeats=repeats,
-            warmup_loops=warmup_loops,
-        )
-    )
-
-    # __getitem__ branch read: should return child path for traversable child.
-    branch_parent = LookupPath.from_lookup(
-        {"root": {"branch": {"child": {"x": 1}}}}, "root", "branch"
-    )
-
-    def getitem_branch(_p: LookupPath = branch_parent) -> None:
-        _ = _p["child"]
-
-    loops_getitem_branch = 200_000 if not args.quick else 20_000
-    results.append(
-        run_benchmark(
-            "lookup.getitem.branch",
-            getitem_branch,
-            loops=loops_getitem_branch,
-            repeats=repeats,
-            warmup_loops=warmup_loops,
-        )
-    )
-
-    # Cache miss cost: disable cache and repeatedly read.
-    deep_accessor = deep.accessor
-    if not isinstance(deep_accessor, LookupAccessor):
-        raise TypeError("Expected LookupPath.accessor to be LookupAccessor")
-    deep_accessor.disable_cache()
-    results.append(
-        run_benchmark(
-            f"lookup.read_value.cache_disabled.depth{depth}",
-            deep.read_value,
-            loops=loops_miss,
-            repeats=repeats,
-            warmup_loops=warmup_loops,
-        )
-    )
-
-    # LRU eviction: alternate two distinct deep paths with maxsize=1.
-    data2 = {
-        "a": _build_deep_tree(depth),
-        "x": _build_deep_tree(depth),
-    }
-    root2 = LookupPath.from_lookup(data2)
-    a_path = _make_deep_path(root2 / "a", depth)
-    x_path = _make_deep_path(root2 / "x", depth)
-
-    root2_accessor = root2.accessor
-    if not isinstance(root2_accessor, LookupAccessor):
-        raise TypeError("Expected LookupPath.accessor to be LookupAccessor")
-    root2_accessor.enable_cache(maxsize=1)
-
-    toggle = {"i": 0}
-
-    def read_alternating() -> None:
-        if toggle["i"] & 1:
-            x_path.read_value()
-        else:
-            a_path.read_value()
-        toggle["i"] += 1
-
-    loops_eviction = 120_000 if not args.quick else 15_000
-    results.append(
-        run_benchmark(
-            f"lookup.read_value.eviction_alternate.maxsize1.depth{depth}",
-            read_alternating,
-            loops=loops_eviction,
-            repeats=repeats,
-            warmup_loops=warmup_loops,
-        )
-    )
-
-    # --- Large mapping operations ---
-    sizes = [10, 1_000, 50_000] if not args.quick else [10, 1_000]
-    for size in sizes:
-        mapping = _build_mapping(size)
-        p = LookupPath.from_lookup({"root": mapping}) / "root"
-
-        # keys() materializes a list in LookupAccessor.keys for mappings.
-        loops_keys = 5_000 if size <= 1_000 else 200
-        if args.quick:
-            loops_keys = min(loops_keys, 500)
-
-        results.append(
-            run_benchmark(
-                f"lookup.keys.mapping.size{size}",
-                p.keys,
-                loops=loops_keys,
-                repeats=repeats,
-                warmup_loops=warmup_loops,
-            )
-        )
-
-        # contains: AccessorPath.__contains__ calls keys() then `in`.
-        probe_key = f"k{size - 1}" if size else "k0"
-        loops_contains = 20_000 if size <= 1_000 else 500
-        if args.quick:
-            loops_contains = min(loops_contains, 2_000)
-
-        def contains_probe(_p: LookupPath = p, _key: str = probe_key) -> None:
-            _ = _key in _p
-
-        results.append(
-            run_benchmark(
-                f"lookup.contains.mapping.size{size}",
-                contains_probe,
-                loops=loops_contains,
-                repeats=repeats,
-                warmup_loops=warmup_loops,
-            )
-        )
-
-        # floordiv (`//`): strict child assertion.
-        # Previously this was implemented via keys()+membership, which
-        # materializes all keys for mappings.
-        loops_floordiv = 20_000 if size <= 1_000 else 500
-        if args.quick:
-            loops_floordiv = min(loops_floordiv, 2_000)
-
-        def floordiv_probe(_p: LookupPath = p, _key: str = probe_key) -> None:
-            _ = _p // _key
-
-        results.append(
-            run_benchmark(
-                f"lookup.floordiv.mapping.size{size}",
-                floordiv_probe,
-                loops=loops_floordiv,
-                repeats=repeats,
-                warmup_loops=warmup_loops,
-            )
-        )
-
-        missing_key = "missing"
-
-        def floordiv_missing_probe(
-            _p: LookupPath = p, _key: str = missing_key
-        ) -> None:
-            try:
-                _ = _p // _key
-            except KeyError:
-                return
-
-        results.append(
-            run_benchmark(
-                f"lookup.floordiv_missing.mapping.size{size}",
-                floordiv_missing_probe,
-                loops=loops_floordiv,
-                repeats=repeats,
-                warmup_loops=warmup_loops,
-            )
-        )
-
-        # iterating children: should call keys() once and yield child paths.
-        loops_iter = 500 if size <= 1_000 else 3
-        if args.quick:
-            loops_iter = min(loops_iter, 50)
-
-        def iter_children(_p: LookupPath = p) -> None:
-            for _ in _p:
-                pass
-
-        results.append(
-            run_benchmark(
-                f"lookup.iter_children.mapping.size{size}",
-                iter_children,
-                loops=loops_iter,
-                repeats=repeats,
-                warmup_loops=warmup_loops,
-            )
-        )
-
-    payload = results_to_json(results=results)
-    write_json(args.output, payload)
-    return 0
+    warnings.warn(_MESSAGE, DeprecationWarning, stacklevel=2)
+    return _main(argv)
 
 
 if __name__ == "__main__":
diff --git a/tests/benchmarks/bench_parse.py b/tests/benchmarks/bench_parse.py
index 89e4967..b5921d9 100644
--- a/tests/benchmarks/bench_parse.py
+++ b/tests/benchmarks/bench_parse.py
@@ -1,96 +1,28 @@
-"""Benchmarks for parsing and BasePath construction."""
+"""Compatibility wrapper for deprecated tests.benchmarks command."""
 
-import argparse
+import sys
+import warnings
+from pathlib import Path
 from typing import Iterable
 
-from pathable.paths import BasePath
-
 try:
-    # Prefer module execution: `python -m tests.benchmarks.bench_parse ...`
-    from .bench_utils import BenchmarkResult
-    from .bench_utils import add_common_args
-    from .bench_utils import results_to_json
-    from .bench_utils import run_benchmark
-    from .bench_utils import write_json
-except ImportError:  # pragma: no cover
-    # Allow direct execution: `python tests/benchmarks/bench_parse.py ...`
-    from bench_utils import BenchmarkResult  # type: ignore[no-redef]
-    from bench_utils import add_common_args  # type: ignore[no-redef]
-    from bench_utils import results_to_json  # type: ignore[no-redef]
-    from bench_utils import run_benchmark  # type: ignore[no-redef]
-    from bench_utils import write_json  # type: ignore[no-redef]
-
+    from pathable.benchmarks.bench_parse import main as _main
+except ModuleNotFoundError as exc:
+    if exc.name != "pathable":
+        raise
+    sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+    from pathable.benchmarks.bench_parse import main as _main
 
-def _build_args(n: int) -> list[object]:
-    # Mix in segments that exercise splitting, filtering, bytes decode, and ints.
-    out: list[object] = []
-    for i in range(n):
-        if i % 11 == 0:
-            out.append(".")
-        elif i % 11 == 1:
-            out.append(b"bytes")
-        elif i % 11 == 2:
-            out.append(i)
-        elif i % 11 == 3:
-            out.append(f"a/{i}/b")
-        else:
-            out.append(f"seg{i}")
-    return out
+_MESSAGE = (
+    "tests.benchmarks.bench_parse is deprecated and will be removed in a "
+    "future release; use `pathable-bench run --impl pathable.LookupPath ...` "
+    "or `python -m pathable.benchmarks.bench_parse ...` instead."
+)
 
 
 def main(argv: Iterable[str] | None = None) -> int:
-    parser = argparse.ArgumentParser()
-    add_common_args(parser)
-    args = parser.parse_args(list(argv) if argv is not None else None)
-
-    repeats: int = args.repeats
-    warmup_loops: int = args.warmup_loops
-
-    results: list[BenchmarkResult] = []
-
-    sizes = [10, 100, 1_000] if not args.quick else [10, 100]
-
-    for n in sizes:
-        inputs = _build_args(n)
-        inputs_t = tuple(inputs)
-
-        loops_parse = 80_000 if n <= 100 else 10_000
-        if args.quick:
-            loops_parse = min(loops_parse, 10_000)
-
-        def do_parse(_inputs: tuple[object, ...] = inputs_t) -> None:
-            BasePath._parse_args(_inputs)
-
-        results.append(
-            run_benchmark(
-                f"parse.BasePath._parse_args.size{n}",
-                do_parse,
-                loops=loops_parse,
-                repeats=repeats,
-                warmup_loops=warmup_loops,
-            )
-        )
-
-        loops_basepath = 60_000 if n <= 100 else 3_000
-        if args.quick:
-            loops_basepath = min(loops_basepath, 5_000)
-
-        def do_basepath(_inputs: tuple[object, ...] = inputs_t) -> None:
-            BasePath(*_inputs)
-
-        results.append(
-            run_benchmark(
-                f"paths.BasePath.constructor.size{n}",
-                do_basepath,
-                loops=loops_basepath,
-                repeats=repeats,
-                warmup_loops=warmup_loops,
-            )
-        )
-
-    payload = results_to_json(results=results)
-    write_json(args.output, payload)
-    return 0
+    warnings.warn(_MESSAGE, DeprecationWarning, stacklevel=2)
+    return _main(argv)
 
 
 if __name__ == "__main__":
diff --git a/tests/benchmarks/bench_utils.py b/tests/benchmarks/bench_utils.py
index 5942187..9101685 100644
--- a/tests/benchmarks/bench_utils.py
+++ b/tests/benchmarks/bench_utils.py
@@ -1,163 +1,39 @@
-"""Minimal benchmark utilities (dependency-free).
+"""Compatibility exports for deprecated benchmark utility module."""
 
-This module is intentionally simple to keep benchmarks stable and easy to run
-locally and in CI.
-"""
-
-import argparse
-import json
-import os
-import platform
-import statistics
 import sys
-import time
-from dataclasses import dataclass
-from typing import Any
-from typing import Callable
-from typing import Iterable
-from typing import Mapping
-from typing import MutableMapping
-
-
-@dataclass(frozen=True)
-class BenchmarkResult:
-    name: str
-    loops: int
-    repeats: int
-    warmup_loops: int
-    times_s: tuple[float, ...]
-
-    @property
-    def total_s_median(self) -> float:
-        return statistics.median(self.times_s)
-
-    @property
-    def per_loop_s_median(self) -> float:
-        if self.loops <= 0:
-            return float("inf")
-        return self.total_s_median / self.loops
-
-    @property
-    def ops_per_sec_median(self) -> float:
-        per = self.per_loop_s_median
-        if per <= 0:
-            return float("inf")
-        return 1.0 / per
-
-
-def _safe_int_env(name: str) -> int | None:
-    value = os.environ.get(name)
-    if value is None:
-        return None
-    try:
-        return int(value)
-    except ValueError:
-        return None
-
-
-def default_meta() -> dict[str, Any]:
-    return {
-        "python": sys.version,
-        "python_implementation": platform.python_implementation(),
-        "platform": platform.platform(),
-        "machine": platform.machine(),
-        "processor": platform.processor(),
-        "pythondotorg": platform.python_build(),
-        "py_hash_seed": os.environ.get("PYTHONHASHSEED"),
-        "github_sha": os.environ.get("GITHUB_SHA"),
-        "github_ref": os.environ.get("GITHUB_REF"),
-        "ci": os.environ.get("CI"),
-    }
-
-
-def run_benchmark(
-    name: str,
-    func: Callable[[], Any],
-    *,
-    loops: int,
-    repeats: int = 5,
-    warmup_loops: int = 1,
-) -> BenchmarkResult:
-    if loops <= 0:
-        raise ValueError("loops must be > 0")
-    if repeats <= 0:
-        raise ValueError("repeats must be > 0")
-    if warmup_loops < 0:
-        raise ValueError("warmup_loops must be >= 0")
-
-    for _ in range(warmup_loops):
-        for __ in range(loops):
-            func()
-
-    times: list[float] = []
-    for _ in range(repeats):
-        start = time.perf_counter()
-        for __ in range(loops):
-            func()
-        end = time.perf_counter()
-        times.append(end - start)
-
-    return BenchmarkResult(
-        name=name,
-        loops=loops,
-        repeats=repeats,
-        warmup_loops=warmup_loops,
-        times_s=tuple(times),
-    )
-
-
-def results_to_json(
-    *,
-    results: Iterable[BenchmarkResult],
-    meta: Mapping[str, Any] | None = None,
-) -> dict[str, Any]:
-    out: dict[str, Any] = {
-        "meta": dict(meta or default_meta()),
-        "benchmarks": {},
-    }
-
-    bench: MutableMapping[str, Any] = out["benchmarks"]
-    for r in results:
-        bench[r.name] = {
-            "loops": r.loops,
-            "repeats": r.repeats,
-            "warmup_loops": r.warmup_loops,
-            "times_s": list(r.times_s),
-            "median_total_s": r.total_s_median,
-            "median_per_loop_s": r.per_loop_s_median,
-            "median_ops_per_sec": r.ops_per_sec_median,
-        }
-
-    return out
-
-
-def add_common_args(parser: argparse.ArgumentParser) -> None:
-    parser.add_argument(
-        "--output",
-        required=True,
-        help="Write JSON results to this file.",
-    )
-    parser.add_argument(
-        "--quick",
-        action="store_true",
-        help="Run fewer iterations for a fast sanity check.",
-    )
-    parser.add_argument(
-        "--repeats",
-        type=int,
-        default=_safe_int_env("PATHABLE_BENCH_REPEATS") or 5,
-        help="Number of repeats per scenario (median is reported).",
-    )
-    parser.add_argument(
-        "--warmup-loops",
-        type=int,
-        default=_safe_int_env("PATHABLE_BENCH_WARMUP") or 1,
-        help="Warmup passes before timing.",
-    )
-
-
-def write_json(path: str, payload: Mapping[str, Any]) -> None:
-    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
-    with open(path, "w", encoding="utf-8") as f:
-        json.dump(payload, f, indent=2, sort_keys=True)
-        f.write("\n")
+import warnings
+from pathlib import Path
+
+try:
+    from pathable.benchmarks.core import BenchmarkResult
+    from pathable.benchmarks.core import add_common_args
+    from pathable.benchmarks.core import default_meta
+    from pathable.benchmarks.core import results_to_json
+    from pathable.benchmarks.core import run_benchmark
+    from pathable.benchmarks.core import write_json
+except ModuleNotFoundError as exc:
+    if exc.name != "pathable":
+        raise
+    sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+    from pathable.benchmarks.core import BenchmarkResult
+    from pathable.benchmarks.core import add_common_args
+    from pathable.benchmarks.core import default_meta
+    from pathable.benchmarks.core import results_to_json
+    from pathable.benchmarks.core import run_benchmark
+    from pathable.benchmarks.core import write_json
+
+warnings.warn(
+    "tests.benchmarks.bench_utils is deprecated and will be removed in a "
+    "future release; use pathable.benchmarks.core instead.",
+    DeprecationWarning,
+    stacklevel=2,
+)
+
+__all__ = [
+    "BenchmarkResult",
+    "add_common_args",
+    "default_meta",
+    "results_to_json",
+    "run_benchmark",
+    "write_json",
+]
diff --git a/tests/benchmarks/compare_results.py b/tests/benchmarks/compare_results.py
index 08529ee..05a73ae 100644
--- a/tests/benchmarks/compare_results.py
+++ b/tests/benchmarks/compare_results.py
@@ -1,157 +1,28 @@
-"""Compare two pathable benchmark JSON results.
+"""Compatibility wrapper for deprecated tests.benchmarks command."""
 
-Exits non-zero if candidate regresses beyond the configured tolerance.
-
-This is meant for local regression checking and optional CI gating.
-"""
-
-import argparse
-import json
-from dataclasses import dataclass
-from typing import Any
+import sys
+import warnings
+from pathlib import Path
 from typing import Iterable
-from typing import Mapping
-from typing import cast
-
-
-@dataclass(frozen=True)
-class ScenarioComparison:
-    name: str
-    baseline_ops: float
-    candidate_ops: float
-    ratio: float
-    baseline_scenario: str
-    candidate_scenario: str
-
-
-def _canonicalize_scenario_name(name: str) -> str:
-    """Return a stable scenario identifier across benchmark renames.
-
-    This keeps `compare_results.py` compatible with older JSON reports.
-    """
-    aliases: tuple[tuple[str, str], ...] = (
-        # Historical rename in bench_parse:
-        #   parse.parse_args.sizeN -> parse.BasePath._parse_args.sizeN
-        # Canonicalize both to parse.args.sizeN.
-        ("parse.parse_args.", "parse.args."),
-        ("parse.BasePath._parse_args.", "parse.args."),
-    )
-
-    for prefix, replacement in aliases:
-        if name.startswith(prefix):
-            return replacement + name[len(prefix) :]
-    return name
-
-
-def _load(path: str) -> Mapping[str, Any]:
-    with open(path, "r", encoding="utf-8") as f:
-        data_any = json.load(f)
-    if not isinstance(data_any, dict):
-        raise ValueError("Invalid report: expected top-level JSON object")
-    return cast(dict[str, Any], data_any)
-
-
-def _extract_ops(report: Mapping[str, Any]) -> dict[str, float]:
-    benchmarks = report.get("benchmarks")
-    if not isinstance(benchmarks, dict):
-        raise ValueError("Invalid report: missing 'benchmarks' dict")
-
-    benchmarks_d = cast(dict[str, Any], benchmarks)
 
-    out: dict[str, float] = {}
-    for name, payload in benchmarks_d.items():
-        if not isinstance(payload, dict):
-            continue
-        payload_d = cast(dict[str, Any], payload)
-        ops_any = payload_d.get("median_ops_per_sec")
-        ops = ops_any if isinstance(ops_any, (int, float)) else None
-        if ops is not None:
-            out[name] = float(ops)
-    return out
+try:
+    from pathable.benchmarks.compare import main as _main
+except ModuleNotFoundError as exc:
+    if exc.name != "pathable":
+        raise
+    sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+    from pathable.benchmarks.compare import main as _main
 
-
-def compare(
-    *,
-    baseline: Mapping[str, Any],
-    candidate: Mapping[str, Any],
-    tolerance: float,
-) -> tuple[list[ScenarioComparison], list[ScenarioComparison]]:
-    if tolerance < 0:
-        raise ValueError("tolerance must be >= 0")
-
-    b_raw = _extract_ops(baseline)
-    c_raw = _extract_ops(candidate)
-
-    b: dict[str, tuple[str, float]] = {}
-    c: dict[str, tuple[str, float]] = {}
-
-    for name, ops in b_raw.items():
-        canon = _canonicalize_scenario_name(name)
-        b.setdefault(canon, (name, ops))
-
-    for name, ops in c_raw.items():
-        canon = _canonicalize_scenario_name(name)
-        c.setdefault(canon, (name, ops))
-
-    comparisons: list[ScenarioComparison] = []
-    for name in sorted(set(b) & set(c)):
-        b_name, bops = b[name]
-        c_name, cops = c[name]
-        ratio = cops / bops if bops > 0 else float("inf")
-        comparisons.append(
-            ScenarioComparison(
-                name=name,
-                baseline_ops=bops,
-                candidate_ops=cops,
-                ratio=ratio,
-                baseline_scenario=b_name,
-                candidate_scenario=c_name,
-            )
-        )
-
-    # Regression if candidate is slower by more than tolerance:
-    # candidate_ops < baseline_ops * (1 - tolerance)
-    floor_ratio = 1.0 - tolerance
-    regressions = [x for x in comparisons if x.ratio < floor_ratio]
-    return comparisons, regressions
+_MESSAGE = (
+    "tests.benchmarks.compare_results is deprecated and will be removed in a "
+    "future release; use `pathable-bench compare ...` or "
+    "`python -m pathable.benchmarks.compare ...` instead."
+)
 
 
 def main(argv: Iterable[str] | None = None) -> int:
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--baseline", required=True)
-    parser.add_argument("--candidate", required=True)
-    parser.add_argument(
-        "--tolerance",
-        type=float,
-        default=0.20,
-        help="Allowed slowdown (e.g. 0.20 means 20% slower allowed).",
-    )
-    args = parser.parse_args(list(argv) if argv is not None else None)
-
-    baseline = _load(args.baseline)
-    candidate = _load(args.candidate)
-
-    comparisons, regressions = compare(
-        baseline=baseline,
-        candidate=candidate,
-        tolerance=args.tolerance,
-    )
-
-    print("scenario\tbaseline_ops/s\tcandidate_ops/s\tratio")
-    for c in comparisons:
-        print(
-            f"{c.name}\t{c.baseline_ops:.2f}\t{c.candidate_ops:.2f}\t{c.ratio:.3f}"
-        )
-
-    if regressions:
-        print("\nREGRESSIONS:")
-        for r in regressions:
-            print(
-                f"- {r.name}: {r.ratio:.3f}x (baseline {r.baseline_ops:.2f} ops/s, candidate {r.candidate_ops:.2f} ops/s)"
-            )
-        return 1
-
-    return 0
+    warnings.warn(_MESSAGE, DeprecationWarning, stacklevel=2)
+    return _main(argv)
 
 
 if __name__ == "__main__":
diff --git a/tests/unit/test_benchmarks.py b/tests/unit/test_benchmarks.py
new file mode 100644
index 0000000..c457b37
--- /dev/null
+++ b/tests/unit/test_benchmarks.py
@@ -0,0 +1,93 @@
+import json
+from pathlib import Path
+
+import pytest
+
+from pathable.benchmarks.compare import compare
+from pathable.benchmarks.compare import main as compare_main
+from pathable.benchmarks.registry import resolve_impl
+from pathable.benchmarks.run import run_all
+from tests.benchmarks import bench_lookup
+from tests.benchmarks import bench_parse
+from tests.benchmarks import compare_results
+
+
+def test_resolve_impl_by_dotted_name() -> None:
+    impl = resolve_impl("pathable.LookupPath")
+    assert impl.__name__ == "LookupPath"
+
+
+def test_compare_uses_only_overlapping_scenarios() -> None:
+    baseline = {
+        "benchmarks": {
+            "same": {"median_ops_per_sec": 100.0},
+            "baseline_only": {"median_ops_per_sec": 100.0},
+        }
+    }
+    candidate = {
+        "benchmarks": {
+            "same": {"median_ops_per_sec": 85.0},
+            "candidate_only": {"median_ops_per_sec": 200.0},
+        }
+    }
+
+    result = compare(
+        baseline=baseline,
+        candidate=candidate,
+        tolerance=0.10,
+    )
+
+    assert [x.name for x in result.comparisons] == ["same"]
+    assert [x.name for x in result.regressions] == ["same"]
+    assert result.baseline_only == ["baseline_only"]
+    assert result.candidate_only == ["candidate_only"]
+
+
+def test_compare_main_fails_on_no_overlap(tmp_path: Path) -> None:
+    baseline_path = tmp_path / "baseline.json"
+    candidate_path = tmp_path / "candidate.json"
+
+    baseline_path.write_text(
+        json.dumps({"benchmarks": {"a": {"median_ops_per_sec": 1.0}}}),
+        encoding="utf-8",
+    )
+    candidate_path.write_text(
+        json.dumps({"benchmarks": {"b": {"median_ops_per_sec": 1.0}}}),
+        encoding="utf-8",
+    )
+
+    code = compare_main(
+        [
+            "--baseline",
+            str(baseline_path),
+            "--candidate",
+            str(candidate_path),
+        ]
+    )
+    assert code == 1
+
+
+def test_run_all_can_limit_to_lookup_scenarios() -> None:
+    payload = run_all(
+        impl_target="pathable.LookupPath",
+        quick=True,
+        repeats=1,
+        warmup_loops=0,
+        scenarios=("lookup",),
+    )
+    benchmarks = payload["benchmarks"]
+    assert benchmarks
+    assert all(name.startswith("lookup.") for name in benchmarks)
+
+
+@pytest.mark.parametrize(
+    "module",
+    [bench_lookup, bench_parse, compare_results],
+)
+def test_compat_wrapper_emits_deprecation_warning(module: object) -> None:
+    with pytest.warns(DeprecationWarning):
+        # Delegate may fail due to missing required args; warning is what matters.
+        try:
+            getattr(module, "main")([])
+        except SystemExit:
+            pass