From 044ed632fb121dd51cb0bd5b58e80e61f52c5f10 Mon Sep 17 00:00:00 2001 From: Meftun Akarsu Date: Sun, 21 Jun 2026 15:07:36 +0300 Subject: [PATCH] =?UTF-8?q?feat(okf):=20add=20a=20`validate`=20subcommand?= =?UTF-8?q?=20checking=20a=20bundle=20against=20SPEC=20=C2=A79?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A credential-free, offline conformance runner for OKF bundles (addresses #62). Adds reference_agent/bundle/conformance.py and a `validate` CLI subcommand that check an on-disk bundle against the machine-checkable SPEC §9 rules: §9.1 (parseable YAML frontmatter), §9.2 (non-empty `type`), and the hard index.md rule (§6/§11). Prints one `path: [rule] message` line per violation and exits non-zero, so it drops into CI. `--strict` additionally enforces the producer-level recommended keys (§4.1). log.md date headings (§7) and index.md body sections (§6) are out of scope (they need a full CommonMark parser), so success reports exactly what was checked rather than asserting full v0.1 conformance. Reuses OKFDocument.parse, adds no new dependencies. Tests cover all three committed bundles plus a fixture per violation type; full okf suite: 54 passed. --- okf/README.md | 44 +++++ okf/src/reference_agent/bundle/conformance.py | 173 +++++++++++++++++ okf/src/reference_agent/cli.py | 42 +++++ okf/tests/test_conformance.py | 174 ++++++++++++++++++ 4 files changed, 433 insertions(+) create mode 100644 okf/src/reference_agent/bundle/conformance.py create mode 100644 okf/tests/test_conformance.py diff --git a/okf/README.md b/okf/README.md index cf466e7..60694fe 100644 --- a/okf/README.md +++ b/okf/README.md @@ -205,6 +205,50 @@ The HTML embeds the bundle as a JSON blob and uses both loaded from a CDN. No data leaves the page; the bundle is parsed once at generation time and serialized into the file. +## Validate + +The `validate` subcommand checks an OKF bundle against the +**[SPEC §9](SPEC.md) conformance rules** — no credentials, no model, no +network required: + +``` +.venv/bin/python -m reference_agent validate --bundle ./bundles/ +``` + +It verifies that: + +- every non-reserved `.md` file has a **parseable YAML frontmatter + block** (§9.1), +- every frontmatter block has a **non-empty `type`** (§9.2), and +- reserved `index.md` files carry no frontmatter except an optional + bundle-root `okf_version` (§6/§11). + +The remaining §9.3 structure — `log.md` date headings (§7) and +`index.md` body sections (§6) — is **not** validated: telling a real +`##` heading apart from fenced-code content needs a full CommonMark +parser, which is out of scope for this dependency-light checker. The +command therefore reports *no violations in the checked rules* rather +than asserting full v0.1 conformance. + +One line is printed per violation as `path: [rule] message`, and the +command exits non-zero if any are found — so it drops cleanly into CI to +keep bundles conformant as they evolve. A clean bundle prints +`OK: … — no OKF v0.1 violations found (§9.1, §9.2, index.md §6/§11)` and +exits `0`; all three bundles in [`bundles/`](bundles/) pass. + +Add `--strict` to additionally require the producer-level recommended +keys (`title`, `description`, `timestamp`) from §4.1 on every concept: + +``` +.venv/bin/python -m reference_agent validate --bundle ./bundles/ga4 --strict +``` + +The sample bundles already populate those keys, so they pass `--strict` +unchanged; the extra `§4.1` violations surface on bundles that omit them. + +It is also usable as a library — `from reference_agent.bundle.conformance +import check_bundle` returns a list of `Violation`s for programmatic use. + ## Tests ``` diff --git a/okf/src/reference_agent/bundle/conformance.py b/okf/src/reference_agent/bundle/conformance.py new file mode 100644 index 0000000..b733ef7 --- /dev/null +++ b/okf/src/reference_agent/bundle/conformance.py @@ -0,0 +1,173 @@ +"""OKF v0.1 conformance checking (SPEC §9). + +A small, dependency-light validator that checks an on-disk OKF bundle against +the three conformance rules in SPEC §9: + + 1. Every non-reserved ``.md`` file contains a parseable YAML frontmatter + block (§9.1). + 2. Every frontmatter block contains a non-empty ``type`` field (§9.2). + 3. Reserved files (``index.md``, ``log.md``) follow their prescribed shape + when present (§9.3). This checks the one hard ``index.md`` rule — + frontmatter is permitted only in a bundle-root ``index.md`` and only the + ``okf_version`` key (§6/§11). + +Note that §9.3 also covers ``log.md`` date-heading structure (§7) and +``index.md`` body sections (§6). Those are *not* validated here: faithfully +deciding whether a ``##`` line is a real heading or fenced-code content +requires a full CommonMark parser, which is out of scope for this +dependency-light checker, so a ``log.md`` is accepted as-is. Because of that +gap the tool reports "no violations in the checked rules" rather than +asserting full v0.1 conformance. + +This is intentionally *stricter to detect* than the permissive consumption +model SPEC §9 mandates for consumers: a validator's job is to surface +problems, while a consumer must tolerate them. For the producer-level bar +(``type``/``title``/``description``/``timestamp``), pass ``strict=True``. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + +from reference_agent.bundle.document import ( + REQUIRED_FRONTMATTER_KEYS, + OKFDocument, +) + +# Reserved filenames (SPEC §3.1). +INDEX_NAME = "index.md" +LOG_NAME = "log.md" + +_FRONTMATTER_DELIM = "---" +# SPEC §11: the only frontmatter key permitted in a bundle-root index.md. +_ROOT_INDEX_ALLOWED_KEYS = frozenset({"okf_version"}) + + +@dataclass(frozen=True) +class Violation: + """A single OKF v0.1 conformance violation.""" + + path: str # bundle-relative POSIX path of the offending file + rule: str # SPEC reference, e.g. "§9.1", "§9.2", "§6", "§11" + message: str + + def __str__(self) -> str: + return f"{self.path}: [{self.rule}] {self.message}" + + +def _has_frontmatter_block(text: str) -> bool: + """Whether ``text`` opens with a frontmatter block. + + Mirrors ``OKFDocument.parse``: a block exists iff the first line is the + ``---`` delimiter. ``parse`` silently returns an empty mapping when no + block is present, so presence must be checked separately to enforce §9.1. + """ + lines = text.splitlines() + return bool(lines) and lines[0].strip() == _FRONTMATTER_DELIM + + +def _check_concept(rel: str, text: str, *, strict: bool) -> list[Violation]: + out: list[Violation] = [] + # §9.1 — a parseable YAML frontmatter block must be present. + if not _has_frontmatter_block(text): + out.append(Violation(rel, "§9.1", "missing YAML frontmatter block")) + return out + try: + doc = OKFDocument.parse(text) + except ValueError as exc: + # OKFDocumentError (unterminated / non-mapping frontmatter) is a + # ValueError subclass; PyYAML's implicit resolvers also raise a *bare* + # ValueError for e.g. an out-of-range timestamp (`2026-13-45`). Both + # mean the frontmatter cannot be parsed, so both are §9.1. + out.append(Violation(rel, "§9.1", f"unparseable frontmatter: {exc}")) + return out + # §9.2 — frontmatter must contain a non-empty `type`. + type_val = doc.frontmatter.get("type") + if not (isinstance(type_val, str) and type_val.strip()): + out.append(Violation(rel, "§9.2", "missing or empty 'type' field")) + # --strict — also enforce the producer-level recommended keys (§4.1). + if strict: + missing = [ + k + for k in REQUIRED_FRONTMATTER_KEYS + if k != "type" and not doc.frontmatter.get(k) + ] + if missing: + out.append( + Violation( + rel, + "§4.1", + f"strict: missing recommended key(s): {', '.join(missing)}", + ) + ) + return out + + +def _check_index(rel: str, text: str, *, is_root: bool) -> list[Violation]: + # §6 — index.md carries no frontmatter; §11 permits only `okf_version` + # in the bundle-root index.md. + if not _has_frontmatter_block(text): + return [] + if not is_root: + return [Violation(rel, "§6", "index.md must not contain frontmatter")] + try: + doc = OKFDocument.parse(text) + except ValueError as exc: # see _check_concept: covers OKFDocumentError + bare ValueError + return [Violation(rel, "§6", f"unparseable index frontmatter: {exc}")] + extra = sorted(k for k in doc.frontmatter if k not in _ROOT_INDEX_ALLOWED_KEYS) + if extra: + return [ + Violation( + rel, + "§11", + "root index.md frontmatter may only contain 'okf_version'; " + f"found: {', '.join(extra)}", + ) + ] + return [] + + +def check_bundle(root, *, strict: bool = False) -> list[Violation]: + """Check an OKF bundle directory against the SPEC v0.1 §9 conformance rules. + + Args: + root: Path to the bundle root directory. + strict: Also enforce the producer-level recommended keys (``title``, + ``description``, ``timestamp``) from SPEC §4.1, on top of §9. + + Returns: + A list of :class:`Violation` (empty means the bundle is conformant), + ordered by file path. + + Raises: + FileNotFoundError: if ``root`` is not a directory. + """ + root = Path(root) + if not root.is_dir(): + raise FileNotFoundError(f"Bundle directory not found: {root}") + root_resolved = root.resolve() + + violations: list[Violation] = [] + for md_path in sorted(root.rglob("*.md")): + # rglob also matches directories and broken symlinks whose name ends in + # '.md'; only regular files are concept/reserved documents (§3/§9). + if not md_path.is_file(): + continue + rel = md_path.relative_to(root).as_posix() + try: + text = md_path.read_text(encoding="utf-8") + except (UnicodeDecodeError, OSError) as exc: + violations.append(Violation(rel, "read", f"cannot read file: {exc}")) + continue + name = md_path.name + if name == INDEX_NAME: + is_root = md_path.parent.resolve() == root_resolved + violations.extend(_check_index(rel, text, is_root=is_root)) + elif name == LOG_NAME: + # Reserved file (§7). Its date-heading structure is not validated + # (see the module docstring); the file is accepted as-is. + continue + else: + violations.extend(_check_concept(rel, text, strict=strict)) + return violations diff --git a/okf/src/reference_agent/cli.py b/okf/src/reference_agent/cli.py index 5b75a97..e364e65 100644 --- a/okf/src/reference_agent/cli.py +++ b/okf/src/reference_agent/cli.py @@ -158,6 +158,20 @@ def _parser() -> argparse.ArgumentParser: "--name", default=None, help="Display name for the bundle (default: bundle directory name).", ) + + validate = sub.add_parser( + "validate", + help="Check an OKF bundle against the SPEC v0.1 §9 conformance rules.", + ) + validate.add_argument( + "--bundle", required=True, type=Path, + help="Path to the bundle root directory.", + ) + validate.add_argument( + "--strict", action="store_true", + help="Also enforce the producer-level recommended keys " + "(title, description, timestamp) from §4.1, on top of §9.", + ) return p @@ -185,6 +199,34 @@ def main(argv: list[str] | None = None) -> int: ) return 0 + if args.command == "validate": + from reference_agent.bundle.conformance import check_bundle + try: + violations = check_bundle(args.bundle, strict=args.strict) + except FileNotFoundError as exc: + # A bad --bundle path is a usage error, not a conformance failure; + # report it cleanly (exit 2) instead of dumping a traceback. + print(f"Error: {exc}", file=sys.stderr) + return 2 + for v in violations: + print(str(v), file=sys.stderr) + if violations: + print( + f"FAIL: {len(violations)} conformance violation(s) in " + f"{args.bundle}", + file=sys.stderr, + ) + return 1 + # Scoped to the rules check_bundle enforces (§9.1, §9.2, index.md + # §6/§11); log.md §7 and index.md body structure are not validated, so + # this does not assert full v0.1 conformance. + print( + f"OK: {args.bundle} — no OKF v0.1 violations found " + "(§9.1, §9.2, index.md §6/§11)", + file=sys.stderr, + ) + return 0 + if args.command == "enrich": source = _build_source(args.source, args) seeds = _collect_seeds(args) diff --git a/okf/tests/test_conformance.py b/okf/tests/test_conformance.py new file mode 100644 index 0000000..40db83a --- /dev/null +++ b/okf/tests/test_conformance.py @@ -0,0 +1,174 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest + +from reference_agent.bundle.conformance import Violation, check_bundle + +_BUNDLES_DIR = Path(__file__).resolve().parents[1] / "bundles" + + +def _write(path: Path, text: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(text, encoding="utf-8") + + +# --- the committed sample bundles must be conformant ---------------------- + + +@pytest.mark.parametrize("name", ["crypto_bitcoin", "ga4", "stackoverflow"]) +def test_committed_bundles_are_conformant(name: str): + bundle = _BUNDLES_DIR / name + if not bundle.is_dir(): + pytest.skip(f"bundle '{name}' not present") + violations = check_bundle(bundle) + assert violations == [], "\n".join(str(v) for v in violations) + + +# --- §9.1: frontmatter block present and parseable ------------------------ + + +def test_missing_frontmatter_block_is_9_1(tmp_path: Path): + _write(tmp_path / "tables" / "x.md", "No frontmatter here.\n") + violations = check_bundle(tmp_path) + assert [(v.path, v.rule) for v in violations] == [("tables/x.md", "§9.1")] + + +def test_unterminated_frontmatter_is_9_1(tmp_path: Path): + _write(tmp_path / "t.md", "---\ntype: BigQuery Table\nstill in frontmatter\n") + violations = check_bundle(tmp_path) + assert len(violations) == 1 and violations[0].rule == "§9.1" + + +def test_non_mapping_frontmatter_is_9_1(tmp_path: Path): + _write(tmp_path / "t.md", "---\n- a\n- b\n---\nbody\n") + violations = check_bundle(tmp_path) + assert len(violations) == 1 and violations[0].rule == "§9.1" + + +def test_out_of_range_timestamp_is_9_1_not_a_crash(tmp_path: Path): + # PyYAML's implicit resolver raises a *bare* ValueError on an out-of-range + # date; the checker must report §9.1, not propagate the exception. + _write( + tmp_path / "t.md", + "---\ntype: BigQuery Table\ntimestamp: 2026-13-45\n---\nbody\n", + ) + violations = check_bundle(tmp_path) + assert len(violations) == 1 and violations[0].rule == "§9.1" + + +# --- §9.2: non-empty type ------------------------------------------------- + + +def test_missing_type_is_9_2(tmp_path: Path): + _write(tmp_path / "t.md", "---\ntitle: No type here\n---\nbody\n") + violations = check_bundle(tmp_path) + assert len(violations) == 1 and violations[0].rule == "§9.2" + + +def test_empty_type_is_9_2(tmp_path: Path): + _write(tmp_path / "t.md", "---\ntype: ''\ntitle: Empty type\n---\nbody\n") + violations = check_bundle(tmp_path) + assert len(violations) == 1 and violations[0].rule == "§9.2" + + +def test_non_string_type_is_9_2(tmp_path: Path): + # SPEC §4.1: `type` is "a short string". A non-string value (here an int) + # does not satisfy §9.2 even though it is technically non-empty. + _write(tmp_path / "t.md", "---\ntype: 123\ntitle: T\n---\nbody\n") + violations = check_bundle(tmp_path) + assert len(violations) == 1 and violations[0].rule == "§9.2" + + +def test_valid_concept_passes(tmp_path: Path): + _write(tmp_path / "t.md", "---\ntype: BigQuery Table\ntitle: T\n---\nbody\n") + assert check_bundle(tmp_path) == [] + + +# --- §6 / §11: index.md frontmatter -------------------------------------- + + +def test_subdir_index_with_frontmatter_is_6(tmp_path: Path): + _write(tmp_path / "tables" / "index.md", "---\ntype: Index\n---\n# Tables\n") + violations = check_bundle(tmp_path) + assert len(violations) == 1 and violations[0].rule == "§6" + + +def test_index_without_frontmatter_passes(tmp_path: Path): + _write(tmp_path / "index.md", "# Bundle\n- tables/x\n") + _write(tmp_path / "tables" / "index.md", "# Tables\n- x\n") + assert check_bundle(tmp_path) == [] + + +def test_root_index_okf_version_only_passes(tmp_path: Path): + _write(tmp_path / "index.md", "---\nokf_version: '0.1'\n---\n# Bundle\n") + assert check_bundle(tmp_path) == [] + + +def test_root_index_extra_key_is_11(tmp_path: Path): + _write( + tmp_path / "index.md", + "---\nokf_version: '0.1'\ntype: Bundle\n---\n# Bundle\n", + ) + violations = check_bundle(tmp_path) + assert len(violations) == 1 and violations[0].rule == "§11" + + +# --- reserved log.md is accepted as-is ------------------------------------ + + +def test_log_md_is_not_checked(tmp_path: Path): + # log.md is reserved (§7). Its date-heading structure is intentionally not + # validated (that needs a full markdown parser), so even a non-ISO heading + # is accepted as-is and never reported as a concept (§9.1/§9.2) violation. + _write(tmp_path / "log.md", "# Log\n\n## not-a-date\nan entry\n") + assert check_bundle(tmp_path) == [] + + +# --- --strict producer-level keys (§4.1) ---------------------------------- + + +def test_strict_flags_missing_recommended_keys(tmp_path: Path): + _write(tmp_path / "t.md", "---\ntype: BigQuery Table\n---\nbody\n") + # §9 alone passes (type is present)... + assert check_bundle(tmp_path) == [] + # ...but strict mode flags the missing recommended keys. + violations = check_bundle(tmp_path, strict=True) + assert len(violations) == 1 and violations[0].rule == "§4.1" + assert "title" in violations[0].message + assert "description" in violations[0].message + assert "timestamp" in violations[0].message + + +# --- misc ----------------------------------------------------------------- + + +def test_missing_bundle_raises(tmp_path: Path): + with pytest.raises(FileNotFoundError): + check_bundle(tmp_path / "does-not-exist") + + +def test_cli_missing_bundle_exits_2_without_traceback(tmp_path: Path, capsys): + # A bad --bundle path must produce a clean one-line error (exit 2), not a + # raw Python traceback. + from reference_agent.cli import main + + rc = main(["validate", "--bundle", str(tmp_path / "nope")]) + err = capsys.readouterr().err + assert rc == 2 + assert err.startswith("Error:") + assert "Traceback" not in err + + +def test_directory_named_md_is_skipped(tmp_path: Path): + # rglob("*.md") also matches a directory whose name ends in '.md'; it must + # not be read as a file and reported as a spurious violation. + (tmp_path / "weird.md").mkdir() + _write(tmp_path / "weird.md" / "child.md", "---\ntype: Table\n---\nbody\n") + assert check_bundle(tmp_path) == [] + + +def test_violation_str(): + v = Violation("tables/x.md", "§9.2", "missing or empty 'type' field") + assert str(v) == "tables/x.md: [§9.2] missing or empty 'type' field"