nullsec-s1/scripts/validate_claims.py at main · trynullsec/nullsec-s1 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python3
"""Validate which public claims the repo is currently allowed to make.

Claims are permitted only when the artifacts that make them true exist on disk
(a trained adapter, a real-model benchmark report, passing safety probes, a
release bundle). Run with no arguments for a status table:

    python scripts/validate_claims.py
    python scripts/validate_claims.py --adapter outputs/nullsec-s1-qlora

With --check, scan README.md and RELEASE_SUMMARY.md for phrases that assert a
claim the artifacts do not yet support, and exit non-zero if any are found. This
is what lets CI enforce honesty automatically.
"""
from __future__ import annotations

import argparse
import re
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parents[1]))

from scripts._artifacts import ROOT, evaluate_claims, gather_state

# Phrases that, if present in public docs, assert a gated claim.
# Each maps to the claim name in evaluate_claims that must be allowed.
# Patterns are matched against lowercased text and are assertion-shaped on
# purpose: instructional text like "cut a release candidate" must not trip them,
# only a statement that Nullsec-1 *is* one.
CLAIM_PHRASES = {
    "trained model": [
        r"\bis a trained specialized security llm\b",
        r"\bnullsec-1\.0 is a trained\b",
        r"\bis now a trained model\b",
    ],
    "benchmarked": [
        r"\bhas been benchmarked\b",
        r"\bbenchmarks?\s+(show|shows|demonstrate|demonstrates|prove|proves|confirm|confirms)\b",
        r"\bachieves?\s+an?\s+f1\b",
    ],
    "evaluated with real model outputs": [
        r"\bevaluated with (the )?real model\b",
        r"\breal-model evaluation (shows|confirms|gives)\b",
    ],
    "release candidate": [
        r"\bis (a|the|now a) release candidate\b",
        r"\bnullsec-1\.0 release candidate is ready\b",
    ],
    "production-ready": [
        r"\bproduction[- ]ready model\b",
        r"\bmodel is production[- ]ready\b",
        r"\bready for production use\b",
    ],
}
# Superlatives are always flagged as unverifiable when stated as fact.
SUPERLATIVE_PHRASES = [
    r"\bthe first llm\b", r"\bfirst llm purpose-built\b",
    r"\bthe only llm\b", r"\bworld'?s first\b", r"\bbest[- ]in[- ]class\b",
]

DOCS_TO_SCAN = ["README.md", "RELEASE_SUMMARY.md", "releases/nullsec-1.0/RELEASE_SUMMARY.md"]


def status_table(claims) -> str:
    width = max(len(c.name) for c in claims)
    lines = ["", "Nullsec-1 — permitted public claims", "=" * 60]
    for c in claims:
        mark = "ALLOWED " if c.allowed else "FORBIDDEN"
        lines.append(f"[{mark}] {c.name.ljust(width)}  — {c.reason}")
    lines.append("")
    return "\n".join(lines)


def scan_docs(claims) -> list[str]:
    allowed = {c.name: c.allowed for c in claims}
    violations: list[str] = []
    for rel in DOCS_TO_SCAN:
        path = ROOT / rel
        if not path.exists():
            continue
        text = path.read_text(encoding="utf-8")
        low = text.lower()
        for claim_name, patterns in CLAIM_PHRASES.items():
            if allowed.get(claim_name):
                continue  # claim is substantiated; phrasing is fine
            for pat in patterns:
                if re.search(pat, low):
                    violations.append(
                        f"{rel}: asserts '{claim_name}' (matched /{pat}/) but artifacts do not support it"
                    )
        for pat in SUPERLATIVE_PHRASES:
            if re.search(pat, low):
                violations.append(
                    f"{rel}: superlative (/{pat}/) is unverifiable from artifacts — support it independently or remove it"
                )
    return violations


def main():
    ap = argparse.ArgumentParser(description="Validate public claims against real artifacts")
    ap.add_argument("--adapter", default=None, help="adapter path to check for a trained model")
    ap.add_argument("--report", default=None, help="benchmark report path (defaults to release/standard locations)")
    ap.add_argument("--check", action="store_true", help="scan docs and fail on unsubstantiated claims")
    args = ap.parse_args()

    st = gather_state(adapter=args.adapter, report=args.report)
    claims = evaluate_claims(st)
    print(status_table(claims))

    if args.check:
        violations = scan_docs(claims)
        if violations:
            print("HONESTY CHECK FAILED:\n  - " + "\n  - ".join(violations))
            sys.exit(1)
        print("HONESTY CHECK PASSED: public docs make no unsubstantiated claims.")
    sys.exit(0)


if __name__ == "__main__":
    main()