import pytruth as pt| Symbol | Type | Notes |
|---|---|---|
pt.Judge |
abstract base | Static constructors: from_openai, from_anthropic, from_hf, from_pretrained, from_mock |
pt.ScoreResult |
dataclass | score, score_dist, confidence, raw_output, model, latency_ms, ... |
pt.EnsembleJudge |
class | EnsembleJudge(judges, strategy="trimmed_mean") |
pt.Orbit |
dataclass | Orbit(text, transforms).materialize() |
pt.Transform |
dataclass | metadata + transform function |
pt.VerificationTier |
Enum | DETERMINISTIC, EMBEDDING, NLI, DOMAIN |
pt.Probe |
abstract base | materialize(n), analyze(cases, scores), run(judge, n=...) |
pt.Oracle |
abstract base | equivalent(a, b), quality(prompt, response, reference=...) |
pt.BiasCard |
dataclass | to_dict, to_json, to_markdown, save |
pt.probe(judge, ...) |
async fn | runs orbits + probes + L6 → ProbeReport |
pt.compare(judges, ...) |
async fn | side-by-side ComparisonReport |
pt.serve(judge, ...) |
fn | FastAPI server |
pt.JudgeTrainer(...) |
class | LoRA trainer for the debiased judge |
pt.OrbitDataset(...) |
class | preference pairs expanded through orbits |
class Judge:
model: str
n_samples: int
async def score(prompt, response, rubric=None, *, n_samples=None) -> ScoreResult
async def score_many(items, *, max_concurrent=5, rubric=None) -> list[ScoreResult]pt.Judge.from_openai("gpt-4o", n_samples=20, requests_per_minute=60)
pt.Judge.from_anthropic("claude-sonnet-4-7", n_samples=20)
pt.Judge.from_hf(base="meta-llama/Llama-3.1-8B-Instruct", adapter="path/to/lora", mode="auto")
pt.Judge.from_pretrained("pytruth/judge-code-v1")
pt.Judge.from_mock(length_bias=1.0, markdown_bias=0.5, sycophancy_bias=0.4)| Field | Type | Notes |
|---|---|---|
score |
float | Expected value of the score distribution. 1..10. |
score_dist |
np.ndarray | 10-bin probability distribution. |
confidence |
float | 1 - normalized entropy. 0..1. |
raw_output |
str | Last raw model output. |
latency_ms |
float | Wall-clock ms (sum across samples). |
prompt_tokens, completion_tokens |
int | Usage. |
model |
str | Model identifier. |
n_samples |
int | How many calls produced this distribution. |
metadata |
dict | Per-sample scores, raw outputs, etc. |
async def probe(
judge: Judge,
*,
domain: str = "general",
suite: str = "standard", # "quick" | "standard" | "full"
texts: list[str] | None = None,
transforms: list[str] | None = None,
n_per_probe: int = 3,
include_consistency: bool = True,
max_concurrent: int = 5,
rubric: str | None = None,
) -> ProbeReport| Suite | Probes |
|---|---|
quick |
position, sycophancy |
standard |
+ authority, reasoning_theater |
full |
+ confidence, identity |
class ProbeReport:
judge: str
domain: str
suite: str
bias_card: BiasCard
raw_perturbation_records: list[dict]
probe_findings: list[ProbeFinding]
def to_markdown() -> str
def to_dict() -> dict
def save(path) -> Noneasync def compare(judges: list[Judge], *, domain="general", suite="standard", **kwargs) -> ComparisonReportComparisonReport.to_dataframe() returns a pandas DataFrame with one row per
judge, columns: judge, domain, bias_score, invariance_pass_rate, n_significant_transforms, n_high_severity_probes.
Runs everything — surface + cognitive + L2 + L3 + L4 + L6.
async def run_full_pipeline(
judge: Judge,
*,
domain: str = "general",
suite: str = "full",
texts: list[str] | None = None,
transforms: list[str] | None = None,
epistemic_pairs: list[dict] | None = None,
adversarial_candidates: list[dict] | None = None,
halo_items: list[tuple[str, str]] | None = None,
n_per_probe: int = 3,
max_concurrent: int = 5,
) -> FullPipelineResultfrom pytruth.orbits.base import registry, Orbit
# All registered transforms
registry.names() # → ['contractions_expand', 'add_blank_lines', ...]
registry.by_domain("code") # → list[Transform]
registry.apply("add_blank_lines", text) # → list[TransformResult]
# Iterate the orbit of a text
orbit = Orbit(text=t, transforms=registry.by_domain(None))
for name, result in orbit:
print(name, result.changed, result.perturbed[:50])from pytruth.probes import registry as probe_registry, Probe
probe_registry.names() # → ['position', 'sycophancy', 'authority', ...]
# Run one probe directly
ProbeCls = probe_registry.get("sycophancy")
finding = await ProbeCls().run(judge, n=10)
print(finding.statistic, finding.severity, finding.summary)from pytruth.depth import consistency, epistemic, causal, adversarial
# L2
ep = await epistemic.measure_novel_vs_canonical(judge, pairs)
ep.novel_vs_canonical_gap, ep.summary
# L3
halo = await causal.halo_audit(judge, items)
halo.cross_dimension_correlation, halo.per_dimension_means
# L3 — adversarial token search
findings = await causal.adversarial_token_search(judge, prompt, response, n_iterations=30)
# L4 — exploit family mining (offline)
report = await adversarial.mine_exploits_offline(judge, candidates, quality_oracle=oracle)
report.exploit_families, report.max_score_at_low_quality
# L6
cons = await consistency.battery(judge, items, cycle_responses=responses)
cons.test_retest_sigma, cons.cycle_rate, cons.justification_alignment, cons.calibration_eceRequires open-weights judge (HFJudge).
from pytruth.interp import linear_probe, probe_stack, deflection_check, entanglement_matrix
# L5 — single-layer probe
result = linear_probe(activations, labels, feature="length")
result.accuracy, result.delta
# L5 — multi-layer probe stack
stack = probe_stack(judge, texts, labels, feature="length")
peak_layer, peak_acc = stack.peak_accuracy()
# L5 — deflection check (compare base vs debiased)
report = deflection_check(base_judge, debiased_judge, texts, labels, feature="length")
report.deflection_score, report.summary
# L5 — entanglement
matrix = entanglement_matrix({"quality": w_q, "length": w_l, "format": w_f})
matrix["mean_off_diagonal"]from pytruth.train import OrbitDataset, JudgeTrainer
from pytruth.train.losses import (
bradley_terry_loss, invariance_loss, kl_distribution_loss,
probe_equality_loss, consistency_loss,
)
dataset = OrbitDataset.from_pairs(pairs, domain="code", oracle=code_oracle)
trainer = JudgeTrainer(
base="meta-llama/Llama-3.1-8B-Instruct",
domain="code",
use_lora=True,
lora_rank=32,
invariance_weight=0.3,
kl_weight=0.1,
probe_weight=0.5,
consistency_weight=0.2,
epochs=3,
batch_size=4,
)
trainer.fit(dataset, on_step=lambda losses, step, epoch: print(losses))
trainer.push_to_hub("you/judge-code-v1")pt.serve(judge, host="127.0.0.1", port=8000, additional_judges={"alt": j2})Or via uvicorn:
uvicorn pytruth.serve.app:app --host 0.0.0.0 --port 8000
# (mounts no judges by default; set them via the API or by importing
# pytruth.serve.app and assigning _GLOBAL_JUDGE before launch.)Programmatic client:
from pytruth.serve.client import Client
client = Client("http://localhost:8000")
result = await client.score("What is 2+2?", "The answer is 4.")pytruth probe <judge> [--domain DOMAIN] [--suite SUITE] [--output PATH] [--fmt md|json]
pytruth compare <j1> <j2> ... [--domain DOMAIN] [--suite SUITE] [--output PATH]
pytruth serve <judge> [--host HOST] [--port PORT]
pytruth train [--base ID] [--domain DOMAIN] [--data PATH] [--output DIR] [--epochs N]
pytruth certify <base> <debiased> [--feature NAME]
Judge specs:
| Spec | Meaning |
|---|---|
mock |
MockJudge (offline, no API key) |
openai:<model> |
OpenAI judge, e.g. openai:gpt-4o |
anthropic:<model> |
Anthropic judge |
hf:<base> |
HuggingFace base model |
pretrained:<id> |
Released pytruth judge |