Skip to content

Latest commit

 

History

History
277 lines (218 loc) · 8.31 KB

File metadata and controls

277 lines (218 loc) · 8.31 KB

API reference

Top-level

import pytruth as pt
Symbol Type Notes
pt.Judge abstract base Static constructors: from_openai, from_anthropic, from_hf, from_pretrained, from_mock
pt.ScoreResult dataclass score, score_dist, confidence, raw_output, model, latency_ms, ...
pt.EnsembleJudge class EnsembleJudge(judges, strategy="trimmed_mean")
pt.Orbit dataclass Orbit(text, transforms).materialize()
pt.Transform dataclass metadata + transform function
pt.VerificationTier Enum DETERMINISTIC, EMBEDDING, NLI, DOMAIN
pt.Probe abstract base materialize(n), analyze(cases, scores), run(judge, n=...)
pt.Oracle abstract base equivalent(a, b), quality(prompt, response, reference=...)
pt.BiasCard dataclass to_dict, to_json, to_markdown, save
pt.probe(judge, ...) async fn runs orbits + probes + L6 → ProbeReport
pt.compare(judges, ...) async fn side-by-side ComparisonReport
pt.serve(judge, ...) fn FastAPI server
pt.JudgeTrainer(...) class LoRA trainer for the debiased judge
pt.OrbitDataset(...) class preference pairs expanded through orbits

pt.Judge

class Judge:
    model: str
    n_samples: int

    async def score(prompt, response, rubric=None, *, n_samples=None) -> ScoreResult
    async def score_many(items, *, max_concurrent=5, rubric=None) -> list[ScoreResult]

Constructors

pt.Judge.from_openai("gpt-4o", n_samples=20, requests_per_minute=60)
pt.Judge.from_anthropic("claude-sonnet-4-7", n_samples=20)
pt.Judge.from_hf(base="meta-llama/Llama-3.1-8B-Instruct", adapter="path/to/lora", mode="auto")
pt.Judge.from_pretrained("pytruth/judge-code-v1")
pt.Judge.from_mock(length_bias=1.0, markdown_bias=0.5, sycophancy_bias=0.4)

ScoreResult

Field Type Notes
score float Expected value of the score distribution. 1..10.
score_dist np.ndarray 10-bin probability distribution.
confidence float 1 - normalized entropy. 0..1.
raw_output str Last raw model output.
latency_ms float Wall-clock ms (sum across samples).
prompt_tokens, completion_tokens int Usage.
model str Model identifier.
n_samples int How many calls produced this distribution.
metadata dict Per-sample scores, raw outputs, etc.

pt.api.probe

async def probe(
    judge: Judge,
    *,
    domain: str = "general",
    suite: str = "standard",     # "quick" | "standard" | "full"
    texts: list[str] | None = None,
    transforms: list[str] | None = None,
    n_per_probe: int = 3,
    include_consistency: bool = True,
    max_concurrent: int = 5,
    rubric: str | None = None,
) -> ProbeReport

Suites

Suite Probes
quick position, sycophancy
standard + authority, reasoning_theater
full + confidence, identity

ProbeReport

class ProbeReport:
    judge: str
    domain: str
    suite: str
    bias_card: BiasCard
    raw_perturbation_records: list[dict]
    probe_findings: list[ProbeFinding]

    def to_markdown() -> str
    def to_dict() -> dict
    def save(path) -> None

pt.api.compare

async def compare(judges: list[Judge], *, domain="general", suite="standard", **kwargs) -> ComparisonReport

ComparisonReport.to_dataframe() returns a pandas DataFrame with one row per judge, columns: judge, domain, bias_score, invariance_pass_rate, n_significant_transforms, n_high_severity_probes.

pt.api.run_full_pipeline

Runs everything — surface + cognitive + L2 + L3 + L4 + L6.

async def run_full_pipeline(
    judge: Judge,
    *,
    domain: str = "general",
    suite: str = "full",
    texts: list[str] | None = None,
    transforms: list[str] | None = None,
    epistemic_pairs: list[dict] | None = None,
    adversarial_candidates: list[dict] | None = None,
    halo_items: list[tuple[str, str]] | None = None,
    n_per_probe: int = 3,
    max_concurrent: int = 5,
) -> FullPipelineResult

Orbits — pt.orbits

from pytruth.orbits.base import registry, Orbit

# All registered transforms
registry.names()                       # → ['contractions_expand', 'add_blank_lines', ...]
registry.by_domain("code")             # → list[Transform]
registry.apply("add_blank_lines", text) # → list[TransformResult]

# Iterate the orbit of a text
orbit = Orbit(text=t, transforms=registry.by_domain(None))
for name, result in orbit:
    print(name, result.changed, result.perturbed[:50])

Probes — pt.probes

from pytruth.probes import registry as probe_registry, Probe

probe_registry.names()  # → ['position', 'sycophancy', 'authority', ...]

# Run one probe directly
ProbeCls = probe_registry.get("sycophancy")
finding = await ProbeCls().run(judge, n=10)
print(finding.statistic, finding.severity, finding.summary)

Depth stack — pt.depth

from pytruth.depth import consistency, epistemic, causal, adversarial

# L2
ep = await epistemic.measure_novel_vs_canonical(judge, pairs)
ep.novel_vs_canonical_gap, ep.summary

# L3
halo = await causal.halo_audit(judge, items)
halo.cross_dimension_correlation, halo.per_dimension_means

# L3 — adversarial token search
findings = await causal.adversarial_token_search(judge, prompt, response, n_iterations=30)

# L4 — exploit family mining (offline)
report = await adversarial.mine_exploits_offline(judge, candidates, quality_oracle=oracle)
report.exploit_families, report.max_score_at_low_quality

# L6
cons = await consistency.battery(judge, items, cycle_responses=responses)
cons.test_retest_sigma, cons.cycle_rate, cons.justification_alignment, cons.calibration_ece

Mechanistic interp — pt.interp

Requires open-weights judge (HFJudge).

from pytruth.interp import linear_probe, probe_stack, deflection_check, entanglement_matrix

# L5 — single-layer probe
result = linear_probe(activations, labels, feature="length")
result.accuracy, result.delta

# L5 — multi-layer probe stack
stack = probe_stack(judge, texts, labels, feature="length")
peak_layer, peak_acc = stack.peak_accuracy()

# L5 — deflection check (compare base vs debiased)
report = deflection_check(base_judge, debiased_judge, texts, labels, feature="length")
report.deflection_score, report.summary

# L5 — entanglement
matrix = entanglement_matrix({"quality": w_q, "length": w_l, "format": w_f})
matrix["mean_off_diagonal"]

Training — pt.train

from pytruth.train import OrbitDataset, JudgeTrainer
from pytruth.train.losses import (
    bradley_terry_loss, invariance_loss, kl_distribution_loss,
    probe_equality_loss, consistency_loss,
)

dataset = OrbitDataset.from_pairs(pairs, domain="code", oracle=code_oracle)

trainer = JudgeTrainer(
    base="meta-llama/Llama-3.1-8B-Instruct",
    domain="code",
    use_lora=True,
    lora_rank=32,
    invariance_weight=0.3,
    kl_weight=0.1,
    probe_weight=0.5,
    consistency_weight=0.2,
    epochs=3,
    batch_size=4,
)
trainer.fit(dataset, on_step=lambda losses, step, epoch: print(losses))
trainer.push_to_hub("you/judge-code-v1")

HTTP — pt.serve

pt.serve(judge, host="127.0.0.1", port=8000, additional_judges={"alt": j2})

Or via uvicorn:

uvicorn pytruth.serve.app:app --host 0.0.0.0 --port 8000
# (mounts no judges by default; set them via the API or by importing
#  pytruth.serve.app and assigning _GLOBAL_JUDGE before launch.)

Programmatic client:

from pytruth.serve.client import Client
client = Client("http://localhost:8000")
result = await client.score("What is 2+2?", "The answer is 4.")

CLI — pytruth

pytruth probe <judge>         [--domain DOMAIN] [--suite SUITE] [--output PATH] [--fmt md|json]
pytruth compare <j1> <j2> ... [--domain DOMAIN] [--suite SUITE] [--output PATH]
pytruth serve <judge>         [--host HOST] [--port PORT]
pytruth train                 [--base ID] [--domain DOMAIN] [--data PATH] [--output DIR] [--epochs N]
pytruth certify <base> <debiased> [--feature NAME]

Judge specs:

Spec Meaning
mock MockJudge (offline, no API key)
openai:<model> OpenAI judge, e.g. openai:gpt-4o
anthropic:<model> Anthropic judge
hf:<base> HuggingFace base model
pretrained:<id> Released pytruth judge