From fa417387076e6dba901a032c5018f5cd2f746924 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Thu, 30 Apr 2026 06:01:09 -0700
Subject: [PATCH 01/26] feat: conventionality evaluator

---
 scripts/generate_settings.py                  | 535 ++++++++++++++++++
 sdks/python/Makefile                          |  34 +-
 .../learning_commons_evaluators/__init__.py   |  12 +
 .../evaluators/__init__.py                    |   6 +
 .../evaluators/conventionality.py             | 119 ++++
 .../schemas/__init__.py                       |   6 +
 .../schemas/conventionality.py                |  32 ++
 .../_generated_conventionality_settings.py    | 104 ++++
 sdks/python/tests/conftest.py                 |  13 +-
 sdks/python/tests/evaluators/test_base.py     | 368 ++++++++++++
 .../tests/evaluators/test_conventionality.py  | 143 +++++
 .../tests/schemas/test_evaluator_schemas.py   | 256 +++++++++
 .../tests/settings/test_load_settings.py      | 332 +++++++++++
 sdks/python/tests/test_package_imports.py     |   2 +
 sdks/settings/conventionality/settings.toml   |  78 +++
 15 files changed, 2024 insertions(+), 16 deletions(-)
 create mode 100644 scripts/generate_settings.py
 create mode 100644 sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py
 create mode 100644 sdks/python/src/learning_commons_evaluators/schemas/conventionality.py
 create mode 100644 sdks/python/src/learning_commons_evaluators/settings/_generated_conventionality_settings.py
 create mode 100644 sdks/python/tests/evaluators/test_base.py
 create mode 100644 sdks/python/tests/evaluators/test_conventionality.py
 create mode 100644 sdks/python/tests/schemas/test_evaluator_schemas.py
 create mode 100644 sdks/python/tests/settings/test_load_settings.py
 create mode 100644 sdks/settings/conventionality/settings.toml
diff --git a/scripts/generate_settings.py b/scripts/generate_settings.py
new file mode 100644
index 0000000..1a5aa5a
--- /dev/null
+++ b/scripts/generate_settings.py
@@ -0,0 +1,535 @@
+#!/usr/bin/env python3
+"""Generate pre-built settings modules from evaluator TOML files.
+
+Each evaluator's settings TOML is parsed once at *build time* and serialised
+as a plain Python module.  The evaluator modules then import that generated
+module at import time — no file I/O, no TOML parsing on every process start.
+
+Each evaluator directory under sdks/settings/ contains two files::
+
+    sdks/settings/<evaluator>/settings.toml   — prompts, model, temperature
+    sdks/settings/<evaluator>/contracts.toml  — captured LLM interactions for contract tests
+
+Usage::
+
+    # Regenerate all evaluators:
+    python scripts/generate_settings.py
+
+    # Check whether generated files are stale (exits 1 if any differ):
+    python scripts/generate_settings.py --check
+
+    # Copy contracts.toml from sdks/settings/ → bundled package:
+    python scripts/generate_settings.py --sync
+
+    # Verify bundled contracts.toml matches canonical sdks/settings/:
+    python scripts/generate_settings.py --check-sync
+
+Typical CI configuration::
+
+    - name: Check settings are up to date
+      run: python scripts/generate_settings.py --check
+    - name: Check contracts are in sync
+      run: python scripts/generate_settings.py --check-sync
+
+When to regenerate::
+
+    Any time you edit settings.toml under sdks/settings/<evaluator>/,
+    run this script and commit the updated generated file alongside the TOML.
+"""
+
+from __future__ import annotations
+
+import argparse
+import difflib
+import os
+import sys
+import types
+from dataclasses import MISSING, fields, is_dataclass
+from enum import Enum
+from pathlib import Path
+from typing import Any
+
+# ---------------------------------------------------------------------------
+# Path setup — resolve repo root and add SDK src to sys.path so we can import
+# the SDK without a full install.
+# ---------------------------------------------------------------------------
+
+_SCRIPT_DIR = Path(__file__).resolve().parent
+_REPO_ROOT = _SCRIPT_DIR.parent
+_SDK_SRC = _REPO_ROOT / "sdks" / "python" / "src"
+_SETTINGS_DIR = _REPO_ROOT / "sdks" / "settings"
+_GENERATED_DIR = _SDK_SRC / "learning_commons_evaluators" / "settings"
+
+sys.path.insert(0, str(_SDK_SRC))
+
+# Point load_settings at the canonical settings directory so the generator
+# always reads the source-of-truth TOML, not the bundled copy.
+os.environ.setdefault("EVALUATORS_SETTINGS_DIR", str(_SETTINGS_DIR))
+
+# Pre-register the package without running learning_commons_evaluators/__init__.py.
+# That __init__ imports evaluators, which import generated settings — invalid while
+# this script is regenerating those files. Submodules (schemas, settings, …) load
+# normally via __path__.
+if "learning_commons_evaluators" not in sys.modules:
+    _lce_pkg = types.ModuleType("learning_commons_evaluators")
+    _lce_pkg.__path__ = [str(_SDK_SRC / "learning_commons_evaluators")]
+    sys.modules["learning_commons_evaluators"] = _lce_pkg
+
+# ---------------------------------------------------------------------------
+# SDK imports (after path setup)
+# ---------------------------------------------------------------------------
+
+from pydantic import BaseModel  # noqa: E402
+from pydantic_core import PydanticUndefined  # noqa: E402
+
+from learning_commons_evaluators.schemas.config import (  # noqa: E402
+    EvaluationSettings,
+)
+from learning_commons_evaluators.schemas.conventionality import (  # noqa: E402
+    ConventionalityEvaluationSettings,
+)
+from learning_commons_evaluators.schemas.vocabulary import (  # noqa: E402
+    VocabularyEvaluationSettings,
+)
+from learning_commons_evaluators.settings.load_settings import (  # noqa: E402
+    EvaluatorSettingsResult,
+    load_evaluator_settings,
+)
+
+# ---------------------------------------------------------------------------
+# Import map: class name → (module path, symbol name)
+# Used to build the import block of generated files.
+# ---------------------------------------------------------------------------
+
+_IMPORT_MAP: dict[str, tuple[str, str]] = {
+    "ConventionalityEvaluationSettings": (
+        "learning_commons_evaluators.schemas.conventionality",
+        "ConventionalityEvaluationSettings",
+    ),
+    "EvaluatorMaturity": (
+        "learning_commons_evaluators.schemas.metadata",
+        "EvaluatorMaturity",
+    ),
+    "EvaluatorMetadata": (
+        "learning_commons_evaluators.schemas.metadata",
+        "EvaluatorMetadata",
+    ),
+    "EvaluatorSettingsResult": (
+        "learning_commons_evaluators.settings.load_settings",
+        "EvaluatorSettingsResult",
+    ),
+    "GradeInputSpec": (
+        "learning_commons_evaluators.schemas.input_specs",
+        "GradeInputSpec",
+    ),
+    "LlmProvider": (
+        "learning_commons_evaluators.schemas.config",
+        "LlmProvider",
+    ),
+    "PromptSettings": (
+        "learning_commons_evaluators.schemas.config",
+        "PromptSettings",
+    ),
+    "TextInputSpec": (
+        "learning_commons_evaluators.schemas.input_specs",
+        "TextInputSpec",
+    ),
+    "VocabularyEvaluationSettings": (
+        "learning_commons_evaluators.schemas.vocabulary",
+        "VocabularyEvaluationSettings",
+    ),
+}
+
+# ---------------------------------------------------------------------------
+# Value emitter
+# ---------------------------------------------------------------------------
+
+
+def _emit_string(s: str) -> str:
+    """Emit a string literal, using triple-quotes for multiline / long strings."""
+    if "\n" in s or len(s) > 88:
+        # Escape any literal `"""` sequences inside the content.
+        content = s.replace('"""', '""\\"')
+        return f'"""{content}"""'
+    return repr(s)
+
+
+def _emit_value(obj: Any, indent: int = 0) -> str:
+    """Recursively emit a Python value as a source-code string."""
+    pad = "    " * indent
+    inner = "    " * (indent + 1)
+
+    if obj is None:
+        return "None"
+    if isinstance(obj, bool):
+        return "True" if obj else "False"
+    # str- and int-backed enums (e.g. LlmProvider(str, Enum)) must be handled before
+    # str/int or we emit repr() and get invalid syntax like <LlmProvider.GOOGLE: 'google'>.
+    if isinstance(obj, Enum):
+        return f"{type(obj).__name__}.{obj.name}"
+    if isinstance(obj, int):
+        return repr(obj)
+    if isinstance(obj, float):
+        return f"{obj:.1f}" if obj == int(obj) else repr(obj)
+    if isinstance(obj, str):
+        return _emit_string(obj)
+    if isinstance(obj, list):
+        if not obj:
+            return "[]"
+        items = [_emit_value(v, indent + 1) for v in obj]
+        single = f"[{', '.join(items)}]"
+        if len(single) <= 88 - len(pad) and "\n" not in single:
+            return single
+        body = "\n".join(f"{inner}{item}," for item in items)
+        return f"[\n{body}\n{pad}]"
+    if isinstance(obj, dict):
+        if not obj:
+            return "{}"
+        pairs = [(repr(k), _emit_value(v, indent + 1)) for k, v in obj.items()]
+        single = "{" + ", ".join(f"{k}: {v}" for k, v in pairs) + "}"
+        if len(single) <= 88 - len(pad) and "\n" not in single:
+            return single
+        body = "\n".join(f"{inner}{k}: {v}," for k, v in pairs)
+        return f"{{\n{body}\n{pad}}}"
+    if is_dataclass(obj) and not isinstance(obj, type):
+        return _emit_dataclass(obj, indent)
+    if isinstance(obj, BaseModel):
+        return _emit_model(obj, indent)
+    raise TypeError(f"Cannot emit {type(obj).__name__}: {obj!r}")
+
+
+def _emit_model(obj: BaseModel, indent: int = 0) -> str:
+    """Emit a Pydantic model as a constructor call."""
+    cls = type(obj)
+    cls_name = cls.__name__
+    pad = "    " * indent
+    inner = "    " * (indent + 1)
+
+    args: list[tuple[str, str]] = []
+    for field_name, field_info in cls.model_fields.items():
+        val = getattr(obj, field_name)
+
+        # Skip Literal discriminators (e.g. type="TextInputField").
+        if field_name == "type" and not field_info.is_required():
+            continue
+
+        # Skip fields that equal their default — keeps generated code clean.
+        default = field_info.default
+        if default is not PydanticUndefined and val == default:
+            continue
+
+        args.append((field_name, _emit_value(val, indent + 1)))
+
+    if not args:
+        return f"{cls_name}()"
+
+    single = f"{cls_name}({', '.join(f'{n}={v}' for n, v in args)})"
+    if len(single) <= 88 - len(pad) and "\n" not in single:
+        return single
+
+    body = "\n".join(f"{inner}{n}={v}," for n, v in args)
+    return f"{cls_name}(\n{body}\n{pad})"
+
+
+def _emit_dataclass(obj: Any, indent: int = 0) -> str:
+    """Emit a stdlib dataclass instance as a constructor call (e.g. PromptSettings)."""
+    cls = type(obj)
+    cls_name = cls.__name__
+    pad = "    " * indent
+    inner = "    " * (indent + 1)
+
+    args: list[tuple[str, str]] = []
+    for f in fields(obj):
+        val = getattr(obj, f.name)
+        if f.default is not MISSING and val == f.default:
+            continue
+        args.append((f.name, _emit_value(val, indent + 1)))
+
+    if not args:
+        return f"{cls_name}()"
+
+    single = f"{cls_name}({', '.join(f'{n}={v}' for n, v in args)})"
+    if len(single) <= 88 - len(pad) and "\n" not in single:
+        return single
+
+    body = "\n".join(f"{inner}{n}={v}," for n, v in args)
+    return f"{cls_name}(\n{body}\n{pad})"
+
+
+# ---------------------------------------------------------------------------
+# Import-block builder
+# ---------------------------------------------------------------------------
+
+
+def _collect_class_names(obj: Any, names: set[str]) -> None:
+    """Walk obj recursively and collect names of all classes that need importing."""
+    if isinstance(obj, Enum):
+        names.add(type(obj).__name__)
+    elif is_dataclass(obj) and not isinstance(obj, type):
+        names.add(type(obj).__name__)
+        for f in fields(obj):
+            _collect_class_names(getattr(obj, f.name), names)
+    elif isinstance(obj, BaseModel):
+        names.add(type(obj).__name__)
+        for field_name in type(obj).model_fields:
+            _collect_class_names(getattr(obj, field_name), names)
+    elif isinstance(obj, dict):
+        for v in obj.values():
+            _collect_class_names(v, names)
+    elif isinstance(obj, list):
+        for v in obj:
+            _collect_class_names(v, names)
+
+
+def _build_import_block(config: EvaluatorSettingsResult, settings_cls_name: str) -> str:
+    used: set[str] = set()
+    _collect_class_names(config.evaluator_metadata, used)
+    _collect_class_names(config.evaluation_settings, used)
+    used.add(settings_cls_name)
+    used.add("EvaluatorSettingsResult")
+
+    by_module: dict[str, list[str]] = {}
+    for name in sorted(used):
+        if name in _IMPORT_MAP:
+            mod, sym = _IMPORT_MAP[name]
+            by_module.setdefault(mod, []).append(sym)
+
+    lines: list[str] = []
+    for mod in sorted(by_module):
+        syms = sorted(by_module[mod])
+        if len(syms) == 1:
+            lines.append(f"from {mod} import {syms[0]}")
+        else:
+            lines.append(f"from {mod} import {', '.join(syms)}")
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# File generator
+# ---------------------------------------------------------------------------
+
+
+def generate_module(
+    evaluator_name: str,
+    toml_path: Path,
+    settings_cls: type[EvaluationSettings],
+) -> str:
+    """Parse *toml_path* and return the content of the generated Python module."""
+    config = load_evaluator_settings(toml_path, settings_cls)
+    settings_cls_name = settings_cls.__name__
+
+    imports = _build_import_block(config, settings_cls_name)
+    metadata_code = _emit_model(config.evaluator_metadata)
+    prompts_code = _emit_value(config.prompts)
+    settings_code = _emit_model(config.evaluation_settings)
+
+    rel_toml = toml_path.relative_to(_REPO_ROOT)
+
+    # Emit flush-left Python only. Do not wrap this in textwrap.dedent() while
+    # interpolating multi-line fragments (imports, *_code): continuation lines
+    # from those values start at column 0, which would make dedent's common
+    # margin zero and leave the header indented — IndentationError at import.
+    return f"""# !! AUTO-GENERATED — do not edit directly.
+# Source: {rel_toml}
+# Regenerate : python scripts/generate_settings.py
+# Staleness check: python scripts/generate_settings.py --check
+
+from __future__ import annotations
+
+{imports}
+
+# ── Evaluator metadata ────────────────────────────────────────────────────────
+
+_EVALUATOR_METADATA = {metadata_code}
+
+# ── Prompt templates ──────────────────────────────────────────────────────────
+
+_PROMPTS: dict[str, str] = {prompts_code}
+
+# ── Evaluation settings ───────────────────────────────────────────────────────
+
+_EVALUATION_SETTINGS = {settings_code}
+
+# ── Public config object (imported by evaluator modules) ──────────────────────
+
+CONFIG: EvaluatorSettingsResult[{settings_cls_name}] = EvaluatorSettingsResult(
+    evaluator_metadata=_EVALUATOR_METADATA,
+    evaluation_settings=_EVALUATION_SETTINGS,
+    prompts=_PROMPTS,
+)
+"""
+
+
+# ---------------------------------------------------------------------------
+# Evaluator registry
+# ---------------------------------------------------------------------------
+
+_EVALUATORS: list[dict] = [
+    {
+        "name": "conventionality",
+        "settings_cls": ConventionalityEvaluationSettings,
+        "output": _GENERATED_DIR / "_generated_conventionality_settings.py",
+    },
+    {
+        "name": "vocabulary",
+        "settings_cls": VocabularyEvaluationSettings,
+        "output": _GENERATED_DIR / "_generated_vocabulary_settings.py",
+    },
+]
+
+
+def _settings_toml(evaluator_name: str) -> Path:
+    """Return ``sdks/settings/<evaluator>/settings.toml``."""
+    return _SETTINGS_DIR / evaluator_name / "settings.toml"
+
+
+def _contracts_toml(evaluator_name: str) -> Path:
+    """Return the canonical ``sdks/settings/<evaluator>/contracts.toml``."""
+    return _SETTINGS_DIR / evaluator_name / "contracts.toml"
+
+
+# ---------------------------------------------------------------------------
+# Commands
+# ---------------------------------------------------------------------------
+
+
+def cmd_generate() -> None:
+    for ev in _EVALUATORS:
+        content = generate_module(ev["name"], _settings_toml(ev["name"]), ev["settings_cls"])
+        ev["output"].write_text(content, encoding="utf-8")
+        rel = ev["output"].relative_to(_REPO_ROOT)
+        print(f"  generated  {rel}")
+    print("Done.")
+
+
+def cmd_check() -> int:
+    stale: list[str] = []
+    for ev in _EVALUATORS:
+        expected = generate_module(ev["name"], _settings_toml(ev["name"]), ev["settings_cls"])
+        actual = ev["output"].read_text(encoding="utf-8") if ev["output"].exists() else ""
+        if expected != actual:
+            diff = "".join(
+                difflib.unified_diff(
+                    actual.splitlines(keepends=True),
+                    expected.splitlines(keepends=True),
+                    fromfile=str(ev["output"].relative_to(_REPO_ROOT)),
+                    tofile="(regenerated)",
+                    n=3,
+                )
+            )
+            print(f"STALE: {ev['output'].relative_to(_REPO_ROOT)}\n{diff}")
+            stale.append(ev["name"])
+
+    if stale:
+        print(f"\nStale evaluators: {stale}")
+        print("Run:  python scripts/generate_settings.py")
+        return 1
+
+    print("All generated settings are up to date.")
+    return 0
+
+
+def cmd_sync() -> None:
+    """Copy ``contracts.toml`` from canonical (sdks/settings/) → bundled package.
+
+    The bundled package ships a copy of the contracts TOML so that contract
+    tests work correctly when installed via ``pip install`` (i.e. without
+    access to the monorepo ``sdks/settings/`` directory).
+    """
+    for ev in _EVALUATORS:
+        src = _contracts_toml(ev["name"])
+        if not src.exists():
+            print(f"  WARNING: canonical {src.relative_to(_REPO_ROOT)} not found — skipping")
+            continue
+        dst_dir = _GENERATED_DIR / ev["name"]
+        dst_dir.mkdir(parents=True, exist_ok=True)
+        dst = dst_dir / "contracts.toml"
+        dst.write_bytes(src.read_bytes())
+        print(f"  copied   {src.relative_to(_REPO_ROOT)} → {dst.relative_to(_REPO_ROOT)}")
+    print("Done.")
+
+
+def cmd_check_sync() -> int:
+    """Verify bundled ``contracts.toml`` files match the canonical sdks/settings/ copies.
+
+    Exits with a non-zero status if any bundled file is missing or differs from
+    the canonical source.
+    """
+    errors: list[str] = []
+    for ev in _EVALUATORS:
+        canonical = _contracts_toml(ev["name"])
+        if not canonical.exists():
+            errors.append(f"MISSING canonical: {canonical.relative_to(_REPO_ROOT)}")
+            continue
+        bundled = _GENERATED_DIR / ev["name"] / "contracts.toml"
+        if not bundled.exists():
+            errors.append(
+                f"MISSING bundled: {bundled.relative_to(_REPO_ROOT)} "
+                f"(run: python scripts/generate_settings.py --sync)"
+            )
+        elif canonical.read_bytes() != bundled.read_bytes():
+            errors.append(
+                f"OUT OF SYNC: {ev['name']}/contracts.toml "
+                f"(run: python scripts/generate_settings.py --sync)"
+            )
+
+    if errors:
+        for msg in errors:
+            print(msg)
+        return 1
+
+    print("All contracts TOMLs are in sync.")
+    return 0
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Generate (or check) pre-built settings modules from evaluator TOML files, "
+            "and sync bundled contracts TOMLs."
+        )
+    )
+    parser.add_argument(
+        "--check",
+        action="store_true",
+        help=(
+            "Check generated _generated_*_settings.py files for staleness; "
+            "exit 1 if any differ from what would be generated."
+        ),
+    )
+    parser.add_argument(
+        "--sync",
+        action="store_true",
+        help=(
+            "Copy contracts.toml from sdks/settings/ into the bundled package "
+            "directory so contract tests work after pip install."
+        ),
+    )
+    parser.add_argument(
+        "--check-sync",
+        action="store_true",
+        dest="check_sync",
+        help=(
+            "Verify bundled contracts.toml files match the canonical sdks/settings/ "
+            "copies; exit 1 if any are missing or differ."
+        ),
+    )
+    args = parser.parse_args()
+
+    if args.check:
+        sys.exit(cmd_check())
+    elif args.sync:
+        cmd_sync()
+    elif args.check_sync:
+        sys.exit(cmd_check_sync())
+    else:
+        cmd_generate()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sdks/python/Makefile b/sdks/python/Makefile
index bf97dfe..742abd1 100644
--- a/sdks/python/Makefile
+++ b/sdks/python/Makefile
@@ -4,32 +4,43 @@ PYTEST  := $(VENV)/bin/pytest
 RUFF    := $(VENV)/bin/ruff
 MYPY    := $(VENV)/bin/mypy
 
-.PHONY: help test unit-test lint format format-check typecheck pip-check verify coverage
+# Bundled settings directory inside the package.
+SETTINGS_DST := src/learning_commons_evaluators/settings
+
+.PHONY: help test unit-test generate-settings check-generated \
+        lint format format-check typecheck pip-check verify coverage
 
 help:
 	@echo "Usage: make <target>"
 	@echo ""
-	@echo "  lint               Ruff linter (src, tests)"
+	@echo "  lint               Ruff linter (src, tests, settings generator script)"
 	@echo "  format             Apply Ruff formatter"
 	@echo "  format-check       Fail if Ruff would reformat any file"
 	@echo "  typecheck          Mypy on src package + tests"
 	@echo "  pip-check          pip dependency consistency check"
-	@echo "  verify             lint + format-check + typecheck + pip-check + unit-test"
+	@echo "  verify             check-generated + lint + format-check + typecheck + pip-check + unit-test"
 	@echo "  coverage           Unit tests with coverage report (requires dev install)"
 	@echo ""
 	@echo "  test               Run the full test suite"
 	@echo "  unit-test          Run unit tests only"
+	@echo ""
+	@echo "  generate-settings  Re-generate _generated_*_settings.py from sdks/settings/"
+	@echo "  check-generated    Verify generated .py files match sdks/settings/ (exits 1 if stale)"
+	@echo ""
+	@echo "Typical workflows:"
+	@echo "  After editing evaluator settings TOML:  make generate-settings && make test"
+	@echo "  CI:                                     make verify"
 
 # ── Static checks (Ruff + Mypy) ───────────────────────────────────────────────
 
 lint:
-	$(RUFF) check src tests
+	$(RUFF) check src tests ../../scripts/generate_settings.py
 
 format:
-	$(RUFF) format src tests
+	$(RUFF) format src tests ../../scripts/generate_settings.py
 
 format-check:
-	$(RUFF) format --check src tests
+	$(RUFF) format --check src tests ../../scripts/generate_settings.py
 
 typecheck:
 	$(MYPY) src/learning_commons_evaluators tests
@@ -37,7 +48,7 @@ typecheck:
 pip-check:
 	$(PYTHON) -m pip check
 
-verify: lint format-check typecheck pip-check unit-test
+verify: check-generated lint format-check typecheck pip-check unit-test
 
 coverage:
 	$(PYTEST) tests/ -q --ignore=tests/contract_tests --cov=learning_commons_evaluators --cov-report=term-missing
@@ -49,3 +60,12 @@ test:
 
 unit-test:
 	$(PYTEST) tests/ -v --ignore=tests/contract_tests
+
+# ── Code generation ───────────────────────────────────────────────────────────
+# Re-run when an evaluator settings TOML under sdks/settings/ changes.
+
+generate-settings:
+	$(PYTHON) ../../scripts/generate_settings.py
+
+check-generated:
+	$(PYTHON) ../../scripts/generate_settings.py --check
diff --git a/sdks/python/src/learning_commons_evaluators/__init__.py b/sdks/python/src/learning_commons_evaluators/__init__.py
index 0b96e44..5dab77b 100644
--- a/sdks/python/src/learning_commons_evaluators/__init__.py
+++ b/sdks/python/src/learning_commons_evaluators/__init__.py
@@ -32,9 +32,13 @@
 # Evaluators
 from learning_commons_evaluators.evaluators import (
     BaseEvaluator,
+    ConventionalityEvaluator,
     InputT,
     OutputT,
 )
+from learning_commons_evaluators.evaluators.conventionality import (
+    ConventionalityEvaluationInput,
+)
 
 # Logger (uses Python standard logging)
 from learning_commons_evaluators.logger import (
@@ -49,6 +53,10 @@
     TextInputField,
 )
 from learning_commons_evaluators.schemas.config import EvaluationSettings, LlmProvider
+from learning_commons_evaluators.schemas.conventionality import (
+    ConventionalityEvaluationSettings,
+    ConventionalityOutput,
+)
 
 # Schemas (core)
 from learning_commons_evaluators.schemas.evaluator import (
@@ -83,6 +91,10 @@
     "AuthenticationError",
     "BaseEvaluator",
     "ConfigurationError",
+    "ConventionalityEvaluationInput",
+    "ConventionalityEvaluationSettings",
+    "ConventionalityEvaluator",
+    "ConventionalityOutput",
     "EvaluationAnswer",
     "EvaluationExplanation",
     "EvaluationInput",
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py b/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py
index a22ec2b..50821c6 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py
@@ -7,9 +7,15 @@
 """
 
 from learning_commons_evaluators.evaluators.base import BaseEvaluator, InputT, OutputT
+from learning_commons_evaluators.evaluators.conventionality import (
+    ConventionalityEvaluationInput,
+    ConventionalityEvaluator,
+)
 
 __all__ = [
     "BaseEvaluator",
+    "ConventionalityEvaluationInput",
+    "ConventionalityEvaluator",
     "InputT",
     "OutputT",
 ]
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py b/sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py
new file mode 100644
index 0000000..c7fbcce
--- /dev/null
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py
@@ -0,0 +1,119 @@
+"""Conventionality evaluator: evaluates text for conventionality of language."""
+
+from __future__ import annotations
+
+from typing import ClassVar
+
+import textstat
+from langchain_core.output_parsers import JsonOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from pydantic import Field
+
+from learning_commons_evaluators.evaluators.base import BaseEvaluator
+from learning_commons_evaluators.schemas.common_inputs import (
+    GradeInputField,
+    TextInputField,
+)
+from learning_commons_evaluators.schemas.conventionality import (
+    ConventionalityEvaluationSettings,
+    ConventionalityOutput,
+)
+from learning_commons_evaluators.schemas.evaluator import (
+    EvaluationExplanation,
+    EvaluationInput,
+)
+from learning_commons_evaluators.schemas.metadata import (
+    EvaluationMetadata,
+    EvaluatorMetadata,
+)
+from learning_commons_evaluators.schemas.text_complexity import (
+    TextComplexityAnswer,
+    TextComplexityResult,
+)
+from learning_commons_evaluators.settings._generated_conventionality_settings import (
+    CONFIG as _CONVENTIONALITY_CONFIG,
+)
+
+_INPUT_SETTINGS = _CONVENTIONALITY_CONFIG.evaluator_metadata.inputs
+
+
+class ConventionalityEvaluationInput(EvaluationInput):
+    """Input for a conventionality evaluation.
+
+    Constraints (min/max text length, allowed grades) are sourced from
+    ``[[evaluator_metadata.inputs]]`` in evaluator settings and applied
+    automatically — callers supply raw values, not field objects.
+
+    Example::
+
+        inp = ConventionalityEvaluationInput(text="The quick brown fox...", grade=5)
+    """
+
+    _input_settings: ClassVar[dict] = _INPUT_SETTINGS
+
+    text: TextInputField = Field(description="The text to evaluate.")
+    grade: GradeInputField = Field(description="The grade level of the text.")
+
+    def __init__(self, *, text: str, grade: int, **kwargs):
+        super().__init__(text=text, grade=grade, **kwargs)
+
+
+class ConventionalityEvaluator(
+    BaseEvaluator[
+        ConventionalityEvaluationInput,
+        TextComplexityResult,
+        ConventionalityEvaluationSettings,
+    ]
+):
+    """Evaluates text for conventionality (idioms, metaphors, implied meaning) relative to grade."""
+
+    metadata: EvaluatorMetadata = _CONVENTIONALITY_CONFIG.evaluator_metadata
+    default_evaluation_settings: ConventionalityEvaluationSettings = (
+        _CONVENTIONALITY_CONFIG.evaluation_settings
+    )
+
+    def evaluate_impl(
+        self,
+        input: ConventionalityEvaluationInput,
+        evaluation_settings: ConventionalityEvaluationSettings,
+        evaluation_metadata: EvaluationMetadata,
+    ) -> TextComplexityResult:
+        """Run conventionality evaluation. Returns TextComplexityResult with answer, explanation, metadata."""
+        ps_main = evaluation_settings.prompt_settings_step_main
+        assert ps_main is not None
+
+        fk_score = round(textstat.flesch_kincaid_grade(input.text.value), 2)
+        prompt_inputs = input.input_values()
+        prompt_inputs["fk_score"] = fk_score
+
+        parser = JsonOutputParser(pydantic_object=ConventionalityOutput)
+        prompts = _CONVENTIONALITY_CONFIG.prompts
+        template = ChatPromptTemplate.from_messages(
+            [
+                ("system", prompts["system_prompt"]),
+                ("human", prompts["human_prompt"]),
+            ]
+        ).partial(format_instructions=parser.get_format_instructions())
+        conventionality_output = self.execute_prompt_chain_step(
+            step_name="main",
+            prompt_settings=ps_main,
+            evaluation_metadata=evaluation_metadata,
+            template=template,
+            chain_inputs=prompt_inputs,
+            parser_output_type=ConventionalityOutput,
+        )
+        assert isinstance(conventionality_output, ConventionalityOutput)
+
+        answer = TextComplexityAnswer.from_score(conventionality_output.complexity_score)
+        return TextComplexityResult(
+            answer=answer,
+            explanation=EvaluationExplanation(
+                summary=conventionality_output.reasoning,
+                details={
+                    "conventionality_features": conventionality_output.conventionality_features,
+                    "grade_context": conventionality_output.grade_context,
+                    "instructional_insights": conventionality_output.instructional_insights,
+                },
+            ),
+            metadata=evaluation_metadata,
+        )
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/__init__.py b/sdks/python/src/learning_commons_evaluators/schemas/__init__.py
index 5b003b4..5070c69 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/__init__.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/__init__.py
@@ -9,6 +9,10 @@
     LlmProvider,
     PromptSettings,
 )
+from learning_commons_evaluators.schemas.conventionality import (
+    ConventionalityEvaluationSettings,
+    ConventionalityOutput,
+)
 from learning_commons_evaluators.schemas.errors import ValidationError
 from learning_commons_evaluators.schemas.evaluator import (
     EvaluationAnswer,
@@ -41,6 +45,8 @@
 
 __all__ = [
     "AnyInputSpec",
+    "ConventionalityEvaluationSettings",
+    "ConventionalityOutput",
     "GradeInputSpec",
     "InputSpec",
     "TextInputSpec",
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/conventionality.py b/sdks/python/src/learning_commons_evaluators/schemas/conventionality.py
new file mode 100644
index 0000000..9d5595d
--- /dev/null
+++ b/sdks/python/src/learning_commons_evaluators/schemas/conventionality.py
@@ -0,0 +1,32 @@
+"""Conventionality schemas."""
+
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+from learning_commons_evaluators.schemas.config import (
+    EvaluationSettings,
+    PromptSettings,
+)
+
+
+class ConventionalityEvaluationSettings(EvaluationSettings):
+    """Settings for a conventionality evaluation."""
+
+    prompt_settings_step_main: PromptSettings | None = None
+
+
+class ConventionalityOutput(BaseModel):
+    conventionality_features: list[str] = Field(
+        description="List of the specific language features driving the complexity (e.g., idioms, metaphors, implied meaning) with direct quotes from the text."
+    )
+    grade_context: str = Field(
+        description="How the conventionality demands compare to general expectations for the provided target grade."
+    )
+    instructional_insights: str = Field(
+        description="Actionable pedagogical suggestions for scaffolding the unconventional language features in the classroom."
+    )
+    complexity_score: Literal[
+        "slightly_complex", "moderately_complex", "very_complex", "exceedingly_complex"
+    ] = Field(description="The conventionality complexity level of the text")
+    reasoning: str = Field(description="A synthesis of why the text fits the chosen rubric level.")
diff --git a/sdks/python/src/learning_commons_evaluators/settings/_generated_conventionality_settings.py b/sdks/python/src/learning_commons_evaluators/settings/_generated_conventionality_settings.py
new file mode 100644
index 0000000..e52b7d1
--- /dev/null
+++ b/sdks/python/src/learning_commons_evaluators/settings/_generated_conventionality_settings.py
@@ -0,0 +1,104 @@
+# !! AUTO-GENERATED — do not edit directly.
+# Source: sdks/settings/conventionality/settings.toml
+# Regenerate : python scripts/generate_settings.py
+# Staleness check: python scripts/generate_settings.py --check
+
+from __future__ import annotations
+
+from learning_commons_evaluators.schemas.config import LlmProvider, PromptSettings
+from learning_commons_evaluators.schemas.conventionality import ConventionalityEvaluationSettings
+from learning_commons_evaluators.schemas.input_specs import GradeInputSpec, TextInputSpec
+from learning_commons_evaluators.schemas.metadata import EvaluatorMaturity, EvaluatorMetadata
+from learning_commons_evaluators.settings.load_settings import EvaluatorSettingsResult
+
+# ── Evaluator metadata ────────────────────────────────────────────────────────
+
+_EVALUATOR_METADATA = EvaluatorMetadata(
+    id='conventionality',
+    version='0.1',
+    name='Conventionality',
+    description='Conventionality Evaluator',
+    maturity=EvaluatorMaturity.beta,
+    inputs={
+        'text': TextInputSpec(name='text', min_text_length=10, max_text_length=10000),
+        'grade': GradeInputSpec(
+            name='grade',
+            allowed_grades=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+        ),
+    },
+)
+
+# ── Prompt templates ──────────────────────────────────────────────────────────
+
+_PROMPTS: dict[str, str] = {
+    'system_prompt': """Role
+You are an expert reading teacher and text complexity evaluator. Your task is to evaluate the "Conventionality" of a text and assign it a complexity level based on a 4-point scale, carefully factoring in the target grade level.
+
+Objective
+Measure how explicit, literal, and straightforward the text's meaning is, versus how abstract, ironic, figurative, or archaic it is. Focus on the hiddenness of the meaning, the use of conceptual framing, the reliance on abstract reasoning, and the familiarity of the expression for the target grade.
+
+Complexity Levels
+- Slightly Complex: Explicit, literal, straightforward, easy to understand. Meaning is entirely on the surface. The language is concrete, and the meaning is clear and procedural, mostly referring to observable materials and actions. Contains no symbolic or ironic language, and conceptual interpretation is not required. Contains limited figurative language that is common and easy to comprehend at the target grade level.
+- Moderately Complex: Largely explicit and easy to understand with some occasions for more complex meaning. May contain a noticeable amount of archaic/dated phrasing, formal historical prose, vocabulary demands, background knowledge requirements, or expressions that are less familiar to the target grade level, which might make the text feel vague or slightly challenging.
+- Very Complex: Fairly complex; contains sustained abstract language, conceptual framing, rhetorical idealization, ironic comparisons, or central metaphors that drive the meaning of the text. Addresses concepts, beliefs, and abstract qualities rather than just concrete objects. The tone or underlying message requires interpretation, even if the surface message is clear.
+- Exceedingly Complex: Dense and complex; contains considerable abstract, ironic, and/or figurative language. Meaning is heavily hidden, deeply conceptual, or relies heavily on complex rhetorical devices.
+
+Essential Evaluation Rules
+1. Concrete & Procedural Texts: Texts that are highly concrete, clear, and procedural (e.g., describing observable materials, mechanical processes, or physical actions) should typically be rated "Slightly Complex."
+
+2. Grade-Level Anchoring and Vague Narratives: Always consider the target grade. A literal historical narrative that might be straightforward for older students can be "Moderately Complex" for younger students (e.g., 4th graders) if it involves less familiar expressions, older contexts (e.g., wagon loads, traveling by horseback), vocabulary demands, and background knowledge requirements that make the text feel vague or slightly demanding for that age group.
+
+3. Rhetorical Idealization and Abstract Qualities: If an entire argument or narrative is built around abstract qualities (e.g., national character, bravery, liberty) and uses repeated figurative language or personification to portray a subject in a certain idealized way, rate the text as "Very Complex." Even if the figurative language is easy to interpret, the need to interpret the rhetorical tone and sustained abstract focus elevates the complexity beyond level two.
+
+4. Common Idioms and Grade-Level Appropriateness: Do NOT elevate a text to "Moderately Complex" simply because it contains a few common idiomatic expressions. If these expressions are widely known and easy for the target grade to understand without making the text feel vague, the text remains "Slightly Complex."
+
+5. Conversational and Hypothetical Framing: Using a second-person conversational hook (e.g., "Imagine you are...") to explain a concept is a standard, literal device for engaging readers. It does not constitute complex conceptual framing.
+
+6. Sustained vs. Occasional Impact: If abstract language, figurative phrasing, irony, or conceptual framing is sustained throughout the text and central to the argument/meaning, the text is Very Complex. Reserve Moderately Complex for texts where the explicit meaning dominates but the expression, vocabulary, or archaic language provides a moderate conventionality challenge.
+
+7. Central Metaphors and Conceptual Framing: When an author uses a central metaphor to explain a concept or uses figurative phrasing to explain how things "work," this abstract reasoning drives the meaning, elevating the text to Very Complex.
+
+8. Irony and Abstract Comparisons: Texts that rely on sustained irony, especially through comparative arguments, are inherently Very Complex for younger students.
+
+9. Isolate Conventionality from Vocabulary: Do not inflate the Conventionality score just because the text uses archaic, dated, or highly academic vocabulary.
+
+Input Format
+You will receive:
+- text: The passage to evaluate.
+- grade: The target student grade level.
+- fk_score: The Flesch-Kincaid readability score.
+
+Output Format
+Provide a JSON object containing ONLY the following keys:
+- complexity_score: (String) One of the 4 scale levels exactly as formatted: 'slightly_complex', 'moderately_complex', 'very_complex', or 'exceedingly_complex'.
+- reasoning: (String) A detailed explanation of the rating, citing specific features in the text and referencing the expert guardrails (e.g., noting if the text relies on abstract qualities/rhetorical idealization, if vocabulary/background knowledge demands make a literal text vague for the grade level, or if it is strictly concrete/procedural).
+- conventionality_features: (List of Strings) The specific language features driving the complexity (e.g., literal narrative, concrete actions, less familiar expressions, sustained irony, abstract qualities, rhetorical idealization, archaic phrasing) with direct quotes from the text.
+- grade_context: (String) How the conventionality demands compare to general expectations for the provided target grade.
+- instructional_insights: (String) Actionable pedagogical suggestions for scaffolding the conventionality features in the classroom.
+
+{format_instructions}
+""",
+    'human_prompt': """Analyze:
+Text: {text}
+Grade: {grade}
+FK Score: {fk_score}
+""",
+}
+
+# ── Evaluation settings ───────────────────────────────────────────────────────
+
+_EVALUATION_SETTINGS = ConventionalityEvaluationSettings(
+    prompt_settings_step_main=PromptSettings(
+        provider_type=LlmProvider.GOOGLE,
+        model='gemini-3-flash-preview',
+        temperature=0.0,
+    ),
+)
+
+# ── Public config object (imported by evaluator modules) ──────────────────────
+
+CONFIG: EvaluatorSettingsResult[ConventionalityEvaluationSettings] = EvaluatorSettingsResult(
+    evaluator_metadata=_EVALUATOR_METADATA,
+    evaluation_settings=_EVALUATION_SETTINGS,
+    prompts=_PROMPTS,
+)
diff --git a/sdks/python/tests/conftest.py b/sdks/python/tests/conftest.py
index 15f6ff8..fefbd8e 100644
--- a/sdks/python/tests/conftest.py
+++ b/sdks/python/tests/conftest.py
@@ -3,10 +3,9 @@
 import pytest
 
 from learning_commons_evaluators import create_config_no_telemetry
-from learning_commons_evaluators.schemas.config import (
-    EvaluationSettings,
-    LlmProvider,
-    PromptSettings,
+from learning_commons_evaluators.schemas.config import LlmProvider, PromptSettings
+from learning_commons_evaluators.schemas.conventionality import (
+    ConventionalityEvaluationSettings,
 )
 from learning_commons_evaluators.schemas.metadata import (
     EvaluationMetadata,
@@ -15,10 +14,6 @@
 )
 
 
-class _StubEvaluationSettings(EvaluationSettings):
-    """Minimal concrete EvaluationSettings for fixtures; replaced once a real evaluator lands."""
-
-
 @pytest.fixture
 def evaluator_metadata():
     """Minimal EvaluatorMetadata suitable for use in tests."""
@@ -36,7 +31,7 @@ def evaluation_metadata(evaluator_metadata):
     """EvaluationMetadata with sensible defaults for unit tests."""
     return EvaluationMetadata(
         evaluator_metadata=evaluator_metadata,
-        evaluation_settings=_StubEvaluationSettings(),
+        evaluation_settings=ConventionalityEvaluationSettings(),
         input_metadata={},
     )
 
diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py
new file mode 100644
index 0000000..0ce3542
--- /dev/null
+++ b/sdks/python/tests/evaluators/test_base.py
@@ -0,0 +1,368 @@
+"""Tests for BaseEvaluator.
+
+Covers: config wiring, evaluate() telemetry branching, error handling,
+update_total_token_usage, execute_step, and execute_prompt_chain_step.
+"""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from learning_commons_evaluators import (
+    ConventionalityEvaluationInput,
+    ConventionalityEvaluator,
+    create_config,
+    create_config_no_telemetry,
+)
+from learning_commons_evaluators.errors import ConfigurationError
+from learning_commons_evaluators.schemas.config import LlmProvider, PromptSettings
+from learning_commons_evaluators.schemas.conventionality import (
+    ConventionalityEvaluationSettings,
+    ConventionalityOutput,
+)
+from learning_commons_evaluators.schemas.errors import APIError, ValidationError
+from learning_commons_evaluators.schemas.metadata import (
+    PROMPT_STEP_EXTRA_PROMPT_SETTINGS,
+    PROMPT_STEP_EXTRA_TOKEN_USAGE,
+    EvaluationMetadata,
+    EvaluatorMaturity,
+    EvaluatorMetadata,
+    Status,
+    TokenUsage,
+)
+
+# ---------------------------------------------------------------------------
+# Shared test helpers
+# ---------------------------------------------------------------------------
+
+
+def _evaluator(*, send_full_input=False):
+    """Return a ConventionalityEvaluator; use send_full_input=True to enable full-input telemetry."""
+    if send_full_input:
+        config = create_config(telemetry_id="test", send_full_input_with_telemetry=True)
+    else:
+        config = create_config_no_telemetry()
+    return ConventionalityEvaluator(config)
+
+
+def _meta():
+    """Return a minimal EvaluationMetadata suitable for direct method tests."""
+    return EvaluationMetadata(
+        evaluator_metadata=EvaluatorMetadata(
+            id="test",
+            version="0.1",
+            name="Test",
+            description="Test",
+            maturity=EvaluatorMaturity.beta,
+        ),
+        evaluation_settings=ConventionalityEvaluationSettings(),
+        input_metadata={},
+    )
+
+
+# A realistic sample long enough to satisfy the min_text_length=100 constraint.
+_SAMPLE_TEXT = (
+    "Marco Polo was a Venetian merchant and explorer who traveled through Asia "
+    "in the late 13th century. He spent nearly two decades at the court of "
+    "Kublai Khan, the Mongol ruler of China, and described his experiences in "
+    "a book that introduced Europeans to the Far East."
+)
+
+
+def _inp(text=_SAMPLE_TEXT, grade=5):
+    return ConventionalityEvaluationInput(text=text, grade=grade)
+
+
+# ConventionalityOutput used as a stand-in for the LLM-parsed output.
+_MOCK_OUTPUT = ConventionalityOutput(
+    complexity_score="moderately_complex",
+    reasoning="Uses some conventional language.",
+    conventionality_features=["idioms"],
+    grade_context="Grade-appropriate.",
+    instructional_insights="Consider scaffolding.",
+)
+
+# A minimal JSON body accepted by ConventionalityOutput's parser.
+_CONV_JSON = (
+    '{"complexity_score": "slightly_complex", "reasoning": "Clear.",'
+    ' "conventionality_features": [], "grade_context": "Fine.", "instructional_insights": "None."}'
+)
+
+_CHAIN_PATCH = "learning_commons_evaluators.evaluators.base.create_provider"
+
+
+# ---------------------------------------------------------------------------
+# BaseEvaluator.__init__
+# ---------------------------------------------------------------------------
+
+
+class TestBaseEvaluatorInit:
+    def test_config_is_stored(self):
+        config = create_config_no_telemetry()
+        assert ConventionalityEvaluator(config).config is config
+
+
+# ---------------------------------------------------------------------------
+# evaluate() — telemetry branching
+# ---------------------------------------------------------------------------
+
+
+class TestEvaluateTelemetryBranching:
+    def test_uses_input_metadata_by_default(self):
+        """When send_full_input_with_telemetry=False, input.input_metadata() is used."""
+        evaluator = _evaluator()
+        with patch.object(evaluator, "execute_prompt_chain_step", return_value=_MOCK_OUTPUT):
+            result = evaluator.evaluate(_inp())
+        # input_metadata() returns shape metadata, not the raw text value.
+        assert result.metadata.input_metadata["text"] == {"textLength": str(len(_SAMPLE_TEXT))}
+
+    def test_uses_input_values_when_full_telemetry_enabled(self):
+        """When send_full_input_with_telemetry=True, input.input_values() is used."""
+        evaluator = _evaluator(send_full_input=True)
+        with patch.object(evaluator, "execute_prompt_chain_step", return_value=_MOCK_OUTPUT):
+            result = evaluator.evaluate(_inp())
+        # input_values() returns the raw text string, not {"textLength": ...}.
+        assert result.metadata.input_metadata["text"] == _SAMPLE_TEXT
+        assert result.metadata.input_metadata["grade"] == 5
+
+
+# ---------------------------------------------------------------------------
+# evaluate() — error handling
+# ---------------------------------------------------------------------------
+
+
+class TestEvaluateErrorHandling:
+    def test_raises_validation_error_for_invalid_input(self):
+        # "x" is below the 100-char minimum defined in the TOML settings.
+        evaluator = _evaluator()
+        invalid = ConventionalityEvaluationInput(text="x", grade=5)
+        with pytest.raises(ValidationError):
+            evaluator.evaluate(invalid)
+
+    def test_propagates_evaluate_impl_exception(self):
+        evaluator = _evaluator()
+        with (
+            patch.object(evaluator, "evaluate_impl", side_effect=RuntimeError("boom")),
+            pytest.raises(RuntimeError, match="boom"),
+        ):
+            evaluator.evaluate(_inp())
+
+
+# ---------------------------------------------------------------------------
+# update_total_token_usage
+# ---------------------------------------------------------------------------
+
+
+class TestUpdateTotalTokenUsage:
+    def test_inserts_usage_for_new_provider(self, evaluation_metadata):
+        evaluator = _evaluator()
+        usage = TokenUsage(
+            provider_type=LlmProvider.GOOGLE,
+            model="gemini-2.0-flash",
+            input_tokens=10,
+            output_tokens=5,
+        )
+        evaluator.update_total_token_usage(usage, evaluation_metadata)
+        stored = evaluation_metadata.total_token_usage[LlmProvider.GOOGLE]
+        assert stored.input_tokens == 10
+        assert stored.output_tokens == 5
+
+    def test_accumulates_usage_for_existing_provider(self, evaluation_metadata):
+        evaluator = _evaluator()
+        evaluation_metadata.total_token_usage[LlmProvider.GOOGLE] = TokenUsage(
+            provider_type=LlmProvider.GOOGLE,
+            model="gemini-2.0-flash",
+            input_tokens=10,
+            output_tokens=5,
+        )
+        evaluator.update_total_token_usage(
+            TokenUsage(
+                provider_type=LlmProvider.GOOGLE,
+                model="gemini-2.0-flash",
+                input_tokens=20,
+                output_tokens=15,
+            ),
+            evaluation_metadata,
+        )
+        stored = evaluation_metadata.total_token_usage[LlmProvider.GOOGLE]
+        assert stored.input_tokens == 30
+        assert stored.output_tokens == 20
+
+
+# ---------------------------------------------------------------------------
+# execute_step
+# ---------------------------------------------------------------------------
+
+
+class TestExecuteStep:
+    def test_returns_implementation_result(self, evaluation_metadata):
+        result = _evaluator().execute_step("s", evaluation_metadata, lambda: "the-result")
+        assert result == "the-result"
+
+    def test_records_succeeded_status_on_success(self, evaluation_metadata):
+        _evaluator().execute_step("s", evaluation_metadata, lambda: None)
+        assert evaluation_metadata.step_details["s"].status == Status.succeeded
+
+    def test_records_failed_status_and_error_on_exception(self, evaluation_metadata):
+        failing = MagicMock(side_effect=ValueError("boom"))
+        with pytest.raises(ValueError, match="boom"):
+            _evaluator().execute_step("s", evaluation_metadata, failing)
+        step = evaluation_metadata.step_details["s"]
+        assert step.status == Status.failed
+        assert "boom" in step.error_details
+
+    def test_re_raises_exception(self, evaluation_metadata):
+        failing = MagicMock(side_effect=RuntimeError("inner"))
+        with pytest.raises(RuntimeError, match="inner"):
+            _evaluator().execute_step("s", evaluation_metadata, failing)
+
+    def test_extras_appear_in_step_metadata(self, evaluation_metadata):
+        _evaluator().execute_step("s", evaluation_metadata, lambda: None, extras={"k": "v"})
+        assert evaluation_metadata.step_details["s"].extras["k"] == "v"
+
+
+# ---------------------------------------------------------------------------
+# execute_prompt_chain_step
+# ---------------------------------------------------------------------------
+
+
+class TestExecutePromptChainStep:
+    """Mock create_provider so the chain runs in-process with a fake LLM.
+
+    The fake LLM returns a real AIMessage so JsonOutputParser and
+    token_usage_from_aimessage exercise the real code paths.
+    """
+
+    def test_returns_parsed_pydantic_output(self, evaluation_metadata):
+        from langchain_core.messages import AIMessage
+        from langchain_core.prompts import ChatPromptTemplate
+
+        def _fake_llm(prompt_value):
+            return AIMessage(content=_CONV_JSON)
+
+        template = ChatPromptTemplate.from_messages([("human", "{input}")])
+        with patch(_CHAIN_PATCH, return_value=_fake_llm):
+            result = _evaluator().execute_prompt_chain_step(
+                step_name="main",
+                prompt_settings=PromptSettings(
+                    provider_type=LlmProvider.GOOGLE,
+                    model="gemini-2.0-flash",
+                    temperature=0.0,
+                ),
+                evaluation_metadata=evaluation_metadata,
+                template=template,
+                chain_inputs={"input": "Hello"},
+                parser_output_type=ConventionalityOutput,
+            )
+        assert result.complexity_score == "slightly_complex"
+        assert result.reasoning == "Clear."
+
+    def test_prompt_settings_recorded_in_step_extras(self, evaluation_metadata):
+        from langchain_core.messages import AIMessage
+        from langchain_core.prompts import ChatPromptTemplate
+
+        settings = PromptSettings(
+            provider_type=LlmProvider.GOOGLE,
+            model="gemini-2.0-flash",
+            temperature=0.0,
+        )
+        template = ChatPromptTemplate.from_messages([("human", "{input}")])
+
+        with patch(_CHAIN_PATCH, return_value=lambda pv: AIMessage(content=_CONV_JSON)):
+            _evaluator().execute_prompt_chain_step(
+                step_name="main",
+                prompt_settings=settings,
+                evaluation_metadata=evaluation_metadata,
+                template=template,
+                chain_inputs={"input": "text"},
+                parser_output_type=ConventionalityOutput,
+            )
+
+        step = evaluation_metadata.step_details["main"]
+        assert step.extras[PROMPT_STEP_EXTRA_PROMPT_SETTINGS]["model"] == "gemini-2.0-flash"
+        assert PROMPT_STEP_EXTRA_TOKEN_USAGE in step.extras
+
+    def test_token_usage_recorded_when_llm_reports_usage(self, evaluation_metadata):
+        from langchain_core.messages import AIMessage
+        from langchain_core.prompts import ChatPromptTemplate
+
+        def _llm_with_usage(pv):
+            return AIMessage(
+                content=_CONV_JSON,
+                usage_metadata={
+                    "input_tokens": 42,
+                    "output_tokens": 17,
+                    "total_tokens": 59,
+                },
+            )
+
+        settings = PromptSettings(
+            provider_type=LlmProvider.GOOGLE,
+            model="gemini-2.0-flash",
+            temperature=0.0,
+        )
+        template = ChatPromptTemplate.from_messages([("human", "{input}")])
+
+        with patch(_CHAIN_PATCH, return_value=_llm_with_usage):
+            _evaluator().execute_prompt_chain_step(
+                step_name="main",
+                prompt_settings=settings,
+                evaluation_metadata=evaluation_metadata,
+                template=template,
+                chain_inputs={"input": "text"},
+                parser_output_type=ConventionalityOutput,
+            )
+
+        step = evaluation_metadata.step_details["main"]
+        assert step.extras[PROMPT_STEP_EXTRA_TOKEN_USAGE]["input_tokens"] == 42
+        assert step.extras[PROMPT_STEP_EXTRA_TOKEN_USAGE]["output_tokens"] == 17
+        assert evaluation_metadata.total_token_usage[LlmProvider.GOOGLE].input_tokens == 42
+
+    def test_propagates_configuration_error_from_create_provider(self, evaluation_metadata):
+        from langchain_core.prompts import ChatPromptTemplate
+
+        template = ChatPromptTemplate.from_messages([("human", "{input}")])
+        with (
+            patch(
+                _CHAIN_PATCH,
+                side_effect=ConfigurationError("Google provider config is not set"),
+            ),
+            pytest.raises(ConfigurationError, match="Google provider config is not set"),
+        ):
+            _evaluator().execute_prompt_chain_step(
+                step_name="main",
+                prompt_settings=PromptSettings(
+                    provider_type=LlmProvider.GOOGLE,
+                    model="gemini-2.0-flash",
+                    temperature=0.0,
+                ),
+                evaluation_metadata=evaluation_metadata,
+                template=template,
+                chain_inputs={"input": "text"},
+                parser_output_type=ConventionalityOutput,
+            )
+
+    def test_wraps_unexpected_chain_failure_as_api_error(self, evaluation_metadata):
+        """LangChain / provider failures are mapped via wrap_provider_error."""
+        from langchain_core.prompts import ChatPromptTemplate
+
+        def _boom(_prompt_value):
+            raise ValueError("simulated provider failure")
+
+        template = ChatPromptTemplate.from_messages([("human", "{input}")])
+        with (
+            patch(_CHAIN_PATCH, return_value=_boom),
+            pytest.raises(APIError, match="simulated provider failure"),
+        ):
+            _evaluator().execute_prompt_chain_step(
+                step_name="main",
+                prompt_settings=PromptSettings(
+                    provider_type=LlmProvider.GOOGLE,
+                    model="gemini-2.0-flash",
+                    temperature=0.0,
+                ),
+                evaluation_metadata=evaluation_metadata,
+                template=template,
+                chain_inputs={"input": "text"},
+                parser_output_type=ConventionalityOutput,
+            )
diff --git a/sdks/python/tests/evaluators/test_conventionality.py b/sdks/python/tests/evaluators/test_conventionality.py
new file mode 100644
index 0000000..a578027
--- /dev/null
+++ b/sdks/python/tests/evaluators/test_conventionality.py
@@ -0,0 +1,143 @@
+"""Tests for ConventionalityEvaluator and conventionality helpers."""
+
+from unittest.mock import patch
+
+import pytest
+
+from learning_commons_evaluators import (
+    ConventionalityEvaluationInput,
+    ConventionalityEvaluator,
+    create_config_no_telemetry,
+)
+from learning_commons_evaluators.schemas.conventionality import ConventionalityOutput
+from learning_commons_evaluators.schemas.errors import ConfigurationError
+from learning_commons_evaluators.schemas.metadata import Status
+
+# A realistic sample long enough to satisfy the min_text_length=100 constraint.
+_SAMPLE_TEXT = (
+    "Marco Polo was a Venetian merchant and explorer who traveled through Asia "
+    "in the late 13th century. He spent nearly two decades at the court of "
+    "Kublai Khan, the Mongol ruler of China, and described his experiences in "
+    "a book that introduced Europeans to the Far East."
+)
+
+
+def _make_mock_output():
+    return ConventionalityOutput(
+        conventionality_features=["idioms"],
+        grade_context="Grade-appropriate.",
+        instructional_insights="Consider scaffolding.",
+        complexity_score="moderately_complex",
+        reasoning="The text uses some conventional language.",
+    )
+
+
+class TestConventionalityEvaluator:
+    def test_evaluate_returns_evaluation_result(self):
+        config = create_config_no_telemetry()
+        evaluator = ConventionalityEvaluator(config)
+        inp = ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=5)
+        with patch.object(evaluator, "execute_prompt_chain_step", return_value=_make_mock_output()):
+            result = evaluator.evaluate(inp)
+        assert result.answer.score == "moderately_complex"
+        assert result.answer.label == "Moderately complex"
+        assert result.explanation.summary is not None
+        assert result.metadata.status == Status.succeeded
+        assert result.metadata.evaluator_metadata.id == "conventionality"
+
+    def test_evaluate_with_explicit_settings(self):
+        from learning_commons_evaluators.schemas.config import (
+            LlmProvider,
+            PromptSettings,
+        )
+        from learning_commons_evaluators.schemas.conventionality import (
+            ConventionalityEvaluationSettings,
+        )
+
+        config = create_config_no_telemetry()
+        evaluator = ConventionalityEvaluator(config)
+        settings = ConventionalityEvaluationSettings(
+            prompt_settings_step_main=PromptSettings(
+                provider_type=LlmProvider.GOOGLE,
+                model="gemini-2.0-flash",
+                temperature=0.0,
+            )
+        )
+        inp = ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=3)
+        with patch.object(evaluator, "execute_prompt_chain_step", return_value=_make_mock_output()):
+            result = evaluator.evaluate(inp, evaluation_settings=settings)
+        assert result.metadata.status == Status.succeeded
+
+    def test_metadata_and_default_settings(self):
+        evaluator = ConventionalityEvaluator(create_config_no_telemetry())
+        assert evaluator.metadata.id == "conventionality"
+        assert evaluator.metadata.version == "0.1"
+        assert evaluator.default_evaluation_settings is not None
+
+
+class TestConventionalityEvaluationInputConfiguration:
+    """Tests that ConventionalityEvaluationInput fails loudly on bad configuration.
+
+    These tests patch ``ConventionalityEvaluationInput._input_settings`` directly
+    because the ClassVar is bound at class-definition time.  Patching the
+    module-level ``_INPUT_SETTINGS`` name would rebind the module variable but
+    leave the class variable pointing at the original dict.
+    """
+
+    def test_missing_text_spec_raises_configuration_error(self, monkeypatch):
+        """If 'text' is absent from _input_settings, ConfigurationError is raised immediately."""
+        monkeypatch.setattr(ConventionalityEvaluationInput, "_input_settings", {})
+        with pytest.raises(ConfigurationError, match="'text'"):
+            ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=5)
+
+    def test_missing_grade_spec_raises_configuration_error(self, monkeypatch):
+        """If 'grade' is absent from _input_settings, ConfigurationError is raised immediately."""
+        from learning_commons_evaluators.schemas.input_specs import TextInputSpec
+
+        monkeypatch.setattr(
+            ConventionalityEvaluationInput,
+            "_input_settings",
+            {"text": TextInputSpec(name="text")},
+        )
+        with pytest.raises(ConfigurationError, match="'grade'"):
+            ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=5)
+
+    def test_wrong_text_spec_type_raises_configuration_error(self, monkeypatch):
+        """If the 'text' spec has the wrong type, ConfigurationError names the type mismatch."""
+        from learning_commons_evaluators.schemas.input_specs import GradeInputSpec
+
+        monkeypatch.setattr(
+            ConventionalityEvaluationInput,
+            "_input_settings",
+            {"text": GradeInputSpec(name="text")},
+        )
+        with pytest.raises(ConfigurationError, match="TextInputSpec"):
+            ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=5)
+
+    def test_wrong_grade_spec_type_raises_configuration_error(self, monkeypatch):
+        """If the 'grade' spec has the wrong type, ConfigurationError names the type mismatch."""
+        from learning_commons_evaluators.schemas.input_specs import TextInputSpec
+
+        monkeypatch.setattr(
+            ConventionalityEvaluationInput,
+            "_input_settings",
+            {
+                "text": TextInputSpec(name="text"),
+                "grade": TextInputSpec(name="grade"),  # wrong type
+            },
+        )
+        with pytest.raises(ConfigurationError, match="GradeInputSpec"):
+            ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=5)
+
+
+class TestConventionalityOutput:
+    def test_conventionality_output_literal_score(self):
+        out = ConventionalityOutput(
+            conventionality_features=[],
+            grade_context="",
+            instructional_insights="",
+            complexity_score="slightly_complex",
+            reasoning="Test.",
+        )
+        assert out.complexity_score == "slightly_complex"
+        assert out.reasoning == "Test."
diff --git a/sdks/python/tests/schemas/test_evaluator_schemas.py b/sdks/python/tests/schemas/test_evaluator_schemas.py
new file mode 100644
index 0000000..481018f
--- /dev/null
+++ b/sdks/python/tests/schemas/test_evaluator_schemas.py
@@ -0,0 +1,256 @@
+"""Tests for EvaluationInput, EvaluationAnswer, EvaluationExplanation, and EvaluationResult."""
+
+from typing import ClassVar
+
+import pytest
+
+from learning_commons_evaluators.evaluators.conventionality import (
+    ConventionalityEvaluationInput,
+)
+from learning_commons_evaluators.schemas.common_inputs import (
+    GradeInputField,
+    TextInputField,
+)
+from learning_commons_evaluators.schemas.errors import (
+    ConfigurationError,
+    ValidationError,
+)
+from learning_commons_evaluators.schemas.evaluator import (
+    EvaluationAnswer,
+    EvaluationExplanation,
+    EvaluationInput,
+    EvaluationResult,
+)
+from learning_commons_evaluators.schemas.input_specs import (
+    GradeInputSpec,
+    TextInputSpec,
+)
+from learning_commons_evaluators.schemas.metadata import (
+    Status,
+)
+
+# A realistic sample long enough to satisfy the min_text_length=100 constraint.
+_SAMPLE_TEXT = (
+    "Marco Polo was a Venetian merchant and explorer who traveled through Asia "
+    "in the late 13th century. He spent nearly two decades at the court of "
+    "Kublai Khan, the Mongol ruler of China, and described his experiences in "
+    "a book that introduced Europeans to the Far East."
+)
+
+
+# ---------------------------------------------------------------------------
+# A minimal EvaluationInput subclass that mixes a proper InputField with a
+# plain float — used to exercise the isinstance(..., InputField) False branches.
+# ---------------------------------------------------------------------------
+
+_BARE_TEXT_SPEC = TextInputSpec(name="text")  # no constraints — for testing base-class behaviour
+
+
+class _MixedInput(EvaluationInput):
+    """EvaluationInput with one proper InputField and one plain Python value."""
+
+    text: TextInputField
+    weight: float  # no validate(), no input_metadata(), no .value attribute
+
+
+class TestEvaluationInput:
+    # --- happy-path construction, validation, and metadata ---
+
+    def test_validate_and_input_metadata(self):
+        inp = ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=5)
+        inp.validate()
+        meta = inp.input_metadata()
+        assert meta["text"] == {"textLength": str(len(_SAMPLE_TEXT))}
+        assert meta["grade"] == {"grade": 5}
+
+    def test_input_values_returns_primitive_values(self):
+        """input_values() should unwrap .value from each InputField."""
+        inp = ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=7)
+        values = inp.input_values()
+        assert values["text"] == _SAMPLE_TEXT
+        assert values["grade"] == 7
+
+    # --- validation error paths ---
+
+    def test_validate_raises_on_invalid_grade(self):
+        inp = ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=99)
+        with pytest.raises(ValidationError):
+            inp.validate()
+
+    def test_validate_raises_on_invalid_text_length(self):
+        # min_text_length=100 comes from the TOML settings; "x" is 1 char.
+        inp = ConventionalityEvaluationInput(text="x", grade=5)
+        with pytest.raises(ValidationError):
+            inp.validate()
+
+    def test_validate_collects_all_errors_before_raising(self):
+        """All field errors are collected; a single ValidationError is raised at the end."""
+        # "x" is below the 100-char minimum; grade 99 is outside the 0-12 range.
+        inp = ConventionalityEvaluationInput(text="x", grade=99)
+        with pytest.raises(ValidationError) as exc_info:
+            inp.validate()
+        # Both errors should appear in the combined message.
+        msg = str(exc_info.value)
+        assert "below minimum" in msg
+        assert "0-12" in msg
+
+    # --- isinstance False branches: field without protocol methods ---
+
+    def test_validate_skips_non_inputfield_fields(self):
+        """Fields that don't implement the InputField protocol are silently skipped."""
+        inp = _MixedInput(text=TextInputField(spec=_BARE_TEXT_SPEC, value="hello"), weight=7.5)
+        inp.validate()  # should not raise even though weight is not an InputField
+
+    def test_input_metadata_returns_none_for_non_inputfield_fields(self):
+        """Fields that are not InputFields produce a None entry in the output dict."""
+        inp = _MixedInput(text=TextInputField(spec=_BARE_TEXT_SPEC, value="hello"), weight=7.5)
+        meta = inp.input_metadata()
+        assert meta["text"] == {"textLength": "5"}
+        assert meta["weight"] is None  # fallback for non-protocol fields
+
+    def test_input_values_returns_field_itself_for_non_inputfield(self):
+        """Fields that are not InputFields are returned as-is from input_values()."""
+        inp = _MixedInput(text=TextInputField(spec=_BARE_TEXT_SPEC, value="hello"), weight=7.5)
+        values = inp.input_values()
+        assert values["text"] == "hello"
+        assert values["weight"] == 7.5  # not an InputField; the float is returned directly
+
+    # --- InputField subclass isinstance checks ---
+
+    def test_text_input_field_is_inputfield_subclass(self):
+        """TextInputField must be an InputField subclass (checked via isinstance)."""
+        from learning_commons_evaluators.schemas.evaluator import InputField
+
+        field = TextInputField(spec=_BARE_TEXT_SPEC, value="hello")
+        assert isinstance(field, InputField)
+
+    def test_grade_input_field_is_inputfield_subclass(self):
+        """GradeInputField must be an InputField subclass (checked via isinstance)."""
+        from learning_commons_evaluators.schemas.common_inputs import GradeInputField
+        from learning_commons_evaluators.schemas.evaluator import InputField
+        from learning_commons_evaluators.schemas.input_specs import GradeInputSpec
+
+        spec = GradeInputSpec(name="grade")
+        field = GradeInputField(spec=spec, value=5)
+        assert isinstance(field, InputField)
+
+    def test_plain_value_is_not_an_inputfield(self):
+        """Plain Python values must not be treated as InputField instances."""
+        from learning_commons_evaluators.schemas.evaluator import InputField
+
+        assert not isinstance(7.5, InputField)
+        assert not isinstance("raw string", InputField)
+
+    def test_inputfield_cannot_be_instantiated_directly(self):
+        """InputField is abstract and must not be instantiable without implementing validate() and input_metadata()."""
+        from learning_commons_evaluators.schemas.evaluator import InputField
+        from learning_commons_evaluators.schemas.input_specs import TextInputSpec
+
+        with pytest.raises(TypeError, match="abstract"):
+            InputField(spec=TextInputSpec(name="text"), value="hello")  # type: ignore[abstract]
+
+
+class TestCoerceRawToInputFields:
+    """Direct unit tests for EvaluationInput._coerce_raw_to_input_fields.
+
+    These tests exercise the model_validator on a minimal concrete subclass
+    rather than through a full evaluator, so failures point directly at the
+    base-class behaviour rather than evaluator-specific configuration.
+    """
+
+    # Minimal concrete EvaluationInput subclass wired to known specs.
+    _TEXT_SPEC = TextInputSpec(name="text")
+    _GRADE_SPEC = GradeInputSpec(name="grade")
+
+    class _SimpleInput(EvaluationInput):
+        _input_settings: ClassVar[dict] = {
+            "text": TextInputSpec(name="text"),
+            "grade": GradeInputSpec(name="grade"),
+        }
+        text: TextInputField
+        grade: GradeInputField
+
+        def __init__(self, *, text: str, grade: int, **kwargs):
+            super().__init__(text=text, grade=grade, **kwargs)
+
+    def test_raw_values_are_wrapped_into_input_fields(self):
+        """Raw str/int values should be wrapped into the declared InputField types."""
+        inp = self._SimpleInput(text="hello world", grade=5)
+        assert isinstance(inp.text, TextInputField)
+        assert inp.text.value == "hello world"
+        assert isinstance(inp.grade, GradeInputField)
+        assert inp.grade.value == 5
+
+    def test_already_constructed_input_field_is_not_rewrapped(self):
+        """Passing a fully-constructed InputField instance bypasses construction."""
+        pre_built = TextInputField(spec=self._TEXT_SPEC, value="pre-built")
+        inp = self._SimpleInput(text=pre_built, grade=3)  # type: ignore[arg-type]
+        assert inp.text is pre_built  # same object, not a copy
+
+    def test_non_inputfield_field_is_left_unchanged(self):
+        """Plain (non-InputField) fields are passed through without modification."""
+
+        class _WithPlainField(EvaluationInput):
+            _input_settings: ClassVar[dict] = {"text": TextInputSpec(name="text")}
+            text: TextInputField
+            weight: float
+
+            def __init__(self, *, text: str, weight: float, **kwargs):
+                super().__init__(text=text, weight=weight, **kwargs)
+
+        inp = _WithPlainField(text="hello", weight=1.5)
+        assert inp.weight == 1.5  # plain float, untouched
+
+    def test_missing_spec_raises_configuration_error(self, monkeypatch):
+        """ConfigurationError is raised when a required spec is absent from _input_settings."""
+        monkeypatch.setattr(self._SimpleInput, "_input_settings", {})
+        with pytest.raises(ConfigurationError, match="'text'"):
+            self._SimpleInput(text="hello", grade=5)
+
+    def test_wrong_spec_type_raises_configuration_error(self, monkeypatch):
+        """ConfigurationError is raised when the spec type doesn't match the field's expectation."""
+        monkeypatch.setattr(
+            self._SimpleInput,
+            "_input_settings",
+            {"text": GradeInputSpec(name="text"), "grade": self._GRADE_SPEC},
+        )
+        with pytest.raises(ConfigurationError, match="TextInputSpec"):
+            self._SimpleInput(text="hello", grade=5)
+
+    def test_error_message_includes_class_and_field_name(self, monkeypatch):
+        """ConfigurationError messages name both the class and the missing field."""
+        monkeypatch.setattr(self._SimpleInput, "_input_settings", {})
+        with pytest.raises(ConfigurationError) as exc_info:
+            self._SimpleInput(text="hello", grade=5)
+        msg = str(exc_info.value)
+        assert "_SimpleInput" in msg
+        assert "'text'" in msg
+
+
+class TestEvaluationAnswer:
+    def test_score_and_label(self):
+        answer = EvaluationAnswer(score="moderately_complex", label="Moderately complex")
+        assert answer.score == "moderately_complex"
+        assert answer.label == "Moderately complex"
+
+
+class TestEvaluationExplanation:
+    def test_summary_and_details(self):
+        explanation = EvaluationExplanation(summary="Reasoning.", details={"key": "value"})
+        assert explanation.summary == "Reasoning."
+        assert explanation.details["key"] == "value"
+
+    def test_details_defaults_to_empty_dict(self):
+        explanation = EvaluationExplanation(summary="Short.")
+        assert explanation.details == {}
+
+
+class TestEvaluationResult:
+    def test_construction_and_status(self, evaluation_metadata):
+        result = EvaluationResult(
+            answer=EvaluationAnswer(score="slightly_complex", label="Slightly complex"),
+            explanation=EvaluationExplanation(summary="Summary"),
+            metadata=evaluation_metadata,
+        )
+        assert result.answer.score == "slightly_complex"
+        assert result.metadata.status == Status.processing
diff --git a/sdks/python/tests/settings/test_load_settings.py b/sdks/python/tests/settings/test_load_settings.py
new file mode 100644
index 0000000..d4f4e91
--- /dev/null
+++ b/sdks/python/tests/settings/test_load_settings.py
@@ -0,0 +1,332 @@
+"""Tests for load_settings.
+
+Covers: load_settings(), _require(), _parse_prompts() (including prompt whitespace
+normalization), load_evaluator_settings(),
+and shared_settings_root() — including every conditional branch in each function.
+"""
+
+import os
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+from learning_commons_evaluators.errors import ConfigurationError
+from learning_commons_evaluators.schemas.conventionality import (
+    ConventionalityEvaluationSettings,
+)
+from learning_commons_evaluators.schemas.metadata import EvaluatorMaturity
+from learning_commons_evaluators.settings.load_settings import (
+    EvaluatorSettingsResult,
+    _parse_prompts,
+    _require,
+    load_evaluator_settings,
+    load_settings,
+    shared_settings_root,
+)
+
+# ---------------------------------------------------------------------------
+# Shared helpers
+# ---------------------------------------------------------------------------
+
+
+def _write_toml(tmp_path: Path, content: str) -> Path:
+    """Write TOML content to evaluator.toml inside tmp_path and return the path."""
+    path = tmp_path / "evaluator.toml"
+    path.write_text(content, encoding="utf-8")
+    return path
+
+
+# Minimal [evaluator_metadata] section reused across several tests.
+_MINIMAL_META = """\
+[evaluator_metadata]
+id = "x"
+version = "0.1"
+name = "X"
+description = "X"
+maturity = "beta"
+"""
+
+# Minimal [evaluation_settings] section reused across several tests.
+_MINIMAL_EVAL_SETTINGS = """\
+[evaluation_settings.prompt_settings_step_main]
+type = "GOOGLE"
+model = "gemini-2.0-flash"
+temperature = 0
+"""
+
+
+# ---------------------------------------------------------------------------
+# load_settings (raw TOML loader)
+# ---------------------------------------------------------------------------
+
+
+class TestLoadSettings:
+    def test_parses_nested_sections(self, tmp_path: Path) -> None:
+        path = _write_toml(
+            tmp_path,
+            """
+[section]
+key = "value"
+n = 42
+[section.nested]
+foo = "bar"
+""",
+        )
+        data = load_settings(path)
+        assert data["section"]["key"] == "value"
+        assert data["section"]["n"] == 42
+        assert data["section"]["nested"]["foo"] == "bar"
+
+    def test_accepts_str_path(self, tmp_path: Path) -> None:
+        path = _write_toml(tmp_path, 'title = "hello"')
+        assert load_settings(str(path))["title"] == "hello"
+
+    def test_raises_file_not_found(self) -> None:
+        with pytest.raises(FileNotFoundError):
+            load_settings(Path("/nonexistent/path/settings.toml"))
+
+
+# ---------------------------------------------------------------------------
+# _require
+# ---------------------------------------------------------------------------
+
+
+class TestRequire:
+    def test_raises_when_value_is_none(self) -> None:
+        with pytest.raises(ConfigurationError, match="missing_field"):
+            _require("missing_field", None, "section")
+
+    def test_raises_when_value_is_blank_string(self) -> None:
+        with pytest.raises(ConfigurationError, match="blank_field"):
+            _require("blank_field", "   ", "section")
+
+    def test_passes_for_non_empty_string(self) -> None:
+        _require("key", "value", "section")  # must not raise
+
+    def test_passes_for_numeric_value(self) -> None:
+        """An integer is not None and not a str, so _require must not raise.
+
+        This matters because TOML fields like ``id = 42`` are valid integers and
+        are later coerced with str() by the caller.
+        """
+        _require("id", 42, "section")  # must not raise
+
+
+# ---------------------------------------------------------------------------
+# _parse_prompts
+# ---------------------------------------------------------------------------
+
+
+class TestParsePrompts:
+    def test_returns_dict_when_prompts_is_a_dict(self) -> None:
+        data = {"prompts": {"system_prompt": "You are helpful.", "human_prompt": "Do it."}}
+        assert _parse_prompts(data) == {
+            "system_prompt": "You are helpful.",
+            "human_prompt": "Do it.",
+        }
+
+    def test_returns_empty_dict_when_prompts_key_is_absent(self) -> None:
+        assert _parse_prompts({}) == {}
+
+    def test_returns_empty_dict_when_prompts_is_not_a_dict(self) -> None:
+        """TOML like ``prompts = "some string"`` must not raise — just return {}."""
+        assert _parse_prompts({"prompts": "some string"}) == {}
+
+    def test_coerces_non_string_prompt_values_to_str(self) -> None:
+        """Non-string values (e.g. TOML integers) inside [prompts] must be coerced."""
+        assert _parse_prompts({"prompts": {"answer": 42}}) == {"answer": "42"}
+
+    def test_collapses_whitespace_only_lines_in_prompts(self) -> None:
+        """Lines that contain only spaces/tabs become empty; intentional \\n\\n\\n is kept."""
+        data = {
+            "prompts": {
+                "system_prompt": "a\n    \nb",
+                "multi_space_lines": "x\n  \n  \ny",
+                "triple_blank": "p\n\n\nq",
+            }
+        }
+        assert _parse_prompts(data) == {
+            "system_prompt": "a\n\nb",
+            "multi_space_lines": "x\n\n\ny",
+            "triple_blank": "p\n\n\nq",
+        }
+
+
+# ---------------------------------------------------------------------------
+# load_evaluator_settings
+# ---------------------------------------------------------------------------
+
+
+class TestLoadEvaluatorSettings:
+    def test_parses_full_toml(self, tmp_path: Path) -> None:
+        path = _write_toml(
+            tmp_path,
+            """
+[evaluator_metadata]
+id = "test_evaluator"
+version = "0.1"
+name = "Test"
+description = "Test evaluator"
+maturity = "beta"
+
+[prompts]
+system_prompt = "You are a helpful assistant."
+
+[evaluation_settings.prompt_settings_step_main]
+type = "GOOGLE"
+model = "gemini-2.0-flash"
+temperature = 0
+""",
+        )
+        result = load_evaluator_settings(path, ConventionalityEvaluationSettings)
+        assert isinstance(result, EvaluatorSettingsResult)
+        assert result.evaluator_metadata.id == "test_evaluator"
+        assert result.evaluator_metadata.version == "0.1"
+        assert result.evaluator_metadata.maturity == EvaluatorMaturity.beta
+        assert result.prompts["system_prompt"] == "You are a helpful assistant."
+        ps_main = result.evaluation_settings.prompt_settings_step_main
+        assert ps_main is not None
+        assert ps_main.model == "gemini-2.0-flash"
+        assert ps_main.temperature == 0.0
+
+    def test_raises_when_evaluator_metadata_section_missing(self, tmp_path: Path) -> None:
+        path = _write_toml(tmp_path, _MINIMAL_EVAL_SETTINGS)
+        with pytest.raises(ConfigurationError, match="evaluator_metadata"):
+            load_evaluator_settings(path, ConventionalityEvaluationSettings)
+
+    def test_raises_when_evaluation_settings_section_missing(self, tmp_path: Path) -> None:
+        path = _write_toml(tmp_path, _MINIMAL_META)
+        with pytest.raises(ConfigurationError, match="evaluation_settings"):
+            load_evaluator_settings(path, ConventionalityEvaluationSettings)
+
+    def test_raises_when_required_prompt_settings_field_missing(self, tmp_path: Path) -> None:
+        """temperature is required in every prompt_settings_* step; omitting it must raise."""
+        path = _write_toml(
+            tmp_path,
+            _MINIMAL_META
+            + """
+[evaluation_settings.prompt_settings_step_main]
+type = "GOOGLE"
+model = "gemini-2.0-flash"
+""",
+        )
+        with pytest.raises(ConfigurationError, match="temperature"):
+            load_evaluator_settings(path, ConventionalityEvaluationSettings)
+
+    def test_raises_for_invalid_maturity_value(self, tmp_path: Path) -> None:
+        path = _write_toml(
+            tmp_path,
+            """
+[evaluator_metadata]
+id = "x"
+version = "0.1"
+name = "X"
+description = "X"
+maturity = "not_a_real_maturity"
+"""
+            + _MINIMAL_EVAL_SETTINGS,
+        )
+        with pytest.raises(ConfigurationError, match="maturity"):
+            load_evaluator_settings(path, ConventionalityEvaluationSettings)
+
+    def test_raises_for_invalid_provider_type(self, tmp_path: Path) -> None:
+        path = _write_toml(
+            tmp_path,
+            _MINIMAL_META
+            + """
+[evaluation_settings.prompt_settings_step_main]
+type = "INVALID_PROVIDER"
+model = "some-model"
+temperature = 0
+""",
+        )
+        with pytest.raises(ConfigurationError, match="provider"):
+            load_evaluator_settings(path, ConventionalityEvaluationSettings)
+
+    def test_parses_inputs_into_typed_specs(self, tmp_path: Path) -> None:
+        """[[evaluator_metadata.inputs]] entries become TextInputSpec / GradeInputSpec."""
+        from learning_commons_evaluators.schemas.input_specs import (
+            GradeInputSpec,
+            TextInputSpec,
+        )
+
+        path = _write_toml(
+            tmp_path,
+            """
+[evaluator_metadata]
+id = "x"
+version = "0.1"
+name = "X"
+description = "X"
+maturity = "beta"
+
+[[evaluator_metadata.inputs]]
+name = "text"
+type = "TextInputField"
+min_text_length = 50
+max_text_length = 5000
+
+[[evaluator_metadata.inputs]]
+name = "grade"
+type = "GradeInputField"
+allowed_grades = [3, 4, 5, 6]
+
+[evaluation_settings.prompt_settings_step_main]
+type = "GOOGLE"
+model = "gemini-2.0-flash"
+temperature = 0
+""",
+        )
+        result = load_evaluator_settings(path, ConventionalityEvaluationSettings)
+        inputs = result.evaluator_metadata.inputs
+
+        text_spec = inputs["text"]
+        assert isinstance(text_spec, TextInputSpec)
+        assert text_spec.min_text_length == 50
+        assert text_spec.max_text_length == 5000
+
+        grade_spec = inputs["grade"]
+        assert isinstance(grade_spec, GradeInputSpec)
+        assert grade_spec.allowed_grades == [3, 4, 5, 6]
+
+    def test_raises_for_unknown_input_type(self, tmp_path: Path) -> None:
+        path = _write_toml(
+            tmp_path,
+            """
+[evaluator_metadata]
+id = "x"
+version = "0.1"
+name = "X"
+description = "X"
+maturity = "beta"
+
+[[evaluator_metadata.inputs]]
+name = "mystery"
+type = "UnknownInputField"
+
+[evaluation_settings.prompt_settings_step_main]
+type = "GOOGLE"
+model = "gemini-2.0-flash"
+temperature = 0
+""",
+        )
+        with pytest.raises(ConfigurationError, match="Unknown input type"):
+            load_evaluator_settings(path, ConventionalityEvaluationSettings)
+
+
+# ---------------------------------------------------------------------------
+# shared_settings_root
+# ---------------------------------------------------------------------------
+
+
+class TestSharedSettingsRoot:
+    def test_env_var_overrides_bundled_path(self, tmp_path: Path) -> None:
+        with patch.dict(os.environ, {"EVALUATORS_SETTINGS_DIR": str(tmp_path)}):
+            assert shared_settings_root() == tmp_path
+
+    def test_importlib_fallback_resolves_to_existing_directory(self) -> None:
+        env = {k: v for k, v in os.environ.items() if k != "EVALUATORS_SETTINGS_DIR"}
+        with patch.dict(os.environ, env, clear=True):
+            root = shared_settings_root()
+        assert root.exists(), f"shared_settings_root() resolved to non-existent path: {root}"
diff --git a/sdks/python/tests/test_package_imports.py b/sdks/python/tests/test_package_imports.py
index df987a2..4c6b0d4 100644
--- a/sdks/python/tests/test_package_imports.py
+++ b/sdks/python/tests/test_package_imports.py
@@ -5,12 +5,14 @@ def test_main_package_imports():
     """All advertised public symbols can be imported from the root package."""
     from learning_commons_evaluators import (
         BaseEvaluator,
+        ConventionalityEvaluator,
         __version__,
         create_config_no_telemetry,
     )
 
     assert __version__ is not None
     assert create_config_no_telemetry is not None
+    assert ConventionalityEvaluator is not None
     assert BaseEvaluator is not None
 
 
diff --git a/sdks/settings/conventionality/settings.toml b/sdks/settings/conventionality/settings.toml
new file mode 100644
index 0000000..cef7ecf
--- /dev/null
+++ b/sdks/settings/conventionality/settings.toml
@@ -0,0 +1,78 @@
+[evaluator_metadata]
+id = "conventionality"
+version = 0.1
+name = "Conventionality"
+description = "Conventionality Evaluator"
+maturity = "beta"
+
+[[evaluator_metadata.inputs]]
+name = "text"
+type = "TextInputField"
+min_text_length = 10
+max_text_length = 10000
+
+[[evaluator_metadata.inputs]]
+name = "grade"
+type = "GradeInputField"
+allowed_grades = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+
+[prompts]
+system_prompt = """
+Role
+You are an expert reading teacher and text complexity evaluator. Your task is to evaluate the "Conventionality" of a text and assign it a complexity level based on a 4-point scale, carefully factoring in the target grade level.
+
+Objective
+Measure how explicit, literal, and straightforward the text's meaning is, versus how abstract, ironic, figurative, or archaic it is. Focus on the hiddenness of the meaning, the use of conceptual framing, the reliance on abstract reasoning, and the familiarity of the expression for the target grade.
+
+Complexity Levels
+- Slightly Complex: Explicit, literal, straightforward, easy to understand. Meaning is entirely on the surface. The language is concrete, and the meaning is clear and procedural, mostly referring to observable materials and actions. Contains no symbolic or ironic language, and conceptual interpretation is not required. Contains limited figurative language that is common and easy to comprehend at the target grade level.
+- Moderately Complex: Largely explicit and easy to understand with some occasions for more complex meaning. May contain a noticeable amount of archaic/dated phrasing, formal historical prose, vocabulary demands, background knowledge requirements, or expressions that are less familiar to the target grade level, which might make the text feel vague or slightly challenging.
+- Very Complex: Fairly complex; contains sustained abstract language, conceptual framing, rhetorical idealization, ironic comparisons, or central metaphors that drive the meaning of the text. Addresses concepts, beliefs, and abstract qualities rather than just concrete objects. The tone or underlying message requires interpretation, even if the surface message is clear.
+- Exceedingly Complex: Dense and complex; contains considerable abstract, ironic, and/or figurative language. Meaning is heavily hidden, deeply conceptual, or relies heavily on complex rhetorical devices.
+
+Essential Evaluation Rules
+1. Concrete & Procedural Texts: Texts that are highly concrete, clear, and procedural (e.g., describing observable materials, mechanical processes, or physical actions) should typically be rated "Slightly Complex."
+
+2. Grade-Level Anchoring and Vague Narratives: Always consider the target grade. A literal historical narrative that might be straightforward for older students can be "Moderately Complex" for younger students (e.g., 4th graders) if it involves less familiar expressions, older contexts (e.g., wagon loads, traveling by horseback), vocabulary demands, and background knowledge requirements that make the text feel vague or slightly demanding for that age group.
+
+3. Rhetorical Idealization and Abstract Qualities: If an entire argument or narrative is built around abstract qualities (e.g., national character, bravery, liberty) and uses repeated figurative language or personification to portray a subject in a certain idealized way, rate the text as "Very Complex." Even if the figurative language is easy to interpret, the need to interpret the rhetorical tone and sustained abstract focus elevates the complexity beyond level two.
+
+4. Common Idioms and Grade-Level Appropriateness: Do NOT elevate a text to "Moderately Complex" simply because it contains a few common idiomatic expressions. If these expressions are widely known and easy for the target grade to understand without making the text feel vague, the text remains "Slightly Complex."
+
+5. Conversational and Hypothetical Framing: Using a second-person conversational hook (e.g., "Imagine you are...") to explain a concept is a standard, literal device for engaging readers. It does not constitute complex conceptual framing.
+
+6. Sustained vs. Occasional Impact: If abstract language, figurative phrasing, irony, or conceptual framing is sustained throughout the text and central to the argument/meaning, the text is Very Complex. Reserve Moderately Complex for texts where the explicit meaning dominates but the expression, vocabulary, or archaic language provides a moderate conventionality challenge.
+
+7. Central Metaphors and Conceptual Framing: When an author uses a central metaphor to explain a concept or uses figurative phrasing to explain how things "work," this abstract reasoning drives the meaning, elevating the text to Very Complex.
+
+8. Irony and Abstract Comparisons: Texts that rely on sustained irony, especially through comparative arguments, are inherently Very Complex for younger students.
+
+9. Isolate Conventionality from Vocabulary: Do not inflate the Conventionality score just because the text uses archaic, dated, or highly academic vocabulary.
+
+Input Format
+You will receive:
+- text: The passage to evaluate.
+- grade: The target student grade level.
+- fk_score: The Flesch-Kincaid readability score.
+
+Output Format
+Provide a JSON object containing ONLY the following keys:
+- complexity_score: (String) One of the 4 scale levels exactly as formatted: 'slightly_complex', 'moderately_complex', 'very_complex', or 'exceedingly_complex'.
+- reasoning: (String) A detailed explanation of the rating, citing specific features in the text and referencing the expert guardrails (e.g., noting if the text relies on abstract qualities/rhetorical idealization, if vocabulary/background knowledge demands make a literal text vague for the grade level, or if it is strictly concrete/procedural).
+- conventionality_features: (List of Strings) The specific language features driving the complexity (e.g., literal narrative, concrete actions, less familiar expressions, sustained irony, abstract qualities, rhetorical idealization, archaic phrasing) with direct quotes from the text.
+- grade_context: (String) How the conventionality demands compare to general expectations for the provided target grade.
+- instructional_insights: (String) Actionable pedagogical suggestions for scaffolding the conventionality features in the classroom.
+
+{format_instructions}
+"""
+human_prompt = """
+Analyze:
+Text: {text}
+Grade: {grade}
+FK Score: {fk_score}
+"""
+
+[evaluation_settings.prompt_settings_step_main]
+type = "GOOGLE"
+model = "gemini-3-flash-preview"
+temperature = 0

From e2849a702510f55813fecf14f2c90b1b2de4fb39 Mon Sep 17 00:00:00 2001
From: Fred Sisenda <121206410+czi-fsisenda@users.noreply.github.com>
Date: Tue, 12 May 2026 12:16:16 -0700
Subject: [PATCH 02/26] chore: eval version as string

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 sdks/settings/conventionality/settings.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdks/settings/conventionality/settings.toml b/sdks/settings/conventionality/settings.toml
index cef7ecf..2cba13b 100644
--- a/sdks/settings/conventionality/settings.toml
+++ b/sdks/settings/conventionality/settings.toml
@@ -1,6 +1,6 @@
 [evaluator_metadata]
 id = "conventionality"
-version = 0.1
+version = "0.1"
 name = "Conventionality"
 description = "Conventionality Evaluator"
 maturity = "beta"

From 07a755eaca7ed3ff19d00746b97644caeb274287 Mon Sep 17 00:00:00 2001
From: Fred Sisenda <121206410+czi-fsisenda@users.noreply.github.com>
Date: Tue, 12 May 2026 12:17:28 -0700
Subject: [PATCH 03/26] test: generalize min text length test description

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 sdks/python/tests/evaluators/test_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py
index 0ce3542..434dc27 100644
--- a/sdks/python/tests/evaluators/test_base.py
+++ b/sdks/python/tests/evaluators/test_base.py
@@ -133,7 +133,7 @@ def test_uses_input_values_when_full_telemetry_enabled(self):
 
 class TestEvaluateErrorHandling:
     def test_raises_validation_error_for_invalid_input(self):
-        # "x" is below the 100-char minimum defined in the TOML settings.
+        # "x" is below the configured minimum text length in the TOML settings.
         evaluator = _evaluator()
         invalid = ConventionalityEvaluationInput(text="x", grade=5)
         with pytest.raises(ValidationError):

From cd7867b7e10e07fde2eec1ce32a374a279b46c42 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Mon, 11 May 2026 17:12:03 -0700
Subject: [PATCH 04/26] chore: PR comments - config validation todo,
 input_metadata only in logs

---
 .../evaluators/base.py                        |   6 +-
 sdks/python/tests/evaluators/test_base.py     | 154 ++++++++++++++++--
 sdks/python/tests/schemas/test_metadata.py    |   1 +
 3 files changed, 142 insertions(+), 19 deletions(-)

diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
index 47af736..81fd14f 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
@@ -56,6 +56,7 @@ class BaseEvaluator(ABC, Generic[InputT, OutputT, SettingsT]):
 
     def __init__(self, config: EvaluatorConfig) -> None:
         self.config = config
+        # TODO: validate config
 
     def evaluate(
         self,
@@ -94,9 +95,7 @@ def evaluate(
         evaluation_metadata = EvaluationMetadata(
             evaluator_metadata=self.metadata,
             evaluation_settings=evaluation_settings,
-            input_metadata=input.input_values()
-            if self.config.send_full_input_with_telemetry
-            else input.input_metadata(),
+            input_metadata=input.input_metadata()
         )
         self.config.logger.info(
             "evaluation start",
@@ -118,6 +117,7 @@ def evaluate(
                 "evaluation end",
                 extra={"evaluation_metadata": evaluation_metadata},
             )
+            # TODO: add full input to telemetry if enabled
             # TODO: send_telemetry(evaluation_metadata)
 
     @abstractmethod
diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py
index 434dc27..fe8af78 100644
--- a/sdks/python/tests/evaluators/test_base.py
+++ b/sdks/python/tests/evaluators/test_base.py
@@ -1,26 +1,35 @@
-"""Tests for BaseEvaluator.
+"""Tests for :class:`~learning_commons_evaluators.evaluators.base.BaseEvaluator`.
 
-Covers: config wiring, evaluate() telemetry branching, error handling,
-update_total_token_usage, execute_step, and execute_prompt_chain_step.
+Covers ``evaluate`` wiring (``EvaluationMetadata`` always uses ``input.input_metadata()``,
+including when ``send_full_input_with_telemetry`` is enabled), ``execute_step``,
+``execute_prompt_chain_step``, token usage, and error handling via both a minimal stub
+evaluator and conventionality-oriented helpers.
 """
 
+from __future__ import annotations
+
 from unittest.mock import MagicMock, patch
 
 import pytest
 
 from learning_commons_evaluators import (
+    BaseEvaluator,
     ConventionalityEvaluationInput,
     ConventionalityEvaluator,
+    EvaluationExplanation,
+    TextComplexityEvaluationInput,
     create_config,
     create_config_no_telemetry,
 )
 from learning_commons_evaluators.errors import ConfigurationError
-from learning_commons_evaluators.schemas.config import LlmProvider, PromptSettings
+from learning_commons_evaluators.schemas.common_inputs import GradeInputField, TextInputField
+from learning_commons_evaluators.schemas.config import EvaluationSettings, LlmProvider, PromptSettings
 from learning_commons_evaluators.schemas.conventionality import (
     ConventionalityEvaluationSettings,
     ConventionalityOutput,
 )
 from learning_commons_evaluators.schemas.errors import APIError, ValidationError
+from learning_commons_evaluators.schemas.input_specs import GradeInputSpec, TextInputSpec
 from learning_commons_evaluators.schemas.metadata import (
     PROMPT_STEP_EXTRA_PROMPT_SETTINGS,
     PROMPT_STEP_EXTRA_TOKEN_USAGE,
@@ -30,14 +39,64 @@
     Status,
     TokenUsage,
 )
+from learning_commons_evaluators.schemas.text_complexity import (
+    TextComplexityAnswer,
+    TextComplexityResult,
+)
 
 # ---------------------------------------------------------------------------
-# Shared test helpers
+# Stub evaluator (incoming branch) — input_metadata shape on evaluate()
+# ---------------------------------------------------------------------------
+
+
+class _StubSettings(EvaluationSettings):
+    """Minimal settings model for stub evaluator."""
+
+
+def _stub_input() -> TextComplexityEvaluationInput:
+    return TextComplexityEvaluationInput(
+        text=TextInputField(spec=TextInputSpec(name="text"), value="hello world"),
+        grade_level=GradeInputField(spec=GradeInputSpec(name="grade_level"), value=3),
+    )
+
+
+class _StubEvaluator(
+    BaseEvaluator[TextComplexityEvaluationInput, TextComplexityResult, _StubSettings]
+):
+    metadata = EvaluatorMetadata(
+        id="stub-evaluator",
+        version="0",
+        name="Stub",
+        description="Unit test stub.",
+        maturity=EvaluatorMaturity.beta,
+    )
+    default_evaluation_settings = _StubSettings()
+
+    def evaluate_impl(
+        self,
+        input: TextComplexityEvaluationInput,
+        evaluation_settings: _StubSettings,
+        evaluation_metadata,
+    ) -> TextComplexityResult:
+        return TextComplexityResult(
+            answer=TextComplexityAnswer.SLIGHTLY_COMPLEX,
+            explanation=EvaluationExplanation(summary="stub", details={}),
+            metadata=evaluation_metadata,
+        )
+
+
+@pytest.fixture
+def stub_evaluator(config):
+    return _StubEvaluator(config)
+
+
+# ---------------------------------------------------------------------------
+# Conventionality-oriented helpers (HEAD branch)
 # ---------------------------------------------------------------------------
 
 
 def _evaluator(*, send_full_input=False):
-    """Return a ConventionalityEvaluator; use send_full_input=True to enable full-input telemetry."""
+    """Return a ConventionalityEvaluator; use send_full_input=True for full-input telemetry."""
     if send_full_input:
         config = create_config(telemetry_id="test", send_full_input_with_telemetry=True)
     else:
@@ -60,7 +119,7 @@ def _meta():
     )
 
 
-# A realistic sample long enough to satisfy the min_text_length=100 constraint.
+# Long sample text (well above configured ``min_text_length`` in settings TOML).
 _SAMPLE_TEXT = (
     "Marco Polo was a Venetian merchant and explorer who traveled through Asia "
     "in the late 13th century. He spent nearly two decades at the court of "
@@ -91,6 +150,30 @@ def _inp(text=_SAMPLE_TEXT, grade=5):
 _CHAIN_PATCH = "learning_commons_evaluators.evaluators.base.create_provider"
 
 
+# ---------------------------------------------------------------------------
+# evaluate() — input_metadata (stub)
+# ---------------------------------------------------------------------------
+
+
+class TestEvaluateInputMetadata:
+    def test_evaluate_sets_metadata_from_input_metadata(self, stub_evaluator):
+        inp = _stub_input()
+        result = stub_evaluator.evaluate(inp)
+        assert result.metadata.input_metadata == inp.input_metadata()
+        assert result.metadata.input_metadata["text"] == {"textLength": "11"}
+        assert result.metadata.input_metadata["grade_level"] == {"grade": 3}
+
+    def test_full_telemetry_config_still_uses_input_metadata_not_raw_values(self, stub_evaluator):
+        """``send_full_input_with_telemetry`` does not replace ``input_metadata`` with raw values."""
+        cfg = create_config(telemetry_id="test", send_full_input_with_telemetry=True)
+        ev = _StubEvaluator(cfg)
+        inp = _stub_input()
+        result = ev.evaluate(inp)
+        assert result.metadata.input_metadata == inp.input_metadata()
+        assert result.metadata.input_metadata["text"] == {"textLength": "11"}
+        assert result.metadata.input_metadata["grade_level"] == {"grade": 3}
+
+
 # ---------------------------------------------------------------------------
 # BaseEvaluator.__init__
 # ---------------------------------------------------------------------------
@@ -103,7 +186,7 @@ def test_config_is_stored(self):
 
 
 # ---------------------------------------------------------------------------
-# evaluate() — telemetry branching
+# evaluate() — telemetry (conventionality)
 # ---------------------------------------------------------------------------
 
 
@@ -113,17 +196,15 @@ def test_uses_input_metadata_by_default(self):
         evaluator = _evaluator()
         with patch.object(evaluator, "execute_prompt_chain_step", return_value=_MOCK_OUTPUT):
             result = evaluator.evaluate(_inp())
-        # input_metadata() returns shape metadata, not the raw text value.
         assert result.metadata.input_metadata["text"] == {"textLength": str(len(_SAMPLE_TEXT))}
 
-    def test_uses_input_values_when_full_telemetry_enabled(self):
-        """When send_full_input_with_telemetry=True, input.input_values() is used."""
+    def test_full_telemetry_still_records_input_metadata_not_raw_values(self):
+        """``send_full_input_with_telemetry`` does not put raw field values on metadata."""
         evaluator = _evaluator(send_full_input=True)
         with patch.object(evaluator, "execute_prompt_chain_step", return_value=_MOCK_OUTPUT):
             result = evaluator.evaluate(_inp())
-        # input_values() returns the raw text string, not {"textLength": ...}.
-        assert result.metadata.input_metadata["text"] == _SAMPLE_TEXT
-        assert result.metadata.input_metadata["grade"] == 5
+        assert result.metadata.input_metadata["text"] == {"textLength": str(len(_SAMPLE_TEXT))}
+        assert result.metadata.input_metadata["grade"] == {"grade": 5}
 
 
 # ---------------------------------------------------------------------------
@@ -131,9 +212,8 @@ def test_uses_input_values_when_full_telemetry_enabled(self):
 # ---------------------------------------------------------------------------
 
 
-class TestEvaluateErrorHandling:
+class TestConventionalityEvaluateErrorHandling:
     def test_raises_validation_error_for_invalid_input(self):
-        # "x" is below the configured minimum text length in the TOML settings.
         evaluator = _evaluator()
         invalid = ConventionalityEvaluationInput(text="x", grade=5)
         with pytest.raises(ValidationError):
@@ -148,6 +228,26 @@ def test_propagates_evaluate_impl_exception(self):
             evaluator.evaluate(_inp())
 
 
+class TestStubEvaluateErrorHandling:
+    def test_raises_validation_error_for_invalid_input(self, stub_evaluator):
+        inp = TextComplexityEvaluationInput(
+            text=TextInputField(
+                spec=TextInputSpec(name="text", min_text_length=100),
+                value="short",
+            ),
+            grade_level=GradeInputField(spec=GradeInputSpec(name="grade_level"), value=3),
+        )
+        with pytest.raises(ValidationError):
+            stub_evaluator.evaluate(inp)
+
+    def test_propagates_evaluate_impl_exception(self, stub_evaluator):
+        with (
+            patch.object(stub_evaluator, "evaluate_impl", side_effect=RuntimeError("boom")),
+            pytest.raises(RuntimeError, match="boom"),
+        ):
+            stub_evaluator.evaluate(_stub_input())
+
+
 # ---------------------------------------------------------------------------
 # update_total_token_usage
 # ---------------------------------------------------------------------------
@@ -233,6 +333,28 @@ class TestExecutePromptChainStep:
     token_usage_from_aimessage exercise the real code paths.
     """
 
+    def test_returns_raw_string_when_parser_output_type_omitted(self, evaluation_metadata):
+        from langchain_core.messages import AIMessage
+        from langchain_core.prompts import ChatPromptTemplate
+
+        def _fake_llm(_pv):
+            return AIMessage(content="plain prose")
+
+        template = ChatPromptTemplate.from_messages([("human", "{input}")])
+        with patch(_CHAIN_PATCH, return_value=_fake_llm):
+            out = _evaluator().execute_prompt_chain_step(
+                step_name="raw",
+                prompt_settings=PromptSettings(
+                    provider_type=LlmProvider.GOOGLE,
+                    model="gemini-2.0-flash",
+                    temperature=0.0,
+                ),
+                evaluation_metadata=evaluation_metadata,
+                template=template,
+                chain_inputs={"input": "Hello"},
+            )
+        assert out == "plain prose"
+
     def test_returns_parsed_pydantic_output(self, evaluation_metadata):
         from langchain_core.messages import AIMessage
         from langchain_core.prompts import ChatPromptTemplate
diff --git a/sdks/python/tests/schemas/test_metadata.py b/sdks/python/tests/schemas/test_metadata.py
index 408a39e..c2b7590 100644
--- a/sdks/python/tests/schemas/test_metadata.py
+++ b/sdks/python/tests/schemas/test_metadata.py
@@ -126,6 +126,7 @@ def test_defaults(self, evaluation_metadata):
         assert evaluation_metadata.step_details == {}
         assert evaluation_metadata.total_token_usage == {}
         assert evaluation_metadata.error_details is None
+        assert evaluation_metadata.input_metadata == {}
 
     def test_status_can_be_mutated(self, evaluation_metadata):
         evaluation_metadata.status = Status.succeeded

From 673857e4d00d27c717889ced709011d756644de5 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Mon, 11 May 2026 17:24:54 -0700
Subject: [PATCH 05/26] chore: execute_step implementation without unsafe casts

---
 .../evaluators/base.py                        | 61 +++++++++++++------
 1 file changed, 42 insertions(+), 19 deletions(-)

diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
index 81fd14f..0a7f495 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
@@ -5,7 +5,7 @@
 import time
 from abc import ABC, abstractmethod
 from collections.abc import Callable
-from typing import Any, Generic, TypeVar, cast
+from typing import Any, Generic, TypeVar, overload
 
 from pydantic import BaseModel
 from pydantic import ValidationError as PydanticValidationError
@@ -41,7 +41,8 @@
 InputT = TypeVar("InputT", bound=EvaluationInput)
 OutputT = TypeVar("OutputT", bound=EvaluationResult)
 SettingsT = TypeVar("SettingsT", bound=EvaluationSettings)
-ResultT = TypeVar("ResultT", bound=BaseModel)
+StepResultT = TypeVar("StepResultT")
+ParsedT = TypeVar("ParsedT", bound=BaseModel)
 
 
 class BaseEvaluator(ABC, Generic[InputT, OutputT, SettingsT]):
@@ -134,14 +135,17 @@ def execute_step(
         self,
         step_name: str,
         evaluation_metadata: EvaluationMetadata,
-        implementation_function: Callable[[], ResultT],
+        implementation_function: Callable[[], StepResultT],
         *,
         extras: dict[str, Any] | None = None,
-    ) -> ResultT:
+    ) -> StepResultT:
         """Run ``implementation_function`` and record step metadata on ``evaluation_metadata``.
 
         ``step_name`` is always the step id. Optional ``extras`` is copied into
         :attr:`StepMetadata.extras` (merged with any updates made during the step, e.g. token usage).
+
+        The step may return any type (e.g. a Pydantic model, a plain ``str``, or ``None``); the same
+        type is returned to the caller.
         """
         start = time.perf_counter()
         step_extras = dict(extras) if extras is not None else {}
@@ -160,6 +164,28 @@ def execute_step(
             self.config.logger.info("step end", extra={"step_metadata": step_metadata})
             evaluation_metadata.step_details[step_name] = step_metadata
 
+    @overload
+    def execute_prompt_chain_step(
+        self,
+        step_name: str,
+        prompt_settings: PromptSettings,
+        evaluation_metadata: EvaluationMetadata,
+        template: Any,
+        chain_inputs: dict[str, Any],
+        parser_output_type: None = None,
+    ) -> str: ...
+
+    @overload
+    def execute_prompt_chain_step(
+        self,
+        step_name: str,
+        prompt_settings: PromptSettings,
+        evaluation_metadata: EvaluationMetadata,
+        template: Any,
+        chain_inputs: dict[str, Any],
+        parser_output_type: type[ParsedT],
+    ) -> ParsedT: ...
+
     def execute_prompt_chain_step(
         self,
         step_name: str,
@@ -167,8 +193,8 @@ def execute_prompt_chain_step(
         evaluation_metadata: EvaluationMetadata,
         template: Any,
         chain_inputs: dict[str, Any],
-        parser_output_type: type[ResultT] | None = None,
-    ) -> ResultT | str:
+        parser_output_type: type[BaseModel] | None = None,
+    ) -> BaseModel | str:
         """Run a prompt chain (template | LLM), record metadata, and return the result.
 
         When ``parser_output_type`` is a Pydantic model class, the LLM response is
@@ -201,7 +227,7 @@ def execute_prompt_chain_step(
         # Populated after a successful LLM invoke so we can attach usage even if parsing fails.
         token_usage: TokenUsage | None = None
 
-        def _run_chain() -> ResultT | str:
+        def _run_chain() -> BaseModel | str:
             nonlocal token_usage
             try:
                 provider = create_provider(prompt_settings, self.config)
@@ -227,18 +253,15 @@ def _run_chain() -> ResultT | str:
                 raise wrap_provider_error(e) from e
 
         try:
-            return cast(
-                ResultT | str,
-                self.execute_step(
-                    step_name,
-                    evaluation_metadata,
-                    extras={
-                        PROMPT_STEP_EXTRA_PROMPT_SETTINGS: prompt_settings_to_extras_value(
-                            prompt_settings
-                        ),
-                    },
-                    implementation_function=cast(Callable[[], ResultT], _run_chain),
-                ),
+            return self.execute_step(
+                step_name,
+                evaluation_metadata,
+                extras={
+                    PROMPT_STEP_EXTRA_PROMPT_SETTINGS: prompt_settings_to_extras_value(
+                        prompt_settings
+                    ),
+                },
+                implementation_function=_run_chain,
             )
         finally:
             if token_usage is not None:

From bf67996b4f637dd818caf1c197e4af7cd1e2c6ba Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Mon, 11 May 2026 17:41:02 -0700
Subject: [PATCH 06/26] test: unit tests for base evaluator. somehow didn't
 make into this branch when I split the PR using AI

---
 sdks/python/tests/evaluators/test_base.py | 446 +++++++++++++++-------
 1 file changed, 310 insertions(+), 136 deletions(-)

diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py
index fe8af78..15efbb0 100644
--- a/sdks/python/tests/evaluators/test_base.py
+++ b/sdks/python/tests/evaluators/test_base.py
@@ -1,16 +1,22 @@
 """Tests for :class:`~learning_commons_evaluators.evaluators.base.BaseEvaluator`.
 
-Covers ``evaluate`` wiring (``EvaluationMetadata`` always uses ``input.input_metadata()``,
-including when ``send_full_input_with_telemetry`` is enabled), ``execute_step``,
-``execute_prompt_chain_step``, token usage, and error handling via both a minimal stub
-evaluator and conventionality-oriented helpers.
+Covers ``__init__``, ``evaluate`` (metadata, settings override, success/failure, telemetry),
+``update_total_token_usage``, ``execute_step``, and ``execute_prompt_chain_step``.
+``EvaluationMetadata`` always uses ``input.input_metadata()`` (including when
+``send_full_input_with_telemetry`` is enabled). Helpers use both a minimal stub evaluator
+and conventionality-oriented fixtures where useful.
 """
 
 from __future__ import annotations
 
+import logging
 from unittest.mock import MagicMock, patch
 
 import pytest
+from langchain_core.messages import AIMessage
+from langchain_core.prompts import ChatPromptTemplate
+from pydantic import BaseModel, Field
+from pydantic import ValidationError as PydanticValidationError
 
 from learning_commons_evaluators import (
     BaseEvaluator,
@@ -23,17 +29,17 @@
 )
 from learning_commons_evaluators.errors import ConfigurationError
 from learning_commons_evaluators.schemas.common_inputs import GradeInputField, TextInputField
-from learning_commons_evaluators.schemas.config import EvaluationSettings, LlmProvider, PromptSettings
-from learning_commons_evaluators.schemas.conventionality import (
-    ConventionalityEvaluationSettings,
-    ConventionalityOutput,
+from learning_commons_evaluators.schemas.config import (
+    EvaluationSettings,
+    LlmProvider,
+    PromptSettings,
 )
-from learning_commons_evaluators.schemas.errors import APIError, ValidationError
+from learning_commons_evaluators.schemas.conventionality import ConventionalityOutput
+from learning_commons_evaluators.schemas.errors import APIError, EvaluatorError, ValidationError
 from learning_commons_evaluators.schemas.input_specs import GradeInputSpec, TextInputSpec
 from learning_commons_evaluators.schemas.metadata import (
     PROMPT_STEP_EXTRA_PROMPT_SETTINGS,
     PROMPT_STEP_EXTRA_TOKEN_USAGE,
-    EvaluationMetadata,
     EvaluatorMaturity,
     EvaluatorMetadata,
     Status,
@@ -44,13 +50,23 @@
     TextComplexityResult,
 )
 
-# ---------------------------------------------------------------------------
-# Stub evaluator (incoming branch) — input_metadata shape on evaluate()
-# ---------------------------------------------------------------------------
+_CHAIN_PATCH = "learning_commons_evaluators.evaluators.base.create_provider"
+
+
+class _ChainOutput(BaseModel):
+    """Minimal LLM JSON payload model (stand-in for conventionality output models)."""
+
+    label: str = Field(description="short label")
+    score: int = Field(description="numeric score")
+
+
+_CHAIN_JSON = '{"label": "ok", "score": 7}'
 
 
 class _StubSettings(EvaluationSettings):
-    """Minimal settings model for stub evaluator."""
+    """Minimal settings; ``marker`` supports tests that pass explicit ``evaluation_settings``."""
+
+    marker: int = 0
 
 
 def _stub_input() -> TextComplexityEvaluationInput:
@@ -80,7 +96,10 @@ def evaluate_impl(
     ) -> TextComplexityResult:
         return TextComplexityResult(
             answer=TextComplexityAnswer.SLIGHTLY_COMPLEX,
-            explanation=EvaluationExplanation(summary="stub", details={}),
+            explanation=EvaluationExplanation(
+                summary="stub",
+                details={"marker": evaluation_settings.marker},
+            ),
             metadata=evaluation_metadata,
         )
 
@@ -90,36 +109,15 @@ def stub_evaluator(config):
     return _StubEvaluator(config)
 
 
-# ---------------------------------------------------------------------------
-# Conventionality-oriented helpers (HEAD branch)
-# ---------------------------------------------------------------------------
-
-
 def _evaluator(*, send_full_input=False):
     """Return a ConventionalityEvaluator; use send_full_input=True for full-input telemetry."""
     if send_full_input:
-        config = create_config(telemetry_id="test", send_full_input_with_telemetry=True)
+        cfg = create_config(telemetry_id="test", send_full_input_with_telemetry=True)
     else:
-        config = create_config_no_telemetry()
-    return ConventionalityEvaluator(config)
-
-
-def _meta():
-    """Return a minimal EvaluationMetadata suitable for direct method tests."""
-    return EvaluationMetadata(
-        evaluator_metadata=EvaluatorMetadata(
-            id="test",
-            version="0.1",
-            name="Test",
-            description="Test",
-            maturity=EvaluatorMaturity.beta,
-        ),
-        evaluation_settings=ConventionalityEvaluationSettings(),
-        input_metadata={},
-    )
+        cfg = create_config_no_telemetry()
+    return ConventionalityEvaluator(cfg)
 
 
-# Long sample text (well above configured ``min_text_length`` in settings TOML).
 _SAMPLE_TEXT = (
     "Marco Polo was a Venetian merchant and explorer who traveled through Asia "
     "in the late 13th century. He spent nearly two decades at the court of "
@@ -132,7 +130,6 @@ def _inp(text=_SAMPLE_TEXT, grade=5):
     return ConventionalityEvaluationInput(text=text, grade=grade)
 
 
-# ConventionalityOutput used as a stand-in for the LLM-parsed output.
 _MOCK_OUTPUT = ConventionalityOutput(
     complexity_score="moderately_complex",
     reasoning="Uses some conventional language.",
@@ -141,21 +138,37 @@ def _inp(text=_SAMPLE_TEXT, grade=5):
     instructional_insights="Consider scaffolding.",
 )
 
-# A minimal JSON body accepted by ConventionalityOutput's parser.
 _CONV_JSON = (
     '{"complexity_score": "slightly_complex", "reasoning": "Clear.",'
     ' "conventionality_features": [], "grade_context": "Fine.", "instructional_insights": "None."}'
 )
 
-_CHAIN_PATCH = "learning_commons_evaluators.evaluators.base.create_provider"
 
+class TestBaseEvaluatorInit:
+    def test_stub_config_is_stored(self, stub_evaluator, config):
+        assert stub_evaluator.config is config
+
+    def test_conventionality_config_is_stored(self):
+        config = create_config_no_telemetry()
+        assert ConventionalityEvaluator(config).config is config
+
+
+class TestEvaluateSuccess:
+    def test_sets_status_succeeded_and_processing_time(self, stub_evaluator):
+        result = stub_evaluator.evaluate(_stub_input())
+        assert result.metadata.status == Status.succeeded
+        assert result.metadata.processing_time_ms >= 0.0
 
-# ---------------------------------------------------------------------------
-# evaluate() — input_metadata (stub)
-# ---------------------------------------------------------------------------
+    def test_passes_explicit_evaluation_settings(self, stub_evaluator):
+        custom = _StubSettings(marker=42)
+        result = stub_evaluator.evaluate(_stub_input(), evaluation_settings=custom)
+        assert result.metadata.evaluation_settings.marker == 42
+        assert result.explanation.details.get("marker") == 42
 
 
 class TestEvaluateInputMetadata:
+    """``input_metadata`` on :class:`EvaluationMetadata` always comes from ``input.input_metadata()``."""
+
     def test_evaluate_sets_metadata_from_input_metadata(self, stub_evaluator):
         inp = _stub_input()
         result = stub_evaluator.evaluate(inp)
@@ -174,32 +187,14 @@ def test_full_telemetry_config_still_uses_input_metadata_not_raw_values(self, st
         assert result.metadata.input_metadata["grade_level"] == {"grade": 3}
 
 
-# ---------------------------------------------------------------------------
-# BaseEvaluator.__init__
-# ---------------------------------------------------------------------------
-
-
-class TestBaseEvaluatorInit:
-    def test_config_is_stored(self):
-        config = create_config_no_telemetry()
-        assert ConventionalityEvaluator(config).config is config
-
-
-# ---------------------------------------------------------------------------
-# evaluate() — telemetry (conventionality)
-# ---------------------------------------------------------------------------
-
-
 class TestEvaluateTelemetryBranching:
     def test_uses_input_metadata_by_default(self):
-        """When send_full_input_with_telemetry=False, input.input_metadata() is used."""
         evaluator = _evaluator()
         with patch.object(evaluator, "execute_prompt_chain_step", return_value=_MOCK_OUTPUT):
             result = evaluator.evaluate(_inp())
         assert result.metadata.input_metadata["text"] == {"textLength": str(len(_SAMPLE_TEXT))}
 
     def test_full_telemetry_still_records_input_metadata_not_raw_values(self):
-        """``send_full_input_with_telemetry`` does not put raw field values on metadata."""
         evaluator = _evaluator(send_full_input=True)
         with patch.object(evaluator, "execute_prompt_chain_step", return_value=_MOCK_OUTPUT):
             result = evaluator.evaluate(_inp())
@@ -207,11 +202,6 @@ def test_full_telemetry_still_records_input_metadata_not_raw_values(self):
         assert result.metadata.input_metadata["grade"] == {"grade": 5}
 
 
-# ---------------------------------------------------------------------------
-# evaluate() — error handling
-# ---------------------------------------------------------------------------
-
-
 class TestConventionalityEvaluateErrorHandling:
     def test_raises_validation_error_for_invalid_input(self):
         evaluator = _evaluator()
@@ -247,35 +237,57 @@ def test_propagates_evaluate_impl_exception(self, stub_evaluator):
         ):
             stub_evaluator.evaluate(_stub_input())
 
+    def test_validation_failure_emits_end_log_with_failed_status(self, stub_evaluator):
+        captured: list = []
+
+        class _Capture(logging.Handler):
+            def emit(self, record: logging.LogRecord) -> None:
+                meta = getattr(record, "evaluation_metadata", None)
+                if meta is not None and record.getMessage() == "evaluation end":
+                    captured.append(meta)
+
+        h = _Capture()
+        stub_evaluator.config.logger.addHandler(h)
+        stub_evaluator.config.logger.setLevel(logging.INFO)
+        try:
+            inp = TextComplexityEvaluationInput(
+                text=TextInputField(
+                    spec=TextInputSpec(name="text", min_text_length=100),
+                    value="short",
+                ),
+                grade_level=GradeInputField(spec=GradeInputSpec(name="grade_level"), value=3),
+            )
+            with pytest.raises(ValidationError):
+                stub_evaluator.evaluate(inp)
+        finally:
+            stub_evaluator.config.logger.removeHandler(h)
 
-# ---------------------------------------------------------------------------
-# update_total_token_usage
-# ---------------------------------------------------------------------------
+        assert captured
+        assert captured[-1].status == Status.failed
+        assert captured[-1].error_details
 
 
 class TestUpdateTotalTokenUsage:
-    def test_inserts_usage_for_new_provider(self, evaluation_metadata):
-        evaluator = _evaluator()
+    def test_inserts_usage_for_new_provider(self, stub_evaluator, evaluation_metadata):
         usage = TokenUsage(
             provider_type=LlmProvider.GOOGLE,
             model="gemini-2.0-flash",
             input_tokens=10,
             output_tokens=5,
         )
-        evaluator.update_total_token_usage(usage, evaluation_metadata)
+        stub_evaluator.update_total_token_usage(usage, evaluation_metadata)
         stored = evaluation_metadata.total_token_usage[LlmProvider.GOOGLE]
         assert stored.input_tokens == 10
         assert stored.output_tokens == 5
 
-    def test_accumulates_usage_for_existing_provider(self, evaluation_metadata):
-        evaluator = _evaluator()
+    def test_accumulates_usage_for_existing_provider(self, stub_evaluator, evaluation_metadata):
         evaluation_metadata.total_token_usage[LlmProvider.GOOGLE] = TokenUsage(
             provider_type=LlmProvider.GOOGLE,
             model="gemini-2.0-flash",
             input_tokens=10,
             output_tokens=5,
         )
-        evaluator.update_total_token_usage(
+        stub_evaluator.update_total_token_usage(
             TokenUsage(
                 provider_type=LlmProvider.GOOGLE,
                 model="gemini-2.0-flash",
@@ -289,60 +301,53 @@ def test_accumulates_usage_for_existing_provider(self, evaluation_metadata):
         assert stored.output_tokens == 20
 
 
-# ---------------------------------------------------------------------------
-# execute_step
-# ---------------------------------------------------------------------------
-
-
 class TestExecuteStep:
-    def test_returns_implementation_result(self, evaluation_metadata):
-        result = _evaluator().execute_step("s", evaluation_metadata, lambda: "the-result")
-        assert result == "the-result"
+    def test_returns_implementation_result(self, stub_evaluator, evaluation_metadata):
+        assert (
+            stub_evaluator.execute_step("s", evaluation_metadata, lambda: "the-result")
+            == "the-result"
+        )
 
-    def test_records_succeeded_status_on_success(self, evaluation_metadata):
-        _evaluator().execute_step("s", evaluation_metadata, lambda: None)
+    def test_records_succeeded_status_on_success(self, stub_evaluator, evaluation_metadata):
+        stub_evaluator.execute_step("s", evaluation_metadata, lambda: None)
         assert evaluation_metadata.step_details["s"].status == Status.succeeded
 
-    def test_records_failed_status_and_error_on_exception(self, evaluation_metadata):
+    def test_records_failed_status_and_error_on_exception(
+        self, stub_evaluator, evaluation_metadata
+    ):
         failing = MagicMock(side_effect=ValueError("boom"))
         with pytest.raises(ValueError, match="boom"):
-            _evaluator().execute_step("s", evaluation_metadata, failing)
+            stub_evaluator.execute_step("s", evaluation_metadata, failing)
         step = evaluation_metadata.step_details["s"]
         assert step.status == Status.failed
         assert "boom" in step.error_details
 
-    def test_re_raises_exception(self, evaluation_metadata):
+    def test_re_raises_exception(self, stub_evaluator, evaluation_metadata):
         failing = MagicMock(side_effect=RuntimeError("inner"))
         with pytest.raises(RuntimeError, match="inner"):
-            _evaluator().execute_step("s", evaluation_metadata, failing)
+            stub_evaluator.execute_step("s", evaluation_metadata, failing)
 
-    def test_extras_appear_in_step_metadata(self, evaluation_metadata):
-        _evaluator().execute_step("s", evaluation_metadata, lambda: None, extras={"k": "v"})
+    def test_extras_appear_in_step_metadata(self, stub_evaluator, evaluation_metadata):
+        stub_evaluator.execute_step("s", evaluation_metadata, lambda: None, extras={"k": "v"})
         assert evaluation_metadata.step_details["s"].extras["k"] == "v"
 
 
-# ---------------------------------------------------------------------------
-# execute_prompt_chain_step
-# ---------------------------------------------------------------------------
-
-
 class TestExecutePromptChainStep:
-    """Mock create_provider so the chain runs in-process with a fake LLM.
+    """Mock ``create_provider`` so ``template | provider`` runs in-process.
 
-    The fake LLM returns a real AIMessage so JsonOutputParser and
-    token_usage_from_aimessage exercise the real code paths.
+    Fake LLMs return real ``AIMessage`` values so ``JsonOutputParser`` and
+    ``token_usage_from_aimessage`` exercise the real code paths where applicable.
     """
 
-    def test_returns_raw_string_when_parser_output_type_omitted(self, evaluation_metadata):
-        from langchain_core.messages import AIMessage
-        from langchain_core.prompts import ChatPromptTemplate
-
+    def test_returns_raw_string_when_parser_output_type_is_none(
+        self, stub_evaluator, evaluation_metadata
+    ):
         def _fake_llm(_pv):
             return AIMessage(content="plain prose")
 
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
         with patch(_CHAIN_PATCH, return_value=_fake_llm):
-            out = _evaluator().execute_prompt_chain_step(
+            out = stub_evaluator.execute_prompt_chain_step(
                 step_name="raw",
                 prompt_settings=PromptSettings(
                     provider_type=LlmProvider.GOOGLE,
@@ -352,14 +357,12 @@ def _fake_llm(_pv):
                 evaluation_metadata=evaluation_metadata,
                 template=template,
                 chain_inputs={"input": "Hello"},
+                parser_output_type=None,
             )
         assert out == "plain prose"
 
-    def test_returns_parsed_pydantic_output(self, evaluation_metadata):
-        from langchain_core.messages import AIMessage
-        from langchain_core.prompts import ChatPromptTemplate
-
-        def _fake_llm(prompt_value):
+    def test_returns_parsed_conventionality_output(self, evaluation_metadata):
+        def _fake_llm(_pv):
             return AIMessage(content=_CONV_JSON)
 
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
@@ -379,10 +382,113 @@ def _fake_llm(prompt_value):
         assert result.complexity_score == "slightly_complex"
         assert result.reasoning == "Clear."
 
-    def test_prompt_settings_recorded_in_step_extras(self, evaluation_metadata):
-        from langchain_core.messages import AIMessage
-        from langchain_core.prompts import ChatPromptTemplate
+    def test_returns_parsed_stub_chain_output(self, stub_evaluator, evaluation_metadata):
+        def _fake_llm(_pv):
+            return AIMessage(content=_CHAIN_JSON)
+
+        template = ChatPromptTemplate.from_messages([("human", "{input}")])
+        with patch(_CHAIN_PATCH, return_value=_fake_llm):
+            result = stub_evaluator.execute_prompt_chain_step(
+                step_name="main",
+                prompt_settings=PromptSettings(
+                    provider_type=LlmProvider.GOOGLE,
+                    model="gemini-2.0-flash",
+                    temperature=0.0,
+                ),
+                evaluation_metadata=evaluation_metadata,
+                template=template,
+                chain_inputs={"input": "Hello"},
+                parser_output_type=_ChainOutput,
+            )
+        assert isinstance(result, _ChainOutput)
+        assert result.label == "ok"
+        assert result.score == 7
+
+    def test_parser_returning_model_instance_short_circuits_model_validate(
+        self, stub_evaluator, evaluation_metadata
+    ):
+        """When ``JsonOutputParser.invoke`` returns a model, ``isinstance`` path skips ``model_validate``."""
+        prebuilt = _ChainOutput(label="direct", score=99)
+
+        def _fake_llm(_pv):
+            return AIMessage(content="unused")
+
+        template = ChatPromptTemplate.from_messages([("human", "{input}")])
+        with (
+            patch(_CHAIN_PATCH, return_value=_fake_llm),
+            patch("langchain_core.output_parsers.json.JsonOutputParser") as mock_parser_cls,
+        ):
+            mock_parser = MagicMock()
+            mock_parser.invoke.return_value = prebuilt
+            mock_parser_cls.return_value = mock_parser
+            result = stub_evaluator.execute_prompt_chain_step(
+                step_name="main",
+                prompt_settings=PromptSettings(
+                    provider_type=LlmProvider.GOOGLE,
+                    model="gemini-2.0-flash",
+                    temperature=0.0,
+                ),
+                evaluation_metadata=evaluation_metadata,
+                template=template,
+                chain_inputs={"input": "Hello"},
+                parser_output_type=_ChainOutput,
+            )
+        assert result is prebuilt
+
+    def test_keyboard_interrupt_from_parser_propagates(self, stub_evaluator, evaluation_metadata):
+        def _fake_llm(_pv):
+            return AIMessage(content=_CHAIN_JSON)
 
+        template = ChatPromptTemplate.from_messages([("human", "{input}")])
+        with (
+            patch(_CHAIN_PATCH, return_value=_fake_llm),
+            patch("langchain_core.output_parsers.json.JsonOutputParser") as mock_parser_cls,
+        ):
+            mock_parser = MagicMock()
+            mock_parser.invoke.side_effect = KeyboardInterrupt
+            mock_parser_cls.return_value = mock_parser
+            with pytest.raises(KeyboardInterrupt):
+                stub_evaluator.execute_prompt_chain_step(
+                    step_name="main",
+                    prompt_settings=PromptSettings(
+                        provider_type=LlmProvider.GOOGLE,
+                        model="gemini-2.0-flash",
+                        temperature=0.0,
+                    ),
+                    evaluation_metadata=evaluation_metadata,
+                    template=template,
+                    chain_inputs={"input": "Hello"},
+                    parser_output_type=_ChainOutput,
+                )
+
+    def test_system_exit_from_parser_propagates(self, stub_evaluator, evaluation_metadata):
+        def _fake_llm(_pv):
+            return AIMessage(content=_CHAIN_JSON)
+
+        template = ChatPromptTemplate.from_messages([("human", "{input}")])
+        with (
+            patch(_CHAIN_PATCH, return_value=_fake_llm),
+            patch("langchain_core.output_parsers.json.JsonOutputParser") as mock_parser_cls,
+        ):
+            mock_parser = MagicMock()
+            mock_parser.invoke.side_effect = SystemExit(3)
+            mock_parser_cls.return_value = mock_parser
+            with pytest.raises(SystemExit) as exc_info:
+                stub_evaluator.execute_prompt_chain_step(
+                    step_name="main",
+                    prompt_settings=PromptSettings(
+                        provider_type=LlmProvider.GOOGLE,
+                        model="gemini-2.0-flash",
+                        temperature=0.0,
+                    ),
+                    evaluation_metadata=evaluation_metadata,
+                    template=template,
+                    chain_inputs={"input": "Hello"},
+                    parser_output_type=_ChainOutput,
+                )
+            assert exc_info.value.code == 3
+
+    def test_prompt_settings_recorded_in_step_extras(self, stub_evaluator, evaluation_metadata):
         settings = PromptSettings(
             provider_type=LlmProvider.GOOGLE,
             model="gemini-2.0-flash",
@@ -390,27 +496,26 @@ def test_prompt_settings_recorded_in_step_extras(self, evaluation_metadata):
         )
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
 
-        with patch(_CHAIN_PATCH, return_value=lambda pv: AIMessage(content=_CONV_JSON)):
-            _evaluator().execute_prompt_chain_step(
+        with patch(_CHAIN_PATCH, return_value=lambda _pv: AIMessage(content=_CHAIN_JSON)):
+            stub_evaluator.execute_prompt_chain_step(
                 step_name="main",
                 prompt_settings=settings,
                 evaluation_metadata=evaluation_metadata,
                 template=template,
                 chain_inputs={"input": "text"},
-                parser_output_type=ConventionalityOutput,
+                parser_output_type=_ChainOutput,
             )
 
         step = evaluation_metadata.step_details["main"]
         assert step.extras[PROMPT_STEP_EXTRA_PROMPT_SETTINGS]["model"] == "gemini-2.0-flash"
         assert PROMPT_STEP_EXTRA_TOKEN_USAGE in step.extras
 
-    def test_token_usage_recorded_when_llm_reports_usage(self, evaluation_metadata):
-        from langchain_core.messages import AIMessage
-        from langchain_core.prompts import ChatPromptTemplate
-
-        def _llm_with_usage(pv):
+    def test_token_usage_recorded_when_llm_reports_usage(
+        self, stub_evaluator, evaluation_metadata
+    ):
+        def _llm_with_usage(_pv):
             return AIMessage(
-                content=_CONV_JSON,
+                content=_CHAIN_JSON,
                 usage_metadata={
                     "input_tokens": 42,
                     "output_tokens": 17,
@@ -426,13 +531,13 @@ def _llm_with_usage(pv):
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
 
         with patch(_CHAIN_PATCH, return_value=_llm_with_usage):
-            _evaluator().execute_prompt_chain_step(
+            stub_evaluator.execute_prompt_chain_step(
                 step_name="main",
                 prompt_settings=settings,
                 evaluation_metadata=evaluation_metadata,
                 template=template,
                 chain_inputs={"input": "text"},
-                parser_output_type=ConventionalityOutput,
+                parser_output_type=_ChainOutput,
             )
 
         step = evaluation_metadata.step_details["main"]
@@ -440,9 +545,9 @@ def _llm_with_usage(pv):
         assert step.extras[PROMPT_STEP_EXTRA_TOKEN_USAGE]["output_tokens"] == 17
         assert evaluation_metadata.total_token_usage[LlmProvider.GOOGLE].input_tokens == 42
 
-    def test_propagates_configuration_error_from_create_provider(self, evaluation_metadata):
-        from langchain_core.prompts import ChatPromptTemplate
-
+    def test_propagates_configuration_error_from_create_provider(
+        self, stub_evaluator, evaluation_metadata
+    ):
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
         with (
             patch(
@@ -451,7 +556,7 @@ def test_propagates_configuration_error_from_create_provider(self, evaluation_me
             ),
             pytest.raises(ConfigurationError, match="Google provider config is not set"),
         ):
-            _evaluator().execute_prompt_chain_step(
+            stub_evaluator.execute_prompt_chain_step(
                 step_name="main",
                 prompt_settings=PromptSettings(
                     provider_type=LlmProvider.GOOGLE,
@@ -461,14 +566,33 @@ def test_propagates_configuration_error_from_create_provider(self, evaluation_me
                 evaluation_metadata=evaluation_metadata,
                 template=template,
                 chain_inputs={"input": "text"},
-                parser_output_type=ConventionalityOutput,
+                parser_output_type=_ChainOutput,
             )
 
-    def test_wraps_unexpected_chain_failure_as_api_error(self, evaluation_metadata):
-        """LangChain / provider failures are mapped via wrap_provider_error."""
-        from langchain_core.prompts import ChatPromptTemplate
+    def test_propagates_evaluator_error_without_wrapping(self, stub_evaluator, evaluation_metadata):
+        """``EvaluatorError`` subclasses raised inside the chain are re-raised unchanged."""
+        template = ChatPromptTemplate.from_messages([("human", "{input}")])
+        with (
+            patch(_CHAIN_PATCH, side_effect=EvaluatorError("bare evaluator error")),
+            pytest.raises(EvaluatorError, match="bare evaluator error"),
+        ):
+            stub_evaluator.execute_prompt_chain_step(
+                step_name="main",
+                prompt_settings=PromptSettings(
+                    provider_type=LlmProvider.GOOGLE,
+                    model="gemini-2.0-flash",
+                    temperature=0.0,
+                ),
+                evaluation_metadata=evaluation_metadata,
+                template=template,
+                chain_inputs={"input": "text"},
+                parser_output_type=_ChainOutput,
+            )
 
-        def _boom(_prompt_value):
+    def test_wraps_unexpected_chain_failure_as_api_error(
+        self, stub_evaluator, evaluation_metadata
+    ):
+        def _boom(_pv):
             raise ValueError("simulated provider failure")
 
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
@@ -476,7 +600,7 @@ def _boom(_prompt_value):
             patch(_CHAIN_PATCH, return_value=_boom),
             pytest.raises(APIError, match="simulated provider failure"),
         ):
-            _evaluator().execute_prompt_chain_step(
+            stub_evaluator.execute_prompt_chain_step(
                 step_name="main",
                 prompt_settings=PromptSettings(
                     provider_type=LlmProvider.GOOGLE,
@@ -486,5 +610,55 @@ def _boom(_prompt_value):
                 evaluation_metadata=evaluation_metadata,
                 template=template,
                 chain_inputs={"input": "text"},
-                parser_output_type=ConventionalityOutput,
+                parser_output_type=_ChainOutput,
+            )
+
+    def test_malformed_llm_json_raises_api_error(self, stub_evaluator, evaluation_metadata):
+        """Invalid JSON from the LLM becomes :class:`APIError` via ``wrap_provider_error``."""
+
+        def _bad(_pv):
+            return AIMessage(content="not-json")
+
+        template = ChatPromptTemplate.from_messages([("human", "{input}")])
+        with (
+            patch(_CHAIN_PATCH, return_value=_bad),
+            pytest.raises(APIError, match="Invalid json output"),
+        ):
+            stub_evaluator.execute_prompt_chain_step(
+                step_name="main",
+                prompt_settings=PromptSettings(
+                    provider_type=LlmProvider.GOOGLE,
+                    model="gemini-2.0-flash",
+                    temperature=0.0,
+                ),
+                evaluation_metadata=evaluation_metadata,
+                template=template,
+                chain_inputs={"input": "text"},
+                parser_output_type=_ChainOutput,
+            )
+
+    def test_schema_mismatch_raises_pydantic_validation_error(
+        self, stub_evaluator, evaluation_metadata
+    ):
+        """Valid JSON that does not satisfy the output model raises Pydantic ``ValidationError``."""
+
+        def _partial(_pv):
+            return AIMessage(content='{"label": "only"}')
+
+        template = ChatPromptTemplate.from_messages([("human", "{input}")])
+        with (
+            patch(_CHAIN_PATCH, return_value=_partial),
+            pytest.raises(PydanticValidationError),
+        ):
+            stub_evaluator.execute_prompt_chain_step(
+                step_name="main",
+                prompt_settings=PromptSettings(
+                    provider_type=LlmProvider.GOOGLE,
+                    model="gemini-2.0-flash",
+                    temperature=0.0,
+                ),
+                evaluation_metadata=evaluation_metadata,
+                template=template,
+                chain_inputs={"input": "text"},
+                parser_output_type=_ChainOutput,
             )

From f9fb6d5fcc9a52ed110b8b3618d11c33e1ec722c Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Mon, 11 May 2026 17:52:40 -0700
Subject: [PATCH 07/26] chore: textLength as int

---
 .../src/learning_commons_evaluators/schemas/common_inputs.py  | 2 +-
 .../src/learning_commons_evaluators/schemas/evaluator.py      | 2 +-
 sdks/python/tests/evaluators/test_base.py                     | 4 ++--
 sdks/python/tests/schemas/test_common_inputs.py               | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py b/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py
index 37a6aa0..446f7a4 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py
@@ -53,7 +53,7 @@ def validate(self) -> None:
 
     def input_metadata(self) -> dict[str, Any]:
         """Return the character count — never the raw text (may contain PII)."""
-        return {"textLength": str(len(self.value))}
+        return {"textLength": len(self.value)}
 
 
 class GradeInputField(InputField[int]):
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py b/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py
index 1e54a57..9b3c6d0 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py
@@ -46,7 +46,7 @@ def validate(self) -> None:
                     raise ValidationError(...)
 
             def input_metadata(self) -> dict[str, Any]:
-                return {"textLength": str(len(self.value))}
+                return {"textLength": len(self.value)}
     """
 
     spec: InputSpec
diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py
index 15efbb0..c00f03b 100644
--- a/sdks/python/tests/evaluators/test_base.py
+++ b/sdks/python/tests/evaluators/test_base.py
@@ -173,7 +173,7 @@ def test_evaluate_sets_metadata_from_input_metadata(self, stub_evaluator):
         inp = _stub_input()
         result = stub_evaluator.evaluate(inp)
         assert result.metadata.input_metadata == inp.input_metadata()
-        assert result.metadata.input_metadata["text"] == {"textLength": "11"}
+        assert result.metadata.input_metadata["text"] == {"textLength": 11}
         assert result.metadata.input_metadata["grade_level"] == {"grade": 3}
 
     def test_full_telemetry_config_still_uses_input_metadata_not_raw_values(self, stub_evaluator):
@@ -183,7 +183,7 @@ def test_full_telemetry_config_still_uses_input_metadata_not_raw_values(self, st
         inp = _stub_input()
         result = ev.evaluate(inp)
         assert result.metadata.input_metadata == inp.input_metadata()
-        assert result.metadata.input_metadata["text"] == {"textLength": "11"}
+        assert result.metadata.input_metadata["text"] == {"textLength": 11}
         assert result.metadata.input_metadata["grade_level"] == {"grade": 3}
 
 
diff --git a/sdks/python/tests/schemas/test_common_inputs.py b/sdks/python/tests/schemas/test_common_inputs.py
index 904a83a..bc6467c 100644
--- a/sdks/python/tests/schemas/test_common_inputs.py
+++ b/sdks/python/tests/schemas/test_common_inputs.py
@@ -31,7 +31,7 @@ class TestTextInputField:
     def test_value_and_metadata(self):
         field = TextInputField(spec=_text_spec(), value="Hello, world!")
         assert field.value == "Hello, world!"
-        assert field.input_metadata() == {"textLength": "13"}
+        assert field.input_metadata() == {"textLength": 13}
 
     def test_validate_passes_within_limits(self):
         TextInputField(

From 354ced255bc383eee9b41caa0429dcf8a235563e Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Mon, 11 May 2026 18:09:37 -0700
Subject: [PATCH 08/26] feat: strip white space by default for text inputs

---
 .../schemas/common_inputs.py                  | 17 +++++++++++
 .../schemas/input_specs.py                    |  7 ++++-
 .../tests/schemas/test_common_inputs.py       | 29 +++++++++++++++++++
 sdks/python/tests/schemas/test_input_specs.py |  4 +++
 4 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py b/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py
index 446f7a4..4de9964 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py
@@ -13,6 +13,8 @@
 
 from typing import Any
 
+from pydantic import model_validator
+
 from learning_commons_evaluators.schemas.errors import ValidationError
 from learning_commons_evaluators.schemas.evaluator import InputField
 from learning_commons_evaluators.schemas.input_specs import (
@@ -34,12 +36,27 @@ class TextInputField(InputField[str]):
     Constraints (min/max text length) are read from ``spec`` rather than
     stored directly on the field, so the same spec object can be shared across
     many field instances.
+
+    By default ``spec.strip_whitespace`` is true, so ``value`` is trimmed when the field is built.
+    Set it to false to preserve leading and trailing whitespace.
     """
 
     # Narrows the abstract InputField.spec: InputSpec → TextInputSpec.
     # value: str is inherited from InputField[str] and does not need to be redeclared.
     spec: TextInputSpec
 
+    @model_validator(mode="before")
+    @classmethod
+    def _strip_value_if_spec_requests(cls, data: Any) -> Any:
+        if not isinstance(data, dict):
+            return data
+        spec = data.get("spec")
+        value = data.get("value")
+        if not isinstance(value, str) or spec is None:
+            return data
+        if isinstance(spec, TextInputSpec) and spec.strip_whitespace:
+            return {**data, "value": value.strip()}
+        return data
     def validate(self) -> None:
         """Raise :class:`~.errors.ValidationError` if the value violates the spec constraints."""
         text_length = len(self.value)
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/input_specs.py b/sdks/python/src/learning_commons_evaluators/schemas/input_specs.py
index c35a6cc..bab6e06 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/input_specs.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/input_specs.py
@@ -6,7 +6,7 @@
 Hierarchy::
 
     InputSpec          base: name, type, description, required
-    ├─ TextInputSpec   adds: min_text_length, max_text_length
+    ├─ TextInputSpec   adds: min_text_length, max_text_length, strip_whitespace
     └─ GradeInputSpec  adds: allowed_grades
 
 **Adding a new input spec type — checklist**
@@ -68,11 +68,16 @@ class TextInputSpec(InputSpec):
 
     Constraint fields are optional; omitting them means no length limit is
     enforced for that boundary.
+
+    When ``strip_whitespace`` is true (the default), leading and trailing whitespace is removed
+    from the value when a :class:`~.common_inputs.TextInputField` is constructed
+    (before length validation). Set it to false to keep the raw string unchanged.
     """
 
     type: Literal["TextInputField"] = "TextInputField"
     min_text_length: int | None = None
     max_text_length: int | None = None
+    strip_whitespace: bool = True
 
 
 class GradeInputSpec(InputSpec):
diff --git a/sdks/python/tests/schemas/test_common_inputs.py b/sdks/python/tests/schemas/test_common_inputs.py
index bc6467c..2cc7d3e 100644
--- a/sdks/python/tests/schemas/test_common_inputs.py
+++ b/sdks/python/tests/schemas/test_common_inputs.py
@@ -62,6 +62,35 @@ def test_spec_is_accessible(self):
         field = TextInputField(spec=spec, value="abc")
         assert field.spec.min_text_length == 50
 
+    def test_strip_whitespace_false_preserves_padding(self):
+        field = TextInputField(spec=_text_spec(strip_whitespace=False), value="  ab  ")
+        assert field.value == "  ab  "
+
+    def test_strip_whitespace_true_trims_value_by_default(self):
+        field = TextInputField(spec=_text_spec(), value="  ab  ")
+        assert field.value == "ab"
+
+    def test_strip_whitespace_applies_before_length_validation(self):
+        TextInputField(
+            spec=_text_spec(min_text_length=2),
+            value="  xx  ",
+        ).validate()
+
+    def test_strip_whitespace_explicit_true_trims_value(self):
+        field = TextInputField(
+            spec=_text_spec(strip_whitespace=True),
+            value="  cd  ",
+        )
+        assert field.value == "cd"
+
+    def test_validate_raises_when_strip_shortens_below_min(self):
+        """Padding does not count toward ``min_text_length`` when stripping is on."""
+        with pytest.raises(ValidationError, match="below minimum"):
+            TextInputField(
+                spec=_text_spec(min_text_length=5, strip_whitespace=True),
+                value="  ab  ",
+            ).validate()
+
 
 class TestGradeInputField:
     def test_value_and_metadata(self):
diff --git a/sdks/python/tests/schemas/test_input_specs.py b/sdks/python/tests/schemas/test_input_specs.py
index b672565..7c06d74 100644
--- a/sdks/python/tests/schemas/test_input_specs.py
+++ b/sdks/python/tests/schemas/test_input_specs.py
@@ -72,6 +72,10 @@ def test_no_length_constraints_by_default(self):
         assert spec.min_text_length is None
         assert spec.max_text_length is None
 
+    def test_strip_whitespace_defaults_true(self):
+        spec = TextInputSpec(name="text")
+        assert spec.strip_whitespace is True
+
 
 class TestGradeInputSpec:
     def test_default_type_discriminator(self):

From 7d14607abfe89a012616ff6c7d8cdf7af3a9b1ef Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Mon, 11 May 2026 19:02:05 -0700
Subject: [PATCH 09/26] chore: PR comments TODOs, remove redundant fields from
 PromptProviderConfig

---
 .../learning_commons_evaluators/schemas/config.py    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sdks/python/src/learning_commons_evaluators/schemas/config.py b/sdks/python/src/learning_commons_evaluators/schemas/config.py
index c451fb7..f3b1577 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/config.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/config.py
@@ -16,6 +16,7 @@
 # --- Prompt provider configs (for LLM calls in prompt steps) ---
 
 
+# TODO: rename to LLMProvider
 class LlmProvider(str, Enum):
     """LLM provider identifier. Subclass of str so it compares and serializes as the provider name."""
 
@@ -24,22 +25,23 @@ class LlmProvider(str, Enum):
     OPENAI = "openai"
 
 
+# TODO: rename to LLMProviderConfig and subclasses to GoogleLLMProviderConfig, OpenAILLMProviderConfig, AnthropicLLMProviderConfig.
 @dataclass(frozen=True)
 class PromptProviderConfig:
     """Base type for prompt provider configuration."""
 
     api_key: str
     type: LlmProvider
-    base_url: str | None = (
-        None  # Optional; for OpenAI-compatible endpoints (e.g. Azure, proxy). Used only when type is OPENAI.
-    )
+    # TODO: verify base_url functionality before enabling
+    # base_url: str | None = (
+    #     None  # Optional; for OpenAI-compatible endpoints (e.g. Azure, proxy). Used only when type is OPENAI.
+    # )
 
 
 @dataclass(frozen=True)
 class GooglePromptProviderConfig(PromptProviderConfig):
     """Google (Gemini) prompt provider config. Takes an API key."""
 
-    api_key: str
     type: LlmProvider = LlmProvider.GOOGLE
 
 
@@ -47,7 +49,6 @@ class GooglePromptProviderConfig(PromptProviderConfig):
 class OpenAIPromptProviderConfig(PromptProviderConfig):
     """OpenAI prompt provider config. Takes an API key. Optional base_url for custom endpoints."""
 
-    api_key: str
     type: LlmProvider = LlmProvider.OPENAI
 
 
@@ -55,7 +56,6 @@ class OpenAIPromptProviderConfig(PromptProviderConfig):
 class AnthropicPromptProviderConfig(PromptProviderConfig):
     """Anthropic (Claude) prompt provider config. Takes an API key."""
 
-    api_key: str
     type: LlmProvider = LlmProvider.ANTHROPIC
 
 

From bfcd4c8bbfb723588d39446de8b408a12f5f1f43 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Mon, 11 May 2026 19:18:34 -0700
Subject: [PATCH 10/26] chore: introduce TelemetryConfig class.

---
 sdks/python/README.md                         |  6 ++---
 .../learning_commons_evaluators/__init__.py   |  2 ++
 .../src/learning_commons_evaluators/config.py |  2 ++
 .../schemas/config.py                         | 25 +++++++++--------
 sdks/python/tests/evaluators/test_base.py     |  8 +++---
 sdks/python/tests/schemas/test_config.py      | 27 ++++++++++---------
 sdks/python/tests/test_package_imports.py     |  2 ++
 7 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/sdks/python/README.md b/sdks/python/README.md
index 1630de4..49f54ad 100644
--- a/sdks/python/README.md
+++ b/sdks/python/README.md
@@ -148,7 +148,7 @@ logging.basicConfig(level=logging.INFO)
 # Create config with provider credentials
 config = create_config(
     google_prompt_provider_config=GooglePromptProviderConfig(api_key="your-google-key"),
-    telemetry_id="your-telemetry-id",
+    telemetry_partner_id="your-telemetry-id",
 )
 
 # Create evaluator and run evaluation
@@ -181,7 +181,7 @@ from learning_commons_evaluators import (
 
 config = create_config(
     google_prompt_provider_config=GooglePromptProviderConfig(api_key="..."),
-    telemetry_id="your-telemetry-id",
+    telemetry_partner_id="your-telemetry-id",
 )
 evaluator = ConventionalityEvaluator(config)
 
@@ -225,7 +225,7 @@ from learning_commons_evaluators import create_config
 config = create_config(
     google_prompt_provider_config=google_config,
     openai_prompt_provider_config=openai_config,
-    telemetry_id="your-telemetry-id",
+    telemetry_partner_id="your-telemetry-id",
     logger=my_logger,  # Optional: any standard logging.Logger (default: package logger)
 )
 ```
diff --git a/sdks/python/src/learning_commons_evaluators/__init__.py b/sdks/python/src/learning_commons_evaluators/__init__.py
index 5dab77b..9bb613a 100644
--- a/sdks/python/src/learning_commons_evaluators/__init__.py
+++ b/sdks/python/src/learning_commons_evaluators/__init__.py
@@ -10,6 +10,7 @@
     OpenAIPromptProviderConfig,
     PromptProviderConfig,
     PromptSettings,
+    TelemetryConfig,
     create_config,
     create_config_no_telemetry,
     create_config_telemetry_with_full_input,
@@ -125,6 +126,7 @@
     "RateLimitError",
     "SDK_LOGGER_NAME",
     "Status",
+    "TelemetryConfig",
     "TextComplexityEvaluationInput",
     "TextInputField",
     "TokenUsage",
diff --git a/sdks/python/src/learning_commons_evaluators/config.py b/sdks/python/src/learning_commons_evaluators/config.py
index f3dd635..dc5995f 100644
--- a/sdks/python/src/learning_commons_evaluators/config.py
+++ b/sdks/python/src/learning_commons_evaluators/config.py
@@ -7,6 +7,7 @@
     OpenAIPromptProviderConfig,
     PromptProviderConfig,
     PromptSettings,
+    TelemetryConfig,
     create_config,
     create_config_no_telemetry,
     create_config_telemetry_with_full_input,
@@ -19,6 +20,7 @@
     "OpenAIPromptProviderConfig",
     "PromptProviderConfig",
     "PromptSettings",
+    "TelemetryConfig",
     "create_config",
     "create_config_no_telemetry",
     "create_config_telemetry_with_full_input",
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/config.py b/sdks/python/src/learning_commons_evaluators/schemas/config.py
index f3b1577..c083840 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/config.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/config.py
@@ -77,6 +77,13 @@ class EvaluationSettings(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
 
+@dataclass(frozen=True)
+class TelemetryConfig:
+    """Config for telemetry."""
+
+    telemetry_partner_id: str | None = None
+    send_full_input_with_telemetry: bool = False
+
 # --- Evaluator config ---
 
 
@@ -98,8 +105,7 @@ class EvaluatorConfig:
     openai_prompt_provider_config: OpenAIPromptProviderConfig | None = None
     anthropic_prompt_provider_config: AnthropicPromptProviderConfig | None = None
     logger: Logger = field(default_factory=get_logger)
-    telemetry_id: str | None = None
-    send_full_input_with_telemetry: bool = False
+    telemetry: TelemetryConfig = field(default_factory=TelemetryConfig)
 
 
 def create_config(
@@ -108,17 +114,16 @@ def create_config(
     openai_prompt_provider_config: OpenAIPromptProviderConfig | None = None,
     anthropic_prompt_provider_config: AnthropicPromptProviderConfig | None = None,
     logger: Logger | None = None,
-    telemetry_id: str,
+    telemetry_partner_id: str,
     send_full_input_with_telemetry: bool = False,
 ) -> EvaluatorConfig:
-    """Create evaluator config with telemetry. telemetry_id is required."""
+    """Create evaluator config with telemetry. telemetry_partner_id is required."""
     return EvaluatorConfig(
         google_prompt_provider_config=google_prompt_provider_config,
         openai_prompt_provider_config=openai_prompt_provider_config,
         anthropic_prompt_provider_config=anthropic_prompt_provider_config,
         logger=get_logger() if logger is None else logger,
-        telemetry_id=telemetry_id,
-        send_full_input_with_telemetry=send_full_input_with_telemetry,
+        telemetry=TelemetryConfig(telemetry_partner_id=telemetry_partner_id, send_full_input_with_telemetry=send_full_input_with_telemetry),
     )
 
 
@@ -135,8 +140,7 @@ def create_config_no_telemetry(
         openai_prompt_provider_config=openai_prompt_provider_config,
         anthropic_prompt_provider_config=anthropic_prompt_provider_config,
         logger=get_logger() if logger is None else logger,
-        telemetry_id=None,
-        send_full_input_with_telemetry=False,
+        telemetry=TelemetryConfig(telemetry_partner_id=None, send_full_input_with_telemetry=False),
     )
 
 
@@ -146,7 +150,7 @@ def create_config_telemetry_with_full_input(
     openai_prompt_provider_config: OpenAIPromptProviderConfig | None = None,
     anthropic_prompt_provider_config: AnthropicPromptProviderConfig | None = None,
     logger: Logger | None = None,
-    telemetry_id: str,
+    telemetry_partner_id: str,
 ) -> EvaluatorConfig:
     """Create evaluator config with telemetry and full input sent with telemetry."""
     return EvaluatorConfig(
@@ -154,6 +158,5 @@ def create_config_telemetry_with_full_input(
         openai_prompt_provider_config=openai_prompt_provider_config,
         anthropic_prompt_provider_config=anthropic_prompt_provider_config,
         logger=get_logger() if logger is None else logger,
-        telemetry_id=telemetry_id,
-        send_full_input_with_telemetry=True,
+        telemetry=TelemetryConfig(telemetry_partner_id=telemetry_partner_id, send_full_input_with_telemetry=True),
     )
diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py
index c00f03b..5650ec7 100644
--- a/sdks/python/tests/evaluators/test_base.py
+++ b/sdks/python/tests/evaluators/test_base.py
@@ -112,7 +112,7 @@ def stub_evaluator(config):
 def _evaluator(*, send_full_input=False):
     """Return a ConventionalityEvaluator; use send_full_input=True for full-input telemetry."""
     if send_full_input:
-        cfg = create_config(telemetry_id="test", send_full_input_with_telemetry=True)
+        cfg = create_config(telemetry_partner_id="test", send_full_input_with_telemetry=True)
     else:
         cfg = create_config_no_telemetry()
     return ConventionalityEvaluator(cfg)
@@ -178,7 +178,7 @@ def test_evaluate_sets_metadata_from_input_metadata(self, stub_evaluator):
 
     def test_full_telemetry_config_still_uses_input_metadata_not_raw_values(self, stub_evaluator):
         """``send_full_input_with_telemetry`` does not replace ``input_metadata`` with raw values."""
-        cfg = create_config(telemetry_id="test", send_full_input_with_telemetry=True)
+        cfg = create_config(telemetry_partner_id="test", send_full_input_with_telemetry=True)
         ev = _StubEvaluator(cfg)
         inp = _stub_input()
         result = ev.evaluate(inp)
@@ -192,13 +192,13 @@ def test_uses_input_metadata_by_default(self):
         evaluator = _evaluator()
         with patch.object(evaluator, "execute_prompt_chain_step", return_value=_MOCK_OUTPUT):
             result = evaluator.evaluate(_inp())
-        assert result.metadata.input_metadata["text"] == {"textLength": str(len(_SAMPLE_TEXT))}
+        assert result.metadata.input_metadata["text"] == {"textLength": len(_SAMPLE_TEXT)}
 
     def test_full_telemetry_still_records_input_metadata_not_raw_values(self):
         evaluator = _evaluator(send_full_input=True)
         with patch.object(evaluator, "execute_prompt_chain_step", return_value=_MOCK_OUTPUT):
             result = evaluator.evaluate(_inp())
-        assert result.metadata.input_metadata["text"] == {"textLength": str(len(_SAMPLE_TEXT))}
+        assert result.metadata.input_metadata["text"] == {"textLength": len(_SAMPLE_TEXT)}
         assert result.metadata.input_metadata["grade"] == {"grade": 5}
 
 
diff --git a/sdks/python/tests/schemas/test_config.py b/sdks/python/tests/schemas/test_config.py
index 9175450..3727ee3 100644
--- a/sdks/python/tests/schemas/test_config.py
+++ b/sdks/python/tests/schemas/test_config.py
@@ -1,5 +1,7 @@
 """Tests for EvaluatorConfig, PromptProviderConfig subclasses, and factory functions."""
 
+from dataclasses import FrozenInstanceError
+
 import pytest
 
 from learning_commons_evaluators.logger import SDK_LOGGER_NAME, get_logger
@@ -8,6 +10,7 @@
     GooglePromptProviderConfig,
     LlmProvider,
     OpenAIPromptProviderConfig,
+    TelemetryConfig,
     create_config,
     create_config_no_telemetry,
     create_config_telemetry_with_full_input,
@@ -46,15 +49,15 @@ def test_openai_config_accepts_base_url(self):
         assert cfg.base_url == "https://custom.example.com"
 
     def test_provider_config_base_url_defaults_to_none(self):
-        cfg = GooglePromptProviderConfig(api_key="key")
+        cfg = OpenAIPromptProviderConfig(api_key="key")
         assert cfg.base_url is None
 
 
 class TestEvaluatorConfigFactory:
     def test_create_config_no_telemetry_defaults(self):
         config = create_config_no_telemetry()
-        assert config.telemetry_id is None
-        assert config.send_full_input_with_telemetry is False
+        assert config.telemetry.telemetry_partner_id is None
+        assert config.telemetry.send_full_input_with_telemetry is False
         assert config.logger.name == SDK_LOGGER_NAME
 
     def test_create_config_no_telemetry_accepts_providers(self):
@@ -65,15 +68,15 @@ def test_create_config_no_telemetry_accepts_providers(self):
         assert config.google_prompt_provider_config.api_key == "gk"
         assert config.openai_prompt_provider_config.api_key == "ok"
 
-    def test_create_config_sets_telemetry_id(self):
-        config = create_config(telemetry_id="tid-123")
-        assert config.telemetry_id == "tid-123"
-        assert config.send_full_input_with_telemetry is False
+    def test_create_config_sets_telemetry_partner_id(self):
+        config = create_config(telemetry_partner_id="tid-123")
+        assert config.telemetry.telemetry_partner_id == "tid-123"
+        assert config.telemetry.send_full_input_with_telemetry is False
 
     def test_create_config_telemetry_with_full_input_sets_flag(self):
-        config = create_config_telemetry_with_full_input(telemetry_id="tid")
-        assert config.telemetry_id == "tid"
-        assert config.send_full_input_with_telemetry is True
+        config = create_config_telemetry_with_full_input(telemetry_partner_id="tid")
+        assert config.telemetry.telemetry_partner_id == "tid"
+        assert config.telemetry.send_full_input_with_telemetry is True
 
     def test_explicit_logger_is_preserved(self):
         custom = get_logger("custom_test")
@@ -83,5 +86,5 @@ def test_explicit_logger_is_preserved(self):
     def test_config_is_frozen(self):
         """EvaluatorConfig is a frozen dataclass; mutation must raise."""
         config = create_config_no_telemetry()
-        with pytest.raises((AttributeError, TypeError)):
-            config.telemetry_id = "other"  # type: ignore[misc]
+        with pytest.raises(FrozenInstanceError):
+            config.telemetry = TelemetryConfig(telemetry_partner_id="x", send_full_input_with_telemetry=False)
diff --git a/sdks/python/tests/test_package_imports.py b/sdks/python/tests/test_package_imports.py
index 4c6b0d4..1eee1a9 100644
--- a/sdks/python/tests/test_package_imports.py
+++ b/sdks/python/tests/test_package_imports.py
@@ -6,6 +6,7 @@ def test_main_package_imports():
     from learning_commons_evaluators import (
         BaseEvaluator,
         ConventionalityEvaluator,
+        TelemetryConfig,
         __version__,
         create_config_no_telemetry,
     )
@@ -14,6 +15,7 @@ def test_main_package_imports():
     assert create_config_no_telemetry is not None
     assert ConventionalityEvaluator is not None
     assert BaseEvaluator is not None
+    assert TelemetryConfig is not None
 
 
 def test_errors_import():

From 231700581de63192f1fda76dcd00f88fdc20f9a8 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Mon, 11 May 2026 19:24:53 -0700
Subject: [PATCH 11/26] chore: remove custom LLM endpoints support for now.

---
 sdks/python/README.md                          |  2 +-
 .../providers/langchain.py                     |  2 --
 .../schemas/config.py                          | 10 +++++-----
 sdks/python/tests/providers/test_langchain.py  | 18 +-----------------
 sdks/python/tests/schemas/test_config.py       |  8 --------
 5 files changed, 7 insertions(+), 33 deletions(-)

diff --git a/sdks/python/README.md b/sdks/python/README.md
index 49f54ad..4c27daa 100644
--- a/sdks/python/README.md
+++ b/sdks/python/README.md
@@ -211,7 +211,7 @@ from learning_commons_evaluators import (
 )
 
 google_config = GooglePromptProviderConfig(api_key="...")
-openai_config = OpenAIPromptProviderConfig(api_key="...", base_url=None)  # base_url for Azure/proxies
+openai_config = OpenAIPromptProviderConfig(api_key="...")
 anthropic_config = AnthropicPromptProviderConfig(api_key="...")
 ```
 
diff --git a/sdks/python/src/learning_commons_evaluators/providers/langchain.py b/sdks/python/src/learning_commons_evaluators/providers/langchain.py
index d4d07b7..9496831 100644
--- a/sdks/python/src/learning_commons_evaluators/providers/langchain.py
+++ b/sdks/python/src/learning_commons_evaluators/providers/langchain.py
@@ -56,8 +56,6 @@ def _create_openai_llm(
         "model": prompt_settings.model,
         "temperature": prompt_settings.temperature,
     }
-    if openai_config.base_url is not None:
-        kwargs["base_url"] = openai_config.base_url
     return ChatOpenAI(**kwargs)
 
 
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/config.py b/sdks/python/src/learning_commons_evaluators/schemas/config.py
index c083840..5c16495 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/config.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/config.py
@@ -32,10 +32,6 @@ class PromptProviderConfig:
 
     api_key: str
     type: LlmProvider
-    # TODO: verify base_url functionality before enabling
-    # base_url: str | None = (
-    #     None  # Optional; for OpenAI-compatible endpoints (e.g. Azure, proxy). Used only when type is OPENAI.
-    # )
 
 
 @dataclass(frozen=True)
@@ -47,9 +43,13 @@ class GooglePromptProviderConfig(PromptProviderConfig):
 
 @dataclass(frozen=True)
 class OpenAIPromptProviderConfig(PromptProviderConfig):
-    """OpenAI prompt provider config. Takes an API key. Optional base_url for custom endpoints."""
+    """OpenAI prompt provider config. Takes an API key."""
 
     type: LlmProvider = LlmProvider.OPENAI
+    # TODO: verify base_url functionality before enabling
+    # base_url: str | None = (
+    #     None  # Optional; for OpenAI-compatible endpoints (e.g. Azure, proxy). Used only when type is OPENAI.
+    # )
 
 
 @dataclass(frozen=True)
diff --git a/sdks/python/tests/providers/test_langchain.py b/sdks/python/tests/providers/test_langchain.py
index 65f3f41..b9f9c31 100644
--- a/sdks/python/tests/providers/test_langchain.py
+++ b/sdks/python/tests/providers/test_langchain.py
@@ -1,6 +1,6 @@
 """Tests for LangChain provider factory and token usage."""
 
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock
 
 import pytest
 
@@ -53,22 +53,6 @@ def test_openai_provider_returns_model(self):
         )
         assert create_provider(settings, config) is not None
 
-    def test_openai_provider_with_base_url_forwards_it_to_chat_model(self):
-        """When OpenAIPromptProviderConfig.base_url is set, it must be passed to ChatOpenAI."""
-        config = _config(
-            openai_prompt_provider_config=OpenAIPromptProviderConfig(
-                api_key="test-key",
-                base_url="https://custom.example.com/v1",
-            ),
-        )
-        settings = PromptSettings(
-            provider_type=LlmProvider.OPENAI, model="gpt-4o-mini", temperature=0.0
-        )
-        with patch("langchain_openai.ChatOpenAI") as mock_chat:
-            create_provider(settings, config)
-        _, kwargs = mock_chat.call_args
-        assert kwargs.get("base_url") == "https://custom.example.com/v1"
-
     def test_raises_when_google_config_missing(self):
         settings = PromptSettings(
             provider_type=LlmProvider.GOOGLE, model="gemini-2.0-flash", temperature=0.0
diff --git a/sdks/python/tests/schemas/test_config.py b/sdks/python/tests/schemas/test_config.py
index 3727ee3..264304f 100644
--- a/sdks/python/tests/schemas/test_config.py
+++ b/sdks/python/tests/schemas/test_config.py
@@ -44,14 +44,6 @@ def test_provider_config_default_type(self, cls, expected_type):
         assert cfg.type == expected_type
         assert cfg.api_key == "test-key"
 
-    def test_openai_config_accepts_base_url(self):
-        cfg = OpenAIPromptProviderConfig(api_key="key", base_url="https://custom.example.com")
-        assert cfg.base_url == "https://custom.example.com"
-
-    def test_provider_config_base_url_defaults_to_none(self):
-        cfg = OpenAIPromptProviderConfig(api_key="key")
-        assert cfg.base_url is None
-
 
 class TestEvaluatorConfigFactory:
     def test_create_config_no_telemetry_defaults(self):

From 6d69909fcb65f1a45d9be284134d56eae1677264 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Mon, 11 May 2026 20:21:37 -0700
Subject: [PATCH 12/26] chore: errors TODOs

---
 .../python/src/learning_commons_evaluators/schemas/errors.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sdks/python/src/learning_commons_evaluators/schemas/errors.py b/sdks/python/src/learning_commons_evaluators/schemas/errors.py
index 470d371..a14e80b 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/errors.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/errors.py
@@ -2,7 +2,7 @@
 
 import re
 
-
+# TODO: rename name and message, and remove Evaluator prefix where appropriate
 class EvaluatorError(Exception):
     """Base error class for all evaluator errors."""
 
@@ -85,7 +85,7 @@ def __init__(self, message: str = "Request timed out"):
         super().__init__(message, 408, True, "TIMEOUT_ERROR")
         self.name = "EvaluatorTimeoutError"
 
-
+# TODO: OpenAI & Anthropic may return a status_code in the response.
 def _parse_provider_error(error: BaseException) -> tuple[str, int | None, str | None]:
     message = str(error)
     status_code = None
@@ -120,6 +120,7 @@ def wrap_provider_error(
     # Timeouts before generic "Connection" — many stacks use "Connection timed out"
     if "timeout" in msg.lower() or "timed out" in msg.lower():
         return EvaluatorTimeoutError(msg)
+    # TODO: confirm if these apply to Python too. Based on TypeScript SDK implementation.
     if any(
         x in msg
         for x in (

From 16e24326b7fad495c31d9f7f697dc7204b33a7af Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Tue, 12 May 2026 00:59:56 -0700
Subject: [PATCH 13/26] chore: simplified load_settings and formatting

---
 sdks/python/pyproject.toml                    |  16 +-
 .../evaluators/base.py                        |   2 +-
 .../schemas/common_inputs.py                  |   1 +
 .../schemas/config.py                         |  10 +-
 .../schemas/errors.py                         |   2 +
 .../schemas/input_specs.py                    |   4 +-
 .../schemas/metadata.py                       |  46 +-
 .../settings/load_settings.py                 | 191 +++----
 sdks/python/tests/schemas/test_config.py      |   4 +-
 sdks/python/tests/schemas/test_metadata.py    |  35 +-
 .../tests/settings/fixtures/__init__.py       |   1 +
 .../tests/settings/fixtures/minimal.toml      |   1 +
 .../tests/settings/test_load_settings.py      | 504 +++++++-----------
 13 files changed, 356 insertions(+), 461 deletions(-)
 create mode 100644 sdks/python/tests/settings/fixtures/__init__.py
 create mode 100644 sdks/python/tests/settings/fixtures/minimal.toml

diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml
index d48f934..ec69933 100644
--- a/sdks/python/pyproject.toml
+++ b/sdks/python/pyproject.toml
@@ -76,8 +76,8 @@ testpaths = ["tests"]
 target-version = "py310"
 line-length = 100
 extend-exclude = [
-    "src/learning_commons_evaluators/settings/_generated_conventionality_settings.py",
-    "src/learning_commons_evaluators/settings/_generated_vocabulary_settings.py",
+    # Codegen output from ``make generate-settings`` — not hand-edited.
+    "src/learning_commons_evaluators/settings/_generated_*_settings.py",
 ]
 
 [tool.ruff.lint]
@@ -111,19 +111,7 @@ disable_error_code = ["override"]
 module = "learning_commons_evaluators.settings.load_settings"
 disable_error_code = ["import-untyped"]
 
-[[tool.mypy.overrides]]
-module = "contract_tests.loader"
-disable_error_code = ["import-untyped"]
-
 [[tool.mypy.overrides]]
 module = "learning_commons_evaluators.providers.langchain"
 # LangChain stubs lag the runtime API (e.g. ChatAnthropic kwargs).
 disable_error_code = ["call-arg", "arg-type"]
-
-[[tool.mypy.overrides]]
-module = "learning_commons_evaluators.evaluators.conventionality"
-disable_error_code = ["import-untyped", "call-arg"]
-
-[[tool.mypy.overrides]]
-module = "learning_commons_evaluators.evaluators.vocabulary"
-disable_error_code = ["import-untyped", "call-arg"]
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
index 0a7f495..9b48d99 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
@@ -96,7 +96,7 @@ def evaluate(
         evaluation_metadata = EvaluationMetadata(
             evaluator_metadata=self.metadata,
             evaluation_settings=evaluation_settings,
-            input_metadata=input.input_metadata()
+            input_metadata=input.input_metadata(),
         )
         self.config.logger.info(
             "evaluation start",
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py b/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py
index 4de9964..d916289 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py
@@ -57,6 +57,7 @@ def _strip_value_if_spec_requests(cls, data: Any) -> Any:
         if isinstance(spec, TextInputSpec) and spec.strip_whitespace:
             return {**data, "value": value.strip()}
         return data
+
     def validate(self) -> None:
         """Raise :class:`~.errors.ValidationError` if the value violates the spec constraints."""
         text_length = len(self.value)
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/config.py b/sdks/python/src/learning_commons_evaluators/schemas/config.py
index 5c16495..94e44f2 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/config.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/config.py
@@ -84,6 +84,7 @@ class TelemetryConfig:
     telemetry_partner_id: str | None = None
     send_full_input_with_telemetry: bool = False
 
+
 # --- Evaluator config ---
 
 
@@ -123,7 +124,10 @@ def create_config(
         openai_prompt_provider_config=openai_prompt_provider_config,
         anthropic_prompt_provider_config=anthropic_prompt_provider_config,
         logger=get_logger() if logger is None else logger,
-        telemetry=TelemetryConfig(telemetry_partner_id=telemetry_partner_id, send_full_input_with_telemetry=send_full_input_with_telemetry),
+        telemetry=TelemetryConfig(
+            telemetry_partner_id=telemetry_partner_id,
+            send_full_input_with_telemetry=send_full_input_with_telemetry,
+        ),
     )
 
 
@@ -158,5 +162,7 @@ def create_config_telemetry_with_full_input(
         openai_prompt_provider_config=openai_prompt_provider_config,
         anthropic_prompt_provider_config=anthropic_prompt_provider_config,
         logger=get_logger() if logger is None else logger,
-        telemetry=TelemetryConfig(telemetry_partner_id=telemetry_partner_id, send_full_input_with_telemetry=True),
+        telemetry=TelemetryConfig(
+            telemetry_partner_id=telemetry_partner_id, send_full_input_with_telemetry=True
+        ),
     )
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/errors.py b/sdks/python/src/learning_commons_evaluators/schemas/errors.py
index a14e80b..47e5a22 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/errors.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/errors.py
@@ -2,6 +2,7 @@
 
 import re
 
+
 # TODO: rename name and message, and remove Evaluator prefix where appropriate
 class EvaluatorError(Exception):
     """Base error class for all evaluator errors."""
@@ -85,6 +86,7 @@ def __init__(self, message: str = "Request timed out"):
         super().__init__(message, 408, True, "TIMEOUT_ERROR")
         self.name = "EvaluatorTimeoutError"
 
+
 # TODO: OpenAI & Anthropic may return a status_code in the response.
 def _parse_provider_error(error: BaseException) -> tuple[str, int | None, str | None]:
     message = str(error)
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/input_specs.py b/sdks/python/src/learning_commons_evaluators/schemas/input_specs.py
index bab6e06..0b6154b 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/input_specs.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/input_specs.py
@@ -40,8 +40,8 @@
 # ---------------------------------------------------------------------------
 
 # Maps the ``type`` discriminator string to the concrete InputSpec class.
-# Used by the TOML parser to instantiate the right subclass and to build
-# the AnyInputSpec discriminated union below.
+# Used when building :class:`~learning_commons_evaluators.schemas.metadata.EvaluatorMetadata`
+# from TOML (via Pydantic's discriminated union on ``AnyInputSpec``).
 INPUT_SPEC_REGISTRY: dict[str, type["InputSpec"]] = {}
 
 
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/metadata.py b/sdks/python/src/learning_commons_evaluators/schemas/metadata.py
index 3ca60ed..26de4dd 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/metadata.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/metadata.py
@@ -4,7 +4,7 @@
 from enum import Enum
 from typing import Any
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, TypeAdapter, field_validator, model_validator
 
 from learning_commons_evaluators._version import __version__ as sdk_version
 from learning_commons_evaluators.schemas.config import LlmProvider, PromptSettings
@@ -31,14 +31,50 @@ class Status(Enum):
 class EvaluatorMetadata(BaseModel):
     """Evaluator metadata: id, version, name, description; maturity (alpha, beta, rc, ga); sdk_version."""
 
-    id: str
-    version: str
-    name: str
-    description: str
+    id: str = Field(..., min_length=1)
+    version: str = Field(..., min_length=1)
+    name: str = Field(..., min_length=1)
+    description: str = Field(..., min_length=1)
     maturity: EvaluatorMaturity
     sdk_version: str = f"learning-commons-evaluators-python-{sdk_version}"
     inputs: dict[str, AnyInputSpec] = Field(default_factory=dict)
 
+    @model_validator(mode="before")
+    @classmethod
+    def _coerce_toml_inputs(cls, data: Any) -> Any:
+        """Turn ``[[evaluator_metadata.inputs]]`` list rows into ``inputs`` keyed by field name."""
+        if not isinstance(data, dict):
+            return data
+        out = dict(data)
+        if "inputs" not in out:
+            return out
+        raw = out["inputs"]
+        if isinstance(raw, list):
+            adapter: TypeAdapter[Any] = TypeAdapter(AnyInputSpec)
+            parsed: dict[str, AnyInputSpec] = {}
+            for item in raw:
+                if not (isinstance(item, dict) and "name" in item):
+                    continue
+                parsed[str(item["name"])] = adapter.validate_python(item)
+            out["inputs"] = parsed
+        elif raw is None:
+            out["inputs"] = {}
+        return out
+
+    @field_validator("id", "version", "name", "description", mode="before")
+    @classmethod
+    def _strip_required_strings(cls, v: Any) -> Any:
+        if v is None:
+            return v
+        return str(v).strip()
+
+    @field_validator("maturity", mode="before")
+    @classmethod
+    def _normalize_maturity(cls, v: Any) -> Any:
+        if isinstance(v, str):
+            return v.lower()
+        return v
+
 
 class TokenUsage(BaseModel):
     """Token usage for a some step of an evaluation: provider type, model, and token counts."""
diff --git a/sdks/python/src/learning_commons_evaluators/settings/load_settings.py b/sdks/python/src/learning_commons_evaluators/settings/load_settings.py
index ebaa4a9..530dd9c 100644
--- a/sdks/python/src/learning_commons_evaluators/settings/load_settings.py
+++ b/sdks/python/src/learning_commons_evaluators/settings/load_settings.py
@@ -5,30 +5,31 @@
 import importlib.resources
 import os
 from dataclasses import dataclass
+
+try:
+    from importlib.resources.abc import Traversable
+except ImportError:
+    from importlib.abc import Traversable  # Python < 3.11
 from pathlib import Path
-from typing import Generic, TypeVar, cast
+from typing import Generic, TypeVar
 
 try:
     import tomllib
 except ImportError:
     import tomli as tomllib  # type: ignore[import-not-found,no-redef]  # Python < 3.11
 
-from learning_commons_evaluators.schemas.config import LlmProvider, PromptSettings
+from pydantic import TypeAdapter
+from pydantic import ValidationError as PydanticValidationError
+
+from learning_commons_evaluators.schemas.config import EvaluationSettings, PromptSettings
 from learning_commons_evaluators.schemas.errors import ConfigurationError
-from learning_commons_evaluators.schemas.input_specs import (
-    INPUT_SPEC_REGISTRY,
-    AnyInputSpec,
-)
-from learning_commons_evaluators.schemas.metadata import (
-    EvaluatorMaturity,
-    EvaluatorMetadata,
-)
+from learning_commons_evaluators.schemas.metadata import EvaluatorMetadata
 
-T = TypeVar("T")
+T = TypeVar("T", bound=EvaluationSettings)
 
 
-def shared_settings_root() -> Path:
-    """Return the path to the evaluator settings directory.
+def shared_settings_root() -> Path | Traversable:
+    """Return the evaluator settings directory (filesystem or package Traversable).
 
     Resolution order:
 
@@ -41,10 +42,13 @@ def shared_settings_root() -> Path:
 
        Also useful in CI jobs that check out settings separately.
 
-    2. Bundled package data — resolved via :mod:`importlib.resources` from the
-       ``learning_commons_evaluators.settings`` sub-package.  This is the path
-       taken by a normal ``pip install`` (editable or non-editable) when the
-       env var is not set.
+    2. Bundled package data — a :class:`importlib.abc.Traversable` from
+       :func:`importlib.resources.files` for the ``learning_commons_evaluators.settings``
+       sub-package (works from a wheel/zip without assuming a real directory path).
+
+       Use :func:`load_settings` / :func:`load_evaluator_settings`, which accept a
+       ``Traversable``, or :func:`importlib.resources.as_file` if an API requires a
+       concrete :class:`pathlib.Path` on disk.
 
     The bundled copy is kept in sync with ``sdks/settings/`` — see the
     *Keeping settings in sync* section of the README.
@@ -52,129 +56,55 @@ def shared_settings_root() -> Path:
     env = os.environ.get("EVALUATORS_SETTINGS_DIR")
     if env:
         return Path(env)
-    # importlib.resources.files() returns a Traversable; converting to Path
-    # works for both editable installs (a real directory) and zip/wheel installs
-    # (where Python 3.9+ extracts to a temp dir automatically).
-    pkg = importlib.resources.files("learning_commons_evaluators.settings")
-    return Path(str(pkg))
+    return importlib.resources.files("learning_commons_evaluators.settings")
 
 
-def _require(key: str, value: object, section: str) -> None:
-    """Raise ConfigurationError if value is missing or empty."""
-    if value is None or (isinstance(value, str) and not value.strip()):
-        raise ConfigurationError(f"Missing required field '{key}' in [{section}]")
-
-
-def load_settings(path: Path | str) -> dict:
+def load_settings(path: Path | str | Traversable) -> dict:
     """
     Load raw settings from a TOML file.
 
     Args:
-        path: Path to the .toml file.
+        path: Path to the .toml file, or a :class:`~importlib.abc.Traversable`
+            (e.g. from :func:`shared_settings_root` when using bundled settings).
 
     Returns:
         Parsed TOML as a dict.
     """
-    path = Path(path)
-    with path.open("rb") as f:
+    if isinstance(path, Traversable):
+        with path.open("rb") as f:
+            return tomllib.load(f)
+    with Path(path).open("rb") as f:
         return tomllib.load(f)
 
 
-def _parse_maturity(value: str) -> EvaluatorMaturity:
-    """Map TOML maturity string to EvaluatorMaturity enum."""
+_prompt_settings_adapter = TypeAdapter(PromptSettings)
+
+
+def _prepare_prompt_settings_dict(pm: dict) -> dict:
+    """Lowercase ``provider_type`` strings for TOML / enum matching."""
+    d = dict(pm)
+    pt = d.get("provider_type")
+    if isinstance(pt, str):
+        d["provider_type"] = pt.lower().strip()
+    return d
+
+
+def _validated_prompt_settings(pm: dict, step_name: str) -> PromptSettings:
     try:
-        return EvaluatorMaturity(value.lower())
-    except ValueError as e:
-        raise ConfigurationError(
-            f"Invalid maturity '{value}' in [evaluator_metadata]; expected one of alpha, beta, rc, ga."
-        ) from e
+        return _prompt_settings_adapter.validate_python(_prepare_prompt_settings_dict(pm))
+    except PydanticValidationError as e:
+        raise ConfigurationError(f"Invalid [{step_name}]: {e}") from e
 
 
 def _parse_evaluator_metadata(data: dict) -> EvaluatorMetadata:
-    """Build EvaluatorMetadata from TOML evaluator_metadata section. Raises ConfigurationError if a required field is missing."""
+    """Build EvaluatorMetadata from TOML ``[evaluator_metadata]`` via Pydantic validation."""
     em = data.get("evaluator_metadata")
     if not em or not isinstance(em, dict):
         raise ConfigurationError("Missing required section [evaluator_metadata].")
-    section = "evaluator_metadata"
-    id_val = em.get("id")
-    _require("id", id_val, section)
-    version = em.get("version")
-    _require("version", version, section)
-    if not isinstance(version, str):
-        version = str(version)
-    name = em.get("name")
-    _require("name", name, section)
-    description = em.get("description")
-    _require("description", description, section)
-    maturity_val = em.get("maturity")
-    _require("maturity", maturity_val, section)
-    if not isinstance(maturity_val, str):
-        raise ConfigurationError(f"Field 'maturity' in [{section}] must be a string.")
-    maturity = _parse_maturity(maturity_val)
-    # Parse [[evaluator_metadata.inputs]] into a dict keyed by field name.
-    # Dispatch on ``type`` to create the correct InputSpec subclass so that
-    # type-specific constraint fields (e.g. min_text_length) are preserved.
-    inputs: dict[str, AnyInputSpec] = {}
-    for spec_dict in em.get("inputs", []):
-        if not (isinstance(spec_dict, dict) and "name" in spec_dict):
-            continue
-        field_name = spec_dict["name"]
-        type_key = spec_dict.get("type", "")
-        spec_cls = INPUT_SPEC_REGISTRY.get(type_key)
-        if spec_cls is None:
-            raise ConfigurationError(
-                f"Unknown input type '{type_key}' in [[evaluator_metadata.inputs]] "
-                f"for field '{field_name}'. Expected one of: {sorted(INPUT_SPEC_REGISTRY)}."
-            )
-        inputs[field_name] = cast(AnyInputSpec, spec_cls(**spec_dict))
-
-    return EvaluatorMetadata(
-        id=str(id_val).strip(),
-        version=version.strip(),
-        name=str(name).strip(),
-        description=str(description).strip(),
-        maturity=maturity,
-        inputs=inputs,
-    )
-
-
-def _parse_provider_type(value: str, step_name: str) -> LlmProvider:
-    """Map TOML provider type string to LlmProvider enum."""
-    normalized = value.upper().strip()
-    if normalized == "GOOGLE":
-        return LlmProvider.GOOGLE
-    if normalized == "OPENAI":
-        return LlmProvider.OPENAI
-    if normalized == "ANTHROPIC":
-        return LlmProvider.ANTHROPIC
-    raise ConfigurationError(
-        f"Invalid provider type '{value}' in [{step_name}]; expected one of: google, openai, anthropic."
-    )
-
-
-def _parse_prompt_settings_step(pm: dict, step_name: str) -> PromptSettings:
-    """Build PromptSettings from a TOML prompt_settings_* subsection. Raises ConfigurationError if a required field is missing."""
-    pt = pm.get("type") or pm.get("provider_type")
-    _require("type", pt, step_name)
-    if not isinstance(pt, str):
-        raise ConfigurationError(f"Field 'type' in [{step_name}] must be a string.")
-    provider_type = _parse_provider_type(pt, step_name)
-    model = pm.get("model")
-    _require("model", model, step_name)
-    if not isinstance(model, str):
-        raise ConfigurationError(f"Field 'model' in [{step_name}] must be a string.")
-    temp = pm.get("temperature")
-    if temp is None:
-        raise ConfigurationError(f"Missing required field 'temperature' in [{step_name}].")
     try:
-        temperature = float(temp)
-    except (TypeError, ValueError) as e:
-        raise ConfigurationError(f"Field 'temperature' in [{step_name}] must be a number.") from e
-    return PromptSettings(
-        provider_type=provider_type,
-        model=model.strip(),
-        temperature=temperature,
-    )
+        return EvaluatorMetadata.model_validate(em)
+    except PydanticValidationError as e:
+        raise ConfigurationError(f"Invalid [evaluator_metadata]: {e}") from e
 
 
 def _normalize_prompt_whitespace(prompt: str) -> str:
@@ -209,19 +139,22 @@ class EvaluatorSettingsResult(Generic[T]):
     prompts: dict[str, str]
 
 
-def load_evaluator_settings(path: Path | str, settings_cls: type[T]) -> EvaluatorSettingsResult[T]:
+def load_evaluator_settings(
+    path: Path | str | Traversable, settings_cls: type[T]
+) -> EvaluatorSettingsResult[T]:
     """
     Load evaluator settings from a TOML file.
 
     Parses evaluator_metadata into EvaluatorMetadata, evaluation_settings into an instance
-    of settings_cls (with prompt_settings_* subsections as PromptSettings), and extracts
+    of settings_cls (with ``prompt_settings_*`` subsections as :class:`~learning_commons_evaluators.schemas.config.PromptSettings`
+    using ``provider_type``, ``model``, and ``temperature``), and extracts
     prompt text (e.g. system_prompt) into a prompts dict. Raises ConfigurationError if
     any required field or section is missing.
 
     Args:
-        path: Path to the .toml file.
+        path: Path to the .toml file, or a :class:`~importlib.abc.Traversable` to it.
         settings_cls: Class for evaluation settings (e.g. ConventionalityEvaluationSettings).
-            Must accept keyword arguments matching the TOML evaluation_settings keys.
+            Must be a Pydantic :class:`~pydantic.BaseModel` subclass; validated with ``model_validate``.
 
     Returns:
         EvaluatorSettingsResult with evaluator_metadata, evaluation_settings (typed), and prompts.
@@ -239,16 +172,12 @@ def load_evaluator_settings(path: Path | str, settings_cls: type[T]) -> Evaluato
     # Convert prompt_settings_* subsections to PromptSettings.
     for key in list(raw):
         if key.startswith("prompt_settings_") and isinstance(raw[key], dict):
-            raw[key] = _parse_prompt_settings_step(raw[key], f"evaluation_settings.{key}")
+            raw[key] = _validated_prompt_settings(raw[key], f"evaluation_settings.{key}")
 
     try:
-        evaluation_settings = settings_cls(**raw)
-    except Exception as e:
-        if isinstance(e, ConfigurationError):
-            raise
-        raise ConfigurationError(
-            f"Invalid [evaluation_settings]: {e!s}",
-        ) from e
+        evaluation_settings = settings_cls.model_validate(raw)
+    except PydanticValidationError as e:
+        raise ConfigurationError(f"Invalid [evaluation_settings]: {e}") from e
 
     return EvaluatorSettingsResult(
         evaluator_metadata=evaluator_metadata,
diff --git a/sdks/python/tests/schemas/test_config.py b/sdks/python/tests/schemas/test_config.py
index 264304f..6a50b09 100644
--- a/sdks/python/tests/schemas/test_config.py
+++ b/sdks/python/tests/schemas/test_config.py
@@ -79,4 +79,6 @@ def test_config_is_frozen(self):
         """EvaluatorConfig is a frozen dataclass; mutation must raise."""
         config = create_config_no_telemetry()
         with pytest.raises(FrozenInstanceError):
-            config.telemetry = TelemetryConfig(telemetry_partner_id="x", send_full_input_with_telemetry=False)
+            config.telemetry = TelemetryConfig(
+                telemetry_partner_id="x", send_full_input_with_telemetry=False
+            )
diff --git a/sdks/python/tests/schemas/test_metadata.py b/sdks/python/tests/schemas/test_metadata.py
index c2b7590..6ddbd81 100644
--- a/sdks/python/tests/schemas/test_metadata.py
+++ b/sdks/python/tests/schemas/test_metadata.py
@@ -7,8 +7,10 @@
 from datetime import timezone
 
 import pytest
+from pydantic import ValidationError
 
 from learning_commons_evaluators.schemas.config import LlmProvider, PromptSettings
+from learning_commons_evaluators.schemas.input_specs import TextInputSpec
 from learning_commons_evaluators.schemas.metadata import (
     PROMPT_STEP_EXTRA_PROMPT_SETTINGS,
     PROMPT_STEP_EXTRA_TOKEN_USAGE,
@@ -61,8 +63,39 @@ def test_fields_and_sdk_version(self):
         # sdk_version is auto-populated from the installed package version.
         assert "learning-commons-evaluators-python" in meta.sdk_version
 
+    def test_model_validate_toml_like_inputs_and_maturity(self):
+        meta = EvaluatorMetadata.model_validate(
+            {
+                "id": " e ",
+                "version": 1.0,
+                "name": "N",
+                "description": "D",
+                "maturity": "GA",
+                "inputs": [
+                    {"name": "text", "type": "TextInputField"},
+                ],
+            }
+        )
+        assert meta.id == "e"
+        assert meta.version == "1.0"
+        assert meta.maturity == EvaluatorMaturity.ga
+        assert isinstance(meta.inputs["text"], TextInputSpec)
+
+    def test_model_validate_unknown_input_type_raises(self):
+        with pytest.raises(
+            ValidationError, match="TextInputField|GradeInputField|Unable to extract"
+        ):
+            EvaluatorMetadata.model_validate(
+                {
+                    "id": "e",
+                    "version": "1",
+                    "name": "N",
+                    "description": "D",
+                    "maturity": "ga",
+                    "inputs": [{"name": "x", "type": "UnknownType"}],
+                }
+            )
 
-class TestTokenUsage:
     def test_fields(self):
         usage = TokenUsage(
             provider_type=LlmProvider.GOOGLE,
diff --git a/sdks/python/tests/settings/fixtures/__init__.py b/sdks/python/tests/settings/fixtures/__init__.py
new file mode 100644
index 0000000..0296240
--- /dev/null
+++ b/sdks/python/tests/settings/fixtures/__init__.py
@@ -0,0 +1 @@
+"""Static TOML fixtures for settings loader tests."""
diff --git a/sdks/python/tests/settings/fixtures/minimal.toml b/sdks/python/tests/settings/fixtures/minimal.toml
new file mode 100644
index 0000000..662527e
--- /dev/null
+++ b/sdks/python/tests/settings/fixtures/minimal.toml
@@ -0,0 +1 @@
+answer = 42
diff --git a/sdks/python/tests/settings/test_load_settings.py b/sdks/python/tests/settings/test_load_settings.py
index d4f4e91..a572a5d 100644
--- a/sdks/python/tests/settings/test_load_settings.py
+++ b/sdks/python/tests/settings/test_load_settings.py
@@ -1,332 +1,228 @@
-"""Tests for load_settings.
+"""Tests for :mod:`learning_commons_evaluators.settings.load_settings`."""
 
-Covers: load_settings(), _require(), _parse_prompts() (including prompt whitespace
-normalization), load_evaluator_settings(),
-and shared_settings_root() — including every conditional branch in each function.
-"""
+from __future__ import annotations
 
-import os
+import importlib.resources
+import textwrap
 from pathlib import Path
-from unittest.mock import patch
 
 import pytest
 
-from learning_commons_evaluators.errors import ConfigurationError
-from learning_commons_evaluators.schemas.conventionality import (
-    ConventionalityEvaluationSettings,
+from learning_commons_evaluators.schemas.config import (
+    EvaluationSettings,
+    LlmProvider,
+    PromptSettings,
 )
+from learning_commons_evaluators.schemas.errors import ConfigurationError
+from learning_commons_evaluators.schemas.input_specs import TextInputSpec
 from learning_commons_evaluators.schemas.metadata import EvaluatorMaturity
 from learning_commons_evaluators.settings.load_settings import (
-    EvaluatorSettingsResult,
-    _parse_prompts,
-    _require,
     load_evaluator_settings,
     load_settings,
     shared_settings_root,
 )
 
-# ---------------------------------------------------------------------------
-# Shared helpers
-# ---------------------------------------------------------------------------
 
+class _MiniSettings(EvaluationSettings):
+    """Minimal evaluation_settings block for loader tests."""
 
-def _write_toml(tmp_path: Path, content: str) -> Path:
-    """Write TOML content to evaluator.toml inside tmp_path and return the path."""
-    path = tmp_path / "evaluator.toml"
-    path.write_text(content, encoding="utf-8")
-    return path
+    marker: int = 0
 
 
-# Minimal [evaluator_metadata] section reused across several tests.
-_MINIMAL_META = """\
-[evaluator_metadata]
-id = "x"
-version = "0.1"
-name = "X"
-description = "X"
-maturity = "beta"
-"""
+class _WithPromptSettings(EvaluationSettings):
+    marker: int = 0
+    prompt_settings_main: PromptSettings
 
-# Minimal [evaluation_settings] section reused across several tests.
-_MINIMAL_EVAL_SETTINGS = """\
-[evaluation_settings.prompt_settings_step_main]
-type = "GOOGLE"
-model = "gemini-2.0-flash"
-temperature = 0
-"""
+
+def test_load_evaluator_settings_end_to_end(tmp_path: Path) -> None:
+    path = tmp_path / "eval.toml"
+    path.write_text(
+        textwrap.dedent(
+            """
+            [evaluator_metadata]
+            id = "e"
+            version = "1.0"
+            name = "N"
+            description = "D"
+            maturity = "GA"
+
+            [[evaluator_metadata.inputs]]
+            name = "text"
+            type = "TextInputField"
+
+            [evaluation_settings]
+            marker = 7
+            """
+        ).strip()
+    )
+    result = load_evaluator_settings(path, _MiniSettings)
+    assert result.evaluator_metadata.maturity == EvaluatorMaturity.ga
+    assert isinstance(result.evaluator_metadata.inputs["text"], TextInputSpec)
+    assert result.evaluation_settings.marker == 7
+
+
+def test_load_evaluator_settings_wraps_metadata_validation(tmp_path: Path) -> None:
+    path = tmp_path / "bad.toml"
+    path.write_text(
+        textwrap.dedent(
+            """
+            [evaluator_metadata]
+            id = ""
+            version = "1"
+            name = "N"
+            description = "D"
+            maturity = "ga"
+
+            [evaluation_settings]
+            marker = 0
+            """
+        ).strip()
+    )
+    with pytest.raises(ConfigurationError, match="Invalid \\[evaluator_metadata\\]"):
+        load_evaluator_settings(path, _MiniSettings)
+
+
+def test_load_settings_path_and_str_same_result(tmp_path: Path) -> None:
+    path = tmp_path / "raw.toml"
+    path.write_text('mode = "test"\n')
+    by_path = load_settings(path)
+    by_str = load_settings(str(path))
+    assert by_path == by_str == {"mode": "test"}
 
 
-# ---------------------------------------------------------------------------
-# load_settings (raw TOML loader)
-# ---------------------------------------------------------------------------
+def test_load_settings_traversable_package_file() -> None:
+    root = importlib.resources.files("tests.settings.fixtures")
+    tom = root.joinpath("minimal.toml")
+    assert load_settings(tom) == {"answer": 42}
 
 
-class TestLoadSettings:
-    def test_parses_nested_sections(self, tmp_path: Path) -> None:
-        path = _write_toml(
-            tmp_path,
+def test_shared_settings_root_env_overrides(
+    monkeypatch: pytest.MonkeyPatch, tmp_path: Path
+) -> None:
+    custom = tmp_path / "custom_settings"
+    custom.mkdir()
+    monkeypatch.setenv("EVALUATORS_SETTINGS_DIR", str(custom))
+    assert shared_settings_root() == custom
+
+
+def test_shared_settings_root_bundled_points_at_package(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.delenv("EVALUATORS_SETTINGS_DIR", raising=False)
+    root = shared_settings_root()
+    assert root.joinpath("load_settings.py").is_file()
+
+
+def test_load_evaluator_settings_missing_evaluator_metadata(tmp_path: Path) -> None:
+    path = tmp_path / "no_meta.toml"
+    path.write_text(
+        textwrap.dedent(
+            """
+            [evaluation_settings]
+            marker = 0
+            """
+        ).strip()
+    )
+    with pytest.raises(
+        ConfigurationError, match="Missing required section \\[evaluator_metadata\\]"
+    ):
+        load_evaluator_settings(path, _MiniSettings)
+
+
+def test_load_evaluator_settings_missing_evaluation_settings(tmp_path: Path) -> None:
+    path = tmp_path / "no_eval.toml"
+    path.write_text(
+        textwrap.dedent(
+            """
+            [evaluator_metadata]
+            id = "e"
+            version = "1.0"
+            name = "N"
+            description = "D"
+            maturity = "ga"
             """
-[section]
-key = "value"
-n = 42
-[section.nested]
-foo = "bar"
-""",
-        )
-        data = load_settings(path)
-        assert data["section"]["key"] == "value"
-        assert data["section"]["n"] == 42
-        assert data["section"]["nested"]["foo"] == "bar"
-
-    def test_accepts_str_path(self, tmp_path: Path) -> None:
-        path = _write_toml(tmp_path, 'title = "hello"')
-        assert load_settings(str(path))["title"] == "hello"
-
-    def test_raises_file_not_found(self) -> None:
-        with pytest.raises(FileNotFoundError):
-            load_settings(Path("/nonexistent/path/settings.toml"))
-
-
-# ---------------------------------------------------------------------------
-# _require
-# ---------------------------------------------------------------------------
-
-
-class TestRequire:
-    def test_raises_when_value_is_none(self) -> None:
-        with pytest.raises(ConfigurationError, match="missing_field"):
-            _require("missing_field", None, "section")
-
-    def test_raises_when_value_is_blank_string(self) -> None:
-        with pytest.raises(ConfigurationError, match="blank_field"):
-            _require("blank_field", "   ", "section")
-
-    def test_passes_for_non_empty_string(self) -> None:
-        _require("key", "value", "section")  # must not raise
-
-    def test_passes_for_numeric_value(self) -> None:
-        """An integer is not None and not a str, so _require must not raise.
-
-        This matters because TOML fields like ``id = 42`` are valid integers and
-        are later coerced with str() by the caller.
-        """
-        _require("id", 42, "section")  # must not raise
-
-
-# ---------------------------------------------------------------------------
-# _parse_prompts
-# ---------------------------------------------------------------------------
-
-
-class TestParsePrompts:
-    def test_returns_dict_when_prompts_is_a_dict(self) -> None:
-        data = {"prompts": {"system_prompt": "You are helpful.", "human_prompt": "Do it."}}
-        assert _parse_prompts(data) == {
-            "system_prompt": "You are helpful.",
-            "human_prompt": "Do it.",
-        }
-
-    def test_returns_empty_dict_when_prompts_key_is_absent(self) -> None:
-        assert _parse_prompts({}) == {}
-
-    def test_returns_empty_dict_when_prompts_is_not_a_dict(self) -> None:
-        """TOML like ``prompts = "some string"`` must not raise — just return {}."""
-        assert _parse_prompts({"prompts": "some string"}) == {}
-
-    def test_coerces_non_string_prompt_values_to_str(self) -> None:
-        """Non-string values (e.g. TOML integers) inside [prompts] must be coerced."""
-        assert _parse_prompts({"prompts": {"answer": 42}}) == {"answer": "42"}
-
-    def test_collapses_whitespace_only_lines_in_prompts(self) -> None:
-        """Lines that contain only spaces/tabs become empty; intentional \\n\\n\\n is kept."""
-        data = {
-            "prompts": {
-                "system_prompt": "a\n    \nb",
-                "multi_space_lines": "x\n  \n  \ny",
-                "triple_blank": "p\n\n\nq",
-            }
-        }
-        assert _parse_prompts(data) == {
-            "system_prompt": "a\n\nb",
-            "multi_space_lines": "x\n\n\ny",
-            "triple_blank": "p\n\n\nq",
-        }
-
-
-# ---------------------------------------------------------------------------
-# load_evaluator_settings
-# ---------------------------------------------------------------------------
-
-
-class TestLoadEvaluatorSettings:
-    def test_parses_full_toml(self, tmp_path: Path) -> None:
-        path = _write_toml(
-            tmp_path,
+        ).strip()
+    )
+    with pytest.raises(
+        ConfigurationError, match="Missing required section \\[evaluation_settings\\]"
+    ):
+        load_evaluator_settings(path, _MiniSettings)
+
+
+def test_load_evaluator_settings_invalid_evaluation_settings(tmp_path: Path) -> None:
+    path = tmp_path / "bad_eval.toml"
+    path.write_text(
+        textwrap.dedent(
             """
-[evaluator_metadata]
-id = "test_evaluator"
-version = "0.1"
-name = "Test"
-description = "Test evaluator"
-maturity = "beta"
-
-[prompts]
-system_prompt = "You are a helpful assistant."
-
-[evaluation_settings.prompt_settings_step_main]
-type = "GOOGLE"
-model = "gemini-2.0-flash"
-temperature = 0
-""",
-        )
-        result = load_evaluator_settings(path, ConventionalityEvaluationSettings)
-        assert isinstance(result, EvaluatorSettingsResult)
-        assert result.evaluator_metadata.id == "test_evaluator"
-        assert result.evaluator_metadata.version == "0.1"
-        assert result.evaluator_metadata.maturity == EvaluatorMaturity.beta
-        assert result.prompts["system_prompt"] == "You are a helpful assistant."
-        ps_main = result.evaluation_settings.prompt_settings_step_main
-        assert ps_main is not None
-        assert ps_main.model == "gemini-2.0-flash"
-        assert ps_main.temperature == 0.0
-
-    def test_raises_when_evaluator_metadata_section_missing(self, tmp_path: Path) -> None:
-        path = _write_toml(tmp_path, _MINIMAL_EVAL_SETTINGS)
-        with pytest.raises(ConfigurationError, match="evaluator_metadata"):
-            load_evaluator_settings(path, ConventionalityEvaluationSettings)
-
-    def test_raises_when_evaluation_settings_section_missing(self, tmp_path: Path) -> None:
-        path = _write_toml(tmp_path, _MINIMAL_META)
-        with pytest.raises(ConfigurationError, match="evaluation_settings"):
-            load_evaluator_settings(path, ConventionalityEvaluationSettings)
-
-    def test_raises_when_required_prompt_settings_field_missing(self, tmp_path: Path) -> None:
-        """temperature is required in every prompt_settings_* step; omitting it must raise."""
-        path = _write_toml(
-            tmp_path,
-            _MINIMAL_META
-            + """
-[evaluation_settings.prompt_settings_step_main]
-type = "GOOGLE"
-model = "gemini-2.0-flash"
-""",
-        )
-        with pytest.raises(ConfigurationError, match="temperature"):
-            load_evaluator_settings(path, ConventionalityEvaluationSettings)
-
-    def test_raises_for_invalid_maturity_value(self, tmp_path: Path) -> None:
-        path = _write_toml(
-            tmp_path,
+            [evaluator_metadata]
+            id = "e"
+            version = "1.0"
+            name = "N"
+            description = "D"
+            maturity = "ga"
+
+            [evaluation_settings]
+            marker = "not-int"
             """
-[evaluator_metadata]
-id = "x"
-version = "0.1"
-name = "X"
-description = "X"
-maturity = "not_a_real_maturity"
-"""
-            + _MINIMAL_EVAL_SETTINGS,
-        )
-        with pytest.raises(ConfigurationError, match="maturity"):
-            load_evaluator_settings(path, ConventionalityEvaluationSettings)
-
-    def test_raises_for_invalid_provider_type(self, tmp_path: Path) -> None:
-        path = _write_toml(
-            tmp_path,
-            _MINIMAL_META
-            + """
-[evaluation_settings.prompt_settings_step_main]
-type = "INVALID_PROVIDER"
-model = "some-model"
-temperature = 0
-""",
-        )
-        with pytest.raises(ConfigurationError, match="provider"):
-            load_evaluator_settings(path, ConventionalityEvaluationSettings)
-
-    def test_parses_inputs_into_typed_specs(self, tmp_path: Path) -> None:
-        """[[evaluator_metadata.inputs]] entries become TextInputSpec / GradeInputSpec."""
-        from learning_commons_evaluators.schemas.input_specs import (
-            GradeInputSpec,
-            TextInputSpec,
-        )
-
-        path = _write_toml(
-            tmp_path,
+        ).strip()
+    )
+    with pytest.raises(ConfigurationError, match="Invalid \\[evaluation_settings\\]"):
+        load_evaluator_settings(path, _MiniSettings)
+
+
+def test_load_evaluator_settings_prompt_settings_and_prompts(tmp_path: Path) -> None:
+    path = tmp_path / "prompts.toml"
+    path.write_text(
+        textwrap.dedent(
+            r'''
+            [evaluator_metadata]
+            id = "e"
+            version = "1.0"
+            name = "N"
+            description = "D"
+            maturity = "ga"
+
+            [evaluation_settings]
+            marker = 1
+
+            [evaluation_settings.prompt_settings_main]
+            provider_type = "GOOGLE"
+            model = "gemini-2.0-flash"
+            temperature = 0.25
+
+            [prompts]
+            system_prompt = """hello
+               \nworld"""
+            '''
+        ).strip()
+    )
+    result = load_evaluator_settings(path, _WithPromptSettings)
+    assert result.evaluation_settings.prompt_settings_main.provider_type == LlmProvider.GOOGLE
+    assert result.evaluation_settings.prompt_settings_main.model == "gemini-2.0-flash"
+    assert result.evaluation_settings.prompt_settings_main.temperature == 0.25
+    assert result.prompts["system_prompt"] == "hello\n\nworld"
+
+
+def test_load_evaluator_settings_invalid_prompt_settings_block(tmp_path: Path) -> None:
+    path = tmp_path / "bad_prompt.toml"
+    path.write_text(
+        textwrap.dedent(
             """
-[evaluator_metadata]
-id = "x"
-version = "0.1"
-name = "X"
-description = "X"
-maturity = "beta"
-
-[[evaluator_metadata.inputs]]
-name = "text"
-type = "TextInputField"
-min_text_length = 50
-max_text_length = 5000
-
-[[evaluator_metadata.inputs]]
-name = "grade"
-type = "GradeInputField"
-allowed_grades = [3, 4, 5, 6]
-
-[evaluation_settings.prompt_settings_step_main]
-type = "GOOGLE"
-model = "gemini-2.0-flash"
-temperature = 0
-""",
-        )
-        result = load_evaluator_settings(path, ConventionalityEvaluationSettings)
-        inputs = result.evaluator_metadata.inputs
-
-        text_spec = inputs["text"]
-        assert isinstance(text_spec, TextInputSpec)
-        assert text_spec.min_text_length == 50
-        assert text_spec.max_text_length == 5000
-
-        grade_spec = inputs["grade"]
-        assert isinstance(grade_spec, GradeInputSpec)
-        assert grade_spec.allowed_grades == [3, 4, 5, 6]
-
-    def test_raises_for_unknown_input_type(self, tmp_path: Path) -> None:
-        path = _write_toml(
-            tmp_path,
+            [evaluator_metadata]
+            id = "e"
+            version = "1.0"
+            name = "N"
+            description = "D"
+            maturity = "ga"
+
+            [evaluation_settings]
+            marker = 0
+
+            [evaluation_settings.prompt_settings_main]
+            provider_type = "google"
+            model = "m"
             """
-[evaluator_metadata]
-id = "x"
-version = "0.1"
-name = "X"
-description = "X"
-maturity = "beta"
-
-[[evaluator_metadata.inputs]]
-name = "mystery"
-type = "UnknownInputField"
-
-[evaluation_settings.prompt_settings_step_main]
-type = "GOOGLE"
-model = "gemini-2.0-flash"
-temperature = 0
-""",
-        )
-        with pytest.raises(ConfigurationError, match="Unknown input type"):
-            load_evaluator_settings(path, ConventionalityEvaluationSettings)
-
-
-# ---------------------------------------------------------------------------
-# shared_settings_root
-# ---------------------------------------------------------------------------
-
-
-class TestSharedSettingsRoot:
-    def test_env_var_overrides_bundled_path(self, tmp_path: Path) -> None:
-        with patch.dict(os.environ, {"EVALUATORS_SETTINGS_DIR": str(tmp_path)}):
-            assert shared_settings_root() == tmp_path
-
-    def test_importlib_fallback_resolves_to_existing_directory(self) -> None:
-        env = {k: v for k, v in os.environ.items() if k != "EVALUATORS_SETTINGS_DIR"}
-        with patch.dict(os.environ, env, clear=True):
-            root = shared_settings_root()
-        assert root.exists(), f"shared_settings_root() resolved to non-existent path: {root}"
+        ).strip()
+    )
+    with pytest.raises(
+        ConfigurationError, match="Invalid \\[evaluation_settings.prompt_settings_main\\]"
+    ):
+        load_evaluator_settings(path, _WithPromptSettings)

From 1052031fd297b5fea9665ea5ca60538c5c621ea2 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Tue, 12 May 2026 01:10:14 -0700
Subject: [PATCH 14/26] chore: update make

---
 sdks/python/Makefile  | 9 ++++-----
 sdks/python/README.md | 4 ++--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/sdks/python/Makefile b/sdks/python/Makefile
index 742abd1..fc5c126 100644
--- a/sdks/python/Makefile
+++ b/sdks/python/Makefile
@@ -1,8 +1,7 @@
-VENV    := ../../.venv
-PYTHON  := $(VENV)/bin/python
-PYTEST  := $(VENV)/bin/pytest
-RUFF    := $(VENV)/bin/ruff
-MYPY    := $(VENV)/bin/mypy
+PYTHON  ?= python3
+PYTEST  := $(PYTHON) -m pytest
+RUFF    := $(PYTHON) -m ruff
+MYPY    := $(PYTHON) -m mypy
 
 # Bundled settings directory inside the package.
 SETTINGS_DST := src/learning_commons_evaluators/settings
diff --git a/sdks/python/README.md b/sdks/python/README.md
index 4c27daa..15dc866 100644
--- a/sdks/python/README.md
+++ b/sdks/python/README.md
@@ -32,10 +32,10 @@ source .venv/bin/activate  # macOS/Linux
 # Install in editable mode with dev dependencies
 pip install -e ".[dev]"
 
-# Static checks + unit tests (same gate as CI for the Python SDK)
+# Static checks + full test suite (unit + contract; same gate as CI for the Python SDK)
 make verify
 
-# Run the full test suite (unit + contract)
+# Tests only (same pytest invocation as the verify step)
 make test
 ```
 

From 9b7429a2f15222d7fdef560f7dd44478b31829c0 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Tue, 12 May 2026 01:30:19 -0700
Subject: [PATCH 15/26] ci: fix CI?

---
 sdks/python/pyproject.toml                    |  4 +++
 .../settings/load_settings.py                 | 28 +++++++++++--------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml
index ec69933..c739073 100644
--- a/sdks/python/pyproject.toml
+++ b/sdks/python/pyproject.toml
@@ -40,11 +40,15 @@ dev = [
     "pytest-cov>=5.0.0",
     "ruff>=0.9.0",
     "mypy>=1.14.0",
+    # Mypy is configured for Python 3.10 even when dev tooling runs on newer Python,
+    # so it needs the 3.10 tomllib backport available for type checking.
+    "tomli>=2.0.0",
 ]
 
 [project.urls]
 Homepage = "https://github.com/learning-commons-org/evaluators"
 Repository = "https://github.com/learning-commons-org/evaluators/tree/main/sdks/python"
+Documentation = "https://docs.learningcommons.org/evaluators"
 "Bug Tracker" = "https://github.com/learning-commons-org/evaluators/issues"
 
 [tool.setuptools.packages.find]
diff --git a/sdks/python/src/learning_commons_evaluators/settings/load_settings.py b/sdks/python/src/learning_commons_evaluators/settings/load_settings.py
index 530dd9c..ad0d3cf 100644
--- a/sdks/python/src/learning_commons_evaluators/settings/load_settings.py
+++ b/sdks/python/src/learning_commons_evaluators/settings/load_settings.py
@@ -2,21 +2,27 @@
 
 from __future__ import annotations
 
-import importlib.resources
 import os
+import sys
 from dataclasses import dataclass
-
-try:
-    from importlib.resources.abc import Traversable
-except ImportError:
-    from importlib.abc import Traversable  # Python < 3.11
+from importlib import resources
 from pathlib import Path
-from typing import Generic, TypeVar
+from typing import TYPE_CHECKING, Generic, TypeVar
+
+if TYPE_CHECKING:
+    from importlib.abc import Traversable
+else:
+    # Runtime location moved to importlib.resources.abc in newer Python versions,
+    # while mypy's Python 3.10 types expose Traversable from importlib.abc.
+    try:
+        from importlib.resources.abc import Traversable
+    except ImportError:
+        from importlib.abc import Traversable
 
-try:
+if sys.version_info >= (3, 11):
     import tomllib
-except ImportError:
-    import tomli as tomllib  # type: ignore[import-not-found,no-redef]  # Python < 3.11
+else:
+    import tomli as tomllib
 
 from pydantic import TypeAdapter
 from pydantic import ValidationError as PydanticValidationError
@@ -56,7 +62,7 @@ def shared_settings_root() -> Path | Traversable:
     env = os.environ.get("EVALUATORS_SETTINGS_DIR")
     if env:
         return Path(env)
-    return importlib.resources.files("learning_commons_evaluators.settings")
+    return resources.files("learning_commons_evaluators.settings")
 
 
 def load_settings(path: Path | str | Traversable) -> dict:

From 1c6ee38b81ac10d68ba65a88c2429745b3bb1491 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Tue, 12 May 2026 14:56:55 -0700
Subject: [PATCH 16/26] chore: first pass addressing PR comments

---
 .../evaluators/base.py                        | 12 +++++-----
 .../evaluators/conventionality.py             | 10 ++++-----
 .../schemas/conventionality.py                |  2 +-
 .../_generated_conventionality_settings.py    |  7 ++----
 sdks/python/tests/conftest.py                 | 22 ++++++++++---------
 sdks/python/tests/evaluators/test_base.py     |  1 -
 .../tests/evaluators/test_conventionality.py  |  4 ++--
 .../tests/schemas/test_evaluator_schemas.py   |  6 ++---
 sdks/settings/conventionality/settings.toml   |  4 ++--
 9 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
index 9b48d99..397b026 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
@@ -198,10 +198,10 @@ def execute_prompt_chain_step(
         """Run a prompt chain (template | LLM), record metadata, and return the result.
 
         When ``parser_output_type`` is a Pydantic model class, the LLM response is
-        parsed as JSON and returned as an instance of that class.  When
-        ``parser_output_type`` is ``None``, the raw response content is returned as
-        a plain ``str`` — use this for steps that produce unstructured prose (e.g. a
-        background-knowledge assumption).
+        parsed as JSON and returned as an instance of that class.  When it is
+        ``None`` (the default), the raw response content is returned as a plain
+        ``str`` (no JSON parser) — use that for steps that produce unstructured prose
+        (e.g. a background-knowledge assumption).
 
         Provider config (e.g. API key) is resolved from ``self.config`` by
         ``prompt_settings.provider_type``.
@@ -217,8 +217,8 @@ def execute_prompt_chain_step(
                 to return the raw text response.
 
         Returns:
-            Parsed instance of ``parser_output_type`` when a type is given; plain
-            ``str`` when ``None``.
+            Parsed instance of ``parser_output_type`` when it is a model class; plain
+            ``str`` when ``parser_output_type`` is omitted or ``None``.
 
         Raises:
             ConfigurationError: No provider config for prompt_settings.provider_type.
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py b/sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py
index c7fbcce..1ae7032 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py
@@ -4,7 +4,7 @@
 
 from typing import ClassVar
 
-import textstat
+import textstat  # type: ignore[import-untyped]
 from langchain_core.output_parsers import JsonOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 from pydantic import Field
@@ -79,8 +79,7 @@ def evaluate_impl(
         evaluation_metadata: EvaluationMetadata,
     ) -> TextComplexityResult:
         """Run conventionality evaluation. Returns TextComplexityResult with answer, explanation, metadata."""
-        ps_main = evaluation_settings.prompt_settings_step_main
-        assert ps_main is not None
+        step_prompt_settings = evaluation_settings.prompt_settings_step_conventionality_evaluation
 
         fk_score = round(textstat.flesch_kincaid_grade(input.text.value), 2)
         prompt_inputs = input.input_values()
@@ -95,14 +94,13 @@ def evaluate_impl(
             ]
         ).partial(format_instructions=parser.get_format_instructions())
         conventionality_output = self.execute_prompt_chain_step(
-            step_name="main",
-            prompt_settings=ps_main,
+            step_name="conventionality_evaluation",
+            prompt_settings=step_prompt_settings,
             evaluation_metadata=evaluation_metadata,
             template=template,
             chain_inputs=prompt_inputs,
             parser_output_type=ConventionalityOutput,
         )
-        assert isinstance(conventionality_output, ConventionalityOutput)
 
         answer = TextComplexityAnswer.from_score(conventionality_output.complexity_score)
         return TextComplexityResult(
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/conventionality.py b/sdks/python/src/learning_commons_evaluators/schemas/conventionality.py
index 9d5595d..17ae47a 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/conventionality.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/conventionality.py
@@ -13,7 +13,7 @@
 class ConventionalityEvaluationSettings(EvaluationSettings):
     """Settings for a conventionality evaluation."""
 
-    prompt_settings_step_main: PromptSettings | None = None
+    prompt_settings_step_conventionality_evaluation: PromptSettings
 
 
 class ConventionalityOutput(BaseModel):
diff --git a/sdks/python/src/learning_commons_evaluators/settings/_generated_conventionality_settings.py b/sdks/python/src/learning_commons_evaluators/settings/_generated_conventionality_settings.py
index e52b7d1..a3c62b7 100644
--- a/sdks/python/src/learning_commons_evaluators/settings/_generated_conventionality_settings.py
+++ b/sdks/python/src/learning_commons_evaluators/settings/_generated_conventionality_settings.py
@@ -21,10 +21,7 @@
     maturity=EvaluatorMaturity.beta,
     inputs={
         'text': TextInputSpec(name='text', min_text_length=10, max_text_length=10000),
-        'grade': GradeInputSpec(
-            name='grade',
-            allowed_grades=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
-        ),
+        'grade': GradeInputSpec(name='grade', allowed_grades=[3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
     },
 )
 
@@ -88,7 +85,7 @@
 # ── Evaluation settings ───────────────────────────────────────────────────────
 
 _EVALUATION_SETTINGS = ConventionalityEvaluationSettings(
-    prompt_settings_step_main=PromptSettings(
+    prompt_settings_step_conventionality_evaluation=PromptSettings(
         provider_type=LlmProvider.GOOGLE,
         model='gemini-3-flash-preview',
         temperature=0.0,
diff --git a/sdks/python/tests/conftest.py b/sdks/python/tests/conftest.py
index fefbd8e..128cffb 100644
--- a/sdks/python/tests/conftest.py
+++ b/sdks/python/tests/conftest.py
@@ -26,16 +26,6 @@ def evaluator_metadata():
     )
 
 
-@pytest.fixture
-def evaluation_metadata(evaluator_metadata):
-    """EvaluationMetadata with sensible defaults for unit tests."""
-    return EvaluationMetadata(
-        evaluator_metadata=evaluator_metadata,
-        evaluation_settings=ConventionalityEvaluationSettings(),
-        input_metadata={},
-    )
-
-
 @pytest.fixture
 def prompt_settings_google():
     """PromptSettings configured for Google, usable in multiple test modules."""
@@ -46,6 +36,18 @@ def prompt_settings_google():
     )
 
 
+@pytest.fixture
+def evaluation_metadata(evaluator_metadata, prompt_settings_google):
+    """EvaluationMetadata with sensible defaults for unit tests."""
+    return EvaluationMetadata(
+        evaluator_metadata=evaluator_metadata,
+        evaluation_settings=ConventionalityEvaluationSettings(
+            prompt_settings_step_conventionality_evaluation=prompt_settings_google,
+        ),
+        input_metadata={},
+    )
+
+
 @pytest.fixture
 def config():
     """EvaluatorConfig with no telemetry, suitable for unit tests."""
diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py
index 5650ec7..a8c12f4 100644
--- a/sdks/python/tests/evaluators/test_base.py
+++ b/sdks/python/tests/evaluators/test_base.py
@@ -357,7 +357,6 @@ def _fake_llm(_pv):
                 evaluation_metadata=evaluation_metadata,
                 template=template,
                 chain_inputs={"input": "Hello"},
-                parser_output_type=None,
             )
         assert out == "plain prose"
 
diff --git a/sdks/python/tests/evaluators/test_conventionality.py b/sdks/python/tests/evaluators/test_conventionality.py
index a578027..e339da0 100644
--- a/sdks/python/tests/evaluators/test_conventionality.py
+++ b/sdks/python/tests/evaluators/test_conventionality.py
@@ -13,7 +13,7 @@
 from learning_commons_evaluators.schemas.errors import ConfigurationError
 from learning_commons_evaluators.schemas.metadata import Status
 
-# A realistic sample long enough to satisfy the min_text_length=100 constraint.
+# Long sample text (well above ``min_text_length`` from conventionality settings TOML).
 _SAMPLE_TEXT = (
     "Marco Polo was a Venetian merchant and explorer who traveled through Asia "
     "in the late 13th century. He spent nearly two decades at the court of "
@@ -57,7 +57,7 @@ def test_evaluate_with_explicit_settings(self):
         config = create_config_no_telemetry()
         evaluator = ConventionalityEvaluator(config)
         settings = ConventionalityEvaluationSettings(
-            prompt_settings_step_main=PromptSettings(
+            prompt_settings_step_conventionality_evaluation=PromptSettings(
                 provider_type=LlmProvider.GOOGLE,
                 model="gemini-2.0-flash",
                 temperature=0.0,
diff --git a/sdks/python/tests/schemas/test_evaluator_schemas.py b/sdks/python/tests/schemas/test_evaluator_schemas.py
index 481018f..cb2f082 100644
--- a/sdks/python/tests/schemas/test_evaluator_schemas.py
+++ b/sdks/python/tests/schemas/test_evaluator_schemas.py
@@ -29,7 +29,7 @@
     Status,
 )
 
-# A realistic sample long enough to satisfy the min_text_length=100 constraint.
+# Long sample text (well above ``min_text_length`` from conventionality settings TOML).
 _SAMPLE_TEXT = (
     "Marco Polo was a Venetian merchant and explorer who traveled through Asia "
     "in the late 13th century. He spent nearly two decades at the court of "
@@ -78,14 +78,14 @@ def test_validate_raises_on_invalid_grade(self):
             inp.validate()
 
     def test_validate_raises_on_invalid_text_length(self):
-        # min_text_length=100 comes from the TOML settings; "x" is 1 char.
+        # "x" is shorter than ``min_text_length`` from the conventionality settings TOML.
         inp = ConventionalityEvaluationInput(text="x", grade=5)
         with pytest.raises(ValidationError):
             inp.validate()
 
     def test_validate_collects_all_errors_before_raising(self):
         """All field errors are collected; a single ValidationError is raised at the end."""
-        # "x" is below the 100-char minimum; grade 99 is outside the 0-12 range.
+        # "x" is below the configured minimum text length; grade 99 is outside the 0-12 range.
         inp = ConventionalityEvaluationInput(text="x", grade=99)
         with pytest.raises(ValidationError) as exc_info:
             inp.validate()
diff --git a/sdks/settings/conventionality/settings.toml b/sdks/settings/conventionality/settings.toml
index 2cba13b..b4e0323 100644
--- a/sdks/settings/conventionality/settings.toml
+++ b/sdks/settings/conventionality/settings.toml
@@ -14,7 +14,7 @@ max_text_length = 10000
 [[evaluator_metadata.inputs]]
 name = "grade"
 type = "GradeInputField"
-allowed_grades = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+allowed_grades = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
 
 [prompts]
 system_prompt = """
@@ -72,7 +72,7 @@ Grade: {grade}
 FK Score: {fk_score}
 """
 
-[evaluation_settings.prompt_settings_step_main]
+[evaluation_settings.prompt_settings_step_conventionality_evaluation]
 type = "GOOGLE"
 model = "gemini-3-flash-preview"
 temperature = 0

From b264db56e24a8b2766b606782552443f33af7c75 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Tue, 12 May 2026 15:03:07 -0700
Subject: [PATCH 17/26] test: expect integer textLength in input_metadata
 assertions

Aligns with TextInputField.input_metadata() returning len() as int.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 sdks/python/tests/schemas/test_evaluator_schemas.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sdks/python/tests/schemas/test_evaluator_schemas.py b/sdks/python/tests/schemas/test_evaluator_schemas.py
index cb2f082..855ee94 100644
--- a/sdks/python/tests/schemas/test_evaluator_schemas.py
+++ b/sdks/python/tests/schemas/test_evaluator_schemas.py
@@ -60,7 +60,7 @@ def test_validate_and_input_metadata(self):
         inp = ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=5)
         inp.validate()
         meta = inp.input_metadata()
-        assert meta["text"] == {"textLength": str(len(_SAMPLE_TEXT))}
+        assert meta["text"] == {"textLength": len(_SAMPLE_TEXT)}
         assert meta["grade"] == {"grade": 5}
 
     def test_input_values_returns_primitive_values(self):
@@ -105,7 +105,7 @@ def test_input_metadata_returns_none_for_non_inputfield_fields(self):
         """Fields that are not InputFields produce a None entry in the output dict."""
         inp = _MixedInput(text=TextInputField(spec=_BARE_TEXT_SPEC, value="hello"), weight=7.5)
         meta = inp.input_metadata()
-        assert meta["text"] == {"textLength": "5"}
+        assert meta["text"] == {"textLength": 5}
         assert meta["weight"] is None  # fallback for non-protocol fields
 
     def test_input_values_returns_field_itself_for_non_inputfield(self):

From 23f3def722d73c82716fd87fd17352127232f792 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Tue, 12 May 2026 15:52:54 -0700
Subject: [PATCH 18/26] chore: updating generate_settings to be more general +
 misc PR updates

---
 scripts/generate_settings.py                | 213 +++++++++++---------
 sdks/python/tests/evaluators/test_base.py   |  39 +---
 sdks/settings/conventionality/settings.toml |   2 +-
 3 files changed, 121 insertions(+), 133 deletions(-)

diff --git a/scripts/generate_settings.py b/scripts/generate_settings.py
index 1a5aa5a..9a0d5c2 100644
--- a/scripts/generate_settings.py
+++ b/scripts/generate_settings.py
@@ -5,9 +5,17 @@
 as a plain Python module.  The evaluator modules then import that generated
 module at import time — no file I/O, no TOML parsing on every process start.
 
-Each evaluator directory under sdks/settings/ contains two files::
+Each evaluator directory under sdks/settings/ that contains ``settings.toml`` is picked up
+automatically.  The settings model is resolved as::
+
+    learning_commons_evaluators.schemas.<evaluator>.<PascalCase>EvaluationSettings
+
+where ``<PascalCase>`` is the snake_case evaluator folder name converted to PascalCase
+(e.g. ``conventionality`` → ``ConventionalityEvaluationSettings`` in
+``schemas.conventionality``).
+
+Each such directory typically also has::
 
-    sdks/settings/<evaluator>/settings.toml   — prompts, model, temperature
     sdks/settings/<evaluator>/contracts.toml  — captured LLM interactions for contract tests
 
 Usage::
@@ -41,6 +49,7 @@
 
 import argparse
 import difflib
+import importlib
 import os
 import sys
 import types
@@ -85,60 +94,12 @@
 from learning_commons_evaluators.schemas.config import (  # noqa: E402
     EvaluationSettings,
 )
-from learning_commons_evaluators.schemas.conventionality import (  # noqa: E402
-    ConventionalityEvaluationSettings,
-)
-from learning_commons_evaluators.schemas.vocabulary import (  # noqa: E402
-    VocabularyEvaluationSettings,
-)
 from learning_commons_evaluators.settings.load_settings import (  # noqa: E402
     EvaluatorSettingsResult,
     load_evaluator_settings,
 )
 
-# ---------------------------------------------------------------------------
-# Import map: class name → (module path, symbol name)
-# Used to build the import block of generated files.
-# ---------------------------------------------------------------------------
-
-_IMPORT_MAP: dict[str, tuple[str, str]] = {
-    "ConventionalityEvaluationSettings": (
-        "learning_commons_evaluators.schemas.conventionality",
-        "ConventionalityEvaluationSettings",
-    ),
-    "EvaluatorMaturity": (
-        "learning_commons_evaluators.schemas.metadata",
-        "EvaluatorMaturity",
-    ),
-    "EvaluatorMetadata": (
-        "learning_commons_evaluators.schemas.metadata",
-        "EvaluatorMetadata",
-    ),
-    "EvaluatorSettingsResult": (
-        "learning_commons_evaluators.settings.load_settings",
-        "EvaluatorSettingsResult",
-    ),
-    "GradeInputSpec": (
-        "learning_commons_evaluators.schemas.input_specs",
-        "GradeInputSpec",
-    ),
-    "LlmProvider": (
-        "learning_commons_evaluators.schemas.config",
-        "LlmProvider",
-    ),
-    "PromptSettings": (
-        "learning_commons_evaluators.schemas.config",
-        "PromptSettings",
-    ),
-    "TextInputSpec": (
-        "learning_commons_evaluators.schemas.input_specs",
-        "TextInputSpec",
-    ),
-    "VocabularyEvaluationSettings": (
-        "learning_commons_evaluators.schemas.vocabulary",
-        "VocabularyEvaluationSettings",
-    ),
-}
+_LCE_PACKAGE = "learning_commons_evaluators"
 
 # ---------------------------------------------------------------------------
 # Value emitter
@@ -261,42 +222,49 @@ def _emit_dataclass(obj: Any, indent: int = 0) -> str:
 # ---------------------------------------------------------------------------
 
 
-def _collect_class_names(obj: Any, names: set[str]) -> None:
-    """Walk obj recursively and collect names of all classes that need importing."""
+def _collect_lce_types(obj: Any, found: set[type]) -> None:
+    """Walk *obj* and collect types defined under learning_commons_evaluators for imports."""
     if isinstance(obj, Enum):
-        names.add(type(obj).__name__)
+        found.add(type(obj))
     elif is_dataclass(obj) and not isinstance(obj, type):
-        names.add(type(obj).__name__)
+        found.add(type(obj))
         for f in fields(obj):
-            _collect_class_names(getattr(obj, f.name), names)
+            _collect_lce_types(getattr(obj, f.name), found)
     elif isinstance(obj, BaseModel):
-        names.add(type(obj).__name__)
+        found.add(type(obj))
         for field_name in type(obj).model_fields:
-            _collect_class_names(getattr(obj, field_name), names)
+            _collect_lce_types(getattr(obj, field_name), found)
     elif isinstance(obj, dict):
         for v in obj.values():
-            _collect_class_names(v, names)
+            _collect_lce_types(v, found)
     elif isinstance(obj, list):
         for v in obj:
-            _collect_class_names(v, names)
-
+            _collect_lce_types(v, found)
 
-def _build_import_block(config: EvaluatorSettingsResult, settings_cls_name: str) -> str:
-    used: set[str] = set()
-    _collect_class_names(config.evaluator_metadata, used)
-    _collect_class_names(config.evaluation_settings, used)
-    used.add(settings_cls_name)
-    used.add("EvaluatorSettingsResult")
 
+def _build_import_block(
+    config: EvaluatorSettingsResult,
+    settings_cls: type[EvaluationSettings],
+) -> str:
+    """Build ``from … import …`` lines for everything referenced in the generated module."""
+    found: set[type] = set()
+    _collect_lce_types(config.evaluator_metadata, found)
+    _collect_lce_types(config.evaluation_settings, found)
+    _collect_lce_types(config.prompts, found)
+    found.add(settings_cls)
+    found.add(EvaluatorSettingsResult)
+
+    prefix = f"{_LCE_PACKAGE}."
     by_module: dict[str, list[str]] = {}
-    for name in sorted(used):
-        if name in _IMPORT_MAP:
-            mod, sym = _IMPORT_MAP[name]
-            by_module.setdefault(mod, []).append(sym)
+    for cls in found:
+        mod = cls.__module__
+        if not mod.startswith(prefix):
+            continue
+        by_module.setdefault(mod, []).append(cls.__name__)
 
     lines: list[str] = []
     for mod in sorted(by_module):
-        syms = sorted(by_module[mod])
+        syms = sorted(set(by_module[mod]))
         if len(syms) == 1:
             lines.append(f"from {mod} import {syms[0]}")
         else:
@@ -318,7 +286,7 @@ def generate_module(
     config = load_evaluator_settings(toml_path, settings_cls)
     settings_cls_name = settings_cls.__name__
 
-    imports = _build_import_block(config, settings_cls_name)
+    imports = _build_import_block(config, settings_cls)
     metadata_code = _emit_model(config.evaluator_metadata)
     prompts_code = _emit_value(config.prompts)
     settings_code = _emit_model(config.evaluation_settings)
@@ -361,26 +329,65 @@ def generate_module(
 
 
 # ---------------------------------------------------------------------------
-# Evaluator registry
+# Evaluator discovery
 # ---------------------------------------------------------------------------
 
-_EVALUATORS: list[dict] = [
-    {
-        "name": "conventionality",
-        "settings_cls": ConventionalityEvaluationSettings,
-        "output": _GENERATED_DIR / "_generated_conventionality_settings.py",
-    },
-    {
-        "name": "vocabulary",
-        "settings_cls": VocabularyEvaluationSettings,
-        "output": _GENERATED_DIR / "_generated_vocabulary_settings.py",
-    },
-]
+
+def _snake_to_pascal(name: str) -> str:
+    return "".join(part.capitalize() for part in name.split("_"))
 
 
-def _settings_toml(evaluator_name: str) -> Path:
-    """Return ``sdks/settings/<evaluator>/settings.toml``."""
-    return _SETTINGS_DIR / evaluator_name / "settings.toml"
+def _resolve_settings_class(evaluator_name: str) -> type[EvaluationSettings]:
+    """Import ``<Pascal>EvaluationSettings`` from ``learning_commons_evaluators.schemas.<name>``."""
+    if not evaluator_name.isidentifier():
+        raise SystemExit(
+            f"Evaluator folder name {evaluator_name!r} is not a valid Python identifier; "
+            "rename the directory under sdks/settings/."
+        )
+    class_name = f"{_snake_to_pascal(evaluator_name)}EvaluationSettings"
+    module_name = f"{_LCE_PACKAGE}.schemas.{evaluator_name}"
+    try:
+        mod = importlib.import_module(module_name)
+    except ModuleNotFoundError as e:
+        raise SystemExit(
+            f"No Python module {module_name!r} for evaluator {evaluator_name!r} "
+            f"(expected class {class_name}). Add schemas/{evaluator_name}.py or align the folder name."
+        ) from e
+    try:
+        cls = getattr(mod, class_name)
+    except AttributeError as e:
+        raise SystemExit(
+            f"Module {module_name!r} has no attribute {class_name!r}. "
+            f"Define {class_name} there (subclass of EvaluationSettings), or align names."
+        ) from e
+    if not isinstance(cls, type) or not issubclass(cls, EvaluationSettings):
+        raise SystemExit(f"{module_name}.{class_name} must be a subclass of EvaluationSettings.")
+    return cls
+
+
+def _discover_evaluators() -> list[dict[str, Any]]:
+    """Return one entry per ``sdks/settings/<evaluator>/settings.toml`` found on disk."""
+    specs: list[dict[str, Any]] = []
+    if not _SETTINGS_DIR.is_dir():
+        return specs
+    for child in sorted(_SETTINGS_DIR.iterdir()):
+        if not child.is_dir():
+            continue
+        toml_path = child / "settings.toml"
+        if not toml_path.is_file():
+            continue
+        name = child.name
+        settings_cls = _resolve_settings_class(name)
+        output = _GENERATED_DIR / f"_generated_{name}_settings.py"
+        specs.append(
+            {
+                "name": name,
+                "settings_cls": settings_cls,
+                "toml": toml_path,
+                "output": output,
+            }
+        )
+    return specs
 
 
 def _contracts_toml(evaluator_name: str) -> Path:
@@ -394,8 +401,12 @@ def _contracts_toml(evaluator_name: str) -> Path:
 
 
 def cmd_generate() -> None:
-    for ev in _EVALUATORS:
-        content = generate_module(ev["name"], _settings_toml(ev["name"]), ev["settings_cls"])
+    evaluators = _discover_evaluators()
+    if not evaluators:
+        print(f"No evaluators found under {_SETTINGS_DIR} (add */settings.toml).")
+        return
+    for ev in evaluators:
+        content = generate_module(ev["name"], ev["toml"], ev["settings_cls"])
         ev["output"].write_text(content, encoding="utf-8")
         rel = ev["output"].relative_to(_REPO_ROOT)
         print(f"  generated  {rel}")
@@ -403,9 +414,13 @@ def cmd_generate() -> None:
 
 
 def cmd_check() -> int:
+    evaluators = _discover_evaluators()
+    if not evaluators:
+        print(f"No evaluators found under {_SETTINGS_DIR} (nothing to check).")
+        return 0
     stale: list[str] = []
-    for ev in _EVALUATORS:
-        expected = generate_module(ev["name"], _settings_toml(ev["name"]), ev["settings_cls"])
+    for ev in evaluators:
+        expected = generate_module(ev["name"], ev["toml"], ev["settings_cls"])
         actual = ev["output"].read_text(encoding="utf-8") if ev["output"].exists() else ""
         if expected != actual:
             diff = "".join(
@@ -436,7 +451,11 @@ def cmd_sync() -> None:
     tests work correctly when installed via ``pip install`` (i.e. without
     access to the monorepo ``sdks/settings/`` directory).
     """
-    for ev in _EVALUATORS:
+    evaluators = _discover_evaluators()
+    if not evaluators:
+        print(f"No evaluators found under {_SETTINGS_DIR} (nothing to sync).")
+        return
+    for ev in evaluators:
         src = _contracts_toml(ev["name"])
         if not src.exists():
             print(f"  WARNING: canonical {src.relative_to(_REPO_ROOT)} not found — skipping")
@@ -455,8 +474,12 @@ def cmd_check_sync() -> int:
     Exits with a non-zero status if any bundled file is missing or differs from
     the canonical source.
     """
+    evaluators = _discover_evaluators()
+    if not evaluators:
+        print(f"No evaluators found under {_SETTINGS_DIR} (nothing to verify).")
+        return 0
     errors: list[str] = []
-    for ev in _EVALUATORS:
+    for ev in evaluators:
         canonical = _contracts_toml(ev["name"])
         if not canonical.exists():
             errors.append(f"MISSING canonical: {canonical.relative_to(_REPO_ROOT)}")
diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py
index b4b3b3e..b453853 100644
--- a/sdks/python/tests/evaluators/test_base.py
+++ b/sdks/python/tests/evaluators/test_base.py
@@ -155,37 +155,6 @@ def test_full_telemetry_config_still_uses_input_metadata_not_raw_values(self, st
         assert result.metadata.input_metadata["grade_level"] == {"grade": 3}
 
 
-class TestEvaluateTelemetryBranching:
-    def test_uses_input_metadata_by_default(self):
-        evaluator = _evaluator()
-        with patch.object(evaluator, "execute_prompt_chain_step", return_value=_MOCK_OUTPUT):
-            result = evaluator.evaluate(_inp())
-        assert result.metadata.input_metadata["text"] == {"textLength": len(_SAMPLE_TEXT)}
-
-    def test_full_telemetry_still_records_input_metadata_not_raw_values(self):
-        evaluator = _evaluator(send_full_input=True)
-        with patch.object(evaluator, "execute_prompt_chain_step", return_value=_MOCK_OUTPUT):
-            result = evaluator.evaluate(_inp())
-        assert result.metadata.input_metadata["text"] == {"textLength": len(_SAMPLE_TEXT)}
-        assert result.metadata.input_metadata["grade"] == {"grade": 5}
-
-
-class TestConventionalityEvaluateErrorHandling:
-    def test_raises_validation_error_for_invalid_input(self):
-        evaluator = _evaluator()
-        invalid = ConventionalityEvaluationInput(text="x", grade=5)
-        with pytest.raises(ValidationError):
-            evaluator.evaluate(invalid)
-
-    def test_propagates_evaluate_impl_exception(self):
-        evaluator = _evaluator()
-        with (
-            patch.object(evaluator, "evaluate_impl", side_effect=RuntimeError("boom")),
-            pytest.raises(RuntimeError, match="boom"),
-        ):
-            evaluator.evaluate(_inp())
-
-
 class TestStubEvaluateErrorHandling:
     def test_raises_validation_error_for_invalid_input(self, stub_evaluator):
         inp = TextComplexityEvaluationInput(
@@ -473,9 +442,7 @@ def test_prompt_settings_recorded_in_step_extras(self, stub_evaluator, evaluatio
         assert step.extras[PROMPT_STEP_EXTRA_PROMPT_SETTINGS]["model"] == "gemini-2.0-flash"
         assert PROMPT_STEP_EXTRA_TOKEN_USAGE in step.extras
 
-    def test_token_usage_recorded_when_llm_reports_usage(
-        self, stub_evaluator, evaluation_metadata
-    ):
+    def test_token_usage_recorded_when_llm_reports_usage(self, stub_evaluator, evaluation_metadata):
         def _llm_with_usage(_pv):
             return AIMessage(
                 content=_CHAIN_JSON,
@@ -552,9 +519,7 @@ def test_propagates_evaluator_error_without_wrapping(self, stub_evaluator, evalu
                 parser_output_type=_ChainOutput,
             )
 
-    def test_wraps_unexpected_chain_failure_as_api_error(
-        self, stub_evaluator, evaluation_metadata
-    ):
+    def test_wraps_unexpected_chain_failure_as_api_error(self, stub_evaluator, evaluation_metadata):
         def _boom(_pv):
             raise ValueError("simulated provider failure")
 
diff --git a/sdks/settings/conventionality/settings.toml b/sdks/settings/conventionality/settings.toml
index b4e0323..5d29e96 100644
--- a/sdks/settings/conventionality/settings.toml
+++ b/sdks/settings/conventionality/settings.toml
@@ -73,6 +73,6 @@ FK Score: {fk_score}
 """
 
 [evaluation_settings.prompt_settings_step_conventionality_evaluation]
-type = "GOOGLE"
+provider_type = "GOOGLE"
 model = "gemini-3-flash-preview"
 temperature = 0

From 93ecf53edffe41b3332ebf62b58b11bf3269aa29 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Tue, 12 May 2026 16:01:25 -0700
Subject: [PATCH 19/26] chore: simplified and generalized generate_settings

---
 scripts/generate_settings.py | 238 ++++++++++++++++-------------------
 1 file changed, 108 insertions(+), 130 deletions(-)

diff --git a/scripts/generate_settings.py b/scripts/generate_settings.py
index 9a0d5c2..2bcef05 100644
--- a/scripts/generate_settings.py
+++ b/scripts/generate_settings.py
@@ -53,9 +53,10 @@
 import os
 import sys
 import types
-from dataclasses import MISSING, fields, is_dataclass
+from dataclasses import MISSING, dataclass, fields, is_dataclass
 from enum import Enum
 from pathlib import Path
+from string import Template
 from typing import Any
 
 # ---------------------------------------------------------------------------
@@ -69,6 +70,8 @@
 _SETTINGS_DIR = _REPO_ROOT / "sdks" / "settings"
 _GENERATED_DIR = _SDK_SRC / "learning_commons_evaluators" / "settings"
 
+_LINE_WRAP = 88
+
 sys.path.insert(0, str(_SDK_SRC))
 
 # Point load_settings at the canonical settings directory so the generator
@@ -102,14 +105,14 @@
 _LCE_PACKAGE = "learning_commons_evaluators"
 
 # ---------------------------------------------------------------------------
-# Value emitter
+# Emit Python source literals (no third-party codegen: must handle Enum,
+# Pydantic, stdlib dataclass, and readable wrapping).
 # ---------------------------------------------------------------------------
 
 
 def _emit_string(s: str) -> str:
     """Emit a string literal, using triple-quotes for multiline / long strings."""
-    if "\n" in s or len(s) > 88:
-        # Escape any literal `"""` sequences inside the content.
+    if "\n" in s or len(s) > _LINE_WRAP:
         content = s.replace('"""', '""\\"')
         return f'"""{content}"""'
     return repr(s)
@@ -124,8 +127,6 @@ def _emit_value(obj: Any, indent: int = 0) -> str:
         return "None"
     if isinstance(obj, bool):
         return "True" if obj else "False"
-    # str- and int-backed enums (e.g. LlmProvider(str, Enum)) must be handled before
-    # str/int or we emit repr() and get invalid syntax like <LlmProvider.GOOGLE: 'google'>.
     if isinstance(obj, Enum):
         return f"{type(obj).__name__}.{obj.name}"
     if isinstance(obj, int):
@@ -139,7 +140,7 @@ def _emit_value(obj: Any, indent: int = 0) -> str:
             return "[]"
         items = [_emit_value(v, indent + 1) for v in obj]
         single = f"[{', '.join(items)}]"
-        if len(single) <= 88 - len(pad) and "\n" not in single:
+        if len(single) <= _LINE_WRAP - len(pad) and "\n" not in single:
             return single
         body = "\n".join(f"{inner}{item}," for item in items)
         return f"[\n{body}\n{pad}]"
@@ -148,7 +149,7 @@ def _emit_value(obj: Any, indent: int = 0) -> str:
             return "{}"
         pairs = [(repr(k), _emit_value(v, indent + 1)) for k, v in obj.items()]
         single = "{" + ", ".join(f"{k}: {v}" for k, v in pairs) + "}"
-        if len(single) <= 88 - len(pad) and "\n" not in single:
+        if len(single) <= _LINE_WRAP - len(pad) and "\n" not in single:
             return single
         body = "\n".join(f"{inner}{k}: {v}," for k, v in pairs)
         return f"{{\n{body}\n{pad}}}"
@@ -159,71 +160,57 @@ def _emit_value(obj: Any, indent: int = 0) -> str:
     raise TypeError(f"Cannot emit {type(obj).__name__}: {obj!r}")
 
 
-def _emit_model(obj: BaseModel, indent: int = 0) -> str:
-    """Emit a Pydantic model as a constructor call."""
-    cls = type(obj)
-    cls_name = cls.__name__
+def _format_constructor(cls_name: str, kw_args: list[tuple[str, str]], indent: int) -> str:
+    """Format ``ClsName(a=..., b=...)`` with optional line wrapping."""
     pad = "    " * indent
     inner = "    " * (indent + 1)
+    if not kw_args:
+        return f"{cls_name}()"
+    single = f"{cls_name}({', '.join(f'{n}={v}' for n, v in kw_args)})"
+    if len(single) <= _LINE_WRAP - len(pad) and "\n" not in single:
+        return single
+    body = "\n".join(f"{inner}{n}={v}," for n, v in kw_args)
+    return f"{cls_name}(\n{body}\n{pad})"
 
-    args: list[tuple[str, str]] = []
+
+def _pydantic_kw_args(obj: BaseModel, indent: int) -> list[tuple[str, str]]:
+    cls = type(obj)
+    out: list[tuple[str, str]] = []
     for field_name, field_info in cls.model_fields.items():
         val = getattr(obj, field_name)
-
-        # Skip Literal discriminators (e.g. type="TextInputField").
         if field_name == "type" and not field_info.is_required():
             continue
-
-        # Skip fields that equal their default — keeps generated code clean.
         default = field_info.default
         if default is not PydanticUndefined and val == default:
             continue
+        out.append((field_name, _emit_value(val, indent + 1)))
+    return out
 
-        args.append((field_name, _emit_value(val, indent + 1)))
-
-    if not args:
-        return f"{cls_name}()"
-
-    single = f"{cls_name}({', '.join(f'{n}={v}' for n, v in args)})"
-    if len(single) <= 88 - len(pad) and "\n" not in single:
-        return single
-
-    body = "\n".join(f"{inner}{n}={v}," for n, v in args)
-    return f"{cls_name}(\n{body}\n{pad})"
-
-
-def _emit_dataclass(obj: Any, indent: int = 0) -> str:
-    """Emit a stdlib dataclass instance as a constructor call (e.g. PromptSettings)."""
-    cls = type(obj)
-    cls_name = cls.__name__
-    pad = "    " * indent
-    inner = "    " * (indent + 1)
 
-    args: list[tuple[str, str]] = []
+def _dataclass_kw_args(obj: Any, indent: int) -> list[tuple[str, str]]:
+    out: list[tuple[str, str]] = []
     for f in fields(obj):
         val = getattr(obj, f.name)
         if f.default is not MISSING and val == f.default:
             continue
-        args.append((f.name, _emit_value(val, indent + 1)))
+        out.append((f.name, _emit_value(val, indent + 1)))
+    return out
 
-    if not args:
-        return f"{cls_name}()"
 
-    single = f"{cls_name}({', '.join(f'{n}={v}' for n, v in args)})"
-    if len(single) <= 88 - len(pad) and "\n" not in single:
-        return single
+def _emit_model(obj: BaseModel, indent: int = 0) -> str:
+    return _format_constructor(type(obj).__name__, _pydantic_kw_args(obj, indent), indent)
 
-    body = "\n".join(f"{inner}{n}={v}," for n, v in args)
-    return f"{cls_name}(\n{body}\n{pad})"
+
+def _emit_dataclass(obj: Any, indent: int = 0) -> str:
+    return _format_constructor(type(obj).__name__, _dataclass_kw_args(obj, indent), indent)
 
 
 # ---------------------------------------------------------------------------
-# Import-block builder
+# Import block: walk values and import learning_commons_evaluators types used.
 # ---------------------------------------------------------------------------
 
 
 def _collect_lce_types(obj: Any, found: set[type]) -> None:
-    """Walk *obj* and collect types defined under learning_commons_evaluators for imports."""
     if isinstance(obj, Enum):
         found.add(type(obj))
     elif is_dataclass(obj) and not isinstance(obj, type):
@@ -246,7 +233,6 @@ def _build_import_block(
     config: EvaluatorSettingsResult,
     settings_cls: type[EvaluationSettings],
 ) -> str:
-    """Build ``from … import …`` lines for everything referenced in the generated module."""
     found: set[type] = set()
     _collect_lce_types(config.evaluator_metadata, found)
     _collect_lce_types(config.evaluation_settings, found)
@@ -272,60 +258,56 @@ def _build_import_block(
     return "\n".join(lines)
 
 
-# ---------------------------------------------------------------------------
-# File generator
-# ---------------------------------------------------------------------------
-
-
-def generate_module(
-    evaluator_name: str,
-    toml_path: Path,
-    settings_cls: type[EvaluationSettings],
-) -> str:
-    """Parse *toml_path* and return the content of the generated Python module."""
-    config = load_evaluator_settings(toml_path, settings_cls)
-    settings_cls_name = settings_cls.__name__
-
-    imports = _build_import_block(config, settings_cls)
-    metadata_code = _emit_model(config.evaluator_metadata)
-    prompts_code = _emit_value(config.prompts)
-    settings_code = _emit_model(config.evaluation_settings)
-
-    rel_toml = toml_path.relative_to(_REPO_ROOT)
-
-    # Emit flush-left Python only. Do not wrap this in textwrap.dedent() while
-    # interpolating multi-line fragments (imports, *_code): continuation lines
-    # from those values start at column 0, which would make dedent's common
-    # margin zero and leave the header indented — IndentationError at import.
-    return f"""# !! AUTO-GENERATED — do not edit directly.
-# Source: {rel_toml}
+_MODULE_TEMPLATE = Template(
+    """# !! AUTO-GENERATED — do not edit directly.
+# Source: $rel_toml
 # Regenerate : python scripts/generate_settings.py
 # Staleness check: python scripts/generate_settings.py --check
 
 from __future__ import annotations
 
-{imports}
+$imports
 
 # ── Evaluator metadata ────────────────────────────────────────────────────────
 
-_EVALUATOR_METADATA = {metadata_code}
+_EVALUATOR_METADATA = $metadata_code
 
 # ── Prompt templates ──────────────────────────────────────────────────────────
 
-_PROMPTS: dict[str, str] = {prompts_code}
+_PROMPTS: dict[str, str] = $prompts_code
 
 # ── Evaluation settings ───────────────────────────────────────────────────────
 
-_EVALUATION_SETTINGS = {settings_code}
+_EVALUATION_SETTINGS = $settings_code
 
 # ── Public config object (imported by evaluator modules) ──────────────────────
 
-CONFIG: EvaluatorSettingsResult[{settings_cls_name}] = EvaluatorSettingsResult(
+CONFIG: EvaluatorSettingsResult[$settings_cls_name] = EvaluatorSettingsResult(
     evaluator_metadata=_EVALUATOR_METADATA,
     evaluation_settings=_EVALUATION_SETTINGS,
     prompts=_PROMPTS,
 )
 """
+)
+
+
+def generate_module(
+    _evaluator_name: str,
+    toml_path: Path,
+    settings_cls: type[EvaluationSettings],
+) -> str:
+    """Parse *toml_path* and return the content of the generated Python module."""
+    config = load_evaluator_settings(toml_path, settings_cls)
+    settings_cls_name = settings_cls.__name__
+
+    return _MODULE_TEMPLATE.substitute(
+        rel_toml=str(toml_path.relative_to(_REPO_ROOT)),
+        imports=_build_import_block(config, settings_cls),
+        metadata_code=_emit_model(config.evaluator_metadata),
+        prompts_code=_emit_value(config.prompts),
+        settings_code=_emit_model(config.evaluation_settings),
+        settings_cls_name=settings_cls_name,
+    )
 
 
 # ---------------------------------------------------------------------------
@@ -365,11 +347,20 @@ def _resolve_settings_class(evaluator_name: str) -> type[EvaluationSettings]:
     return cls
 
 
-def _discover_evaluators() -> list[dict[str, Any]]:
-    """Return one entry per ``sdks/settings/<evaluator>/settings.toml`` found on disk."""
-    specs: list[dict[str, Any]] = []
+@dataclass(frozen=True)
+class _EvaluatorTarget:
+    """One evaluator with a canonical TOML and its generated module path."""
+
+    name: str
+    settings_cls: type[EvaluationSettings]
+    toml_path: Path
+    output_path: Path
+
+
+def _discover_evaluators() -> list[_EvaluatorTarget]:
     if not _SETTINGS_DIR.is_dir():
-        return specs
+        return []
+    out: list[_EvaluatorTarget] = []
     for child in sorted(_SETTINGS_DIR.iterdir()):
         if not child.is_dir():
             continue
@@ -379,19 +370,18 @@ def _discover_evaluators() -> list[dict[str, Any]]:
         name = child.name
         settings_cls = _resolve_settings_class(name)
         output = _GENERATED_DIR / f"_generated_{name}_settings.py"
-        specs.append(
-            {
-                "name": name,
-                "settings_cls": settings_cls,
-                "toml": toml_path,
-                "output": output,
-            }
+        out.append(
+            _EvaluatorTarget(
+                name=name,
+                settings_cls=settings_cls,
+                toml_path=toml_path,
+                output_path=output,
+            )
         )
-    return specs
+    return out
 
 
 def _contracts_toml(evaluator_name: str) -> Path:
-    """Return the canonical ``sdks/settings/<evaluator>/contracts.toml``."""
     return _SETTINGS_DIR / evaluator_name / "contracts.toml"
 
 
@@ -401,39 +391,38 @@ def _contracts_toml(evaluator_name: str) -> Path:
 
 
 def cmd_generate() -> None:
-    evaluators = _discover_evaluators()
-    if not evaluators:
+    targets = _discover_evaluators()
+    if not targets:
         print(f"No evaluators found under {_SETTINGS_DIR} (add */settings.toml).")
         return
-    for ev in evaluators:
-        content = generate_module(ev["name"], ev["toml"], ev["settings_cls"])
-        ev["output"].write_text(content, encoding="utf-8")
-        rel = ev["output"].relative_to(_REPO_ROOT)
-        print(f"  generated  {rel}")
+    for t in targets:
+        content = generate_module(t.name, t.toml_path, t.settings_cls)
+        t.output_path.write_text(content, encoding="utf-8")
+        print(f"  generated  {t.output_path.relative_to(_REPO_ROOT)}")
     print("Done.")
 
 
 def cmd_check() -> int:
-    evaluators = _discover_evaluators()
-    if not evaluators:
+    targets = _discover_evaluators()
+    if not targets:
         print(f"No evaluators found under {_SETTINGS_DIR} (nothing to check).")
         return 0
     stale: list[str] = []
-    for ev in evaluators:
-        expected = generate_module(ev["name"], ev["toml"], ev["settings_cls"])
-        actual = ev["output"].read_text(encoding="utf-8") if ev["output"].exists() else ""
+    for t in targets:
+        expected = generate_module(t.name, t.toml_path, t.settings_cls)
+        actual = t.output_path.read_text(encoding="utf-8") if t.output_path.exists() else ""
         if expected != actual:
             diff = "".join(
                 difflib.unified_diff(
                     actual.splitlines(keepends=True),
                     expected.splitlines(keepends=True),
-                    fromfile=str(ev["output"].relative_to(_REPO_ROOT)),
+                    fromfile=str(t.output_path.relative_to(_REPO_ROOT)),
                     tofile="(regenerated)",
                     n=3,
                 )
             )
-            print(f"STALE: {ev['output'].relative_to(_REPO_ROOT)}\n{diff}")
-            stale.append(ev["name"])
+            print(f"STALE: {t.output_path.relative_to(_REPO_ROOT)}\n{diff}")
+            stale.append(t.name)
 
     if stale:
         print(f"\nStale evaluators: {stale}")
@@ -445,22 +434,16 @@ def cmd_check() -> int:
 
 
 def cmd_sync() -> None:
-    """Copy ``contracts.toml`` from canonical (sdks/settings/) → bundled package.
-
-    The bundled package ships a copy of the contracts TOML so that contract
-    tests work correctly when installed via ``pip install`` (i.e. without
-    access to the monorepo ``sdks/settings/`` directory).
-    """
-    evaluators = _discover_evaluators()
-    if not evaluators:
+    targets = _discover_evaluators()
+    if not targets:
         print(f"No evaluators found under {_SETTINGS_DIR} (nothing to sync).")
         return
-    for ev in evaluators:
-        src = _contracts_toml(ev["name"])
+    for t in targets:
+        src = _contracts_toml(t.name)
         if not src.exists():
             print(f"  WARNING: canonical {src.relative_to(_REPO_ROOT)} not found — skipping")
             continue
-        dst_dir = _GENERATED_DIR / ev["name"]
+        dst_dir = _GENERATED_DIR / t.name
         dst_dir.mkdir(parents=True, exist_ok=True)
         dst = dst_dir / "contracts.toml"
         dst.write_bytes(src.read_bytes())
@@ -469,22 +452,17 @@ def cmd_sync() -> None:
 
 
 def cmd_check_sync() -> int:
-    """Verify bundled ``contracts.toml`` files match the canonical sdks/settings/ copies.
-
-    Exits with a non-zero status if any bundled file is missing or differs from
-    the canonical source.
-    """
-    evaluators = _discover_evaluators()
-    if not evaluators:
+    targets = _discover_evaluators()
+    if not targets:
         print(f"No evaluators found under {_SETTINGS_DIR} (nothing to verify).")
         return 0
     errors: list[str] = []
-    for ev in evaluators:
-        canonical = _contracts_toml(ev["name"])
+    for t in targets:
+        canonical = _contracts_toml(t.name)
         if not canonical.exists():
             errors.append(f"MISSING canonical: {canonical.relative_to(_REPO_ROOT)}")
             continue
-        bundled = _GENERATED_DIR / ev["name"] / "contracts.toml"
+        bundled = _GENERATED_DIR / t.name / "contracts.toml"
         if not bundled.exists():
             errors.append(
                 f"MISSING bundled: {bundled.relative_to(_REPO_ROOT)} "
@@ -492,7 +470,7 @@ def cmd_check_sync() -> int:
             )
         elif canonical.read_bytes() != bundled.read_bytes():
             errors.append(
-                f"OUT OF SYNC: {ev['name']}/contracts.toml "
+                f"OUT OF SYNC: {t.name}/contracts.toml "
                 f"(run: python scripts/generate_settings.py --sync)"
             )
 

From f8a592e15d59112efbc18571fec82a78ac7b8b3e Mon Sep 17 00:00:00 2001
From: Fred Sisenda <121206410+czi-fsisenda@users.noreply.github.com>
Date: Tue, 12 May 2026 16:18:41 -0700
Subject: [PATCH 20/26] chore: missing contract files is just a warning for now

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 scripts/generate_settings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/generate_settings.py b/scripts/generate_settings.py
index 2bcef05..4f64547 100644
--- a/scripts/generate_settings.py
+++ b/scripts/generate_settings.py
@@ -460,7 +460,7 @@ def cmd_check_sync() -> int:
     for t in targets:
         canonical = _contracts_toml(t.name)
         if not canonical.exists():
-            errors.append(f"MISSING canonical: {canonical.relative_to(_REPO_ROOT)}")
+            print(f"  WARNING: canonical {canonical.relative_to(_REPO_ROOT)} not found — skipping")
             continue
         bundled = _GENERATED_DIR / t.name / "contracts.toml"
         if not bundled.exists():

From 5a8a8d29f61b435e325e78cc7c15c6acb0be95c5 Mon Sep 17 00:00:00 2001
From: Fred Sisenda <121206410+czi-fsisenda@users.noreply.github.com>
Date: Tue, 12 May 2026 16:19:45 -0700
Subject: [PATCH 21/26] chore: removing unused directory from Makefile. Might
 be re-introduced in later PR.

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 sdks/python/Makefile | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sdks/python/Makefile b/sdks/python/Makefile
index cb24f69..3660852 100644
--- a/sdks/python/Makefile
+++ b/sdks/python/Makefile
@@ -3,9 +3,6 @@ PYTEST  := $(PYTHON) -m pytest
 RUFF    := $(PYTHON) -m ruff
 MYPY    := $(PYTHON) -m mypy
 
-# Bundled settings directory inside the package.
-SETTINGS_DST := src/learning_commons_evaluators/settings
-
 .PHONY: help test unit-test generate-settings check-generated \
         lint format format-check typecheck pip-check verify coverage
 

From 1007197db1033c88daec169de4ade82e8c53f10b Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Tue, 12 May 2026 17:55:01 -0700
Subject: [PATCH 22/26] chore: generalized test_package_imports

---
 sdks/python/tests/test_package_imports.py | 47 ++++++++---------------
 1 file changed, 15 insertions(+), 32 deletions(-)

diff --git a/sdks/python/tests/test_package_imports.py b/sdks/python/tests/test_package_imports.py
index 1eee1a9..cece087 100644
--- a/sdks/python/tests/test_package_imports.py
+++ b/sdks/python/tests/test_package_imports.py
@@ -1,38 +1,21 @@
 """Smoke tests for package public API imports."""
 
+import importlib
 
-def test_main_package_imports():
-    """All advertised public symbols can be imported from the root package."""
-    from learning_commons_evaluators import (
-        BaseEvaluator,
-        ConventionalityEvaluator,
-        TelemetryConfig,
-        __version__,
-        create_config_no_telemetry,
-    )
 
-    assert __version__ is not None
-    assert create_config_no_telemetry is not None
-    assert ConventionalityEvaluator is not None
-    assert BaseEvaluator is not None
-    assert TelemetryConfig is not None
+def test_root_package_public_api():
+    """Every name in ``__all__`` is defined on the root package (single source of truth)."""
+    pkg = importlib.import_module("learning_commons_evaluators")
+    missing = [name for name in pkg.__all__ if not hasattr(pkg, name)]
+    assert not missing, f"__all__ lists undefined names: {missing}"
+    for name in pkg.__all__:
+        assert getattr(pkg, name) is not None, name
 
 
-def test_errors_import():
-    from learning_commons_evaluators import (
-        APIError,
-        wrap_provider_error,
-    )
-
-    assert APIError is not None
-    assert wrap_provider_error is not None
-
-
-def test_providers_import():
-    from learning_commons_evaluators.providers import (
-        create_provider,
-        token_usage_from_aimessage,
-    )
-
-    assert create_provider is not None
-    assert token_usage_from_aimessage is not None
+def test_providers_public_api():
+    """Every name in ``providers.__all__`` is defined on the submodule."""
+    providers = importlib.import_module("learning_commons_evaluators.providers")
+    missing = [name for name in providers.__all__ if not hasattr(providers, name)]
+    assert not missing, f"providers.__all__ lists undefined names: {missing}"
+    for name in providers.__all__:
+        assert getattr(providers, name) is not None, name

From ee9133e070ef37928e204e9fa73e70d6a3c0025b Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Tue, 12 May 2026 17:59:50 -0700
Subject: [PATCH 23/26] chore: generalized conftest

---
 sdks/python/tests/conftest.py | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/sdks/python/tests/conftest.py b/sdks/python/tests/conftest.py
index 128cffb..34951ec 100644
--- a/sdks/python/tests/conftest.py
+++ b/sdks/python/tests/conftest.py
@@ -3,10 +3,7 @@
 import pytest
 
 from learning_commons_evaluators import create_config_no_telemetry
-from learning_commons_evaluators.schemas.config import LlmProvider, PromptSettings
-from learning_commons_evaluators.schemas.conventionality import (
-    ConventionalityEvaluationSettings,
-)
+from learning_commons_evaluators.schemas.config import EvaluationSettings
 from learning_commons_evaluators.schemas.metadata import (
     EvaluationMetadata,
     EvaluatorMaturity,
@@ -27,23 +24,16 @@ def evaluator_metadata():
 
 
 @pytest.fixture
-def prompt_settings_google():
-    """PromptSettings configured for Google, usable in multiple test modules."""
-    return PromptSettings(
-        provider_type=LlmProvider.GOOGLE,
-        model="gemini-2.0-flash",
-        temperature=0.0,
-    )
+def evaluation_metadata(evaluator_metadata):
+    """EvaluationMetadata with sensible defaults for unit tests.
 
-
-@pytest.fixture
-def evaluation_metadata(evaluator_metadata, prompt_settings_google):
-    """EvaluationMetadata with sensible defaults for unit tests."""
+    Uses base :class:`EvaluationSettings` so this stays evaluator-agnostic; tests for a
+    concrete evaluator should build that evaluator's settings (or a dedicated fixture)
+    when they need specific fields.
+    """
     return EvaluationMetadata(
         evaluator_metadata=evaluator_metadata,
-        evaluation_settings=ConventionalityEvaluationSettings(
-            prompt_settings_step_conventionality_evaluation=prompt_settings_google,
-        ),
+        evaluation_settings=EvaluationSettings(),
         input_metadata={},
     )
 

From 15f22f329bded80302ba90da1ab2bf94454497a2 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Tue, 12 May 2026 18:11:11 -0700
Subject: [PATCH 24/26] chore: generalized test_evaluator_schemas

---
 .../tests/schemas/test_evaluator_schemas.py   | 113 ++++++++----------
 1 file changed, 51 insertions(+), 62 deletions(-)

diff --git a/sdks/python/tests/schemas/test_evaluator_schemas.py b/sdks/python/tests/schemas/test_evaluator_schemas.py
index 855ee94..5ba6a4a 100644
--- a/sdks/python/tests/schemas/test_evaluator_schemas.py
+++ b/sdks/python/tests/schemas/test_evaluator_schemas.py
@@ -1,12 +1,15 @@
-"""Tests for EvaluationInput, EvaluationAnswer, EvaluationExplanation, and EvaluationResult."""
+"""Tests for EvaluationInput, EvaluationAnswer, EvaluationExplanation, and EvaluationResult.
+
+Uses a minimal :class:`_ExampleEvaluationInput` (text + grade) wired to explicit
+:class:`~learning_commons_evaluators.schemas.input_specs.TextInputSpec` /
+:class:`~learning_commons_evaluators.schemas.input_specs.GradeInputSpec`
+instances so nothing depends on a real evaluator's TOML or class names.
+"""
 
 from typing import ClassVar
 
 import pytest
 
-from learning_commons_evaluators.evaluators.conventionality import (
-    ConventionalityEvaluationInput,
-)
 from learning_commons_evaluators.schemas.common_inputs import (
     GradeInputField,
     TextInputField,
@@ -20,6 +23,7 @@
     EvaluationExplanation,
     EvaluationInput,
     EvaluationResult,
+    InputField,
 )
 from learning_commons_evaluators.schemas.input_specs import (
     GradeInputSpec,
@@ -29,22 +33,39 @@
     Status,
 )
 
-# Long sample text (well above ``min_text_length`` from conventionality settings TOML).
-_SAMPLE_TEXT = (
+# Long sample text (well above ``min_text_length`` on :attr:`_EXAMPLE_TEXT_SPEC`).
+_LONG_TEXT = (
     "Marco Polo was a Venetian merchant and explorer who traveled through Asia "
     "in the late 13th century. He spent nearly two decades at the court of "
     "Kublai Khan, the Mongol ruler of China, and described his experiences in "
     "a book that introduced Europeans to the Far East."
 )
 
+_EXAMPLE_TEXT_SPEC = TextInputSpec(name="text", min_text_length=10)
+_EXAMPLE_GRADE_SPEC = GradeInputSpec(name="grade")
+# Unconstrained text spec for tests that only need an :class:`InputField` instance.
+_BARE_TEXT_SPEC = TextInputSpec(name="text")
+
+
+class _ExampleEvaluationInput(EvaluationInput):
+    """Minimal concrete :class:`EvaluationInput` for schema unit tests."""
+
+    _input_settings: ClassVar[dict] = {
+        "text": _EXAMPLE_TEXT_SPEC,
+        "grade": _EXAMPLE_GRADE_SPEC,
+    }
+    text: TextInputField
+    grade: GradeInputField
+
+    def __init__(self, *, text: str, grade: int, **kwargs):
+        super().__init__(text=text, grade=grade, **kwargs)
+
 
 # ---------------------------------------------------------------------------
-# A minimal EvaluationInput subclass that mixes a proper InputField with a
-# plain float — used to exercise the isinstance(..., InputField) False branches.
+# Mixes a proper InputField with a plain float — exercises the
+# isinstance(..., InputField) false branches on :class:`EvaluationInput`.
 # ---------------------------------------------------------------------------
 
-_BARE_TEXT_SPEC = TextInputSpec(name="text")  # no constraints — for testing base-class behaviour
-
 
 class _MixedInput(EvaluationInput):
     """EvaluationInput with one proper InputField and one plain Python value."""
@@ -57,39 +78,36 @@ class TestEvaluationInput:
     # --- happy-path construction, validation, and metadata ---
 
     def test_validate_and_input_metadata(self):
-        inp = ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=5)
+        inp = _ExampleEvaluationInput(text=_LONG_TEXT, grade=5)
         inp.validate()
         meta = inp.input_metadata()
-        assert meta["text"] == {"textLength": len(_SAMPLE_TEXT)}
+        assert meta["text"] == {"textLength": len(_LONG_TEXT)}
         assert meta["grade"] == {"grade": 5}
 
     def test_input_values_returns_primitive_values(self):
         """input_values() should unwrap .value from each InputField."""
-        inp = ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=7)
+        inp = _ExampleEvaluationInput(text=_LONG_TEXT, grade=7)
         values = inp.input_values()
-        assert values["text"] == _SAMPLE_TEXT
+        assert values["text"] == _LONG_TEXT
         assert values["grade"] == 7
 
     # --- validation error paths ---
 
     def test_validate_raises_on_invalid_grade(self):
-        inp = ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=99)
+        inp = _ExampleEvaluationInput(text=_LONG_TEXT, grade=99)
         with pytest.raises(ValidationError):
             inp.validate()
 
     def test_validate_raises_on_invalid_text_length(self):
-        # "x" is shorter than ``min_text_length`` from the conventionality settings TOML.
-        inp = ConventionalityEvaluationInput(text="x", grade=5)
+        inp = _ExampleEvaluationInput(text="x", grade=5)
         with pytest.raises(ValidationError):
             inp.validate()
 
     def test_validate_collects_all_errors_before_raising(self):
         """All field errors are collected; a single ValidationError is raised at the end."""
-        # "x" is below the configured minimum text length; grade 99 is outside the 0-12 range.
-        inp = ConventionalityEvaluationInput(text="x", grade=99)
+        inp = _ExampleEvaluationInput(text="x", grade=99)
         with pytest.raises(ValidationError) as exc_info:
             inp.validate()
-        # Both errors should appear in the combined message.
         msg = str(exc_info.value)
         assert "below minimum" in msg
         assert "0-12" in msg
@@ -119,63 +137,34 @@ def test_input_values_returns_field_itself_for_non_inputfield(self):
 
     def test_text_input_field_is_inputfield_subclass(self):
         """TextInputField must be an InputField subclass (checked via isinstance)."""
-        from learning_commons_evaluators.schemas.evaluator import InputField
-
         field = TextInputField(spec=_BARE_TEXT_SPEC, value="hello")
         assert isinstance(field, InputField)
 
     def test_grade_input_field_is_inputfield_subclass(self):
         """GradeInputField must be an InputField subclass (checked via isinstance)."""
-        from learning_commons_evaluators.schemas.common_inputs import GradeInputField
-        from learning_commons_evaluators.schemas.evaluator import InputField
-        from learning_commons_evaluators.schemas.input_specs import GradeInputSpec
-
         spec = GradeInputSpec(name="grade")
         field = GradeInputField(spec=spec, value=5)
         assert isinstance(field, InputField)
 
     def test_plain_value_is_not_an_inputfield(self):
         """Plain Python values must not be treated as InputField instances."""
-        from learning_commons_evaluators.schemas.evaluator import InputField
-
         assert not isinstance(7.5, InputField)
         assert not isinstance("raw string", InputField)
 
     def test_inputfield_cannot_be_instantiated_directly(self):
         """InputField is abstract and must not be instantiable without implementing validate() and input_metadata()."""
-        from learning_commons_evaluators.schemas.evaluator import InputField
-        from learning_commons_evaluators.schemas.input_specs import TextInputSpec
-
         with pytest.raises(TypeError, match="abstract"):
             InputField(spec=TextInputSpec(name="text"), value="hello")  # type: ignore[abstract]
 
 
 class TestCoerceRawToInputFields:
-    """Direct unit tests for EvaluationInput._coerce_raw_to_input_fields.
-
-    These tests exercise the model_validator on a minimal concrete subclass
-    rather than through a full evaluator, so failures point directly at the
-    base-class behaviour rather than evaluator-specific configuration.
-    """
-
-    # Minimal concrete EvaluationInput subclass wired to known specs.
-    _TEXT_SPEC = TextInputSpec(name="text")
-    _GRADE_SPEC = GradeInputSpec(name="grade")
-
-    class _SimpleInput(EvaluationInput):
-        _input_settings: ClassVar[dict] = {
-            "text": TextInputSpec(name="text"),
-            "grade": GradeInputSpec(name="grade"),
-        }
-        text: TextInputField
-        grade: GradeInputField
+    """Direct unit tests for :meth:`EvaluationInput._coerce_raw_to_input_fields`."""
 
-        def __init__(self, *, text: str, grade: int, **kwargs):
-            super().__init__(text=text, grade=grade, **kwargs)
+    _COERCE_TEXT_SPEC = TextInputSpec(name="text")
 
     def test_raw_values_are_wrapped_into_input_fields(self):
         """Raw str/int values should be wrapped into the declared InputField types."""
-        inp = self._SimpleInput(text="hello world", grade=5)
+        inp = _ExampleEvaluationInput(text="hello world", grade=5)
         assert isinstance(inp.text, TextInputField)
         assert inp.text.value == "hello world"
         assert isinstance(inp.grade, GradeInputField)
@@ -183,8 +172,8 @@ def test_raw_values_are_wrapped_into_input_fields(self):
 
     def test_already_constructed_input_field_is_not_rewrapped(self):
         """Passing a fully-constructed InputField instance bypasses construction."""
-        pre_built = TextInputField(spec=self._TEXT_SPEC, value="pre-built")
-        inp = self._SimpleInput(text=pre_built, grade=3)  # type: ignore[arg-type]
+        pre_built = TextInputField(spec=self._COERCE_TEXT_SPEC, value="pre-built")
+        inp = _ExampleEvaluationInput(text=pre_built, grade=3)  # type: ignore[arg-type]
         assert inp.text is pre_built  # same object, not a copy
 
     def test_non_inputfield_field_is_left_unchanged(self):
@@ -203,27 +192,27 @@ def __init__(self, *, text: str, weight: float, **kwargs):
 
     def test_missing_spec_raises_configuration_error(self, monkeypatch):
         """ConfigurationError is raised when a required spec is absent from _input_settings."""
-        monkeypatch.setattr(self._SimpleInput, "_input_settings", {})
+        monkeypatch.setattr(_ExampleEvaluationInput, "_input_settings", {})
         with pytest.raises(ConfigurationError, match="'text'"):
-            self._SimpleInput(text="hello", grade=5)
+            _ExampleEvaluationInput(text="hello", grade=5)
 
     def test_wrong_spec_type_raises_configuration_error(self, monkeypatch):
         """ConfigurationError is raised when the spec type doesn't match the field's expectation."""
         monkeypatch.setattr(
-            self._SimpleInput,
+            _ExampleEvaluationInput,
             "_input_settings",
-            {"text": GradeInputSpec(name="text"), "grade": self._GRADE_SPEC},
+            {"text": GradeInputSpec(name="text"), "grade": _EXAMPLE_GRADE_SPEC},
         )
         with pytest.raises(ConfigurationError, match="TextInputSpec"):
-            self._SimpleInput(text="hello", grade=5)
+            _ExampleEvaluationInput(text="hello", grade=5)
 
     def test_error_message_includes_class_and_field_name(self, monkeypatch):
         """ConfigurationError messages name both the class and the missing field."""
-        monkeypatch.setattr(self._SimpleInput, "_input_settings", {})
+        monkeypatch.setattr(_ExampleEvaluationInput, "_input_settings", {})
         with pytest.raises(ConfigurationError) as exc_info:
-            self._SimpleInput(text="hello", grade=5)
+            _ExampleEvaluationInput(text="hello", grade=5)
         msg = str(exc_info.value)
-        assert "_SimpleInput" in msg
+        assert "_ExampleEvaluationInput" in msg
         assert "'text'" in msg
 
 

From 3ae2f5ba89267f379096e64ae41bee4aad219005 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Tue, 12 May 2026 18:36:56 -0700
Subject: [PATCH 25/26] chore: move generate_settings script into sdks/python
 directory

---
 .github/workflows/test-sdk-python.yml          |  2 --
 sdks/python/Makefile                           | 10 +++++-----
 sdks/python/pyproject.toml                     |  2 +-
 .../python/scripts}/generate_settings.py       | 18 ++++++++++++------
 4 files changed, 18 insertions(+), 14 deletions(-)
 rename {scripts => sdks/python/scripts}/generate_settings.py (97%)

diff --git a/.github/workflows/test-sdk-python.yml b/.github/workflows/test-sdk-python.yml
index af35a78..be4dfc9 100644
--- a/.github/workflows/test-sdk-python.yml
+++ b/.github/workflows/test-sdk-python.yml
@@ -6,13 +6,11 @@ on:
       - main
     paths:
       - "sdks/python/**"
-      - "scripts/generate_settings.py"
       - "sdks/settings/**"
       - ".github/workflows/test-sdk-python.yml"
   pull_request:
     paths:
       - "sdks/python/**"
-      - "scripts/generate_settings.py"
       - "sdks/settings/**"
       - ".github/workflows/test-sdk-python.yml"
 
diff --git a/sdks/python/Makefile b/sdks/python/Makefile
index 3660852..28982ce 100644
--- a/sdks/python/Makefile
+++ b/sdks/python/Makefile
@@ -30,13 +30,13 @@ help:
 # ── Static checks (Ruff + Mypy) ───────────────────────────────────────────────
 
 lint:
-	$(RUFF) check src tests ../../scripts/generate_settings.py
+	$(RUFF) check src tests scripts/generate_settings.py
 
 format:
-	$(RUFF) format src tests ../../scripts/generate_settings.py
+	$(RUFF) format src tests scripts/generate_settings.py
 
 format-check:
-	$(RUFF) format --check src tests ../../scripts/generate_settings.py
+	$(RUFF) format --check src tests scripts/generate_settings.py
 
 typecheck:
 	$(MYPY) src/learning_commons_evaluators tests
@@ -61,7 +61,7 @@ unit-test:
 # Re-run when an evaluator settings TOML under sdks/settings/ changes.
 
 generate-settings:
-	$(PYTHON) ../../scripts/generate_settings.py
+	$(PYTHON) scripts/generate_settings.py
 
 check-generated:
-	$(PYTHON) ../../scripts/generate_settings.py --check
+	$(PYTHON) scripts/generate_settings.py --check
diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml
index c739073..0e1d03e 100644
--- a/sdks/python/pyproject.toml
+++ b/sdks/python/pyproject.toml
@@ -97,7 +97,7 @@ select = [
 ignore = ["E501"]  # line length — formatter wraps where it matters
 
 [tool.ruff.lint.per-file-ignores]
-"../../scripts/generate_settings.py" = ["E402"]
+"scripts/generate_settings.py" = ["E402"]
 
 # ── Mypy ──────────────────────────────────────────────────────────────────────
 
diff --git a/scripts/generate_settings.py b/sdks/python/scripts/generate_settings.py
similarity index 97%
rename from scripts/generate_settings.py
rename to sdks/python/scripts/generate_settings.py
index 4f64547..7d290c0 100644
--- a/scripts/generate_settings.py
+++ b/sdks/python/scripts/generate_settings.py
@@ -18,11 +18,16 @@
 
     sdks/settings/<evaluator>/contracts.toml  — captured LLM interactions for contract tests
 
-Usage::
+Usage (from ``sdks/python/``, e.g. after ``cd sdks/python`` or ``make generate-settings``)::
 
-    # Regenerate all evaluators:
     python scripts/generate_settings.py
 
+From the repository root::
+
+    python sdks/python/scripts/generate_settings.py
+
+Other flags::
+
     # Check whether generated files are stale (exits 1 if any differ):
     python scripts/generate_settings.py --check
 
@@ -32,7 +37,7 @@
     # Verify bundled contracts.toml matches canonical sdks/settings/:
     python scripts/generate_settings.py --check-sync
 
-Typical CI configuration::
+Typical CI configuration (``working-directory: sdks/python``)::
 
     - name: Check settings are up to date
       run: python scripts/generate_settings.py --check
@@ -61,12 +66,13 @@
 
 # ---------------------------------------------------------------------------
 # Path setup — resolve repo root and add SDK src to sys.path so we can import
-# the SDK without a full install.
+# the SDK without a full install.  This file lives under sdks/python/scripts/.
 # ---------------------------------------------------------------------------
 
 _SCRIPT_DIR = Path(__file__).resolve().parent
-_REPO_ROOT = _SCRIPT_DIR.parent
-_SDK_SRC = _REPO_ROOT / "sdks" / "python" / "src"
+_PYTHON_SDK_ROOT = _SCRIPT_DIR.parent
+_REPO_ROOT = _PYTHON_SDK_ROOT.parent.parent
+_SDK_SRC = _PYTHON_SDK_ROOT / "src"
 _SETTINGS_DIR = _REPO_ROOT / "sdks" / "settings"
 _GENERATED_DIR = _SDK_SRC / "learning_commons_evaluators" / "settings"
 

From f34050e229c2ef39dfb2095c53c2b0317e98b57a Mon Sep 17 00:00:00 2001
From: Fred Sisenda <121206410+czi-fsisenda@users.noreply.github.com>
Date: Thu, 14 May 2026 02:19:49 -0700
Subject: [PATCH 26/26] feat(python-sdk): contract test scaffold and
 conventionality contract test (#39)

* feat: contract test scaffold and conventionality contract test

* chore: fix build issues

* ci: fixing build

* chore: moved capture script to scripts folder within python sdk

* Align conventionality_evaluator notebook with main

Co-authored-by: Cursor <cursoragent@cursor.com>

* chore: addressing PR comments

* feat(python-sdk): vocabulary evaluator (#36)

* feat: vocabulary evaluator

* chore: update vocabulary settings to use  instead of  for prompt settings

* chore: fix capture and contract tests

* chore: vocabulary settings are required

* feat: eval instance settings overrides

* chore: addressing PR comments

* chore: restore vocabulary notebook

* feat: base eval support for json normalizers

* chore: cleaner implementation of vocab

* chore: same step name as typescript sdk + edge case unit test

---------

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .github/workflows/test-sdk-python.yml         |   2 +-
 sdks/python/Makefile                          |  59 +-
 sdks/python/README.md                         |  43 +-
 sdks/python/scripts/capture.py                | 329 +++++++++
 .../learning_commons_evaluators/__init__.py   |  12 +
 .../evaluators/__init__.py                    |   8 +
 .../evaluators/base.py                        |  39 +-
 .../evaluators/vocabulary.py                  | 195 ++++++
 .../schemas/vocabulary.py                     |  68 ++
 .../_generated_vocabulary_settings.py         | 404 +++++++++++
 .../settings/conventionality/__init__.py      |   0
 .../settings/conventionality/contracts.toml   | 130 ++++
 .../settings/vocabulary/__init__.py           |   0
 .../settings/vocabulary/contracts.toml        | 641 ++++++++++++++++++
 sdks/python/tests/contract_tests/__init__.py  |  14 +
 .../tests/contract_tests/conventionality.py   |  87 +++
 sdks/python/tests/contract_tests/harness.py   | 229 +++++++
 sdks/python/tests/contract_tests/loader.py    | 169 +++++
 .../contract_tests/test_conventionality.py    |  84 +++
 .../tests/contract_tests/test_vocabulary.py   | 136 ++++
 .../python/tests/contract_tests/vocabulary.py | 146 ++++
 sdks/python/tests/evaluators/test_base.py     |  78 +++
 .../tests/evaluators/test_vocabulary.py       | 353 ++++++++++
 .../tests/schemas/test_evaluator_schemas.py   |  12 +-
 .../tests/settings/test_load_settings.py      | 127 +++-
 sdks/settings/conventionality/contracts.toml  | 130 ++++
 sdks/settings/vocabulary/contracts.toml       | 641 ++++++++++++++++++
 sdks/settings/vocabulary/settings.toml        | 410 +++++++++++
 28 files changed, 4515 insertions(+), 31 deletions(-)
 create mode 100644 sdks/python/scripts/capture.py
 create mode 100644 sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
 create mode 100644 sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py
 create mode 100644 sdks/python/src/learning_commons_evaluators/settings/_generated_vocabulary_settings.py
 create mode 100644 sdks/python/src/learning_commons_evaluators/settings/conventionality/__init__.py
 create mode 100644 sdks/python/src/learning_commons_evaluators/settings/conventionality/contracts.toml
 create mode 100644 sdks/python/src/learning_commons_evaluators/settings/vocabulary/__init__.py
 create mode 100644 sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml
 create mode 100644 sdks/python/tests/contract_tests/__init__.py
 create mode 100644 sdks/python/tests/contract_tests/conventionality.py
 create mode 100644 sdks/python/tests/contract_tests/harness.py
 create mode 100644 sdks/python/tests/contract_tests/loader.py
 create mode 100644 sdks/python/tests/contract_tests/test_conventionality.py
 create mode 100644 sdks/python/tests/contract_tests/test_vocabulary.py
 create mode 100644 sdks/python/tests/contract_tests/vocabulary.py
 create mode 100644 sdks/python/tests/evaluators/test_vocabulary.py
 create mode 100644 sdks/settings/conventionality/contracts.toml
 create mode 100644 sdks/settings/vocabulary/contracts.toml
 create mode 100644 sdks/settings/vocabulary/settings.toml

diff --git a/.github/workflows/test-sdk-python.yml b/.github/workflows/test-sdk-python.yml
index be4dfc9..1d44f1a 100644
--- a/.github/workflows/test-sdk-python.yml
+++ b/.github/workflows/test-sdk-python.yml
@@ -30,7 +30,7 @@ jobs:
         working-directory: sdks/python
     strategy:
       matrix:
-        python-version: ["3.10", "3.12", "3.13"]
+        python-version: ["3.10", "3.11", "3.12", "3.13"]
     steps:
       - name: Checkout
         uses: actions/checkout@v6
diff --git a/sdks/python/Makefile b/sdks/python/Makefile
index 28982ce..72d8b4b 100644
--- a/sdks/python/Makefile
+++ b/sdks/python/Makefile
@@ -3,40 +3,66 @@ PYTEST  := $(PYTHON) -m pytest
 RUFF    := $(PYTHON) -m ruff
 MYPY    := $(PYTHON) -m mypy
 
-.PHONY: help test unit-test generate-settings check-generated \
+# Bundled settings directory inside the package.  Two things are maintained here:
+#   _generated_*_settings.py  — generated from settings.toml by `make generate-settings`
+#   contracts.toml             — copied from sdks/settings/ by `make sync-settings`
+#
+# Tests always read contracts.toml from the bundled copy (via importlib.resources
+# fallback in shared_settings_root()).  EVALUATORS_SETTINGS_DIR is intentionally NOT
+# exported here so that dev and installed-package runs behave identically.
+SETTINGS_DST := src/learning_commons_evaluators/settings
+
+.PHONY: help build check-build test unit-test contract-test \
+        generate-settings check-generated sync-settings check-sync \
         lint format format-check typecheck pip-check verify coverage
 
 help:
 	@echo "Usage: make <target>"
 	@echo ""
-	@echo "  lint               Ruff linter (src, tests, settings generator script)"
+	@echo "  build              Prepare the package: generate settings + sync contracts TOML"
+	@echo "  check-build        Verify build artifacts are up to date (use in CI)"
+	@echo ""
+	@echo "  lint               Ruff linter (src/, tests/, scripts/)"
 	@echo "  format             Apply Ruff formatter"
 	@echo "  format-check       Fail if Ruff would reformat any file"
 	@echo "  typecheck          Mypy on src package + tests"
 	@echo "  pip-check          pip dependency consistency check"
-	@echo "  verify             check-generated + lint + format-check + typecheck + pip-check + test (unit + contract)"
+	@echo "  verify             check-build + lint + format-check + typecheck + pip-check + test (unit + contract)"
 	@echo "  coverage           Unit tests with coverage report (requires dev install)"
 	@echo ""
-	@echo "  test               Run the full test suite"
-	@echo "  unit-test          Run unit tests only"
+	@echo "  test               Run the full test suite (unit + contract)"
+	@echo "  unit-test          Run unit tests only (skip contract tests)"
+	@echo "  contract-test      Run contract tests only"
 	@echo ""
 	@echo "  generate-settings  Re-generate _generated_*_settings.py from sdks/settings/"
 	@echo "  check-generated    Verify generated .py files match sdks/settings/ (exits 1 if stale)"
+	@echo "  sync-settings      Copy contracts.toml from sdks/settings/ into the package"
+	@echo "  check-sync         Verify bundled contracts.toml matches sdks/settings/"
 	@echo ""
 	@echo "Typical workflows:"
 	@echo "  After editing evaluator settings TOML:  make generate-settings && make test"
+	@echo "  After editing contracts TOML:           make sync-settings && make contract-test"
+	@echo "  Fresh checkout / before release:        make build && make verify"
 	@echo "  CI:                                     make verify"
 
+# ── Build ─────────────────────────────────────────────────────────────────────
+# Prepares all package build artifacts in one step.  Run after editing any
+# file under sdks/settings/, or on a fresh checkout before running tests.
+
+build: generate-settings sync-settings
+
+check-build: check-generated check-sync
+
 # ── Static checks (Ruff + Mypy) ───────────────────────────────────────────────
 
 lint:
-	$(RUFF) check src tests scripts/generate_settings.py
+	$(RUFF) check src tests scripts
 
 format:
-	$(RUFF) format src tests scripts/generate_settings.py
+	$(RUFF) format src tests scripts
 
 format-check:
-	$(RUFF) format --check src tests scripts/generate_settings.py
+	$(RUFF) format --check src tests scripts
 
 typecheck:
 	$(MYPY) src/learning_commons_evaluators tests
@@ -44,7 +70,7 @@ typecheck:
 pip-check:
 	$(PYTHON) -m pip check
 
-verify: check-generated lint format-check typecheck pip-check test
+verify: check-build lint format-check typecheck pip-check test
 
 coverage:
 	$(PYTEST) tests/ -q --ignore=tests/contract_tests --cov=learning_commons_evaluators --cov-report=term-missing
@@ -57,11 +83,26 @@ test:
 unit-test:
 	$(PYTEST) tests/ -v --ignore=tests/contract_tests
 
+contract-test:
+	$(PYTEST) tests/contract_tests/ -v
+
 # ── Code generation ───────────────────────────────────────────────────────────
 # Re-run when an evaluator settings TOML under sdks/settings/ changes.
+# The generator sets EVALUATORS_SETTINGS_DIR itself to point at sdks/settings/.
 
 generate-settings:
 	$(PYTHON) scripts/generate_settings.py
 
 check-generated:
 	$(PYTHON) scripts/generate_settings.py --check
+
+# ── Contract test bundling ────────────────────────────────────────────────────
+# contracts.toml is bundled so contract tests run consistently whether using
+# a development checkout or a pip-installed package.  The sync command is
+# handled by the Python generator script.
+
+sync-settings:
+	$(PYTHON) scripts/generate_settings.py --sync
+
+check-sync:
+	$(PYTHON) scripts/generate_settings.py --check-sync
diff --git a/sdks/python/README.md b/sdks/python/README.md
index 15dc866..9a1e80c 100644
--- a/sdks/python/README.md
+++ b/sdks/python/README.md
@@ -230,6 +230,42 @@ config = create_config(
 )
 ```
 
+### Per-instance default evaluation settings
+
+Every `BaseEvaluator` subclass defines **class-level** `default_evaluation_settings`
+(the bundled evaluators load these from generated settings). You can override that
+default for a single evaluator instance by passing the same keyword to the
+constructor:
+
+```python
+from learning_commons_evaluators import ConventionalityEvaluator, create_config
+
+config = create_config(...)
+# Start from the bundled defaults, then change what your deployment needs (models,
+# temperatures, etc. live on nested PromptSettings).
+settings = ConventionalityEvaluator.default_evaluation_settings.model_copy(deep=True)
+settings.prompt_settings_step_conventionality_evaluation = (
+    settings.prompt_settings_step_conventionality_evaluation.model_copy(
+        update={"temperature": 0.2}
+    )
+)
+evaluator = ConventionalityEvaluator(
+    config,
+    default_evaluation_settings=settings,
+)
+
+# Uses the instance default (a deep copy is taken inside evaluate)
+result = evaluator.evaluate(input)
+
+# Per-call override still wins
+result = evaluator.evaluate(input, evaluation_settings=other_settings)
+```
+
+If you omit `default_evaluation_settings` at construction, attribute lookup uses the
+subclass class attribute, same as before. Whenever you call `evaluate()` without
+`evaluation_settings`, the SDK uses `model_copy(deep=True)` of the resolved default,
+so the object you keep on the instance is not mutated by a run.
+
 ### Logging
 
 The SDK uses Python's standard `logging` module. By default, `EvaluatorConfig` uses the
@@ -294,7 +330,10 @@ On evaluation failure, `metadata.status` and `error_details` are set on the in-m
 
 ## Creating custom evaluators
 
-Extend `BaseEvaluator` to create custom evaluators:
+Extend `BaseEvaluator` to create custom evaluators. Set **class-level**
+`default_evaluation_settings` for the usual defaults; callers may still construct
+`MyEvaluator(config, default_evaluation_settings=...)` to pin different defaults on a
+specific instance (see [Per-instance default evaluation settings](#per-instance-default-evaluation-settings)).
 
 ```python
 from learning_commons_evaluators import BaseEvaluator, EvaluatorConfig
@@ -329,6 +368,8 @@ class MyEvaluator(BaseEvaluator[MyInput, EvaluationResult, MySettings]):
         return EvaluationResult(answer=..., explanation=..., metadata=evaluation_metadata)
 ```
 
+If you override `__init__` on the subclass, accept the same keyword-only argument and forward it: `super().__init__(config, default_evaluation_settings=default_evaluation_settings)`.
+
 ## License
 
 MIT
diff --git a/sdks/python/scripts/capture.py b/sdks/python/scripts/capture.py
new file mode 100644
index 0000000..b2c158c
--- /dev/null
+++ b/sdks/python/scripts/capture.py
@@ -0,0 +1,329 @@
+"""Contract test capture utilities for evaluator notebooks.
+
+Three-step workflow for notebook authors
+-----------------------------------------
+1. Wrap every model in your chain with ``capture_llm()``:
+
+       chain = prompt | capture_llm("step_name", my_model) | JsonOutputParser()
+
+   The prefix (``"step_name"``) becomes the step key in ``prompt_steps`` in
+   the TOML output.  Use a short, stable name per step (e.g. ``"main"``,
+   ``"bk"``, ``"vocab"``).
+
+2. Immediately after each test-case evaluation, call ``capture_case()`` to save a
+   point-in-time copy of what was captured.  Pass the evaluator's input dict
+   and output dict directly — no manual field extraction needed::
+
+       case_input  = {"text": my_text, "grade": 4}
+       case_output = run_evaluator(**case_input)
+
+       _cap = capture_case(
+           name="my_case",
+           input=case_input,
+           llm_call_captures=["step_name"],  # prefixes, in call order
+           expected_result=case_output,
+           description="…",                 # optional human-readable label
+       )
+
+   String values in ``input`` (e.g. ``text``) are ``.strip()``'d before writing
+   TOML so contract ``[input]`` matches common evaluator behavior. Run your
+   chains with the same stripped strings so captured ``user_prompt`` matches.
+
+3. Print the TOML block and paste it into ``contracts.toml`` (for example,
+   ``sdks/settings/<evaluator>/contracts.toml``):
+
+       print(build_contract_toml(_cap_one, _cap_two))
+
+Resetting between runs
+-----------------------
+Call ``reset_captures()`` at the start of each evaluation to avoid stale data from a
+previous run leaking into the next capture_case::
+
+    reset_captures()
+    output = run_evaluator(text, grade)
+    _cap = capture_case(
+        name="my_case",
+        input={"text": text, "grade": grade},
+        llm_call_captures=["main"],
+        expected_result=output,
+    )
+
+Async chains
+-------------
+``capture_llm()`` works in both sync (``invoke``) and async (``ainvoke``)
+chains with no extra configuration.
+"""
+
+from __future__ import annotations
+
+import json as _json
+from typing import Any
+
+from langchain_core.runnables import RunnableLambda
+
+# ---------------------------------------------------------------------------
+# Internal state
+# ---------------------------------------------------------------------------
+
+# Flat dict populated by capture_llm() on every chain invocation.
+# Keys follow the pattern "{prefix}_{field}" (e.g. "bk_user_prompt").
+_captures: dict[str, Any] = {}
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def reset_captures() -> None:
+    """Clear all captured data. Call at the start of each evaluation run."""
+    _captures.clear()
+
+
+def capture_llm(prefix: str, model: Any) -> RunnableLambda:
+    """Return a wrapped model that records the formatted prompt and raw response.
+
+    Drop it into a LangChain chain wherever you have a bare model::
+
+        # single-step evaluator
+        chain = prompt | capture_llm("main", llm) | JsonOutputParser()
+
+        # multi-step evaluator
+        bk_chain    = bk_prompt    | capture_llm("bk",    bk_model)
+        vocab_chain = vocab_prompt | capture_llm("vocab", vocab_model) | JsonOutputParser()
+
+    After the chain runs, ``_captures`` contains:
+
+    * ``"{prefix}_system_prompt"`` — system message content (``""`` if none)
+    * ``"{prefix}_user_prompt"``   — human message content
+    * ``"{prefix}_raw_response"``  — text content of the LLM response.
+                                     Most providers return a plain string.
+                                     Some (e.g. Google Gemini via
+                                     langchain_google_genai) return a list of
+                                     content blocks; the first ``"text"`` block
+                                     is extracted so the stored value is always
+                                     a plain string suitable for JsonOutputParser.
+    * ``"{prefix}_model"``         — model identifier
+    * ``"{prefix}_temperature"``   — temperature used
+
+    Supports both ``invoke`` (sync) and ``ainvoke`` (async) chains.
+    """
+    model_name = getattr(model, "model", None) or getattr(model, "model_name", None) or ""
+    temperature = float(getattr(model, "temperature", 0))
+
+    def _record(prompt_value: Any, ai_message: Any) -> None:
+        if isinstance(prompt_value, str):
+            # Plain string passed directly to the model — treat it as a lone
+            # human message with no system prompt.  This happens when the
+            # caller formats a prompt template themselves and passes the result
+            # as a string rather than going through a ChatPromptTemplate chain.
+            system = ""
+            human = prompt_value
+        else:
+            messages = (
+                prompt_value.to_messages()
+                if hasattr(prompt_value, "to_messages")
+                else list(prompt_value)
+            )
+            system = next(
+                (str(m.content) for m in messages if getattr(m, "type", None) == "system"), ""
+            )
+            human = next(
+                (str(m.content) for m in messages if getattr(m, "type", None) == "human"), ""
+            )
+        _captures[f"{prefix}_system_prompt"] = system
+        _captures[f"{prefix}_user_prompt"] = human
+        _captures[f"{prefix}_model"] = model_name
+        _captures[f"{prefix}_temperature"] = temperature
+        _captures[f"{prefix}_raw_response"] = _extract_text_content(ai_message.content)
+
+    def _invoke(prompt_value: Any) -> Any:
+        ai_message = model.invoke(prompt_value)
+        _record(prompt_value, ai_message)
+        return ai_message
+
+    async def _ainvoke(prompt_value: Any) -> Any:
+        ai_message = await model.ainvoke(prompt_value)
+        _record(prompt_value, ai_message)
+        return ai_message
+
+    return RunnableLambda(_invoke, afunc=_ainvoke)
+
+
+def capture_case(
+    *,
+    name: str,
+    input: dict[str, Any],
+    llm_call_captures: list[str],
+    expected_result: dict[str, Any] | None = None,
+    description: str | None = None,
+) -> dict[str, Any]:
+    """Return a point-in-time copy of ``_captures`` merged with structured context.
+
+    Args:
+        name:              Case identifier used as the TOML key (e.g. ``"marco_polo_grade3"``).
+        input:             The evaluator's input dict (e.g. ``{"text": ..., "grade": 4}``).
+                           String values are ``.strip()``'d before writing TOML.
+                           Keys are otherwise unchanged.
+        llm_call_captures: Ordered list of capture prefixes to include as
+                           ``prompt_steps`` in the TOML.  Must match the prefixes
+                           passed to ``capture_llm()`` during this run, in call order.
+        expected_result:   The evaluator's output.  Pass the whole output — no
+                           need to extract individual fields.  Plain dicts, Pydantic
+                           v1/v2 models, and anything dict-like are all accepted;
+                           ``capture_case()`` normalises to a plain dict.  Written to
+                           the ``[expected_result]`` TOML section.
+        description:       Optional human-readable label for this test case.
+
+    Example::
+
+        case_input  = {"text": text, "grade": 3}
+        case_output = run_evaluator(**case_input)
+
+        _cap = capture_case(
+            name="marco_polo_grade3",
+            input=case_input,
+            llm_call_captures=["bk", "vocab"],
+            expected_result=case_output,
+            description="Marco Polo passage, grade 3",
+        )
+    """
+    data: dict[str, Any] = dict(_captures)
+    data["name"] = name
+    data["input"] = _strip_string_values(dict(input))
+    data["llm_call_captures"] = llm_call_captures
+    if expected_result is not None:
+        # Normalise to a plain dict so capture_case() is always fully serializable.
+        # Handles Pydantic v2 models (.model_dump()), v1 models (.dict()), and
+        # anything else that is already a dict or dict-like.
+        if hasattr(expected_result, "model_dump"):
+            expected_result = expected_result.model_dump()
+        elif hasattr(expected_result, "dict"):
+            expected_result = expected_result.dict()
+        data["expected_result"] = dict(expected_result)
+    if description is not None:
+        data["description"] = description
+    return data
+
+
+def build_contract_toml(*cases: dict[str, Any]) -> str:
+    """Build the contract TOML block for one or more test cases.
+
+    Args:
+        *cases: One or more dicts as returned by :func:`capture_case`.
+
+    Returns:
+        TOML string ready to paste into ``contracts.toml`` (for example,
+        ``sdks/settings/<evaluator>/contracts.toml``).
+
+    Example::
+
+        print(build_contract_toml(_cap_grade3, _cap_grade7))
+    """
+    return "\n".join(_build_case(c) for c in cases)
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+
+
+def _strip_string_values(inp: dict[str, Any]) -> dict[str, Any]:
+    """Return a shallow copy of *inp* with leading/trailing whitespace removed from str values."""
+    return {k: v.strip() if isinstance(v, str) else v for k, v in inp.items()}
+
+
+def _extract_text_content(content: Any) -> str:
+    """Extract a plain-text string from an AIMessage content value.
+
+    Most LLM providers via LangChain return a plain ``str``.  Some
+    (e.g. Google Gemini via ``langchain_google_genai``) return a list of
+    content blocks such as::
+
+        [{"type": "text", "text": "...the model's reply...", "extras": {...}}]
+
+    This helper normalises both shapes to a plain string so that
+    ``_raw_response`` is always something ``JsonOutputParser`` can parse
+    directly — not a Python-repr of a list.
+    """
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        for block in content:
+            if isinstance(block, dict) and block.get("type") == "text":
+                return str(block["text"])
+    return str(content)
+
+
+def _toml_value(v: Any) -> str:
+    """Serialize *v* to a TOML literal."""
+    if isinstance(v, str):
+        if v == "":
+            return '""'
+        # Prefer multiline LITERAL strings ('''...''') — they are verbatim so
+        # backslashes, double-quotes, and \' sequences are all safe.  The only
+        # restriction is the content cannot contain '''.
+        #
+        # IMPORTANT: no "\n" before the closing ''' — that would add a spurious
+        # trailing newline to every parsed value (TOML only trims the *first*
+        # newline after the opening delimiter, not the last one before closing).
+        if "'''" not in v:
+            return "'''\n" + v + "'''"
+        # Fallback: content contains ''' so literal multiline cannot be used.
+        # json.dumps produces a TOML-safe basic string (all backslashes/quotes
+        # escaped); tomllib accepts it as a single-line value.
+        return _json.dumps(v)
+    if isinstance(v, bool):
+        return "true" if v else "false"
+    if isinstance(v, int):
+        return str(v)
+    if isinstance(v, float):
+        return str(v)
+    if isinstance(v, list):
+        return "[" + ", ".join(_json.dumps(item) for item in v) + "]"
+    # Fallback: JSON-encode whatever it is.
+    return _json.dumps(str(v))
+
+
+def _build_case(c: dict[str, Any]) -> str:
+    name = c["name"]
+    lines: list[str] = []
+
+    # ── optional top-level description ───────────────────────────────────────
+    description = c.get("description")
+    if description:
+        lines += [
+            f"[cases.{name}]",
+            f"description = {_json.dumps(description)}",
+            "",
+        ]
+
+    # ── input section ────────────────────────────────────────────────────────
+    lines.append(f"[cases.{name}.input]")
+    for field, val in c.get("input", {}).items():
+        # ``capture_case`` applies ``.strip()`` to string fields in ``input``.
+        lines.append(f"{field} = {_toml_value(val)}")
+    lines.append("")
+
+    # ── prompt_steps sections ─────────────────────────────────────────────────
+    prefixes = c["llm_call_captures"]
+    for prefix in prefixes:
+        lines += [
+            f"[cases.{name}.prompt_steps.{prefix}]",
+            f"model = {_json.dumps(c.get(prefix + '_model', ''))}",
+            f"temperature = {c.get(prefix + '_temperature', 0)}",
+            f"system_prompt = {_toml_value(c.get(prefix + '_system_prompt', ''))}",
+            f"user_prompt = {_toml_value(c.get(prefix + '_user_prompt', ''))}",
+            f"llm_response = {_toml_value(c.get(prefix + '_raw_response', ''))}",
+            "",
+        ]
+
+    # ── expected_result section ───────────────────────────────────────────────
+    expected = c.get("expected_result")
+    if expected:
+        lines.append(f"[cases.{name}.expected_result]")
+        for field, value in expected.items():
+            lines.append(f"{field} = {_toml_value(value)}")
+        lines.append("")
+
+    return "\n".join(lines)
diff --git a/sdks/python/src/learning_commons_evaluators/__init__.py b/sdks/python/src/learning_commons_evaluators/__init__.py
index 9bb613a..b3208dc 100644
--- a/sdks/python/src/learning_commons_evaluators/__init__.py
+++ b/sdks/python/src/learning_commons_evaluators/__init__.py
@@ -36,6 +36,8 @@
     ConventionalityEvaluator,
     InputT,
     OutputT,
+    VocabularyEvaluationInput,
+    VocabularyEvaluator,
 )
 from learning_commons_evaluators.evaluators.conventionality import (
     ConventionalityEvaluationInput,
@@ -83,6 +85,11 @@
 from learning_commons_evaluators.schemas.text_complexity import (
     TextComplexityEvaluationInput,
 )
+from learning_commons_evaluators.schemas.vocabulary import (
+    VocabularyComplexityOutput,
+    VocabularyEvaluationSettings,
+    normalize_complexity_output,
+)
 
 __all__ = [
     "__description__",
@@ -131,6 +138,11 @@
     "TextInputField",
     "TokenUsage",
     "ValidationError",
+    "VocabularyEvaluationInput",
+    "VocabularyEvaluationSettings",
+    "VocabularyEvaluator",
+    "VocabularyComplexityOutput",
+    "normalize_complexity_output",
     "create_config",
     "create_config_no_telemetry",
     "create_config_telemetry_with_full_input",
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py b/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py
index 50821c6..5bb9fc1 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py
@@ -11,6 +11,11 @@
     ConventionalityEvaluationInput,
     ConventionalityEvaluator,
 )
+from learning_commons_evaluators.evaluators.vocabulary import (
+    VocabularyEvaluationInput,
+    VocabularyEvaluator,
+)
+from learning_commons_evaluators.schemas.vocabulary import normalize_complexity_output
 
 __all__ = [
     "BaseEvaluator",
@@ -18,4 +23,7 @@
     "ConventionalityEvaluator",
     "InputT",
     "OutputT",
+    "VocabularyEvaluationInput",
+    "VocabularyEvaluator",
+    "normalize_complexity_output",
 ]
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
index 5bedf05..509b2e1 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
@@ -49,14 +49,25 @@ class BaseEvaluator(ABC, Generic[InputT, OutputT, SettingsT]):
     """
     Abstract base class for all evaluators.
     Subclasses must set metadata, default_evaluation_settings, and implement evaluate_impl().
+
+    Pass ``default_evaluation_settings`` at construction to override the class-level
+    defaults for that instance (used when :meth:`evaluate` is called without
+    ``evaluation_settings``).
     """
 
     config: EvaluatorConfig
     metadata: EvaluatorMetadata
     default_evaluation_settings: SettingsT
 
-    def __init__(self, config: EvaluatorConfig) -> None:
+    def __init__(
+        self,
+        config: EvaluatorConfig,
+        *,
+        default_evaluation_settings: SettingsT | None = None,
+    ) -> None:
         self.config = config
+        if default_evaluation_settings is not None:
+            self.default_evaluation_settings = default_evaluation_settings
         # TODO: validate config
 
     def evaluate(
@@ -68,13 +79,14 @@ def evaluate(
 
         Validates the input, delegates to :meth:`evaluate_impl`, records timing
         and status on the returned metadata, and logs start/end events via the
-        configured logger.  If ``evaluation_settings`` is ``None``, the
-        evaluator's :attr:`default_evaluation_settings` is used.
+        configured logger.  If ``evaluation_settings`` is ``None``, a deep copy of
+        the instance's :attr:`default_evaluation_settings` is used (from the
+        constructor keyword when given, otherwise the subclass class attribute).
 
         Args:
             input: Typed input for this evaluator.
             evaluation_settings: Optional override for evaluation settings.
-                Defaults to :attr:`default_evaluation_settings`.
+                Defaults to :attr:`default_evaluation_settings` (constructor or class).
 
         Returns:
             A typed result whose ``metadata.status`` is
@@ -184,6 +196,7 @@ def execute_prompt_chain_step(
         template: Any,
         chain_inputs: dict[str, Any],
         parser_output_type: type[ParsedT],
+        json_dict_normalizer: Callable[[dict], dict] | None = None,
     ) -> ParsedT: ...
 
     def execute_prompt_chain_step(
@@ -194,6 +207,7 @@ def execute_prompt_chain_step(
         template: Any,
         chain_inputs: dict[str, Any],
         parser_output_type: type[BaseModel] | None = None,
+        json_dict_normalizer: Callable[[dict], dict] | None = None,
     ) -> BaseModel | str:
         """Run a prompt chain (template | LLM), record metadata, and return the result.
 
@@ -215,6 +229,12 @@ def execute_prompt_chain_step(
             chain_inputs: Variables to format the template and invoke the chain.
             parser_output_type: Pydantic model class for JSON parsing, or ``None``
                 to return the raw text response.
+            json_dict_normalizer: When set with ``parser_output_type``, parse the
+                model response as JSON into a plain dict (no Pydantic parse),
+                apply this function (e.g. notebook-style ``normalize_complexity_output``),
+                then validate with ``parser_output_type``. Format instructions for the
+                prompt should still be built from the same ``parser_output_type`` via
+                :class:`~langchain_core.output_parsers.JsonOutputParser`.
 
         Returns:
             Parsed instance of ``parser_output_type`` when it is a model class; plain
@@ -223,7 +243,10 @@ def execute_prompt_chain_step(
         Raises:
             ConfigurationError: No provider config for prompt_settings.provider_type.
             EvaluatorError: SDK errors, including :func:`~learning_commons_evaluators.schemas.errors.wrap_provider_error` output for LangChain or HTTP failures (typically :class:`~learning_commons_evaluators.schemas.errors.APIError` subclasses). Pydantic :exc:`pydantic.ValidationError` from output parsing is re-raised unchanged.
+            ValueError: If ``json_dict_normalizer`` is set but ``parser_output_type`` is omitted.
         """
+        if json_dict_normalizer is not None and parser_output_type is None:
+            raise ValueError("json_dict_normalizer requires parser_output_type to be set")
         # Populated after a successful LLM invoke so we can attach usage even if parsing fails.
         token_usage: TokenUsage | None = None
 
@@ -238,6 +261,14 @@ def _run_chain() -> BaseModel | str:
                     return str(ai_message.content)
                 from langchain_core.output_parsers.json import JsonOutputParser
 
+                if json_dict_normalizer is not None:
+                    loose = JsonOutputParser()
+                    parsed_dict = loose.invoke(ai_message)
+                    if not isinstance(parsed_dict, dict):
+                        parsed_dict = dict(parsed_dict)
+                    normalized = json_dict_normalizer(parsed_dict)
+                    return parser_output_type.model_validate(normalized)
+
                 parser = JsonOutputParser(pydantic_object=parser_output_type)
                 raw = parser.invoke(ai_message)
                 if isinstance(raw, parser_output_type):
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
new file mode 100644
index 0000000..cde17df
--- /dev/null
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
@@ -0,0 +1,195 @@
+"""Vocabulary evaluator: evaluates text for vocabulary complexity relative to grade level."""
+
+from __future__ import annotations
+
+from typing import Any, ClassVar
+
+import textstat  # type: ignore[import-untyped]
+from langchain_core.output_parsers import JsonOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from pydantic import Field
+
+from learning_commons_evaluators.evaluators.base import BaseEvaluator
+from learning_commons_evaluators.schemas.common_inputs import (
+    GradeInputField,
+    TextInputField,
+)
+from learning_commons_evaluators.schemas.config import PromptSettings
+from learning_commons_evaluators.schemas.evaluator import (
+    EvaluationExplanation,
+    EvaluationInput,
+)
+from learning_commons_evaluators.schemas.metadata import (
+    EvaluationMetadata,
+    EvaluatorMetadata,
+)
+from learning_commons_evaluators.schemas.text_complexity import (
+    TextComplexityAnswer,
+    TextComplexityResult,
+)
+from learning_commons_evaluators.schemas.vocabulary import (
+    VocabularyComplexityOutput,
+    VocabularyEvaluationSettings,
+    normalize_complexity_output,
+)
+from learning_commons_evaluators.settings._generated_vocabulary_settings import (
+    CONFIG as _VOCABULARY_CONFIG,
+)
+
+_INPUT_SETTINGS = _VOCABULARY_CONFIG.evaluator_metadata.inputs
+
+# Grades 3–4 use the Gemini-based rubric prompt; grades 5–12 use the GPT-based annotation guide prompt.
+_GRADES_3_4 = frozenset({3, 4})
+
+
+class VocabularyEvaluationInput(EvaluationInput):
+    """Input for a vocabulary evaluation.
+
+    Constraints (allowed grade levels) are sourced from
+    ``[[evaluator_metadata.inputs]]`` in evaluator settings and applied
+    automatically — callers supply raw values, not field objects.
+
+    Example::
+
+        inp = VocabularyEvaluationInput(text="The quick brown fox...", grade=5)
+    """
+
+    _input_settings: ClassVar[dict] = _INPUT_SETTINGS
+
+    text: TextInputField = Field(description="The text to evaluate.")
+    grade: GradeInputField = Field(description="The grade level of the text (3–12).")
+
+    def __init__(self, *, text: str, grade: int, **kwargs):
+        super().__init__(text=text, grade=grade, **kwargs)
+
+
+class VocabularyEvaluator(
+    BaseEvaluator[VocabularyEvaluationInput, TextComplexityResult, VocabularyEvaluationSettings]
+):
+    """Evaluates text for vocabulary complexity relative to the target grade level.
+
+    The evaluation runs in two steps:
+    1. **Background knowledge** – the model generates a short assumption about what
+       students at the target grade already know about the text's topic.  This
+       provides context that keeps the complexity rating from penalising familiar
+       domain words.
+    2. **Vocabulary complexity** – a grade-specific prompt + model produces JSON
+       matching the notebook ``Output`` shape. Raw JSON is passed through
+       :func:`~learning_commons_evaluators.schemas.vocabulary.normalize_complexity_output`
+       (same behaviour as the notebook), then validated. Grades 3–4 use Gemini;
+       grades 5–12 use GPT. Scores are normalised to :class:`TextComplexityAnswer`.
+
+    Supported grades: 3–12.
+    """
+
+    metadata: EvaluatorMetadata = _VOCABULARY_CONFIG.evaluator_metadata
+    default_evaluation_settings: VocabularyEvaluationSettings = (
+        _VOCABULARY_CONFIG.evaluation_settings
+    )
+
+    def evaluate_impl(
+        self,
+        input: VocabularyEvaluationInput,
+        evaluation_settings: VocabularyEvaluationSettings,
+        evaluation_metadata: EvaluationMetadata,
+    ) -> TextComplexityResult:
+        """Run the two-step vocabulary evaluation and return a TextComplexityResult.
+
+        Grade validation is handled by the framework before this method is called:
+        ``VocabularyEvaluationInput`` automatically constrains ``grade`` to the
+        evaluator's ``allowed_grades`` from settings (3–12), so
+        ``BaseEvaluator.evaluate`` raises before reaching here for unsupported grades.
+        """
+        ps_bk = evaluation_settings.prompt_settings_step_background_knowledge
+        ps_34 = evaluation_settings.prompt_settings_step_vocab_grades_3_4
+        ps_og = evaluation_settings.prompt_settings_step_vocab_other_grades
+
+        grade = input.grade.value
+        text = input.text.value
+        fk_score = round(textstat.flesch_kincaid_grade(text), 2)
+        prompts = _VOCABULARY_CONFIG.prompts
+
+        # ── Step 1: background knowledge ──────────────────────────────────────
+        bk_template = ChatPromptTemplate.from_messages(
+            [("human", prompts["background_knowledge_prompt"])]
+        )
+        background_knowledge: str = self.execute_prompt_chain_step(
+            step_name="background_knowledge",
+            prompt_settings=ps_bk,
+            evaluation_metadata=evaluation_metadata,
+            template=bk_template,
+            chain_inputs={"text": text, "grade": grade},
+            parser_output_type=None,
+        )
+
+        # ── Step 2: vocabulary complexity (grade-specific prompts, shared Output shape)
+        chain_inputs: dict[str, Any] = {
+            "text": input.text.value,
+            "student_grade_level": grade,
+            "student_background_knowledge": background_knowledge,
+        }
+        if grade in _GRADES_3_4:
+            chain_inputs["fk_level"] = fk_score
+            answer, explanation = self._run_vocab_complexity_chain(
+                chain_inputs=chain_inputs,
+                evaluation_metadata=evaluation_metadata,
+                prompt_settings_vocab=ps_34,
+                system_prompt=prompts["vocab_grades_3_4_system_prompt"],
+                user_prompt_template=prompts["vocab_grades_3_4_user_prompt"],
+            )
+        else:
+            answer, explanation = self._run_vocab_complexity_chain(
+                chain_inputs=chain_inputs,
+                evaluation_metadata=evaluation_metadata,
+                prompt_settings_vocab=ps_og,
+                system_prompt=prompts["vocab_other_grades_system_prompt"],
+                user_prompt_template=prompts["vocab_other_grades_user_prompt"],
+            )
+
+        return TextComplexityResult(
+            answer=answer,
+            explanation=explanation,
+            metadata=evaluation_metadata,
+        )
+
+    def _run_vocab_complexity_chain(
+        self,
+        *,
+        chain_inputs: dict[str, Any],
+        evaluation_metadata: EvaluationMetadata,
+        prompt_settings_vocab: PromptSettings,
+        system_prompt: str,
+        user_prompt_template: str,
+    ) -> tuple[TextComplexityAnswer, EvaluationExplanation]:
+        parser = JsonOutputParser(pydantic_object=VocabularyComplexityOutput)
+        template = ChatPromptTemplate.from_messages(
+            [
+                ("system", system_prompt),
+                ("human", user_prompt_template),
+            ]
+        ).partial(format_instructions=parser.get_format_instructions())
+
+        output = self.execute_prompt_chain_step(
+            step_name="complexity_evaluation",
+            prompt_settings=prompt_settings_vocab,
+            evaluation_metadata=evaluation_metadata,
+            template=template,
+            chain_inputs=chain_inputs,
+            parser_output_type=VocabularyComplexityOutput,
+            json_dict_normalizer=normalize_complexity_output,
+        )
+
+        # Normalise the score string: the prompt may return spaces ("very complex")
+        # but TextComplexityAnswer expects underscores ("very_complex").
+        score = output.complexity_score.lower().replace(" ", "_")
+        answer = TextComplexityAnswer.from_score(score)
+        explanation = EvaluationExplanation(
+            summary=output.reasoning,
+            details={
+                "tier_2_words": output.tier_2_words,
+                "tier_3_words": output.tier_3_words,
+                "archaic_words": output.archaic_words,
+                "other_complex_words": output.other_complex_words,
+            },
+        )
+        return answer, explanation
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py b/sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py
new file mode 100644
index 0000000..64cde9f
--- /dev/null
+++ b/sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py
@@ -0,0 +1,68 @@
+"""Vocabulary evaluator schemas."""
+
+from pydantic import BaseModel, Field
+
+from learning_commons_evaluators.schemas.config import (
+    EvaluationSettings,
+    PromptSettings,
+)
+
+
+class VocabularyEvaluationSettings(EvaluationSettings):
+    """Settings for a vocabulary complexity evaluation.
+
+    Requires three prompt step configurations:
+    - ``prompt_settings_step_background_knowledge``: generates a student background knowledge
+      assumption (used as context for the vocabulary complexity step).
+    - ``prompt_settings_step_vocab_grades_3_4``: vocabulary complexity for grades 3–4.
+    - ``prompt_settings_step_vocab_other_grades``: vocabulary complexity for grades 5–12.
+    """
+
+    prompt_settings_step_background_knowledge: PromptSettings
+    prompt_settings_step_vocab_grades_3_4: PromptSettings
+    prompt_settings_step_vocab_other_grades: PromptSettings
+
+
+# Note: avoid class docstrings here — Pydantic adds them as a root ``description``
+# key in ``model_json_schema()``, which changes ``JsonOutputParser`` format
+# instructions and breaks vocabulary contract tests that snapshot prompts without
+# that key (see ``contracts.toml``). Document behavior on fields / comments.
+
+
+class VocabularyComplexityOutput(BaseModel):
+    tier_2_words: str = Field(description="List of Tier 2 words")
+    tier_3_words: str = Field(description="List of Tier 3 words")
+    archaic_words: str = Field(description="List of Archaic words")
+    other_complex_words: str = Field(description="List of Other Complex words")
+    complexity_score: str = Field(
+        description="the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex"
+    )
+    reasoning: str = Field(description="your reasoning for your answer")
+
+
+def normalize_complexity_output(output: dict) -> dict:
+    """Mirror ``evals/vocabulary_evaluator.ipynb`` ``normalize_complexity_output``.
+
+    Maps integer ``answer`` (1–4, including string digits) from the grades 5–12
+    path to ``complexity_score`` using the same labels as the notebook. When
+    ``answer`` is absent, ``complexity_score`` is left unchanged.
+
+    Missing ``tier_*`` / ``archaic_words`` / ``other_complex_words`` keys are
+    filled with ``\"\"`` so minimal JSON still validates as ``VocabularyComplexityOutput``.
+    """
+    result = dict(output)
+    for key in ("tier_2_words", "tier_3_words", "archaic_words", "other_complex_words"):
+        if key not in result or result[key] is None:
+            result[key] = ""
+    mapping = {
+        1: "Slightly Complex",
+        2: "Moderately Complex",
+        3: "Very Complex",
+        4: "Exceedingly Complex",
+    }
+    if "answer" in result:
+        value = result["answer"]
+        if isinstance(value, str) and value.isdigit():
+            value = int(value)
+        result["complexity_score"] = mapping.get(value, str(value))
+    return result
diff --git a/sdks/python/src/learning_commons_evaluators/settings/_generated_vocabulary_settings.py b/sdks/python/src/learning_commons_evaluators/settings/_generated_vocabulary_settings.py
new file mode 100644
index 0000000..82562f8
--- /dev/null
+++ b/sdks/python/src/learning_commons_evaluators/settings/_generated_vocabulary_settings.py
@@ -0,0 +1,404 @@
+# !! AUTO-GENERATED — do not edit directly.
+# Source: sdks/settings/vocabulary/settings.toml
+# Regenerate : python scripts/generate_settings.py
+# Staleness check: python scripts/generate_settings.py --check
+
+from __future__ import annotations
+
+from learning_commons_evaluators.schemas.config import LlmProvider, PromptSettings
+from learning_commons_evaluators.schemas.input_specs import GradeInputSpec, TextInputSpec
+from learning_commons_evaluators.schemas.metadata import EvaluatorMaturity, EvaluatorMetadata
+from learning_commons_evaluators.schemas.vocabulary import VocabularyEvaluationSettings
+from learning_commons_evaluators.settings.load_settings import EvaluatorSettingsResult
+
+# ── Evaluator metadata ────────────────────────────────────────────────────────
+
+_EVALUATOR_METADATA = EvaluatorMetadata(
+    id='vocabulary',
+    version='0.1',
+    name='Vocabulary',
+    description='Vocabulary Complexity Evaluator',
+    maturity=EvaluatorMaturity.alpha,
+    inputs={
+        'text': TextInputSpec(
+            name='text',
+            description='The text to evaluate for vocabulary complexity.',
+        ),
+        'grade': GradeInputSpec(
+            name='grade',
+            description='The grade level of the text (3–12).',
+            allowed_grades=[3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+        ),
+    },
+)
+
+# ── Prompt templates ──────────────────────────────────────────────────────────
+
+_PROMPTS: dict[str, str] = {
+    'background_knowledge_prompt': """Review the following text, which is an educational text written for students in the following grade band: {grade}.
+
+Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with.
+
+Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.).
+
+Here's an example:
+[START EXAMPLE]
+Grade Band: 11th
+Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not
+learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was
+not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to
+know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me,
+are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it
+is the chief end of man here to “glorify God and enjoy him forever.”
+
+Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy.
+[END EXAMPLE]
+
+You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else.
+
+You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have:
+[BEGIN TOPICS]
+[
+    K: [
+        "Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees",
+        "The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?",
+        "America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?",
+        "Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together",
+        "How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?",
+        "How Has Our World Changed?", "Why Do People Have Jobs?"
+    ],
+    1: [
+        "Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds",
+        "A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals’ unique features?",
+        "Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?",
+        "Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World",
+        "What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?",
+        "How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?"
+    ],
+    2: [
+        "Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators",
+        "A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?",
+        "Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?",
+        "Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are",
+        "Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?",
+        "How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?"
+    ],
+    "3": [
+        "Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics",
+        "Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist",
+        "Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate",
+        "Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?",
+        "What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?",
+        "How Do People in a Community Meet Their Wants and Needs?"
+    ],
+    4: [
+        "Poetry", "Animal Defense Mechanisms", "The American Revolution",
+        "Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)",
+        "A Great Heart: What does it mean to have a great heart, literally and figuratively?",
+        "Extreme Settings: How does a challenging setting or physical environment change a person?",
+        "American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light",
+        "Earth's Features", "Waves, Energy, and Information", "Regions of the United States",
+        "How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?",
+        "How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?",
+        "How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?"
+    ],
+    5: [
+        "Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change",
+        "Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?",
+        "Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?",
+        "Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky",
+        "Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation",
+        "How Were the Lives of Native Peoples Influenced by Where They Lived?",
+        "What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?",
+        "Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?",
+        "How Does the Constitution Help Us Understand What It Means to Be an American?",
+        "What Do the Early Years of the United States Reveal About the Character of the Nation?",
+        "What Was the Effect of the Civil War on U.S. Society?"
+    ],
+    6: [
+        "Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools",
+        "Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?",
+        "A Hero’s Journey: What is the significance and power of the hero’s journey?",
+        "Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?",
+        "Courage in Crisis: How can the challenges of a hostile environment inspire heroism?",
+        "Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy",
+        "Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate",
+        "Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)",
+        "Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution",
+        "The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)",
+        "Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)",
+        "New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II",
+        "The Cold War", "Civil Rights and American Society", "America Since the 1970s"
+    ],
+    7: [
+        "The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution",
+        "Identity in the Middle Ages: How does society both support and limit the development of identity?",
+        "Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?",
+        "Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering",
+        "Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources",
+        "Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent",
+        "Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan",
+        "Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires",
+        "China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking",
+        "Age of Exploration and Trade", "Revolutions and Empires", "The Modern World"
+    ],
+    8: [
+        "Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment",
+        "The Poetics and Power of Storytelling: What is the power of storytelling?",
+        "The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?",
+        "Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy",
+        "Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun",
+        "Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms",
+        "Places and Regions", "Physical Geography", "Population Geography", "Economic Geography",
+        "Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses",
+        "Government and the Economy", "The Global Economy"
+    ]
+]
+[END TOPICS]
+
+Here is the text:
+[BEGIN TEXT]
+{text}
+[END TEXT]
+""",
+    'vocab_grades_3_4_system_prompt': """You are an expert curriculum designer. Your job is to rate the complexity of a text's vocabulary relative to the grade level.
+
+You will be given a rubric (with levels from least to most complex: slightly complex, moderately complex, very complex, exceedingly complex) as well as guidelines for interpreting the rubric.
+IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meaning, etc.)
+
+**Resource 1: Qualitative Text Complexity rubric (SAP)**
+1.  **Level 1: Slightly complex**
+    *   Original Definition: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. A very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.
+    *   Summary definition: Overall, vocabulary is easy to understand and does not impede comprehension of the bulk of the text (including main idea and supporting claims). 1-2 quick pauses for processing by the student are ok here!
+2.  **Level 2: Moderately complex**
+    *   Original Definition: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK
+    *   Summary definition: Overall, vocabulary generally allows students to comprehend the bulk of the text with little difficulty, though there may be occasional pauses for clarification. Several quick pauses or occasional prolonged pauses may occur.
+3.  **Level 3: Very complex**
+    *   Original Definition: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic
+    *   Summary definition: Overall, vocabulary often presents challenges that may slow down comprehension but does not completely block the comprehension of the bulk of the text.
+4.  **Level 4: Exceedingly complex**
+    *   Original Definition: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading.
+    *   Summary definition: Overall, vocabulary is so complex that it makes comprehension of the bulk of the text very challenging and requires careful effort to interpret.
+
+**Resource 2: Flesch-Kincaid Grade Level**
+Use the Flesch-Kincaid (FK) Grade Level as light guidance of the approximate grade level based on readability. The metric alone does not provide final information of vocabulary complexity, but a ballpark of the difficulty of the entire text.
+*   grade 2-3: 1.98-5.34
+*   grade 4-5: 4.51-7.73
+*   grade 6-8: 6.51-10.34
+*   grade 9-10: 8.32-12.12
+*   grade 11-College: 10.34-14.2
+
+**Guidelines for Interpretation and Reasoning**
+
+Your reasoning is the most critical part of your analysis. It's not enough to simply count complex words. You must analyze their impact on a student at the specified grade level. Use the following principles to guide your judgment:
+
+1.  **Density and Cumulative Effect:** Do not just count complex words; evaluate their concentration. A short text with a high density of challenging Tier 2 words (e.g., `peculiar`, `mischievous`, `courageous` for a 4th grader) can be more overwhelming than a longer text with a few scattered Tier 3 words. A constant barrage of unfamiliar words can elevate complexity from `very` to `exceedingly`.
+2.  **Contextual Scaffolding:** Assess how the text supports new vocabulary.
+    *   Are new, complex terms explicitly defined or explained with simple examples (e.g., "volume... to see if it is big enough to hold a liter of food")?
+    *   Is the surrounding language simple and conversational, making the meaning of new words easier to infer?
+    *   Strong scaffolding can lower the complexity rating. A text with many Tier 3 words that are well-explained might only be `moderately complex`.
+3.  **Abstract vs. Concrete Vocabulary:** Differentiate between words for abstract concepts and words for concrete things. A text built on abstract Tier 2 words (e.g., `relationships`, `performance`, `non-physical`) can be more challenging than a text that introduces Tier 3 labels for concrete things or people (e.g., `Sumerians`, `polonium`).
+4.  **Conceptual Load:** Consider the cognitive load of the vocabulary. A list of many new, multi-syllabic, conceptually-heavy terms (e.g., `Paleolithic`, `Mesolithic`, `Neolithic` for a 3rd grader) can be `very complex` even if the terms are briefly defined, because the student must process multiple new concepts at once.
+5.  **Calibrating the Top Levels:** Be precise in your use of `very complex` vs. `exceedingly complex`.
+    *   **Very complex:** The vocabulary creates significant hurdles and slows the reader down, but the main ideas of the text are still accessible with effort.
+    *   **Exceedingly complex:** The vocabulary is so dense, technical, or abstract that it acts as a barrier, making it nearly impossible for the target student to grasp the bulk of the text's meaning without extensive outside help. Reserve this for texts saturated with advanced terminology.
+6.  **Consider Background Knowledge:** Pay close attention to the provided `student_background_knowledge`. Do not classify a word as complex if the student is likely to be familiar with it (e.g., 'oxygen' for a 3rd grader who has learned about the human body).
+
+**Final Analysis Format**
+
+Provide these information as your final analysis:
+1.  **Complex vocabulary:**
+    *   Tier 2 words: Words that are commonly used in academic settings and more complex than colloquial, or everyday language and often have multiple meanings.
+    *   Tier 3 words: Overly academic or domain-specific words.
+    *   Archaic words: Words, or uses of words that are not commonly used in modern conversational language. E.g., “The jury retired to deliberate on their verdict." The use of “retire” to mean withdrawing to a private place is an archaic use.
+    *   Other complex words: All other words that can increase complexity of the text (e.g., idioms, unfamiliar proper nouns that function as vocabulary).
+2.  **Vocabulary complexity:** one of: slightly complex, moderately complex, very complex, exceedingly complex
+3.  **Your reasoning of the complexity:** A detailed explanation of your rating, referencing the principles above.
+""",
+    'vocab_grades_3_4_user_prompt': """Below is the text you need to evaluate. Let's think step by step in order to predict the output of the vocabulary complexity task.
+
+- It is intended for grade {student_grade_level}.
+
+- You can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: {student_background_knowledge}
+
+- Text Flesch-Kincaid grade level: {fk_level}
+
+- Text to evaluate: [BEGIN TEXT]
+{text}
+[END TEXT]
+
+{format_instructions}
+""",
+    'vocab_other_grades_system_prompt': """You are an expert curriculum designer. Your job involves reading text snippets intended for students in K-12 and evaluating the complexity of the vocabulary in each text.
+
+You will be given a rubric (with options 1, 2, 3, 4) as well as guidelines for interpreting the rubric.
+
+IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meainng, etc.)
+IMPORTANT: Rely on the supplied rubric and annotation guidelines along. Do not introduce any new crtieria for evaluating the complexity of a text's vocabulary.
+
+Please first reason out loud about the vocabulary complexity of the text and then provide an answer between 1 and 4 (whole numbers only). Provide the answer as an integer (not a float).
+""",
+    'vocab_other_grades_user_prompt': """Your job is to rate the complexity of a text's vocabulary (relative to the intended level of the text) according to a rubric and annotation guide. Stick to the rubric and annotation guide exactly — do not introduce any additional criteria or lenses for judging the complexity of the text.
+
+[BEGIN ANNOTATION GUIDE AND RUBRIC]
+Instructions
+For the following task, please assume that:
+    - The student is on grade level and proficient in all core content areas, including reading fluency, comprehension, science, & social studies. (example).
+    - The student is moving through a common progression of topics (detailed here).
+    - The student is fluent in speaking English.
+    - The student has an "average" amount of background knowledge on topics not commonly covered in curriculum.
+    - The student will use this material for independent reading/work, without direct instruction.
+    - The text is reasonable for the given grade level.
+
+Please do not consider the presence of figurative language when scoring Vocabulary. For example: with a phrase like "kicked the bucket," consider only the qualities of the words themselves ("kicked", "the" and "bucket").
+
+Please do be sure to consider:
+- all of the different types of vocabulary (listed below)
+- the overall proportion of complex words in the text - including repeated complex words.
+- the resulting holistic complexity of the vocabulary (described in the Summary section below).
+
+Level 1:
+Rubric: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. That said, a very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.
+
+Level 2:
+Rubric: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK, but if it's very low, the text is probably level 1.
+
+Level 3:
+Rubric: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic
+
+Level 4:
+Rubric: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading
+
+And here are some relevant definitions:
+    - Conversational: Everyday language.
+    - Familiar: Words that the student is likely to have seen/heard, from everyday life or their curriculum. Reminder: assume an "average" level of background knowledge.
+    - Unfamiliar: Words the student has probably not heard, or are being used in an unfamiliar way.
+        - For ex: 4th graders are familiar with the word "table" but may not be familiar with the use of the word with respect to data ("a table of data").
+        - Note:
+            - Words with in-line definitions (via appositives, or because they can be easily inferred from other parts of the text) should be evaluated as less unfamiliar.
+            - For ex: "The pharaoh, a powerful ruler of ancient Egypt, was buried in a grand tomb."
+                - The word "pharaoh" might be unfamiliar or subject-specific, but since is defined within the text, you can consider it a more familiar word.
+        - Unfamiliar proper nouns:
+            - A person's name, even if unfamiliar, generally does not add to complexity.
+            - Other unfamiliar proper nouns (eg locations, organizations) do add to complexity.
+
+- Subject-specific: Words that are specific to a subject or field of study that are essential for understanding concepts and engaging with the content.
+- Overly-academic: Words that are excessively formal, complex, or specialized.
+    - For ex: "The agrarian societal structure of the Neolithic Revolution precipitated a paradigm shift in agriculture"
+- Archaic: A word that was common in the past but is now rarely/almost never used. Could also be a word used in an archaic way.
+    - For ex: "After a long day of court proceedings, the jury 'retired' to deliberate on their verdict."
+        - The word "retire" meaning to stop working may be familiar to a student, but "retire" meaning "withdrawing to a private place" is an archaic use.
+
+
+Examples
+The student is on-grade-level:
+- Consider a 6th grade passage about earth systems. Per NGSS standards, students are introduced to earth systems starting in 2nd grade. They encounter words like: wind, water, river, lake, solids, and liquids. For our rating purposes, we would assume most students following 2nd have encountered these words. In 5th grade, they dive more fully into earth systems concepts, learning vocabulary words like geosphere, sediment, biosphere, atmosphere, ecosystems, organisms and climate. While rating, we would consider the words listed in the NGSS standards as more familiar following that grade level.  If the same passage were intended for 3rd graders, though, then the subject-specific vocabulary is likely to be unfamiliar.
+
+Figurative Language
+- Kicked the bucket.
+- The pen is mightier than the sword.
+- The classroom was a zoo.
+- He ran faster than the speed of light.
+[END ANNOTATION GUIDE AND RUBRIC]
+
+Here are a couple examples of texts that have already been scored along with justification for their scores, which you can use as exemplars:
+[BEGIN EXAMPLES]
+
+*** EXAMPLE 1 ***
+The following text was intended for grade level 11 and received a complexity level of 1.
+
+Here is the background knowledge assumption for that text: N/A
+
+Here is the text:
+// START TEXT //
+"In a recent lecture, "Is Nothing Sacred?", Salman Rushdie, one of the most censored authors of our time, talked about the importance of books. He grew up in a household in India where books were as sacred as bread. If anyone in the household dropped a piece of bread or a book, the person not only picked it up, but also kissed the object by way of apologizing for clumsy disrespect.
+
+He goes on to say that he had kissed many books before he had kissed a girl. Bread and books were for his household, and for many like his, food for the body and the soul. This image of the kissing of the book one had accidentally dropped made an impression on me. It speaks to the love and respect many people have for them.
+
+I grew up in a small town in New Mexico, and we had very few books in our household. The first one I remember reading was my catechism book. Before I went to school to learn English, my mother taught me catechism in Spanish.
+
+I remember the questions and answers I had to learn, and I remember the well-thumbed, frayed volume which was sacred to me.
+
+Growing up with few books in the house created in me a desire and a need for them. When I started school, I remember visiting the one room library of our town and standing in front of the dusty shelves. In reality there were only a few shelves and not over a thousand books, but I wanted to read them all. There was food for my soul in the books, that much I realized."
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+This text is a 1 for vocabulary, because the vocabulary that is used is familiar and accessible for a proficient 11th grader. Most of the words used in the text are very common everyday vocabulary for describing growing up, family life, and the importance of reading. A few examples of these very common words are: small town, book, school, learn, food, kissed, image, respect, love, speaks. There are many more in the text. In this text there are only a few "juicier" or more complex words, you can think of those as words that are less familiar, have a more abstract or nuanced meaning, or carry a very large concept. Less commonly spoken words that were used in the text were: frayed, volume, censored, clumsy, sacred. These are still well within reach of a proficient 11th grader, and would still be considered familiar, because they will have encountered them in past reading or academic studies. In the text there are a couple of words that are outliers, but they are not essential to the understanding of the larger text. One of these words or hyphenated compound phrase is well-frayed. A compound phrase is a phrase consisting of multiple words that work together to create a specific meaning or idea, often acting as a single unit in a sentence. If the meaning of individual words is familiar, it is typically quite easy for proficient readers to generalize the larger meaning that the author is implying with their word choice. In this case, proficient students will be accustomed to the phrase well, with the secondary meaning of very, rather than a description of positivity or health; and they will be accustomed to the use frayed, as in worn, aged, or damaged from use. Making the leap to identify the meaning of "well-frayed" as a book that is very used, will take only moments for a proficient 11th grader. Another word that stands out in the text is the word catechism, which might be new for many students based on their personal background or location, but a full understanding of what a catechism book contains is not essential for understanding the paragraph or whole text. The reader can make it through using minimum context clues to know that the catechism must be something important to his family. The type of book he learned to read before going to school is not critical for comprehension, it's enough to understand that reading was so important in his family, his mother started instruction before he even started school. Additionally, it's important to know that having one unknown word for an 11th grade reading, does not merit a rating higher than one.
+
+It is worth noting that another reason this text is a 1, is that the content or topic of the passage is so familiar and covered extensively in K-12 education, i.e. reading is important, loving books, growing up; that coupled with the simple vocabulary choices, getting to the meaning of the overall text, and even the paragraphs, would be incredibly easy for a proficient 11th grader.
+// END REASONING //
+*** EXAMPLE 2 ***
+The following text was intended for grade level 5 and received a complexity level of 2.
+
+Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with the concept of natural disasters, including hurricanes, and basic atmospheric concepts like high and low pressure from their studies on weather and climate. They may not be familiar with the specific formation processes of hurricanes or the global terminology differences (hurricane, typhoon, cyclone).
+
+Here is the text:
+// START TEXT //
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names—hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+I scored this a 2 because of the density of subject-specific vocabulary related to weather and climate, which is often covered in lower grade levels. This adds to the complexity above a 1, but it is not a level 3 because of the familiarity with the topic, which implies some familiarity with the vocabulary as well. The specific formation process and the vocabulary used to explain the processes are also subject-specfiic but not famliar, which would make the second paragraph a level 3 in the rubric language, but when considering the language used in the overall SUMMARY below the rubric, this new content and vocabulary would cause quick pauses and/or occasional prolonged pauses but would not cause the reader to slow down to due to challenging overall comprehension of the key ideas and supporting claims. This is especially the case because the second paragraph builds upon prior knowledge and familiar vocabulary use, so it is not entirely new information and vocabulary. While there is subject-specific vocabulary used, overly academic vocabulary is NOT used and is more conversational in nature, such as "great whiring storms" and "born" / "giving birth" to storm  (although this is the way storms are described!) rather than more technical terms which made comprehension easier due to the accessibility of the vocabulary (even if used in other contexts before reading this text). Words such as "a lot" and "bigger" are more conversational, and while technical, unfamiliar words are provided, such as "hurricane," "typhoon," and "cyclone," knowing and understanding their differences is not necessary to grasp the main idea. The processes by which they are formed are what need to be retained while reading the entire text, and familiarity with the bulk of the vocabulary used would allow for that to happen without too much struggle to make meaning of it. Additionally, the text does not contain any archaic vocabulary or ambiguous words, which prevents it from reaching a rating of 4, although it is not necessary that they text have such vocabulary to meet a level 4, the frequent inclusion of such vocabulary makes it more likely to land at least a 3 or 4.
+// END REASONING //
+
+*** EXAMPLE 3 ***
+The following text was intended for grade level 6 and received a complexity level of 3.
+
+Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with basic Earth science concepts such as rocks, minerals, and fossils, as well as natural processes like volcanic eruptions and earthquakes. They may not be familiar with more advanced topics like plate tectonics or the specific branches of geology such as mineralogy, petrology, and seismology.
+
+Here is the text:
+// START TEXT //
+Geology is the scientific study of Earth. Geologists study the planet—its formation, its internal structure, its materials, its chemical and physical processes, and its history. Mountains, valleys, plains, sea floors, minerals, rocks, fossils, and the processes that create and destroy each of these are all the domain of the geologist. Geology is divided into two broad categories of study: physical geology and historical geology.
+
+Physical geology is concerned with the processes occurring on or below the surface of Earth and the materials on which they operate. These processes include volcanic eruptions, landslides, earthquakes, and floods. Materials include rocks, air, seawater, soils, and sediment. Physical geology further divides into more specific branches, each of which deals with its own part of Earth's materials, landforms, and processes. Mineralogy and petrology investigate the composition and origin of minerals and rocks. Volcanologists study lava, rocks, and gases on live, dormant, and extinct volcanoes. Seismologists use instruments to monitor and predict earthquakes and volcanic eruptions.
+
+Historical geology is concerned with the chronology of events, both physical and biological, that have taken place in Earth's history. Paleontologists study fossils (remains of ancient life) for evidence of the evolution of life on Earth. Fossils not only relate evolution, but also speak of the environment in which the organism lived. Corals in rocks at the top of the Grand Canyon in Arizona, for example, show a shallow sea flooded the area around 290 million years ago. In addition, by determining the ages and types of rocks around the world, geologists piece together continental and oceanic history over the past few billion years. Plate tectonics (the study of the movement of the sections of Earth's crust) adds to Earth's story with details of the changing configuration of the continents and oceans.
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+To determine the complexity rating of this text based on the vocabulary present, I used the annotation guide, scoring rubric, and examples to set the expectations for rating. During the first read of the text, I "bolded" and categorized the more challenging vocabulary words according to the following complexity groupings: archaic, unfamiliar, archaic, subject-specific, and/or overly academic. On the second read, I considered the main idea or "gist" that students need to acquire understanding of. I then referenced the previously mentioned tools–annotation guide, scoring rubric, and examples to remind myself of the expectations for rating.  I agreed that readers would have familiarity with basic concepts of geology; however, I also considered the definitions provided for words such as Geology, Geologists, Physical Geology, Historical Geology, Mineralogy, and Petrology. I considered how students might pause for clarification and for how long. After reviewing the Annotation Guide while considering, I narrowed the rating down because the definitions provided throughout the text of more complex words should make the meaning of the text more accessible for readers, which is why although the words are subject-specific, I rated this text as a 3 instead of a 2-less complex or a 4–more complex. I read the text one final time to ensure clarity around my rating, scored and wrote the justification.
+// END REASONING //
+[END EXAMPLES]
+
+Below is the text you need to evaluate. It is intended for grade {student_grade_level}.
+
+As you read the text, you can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: {student_background_knowledge}
+
+[BEGIN TEXT]
+{text}
+[END TEXT]
+
+In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say "level 2").
+
+{format_instructions}
+""",
+}
+
+# ── Evaluation settings ───────────────────────────────────────────────────────
+
+_EVALUATION_SETTINGS = VocabularyEvaluationSettings(
+    prompt_settings_step_background_knowledge=PromptSettings(
+        provider_type=LlmProvider.OPENAI,
+        model='gpt-4o-2024-11-20',
+        temperature=0.0,
+    ),
+    prompt_settings_step_vocab_grades_3_4=PromptSettings(
+        provider_type=LlmProvider.GOOGLE,
+        model='gemini-2.5-pro',
+        temperature=0.0,
+    ),
+    prompt_settings_step_vocab_other_grades=PromptSettings(provider_type=LlmProvider.OPENAI, model='gpt-4.1', temperature=0.0),
+)
+
+# ── Public config object (imported by evaluator modules) ──────────────────────
+
+CONFIG: EvaluatorSettingsResult[VocabularyEvaluationSettings] = EvaluatorSettingsResult(
+    evaluator_metadata=_EVALUATOR_METADATA,
+    evaluation_settings=_EVALUATION_SETTINGS,
+    prompts=_PROMPTS,
+)
diff --git a/sdks/python/src/learning_commons_evaluators/settings/conventionality/__init__.py b/sdks/python/src/learning_commons_evaluators/settings/conventionality/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sdks/python/src/learning_commons_evaluators/settings/conventionality/contracts.toml b/sdks/python/src/learning_commons_evaluators/settings/conventionality/contracts.toml
new file mode 100644
index 0000000..47bac4c
--- /dev/null
+++ b/sdks/python/src/learning_commons_evaluators/settings/conventionality/contracts.toml
@@ -0,0 +1,130 @@
+# Contract tests for the Conventionality evaluator.
+#
+# This file captures the exact LLM interaction for known inputs so that
+# tests can verify the SDK sends the same request as the notebook and
+# produces the same result from the same response.
+#
+# HOW TO REGENERATE
+# -----------------
+# Run the notebook "evals/Final ship - Conventionality Experimental Evaluator.ipynb"
+# from the repo root with GOOGLE_API_KEY set. It loads prompts from the evaluator
+# settings TOML (same as the SDK) and prints a TOML block — paste into this file
+# and run `make sync-settings` from sdks/python/ to update the bundled copy.
+#
+# WHAT BELONGS HERE
+# -----------------
+# Only data that originates from external sources:
+#   - input:           what the caller provides to the evaluator
+#   - prompt_steps:    what the SDK sends to the LLM (fully formatted messages,
+#                      model, temperature) and the raw response the LLM returned
+#   - expected_result: the structured output the LLM returned (notebook format)
+#
+# Deterministic values computed by library code (e.g. FK score from textstat,
+# format_instructions from LangChain's JsonOutputParser) are NOT stored as
+# standalone fields — they appear inside the fully formatted prompts, where they
+# are implicitly verified by the prompt-request assertion in the contract test.
+
+[cases.turnip]
+description = "Grade 4 classroom narrative (Henry and the turnip)"
+
+[cases.turnip.input]
+text = '''
+
+"Well, then," said the teacher, "you may take your slate and go out behind the schoolhouse for half an hour. Think of something to write about, and write the word on your slate. Then try to tell what it is, what it is like, what it is good for, and what is done with it. That is the way to write a composition." Henry took his slate and went out. Just behind the schoolhouse was Mr. Finney's barn. Quite close to the barn was a garden. And in the garden, Henry saw a turnip. "Well, I know what that is," he said to himself; and he wrote the word turnip on his slate. Then he tried to tell what it was like, what it was good for, and what was done with it. Before the half hour was ended he had written a very neat composition on his slate. He then went into the house, and waited while the teacher read it. The teacher was surprised and pleased. He said, "Henry Longfellow, you have done very well. Today you may stand up before the school and read what you have written about the turnip."
+'''
+grade = 4
+
+[cases.turnip.prompt_steps.main]
+model = "gemini-3-flash-preview"
+temperature = 0.0
+system_prompt = '''Role
+You are an expert reading teacher and text complexity evaluator. Your task is to evaluate the "Conventionality" of a text and assign it a complexity level based on a 4-point scale, carefully factoring in the target grade level.
+
+Objective
+Measure how explicit, literal, and straightforward the text's meaning is, versus how abstract, ironic, figurative, or archaic it is. Focus on the hiddenness of the meaning, the use of conceptual framing, the reliance on abstract reasoning, and the familiarity of the expression for the target grade.
+
+Complexity Levels
+- Slightly Complex: Explicit, literal, straightforward, easy to understand. Meaning is entirely on the surface. The language is concrete, and the meaning is clear and procedural, mostly referring to observable materials and actions. Contains no symbolic or ironic language, and conceptual interpretation is not required. Contains limited figurative language that is common and easy to comprehend at the target grade level.
+- Moderately Complex: Largely explicit and easy to understand with some occasions for more complex meaning. May contain a noticeable amount of archaic/dated phrasing, formal historical prose, vocabulary demands, background knowledge requirements, or expressions that are less familiar to the target grade level, which might make the text feel vague or slightly challenging.
+- Very Complex: Fairly complex; contains sustained abstract language, conceptual framing, rhetorical idealization, ironic comparisons, or central metaphors that drive the meaning of the text. Addresses concepts, beliefs, and abstract qualities rather than just concrete objects. The tone or underlying message requires interpretation, even if the surface message is clear.
+- Exceedingly Complex: Dense and complex; contains considerable abstract, ironic, and/or figurative language. Meaning is heavily hidden, deeply conceptual, or relies heavily on complex rhetorical devices.
+
+Essential Evaluation Rules
+1. Concrete & Procedural Texts: Texts that are highly concrete, clear, and procedural (e.g., describing observable materials, mechanical processes, or physical actions) should typically be rated "Slightly Complex."
+
+2. Grade-Level Anchoring and Vague Narratives: Always consider the target grade. A literal historical narrative that might be straightforward for older students can be "Moderately Complex" for younger students (e.g., 4th graders) if it involves less familiar expressions, older contexts (e.g., wagon loads, traveling by horseback), vocabulary demands, and background knowledge requirements that make the text feel vague or slightly demanding for that age group.
+
+3. Rhetorical Idealization and Abstract Qualities: If an entire argument or narrative is built around abstract qualities (e.g., national character, bravery, liberty) and uses repeated figurative language or personification to portray a subject in a certain idealized way, rate the text as "Very Complex." Even if the figurative language is easy to interpret, the need to interpret the rhetorical tone and sustained abstract focus elevates the complexity beyond level two.
+
+4. Common Idioms and Grade-Level Appropriateness: Do NOT elevate a text to "Moderately Complex" simply because it contains a few common idiomatic expressions. If these expressions are widely known and easy for the target grade to understand without making the text feel vague, the text remains "Slightly Complex."
+
+5. Conversational and Hypothetical Framing: Using a second-person conversational hook (e.g., "Imagine you are...") to explain a concept is a standard, literal device for engaging readers. It does not constitute complex conceptual framing.
+
+6. Sustained vs. Occasional Impact: If abstract language, figurative phrasing, irony, or conceptual framing is sustained throughout the text and central to the argument/meaning, the text is Very Complex. Reserve Moderately Complex for texts where the explicit meaning dominates but the expression, vocabulary, or archaic language provides a moderate conventionality challenge.
+
+7. Central Metaphors and Conceptual Framing: When an author uses a central metaphor to explain a concept or uses figurative phrasing to explain how things "work," this abstract reasoning drives the meaning, elevating the text to Very Complex.
+
+8. Irony and Abstract Comparisons: Texts that rely on sustained irony, especially through comparative arguments, are inherently Very Complex for younger students.
+
+9. Isolate Conventionality from Vocabulary: Do not inflate the Conventionality score just because the text uses archaic, dated, or highly academic vocabulary.
+
+Input Format
+You will receive:
+- text: The passage to evaluate.
+- grade: The target student grade level.
+- fk_score: The Flesch-Kincaid readability score.
+
+Output Format
+Provide a JSON object containing ONLY the following keys:
+- complexity_score: (String) One of the 4 scale levels exactly as formatted: 'slightly_complex', 'moderately_complex', 'very_complex', or 'exceedingly_complex'.
+- reasoning: (String) A detailed explanation of the rating, citing specific features in the text and referencing the expert guardrails (e.g., noting if the text relies on abstract qualities/rhetorical idealization, if vocabulary/background knowledge demands make a literal text vague for the grade level, or if it is strictly concrete/procedural).
+- conventionality_features: (List of Strings) The specific language features driving the complexity (e.g., literal narrative, concrete actions, less familiar expressions, sustained irony, abstract qualities, rhetorical idealization, archaic phrasing) with direct quotes from the text.
+- grade_context: (String) How the conventionality demands compare to general expectations for the provided target grade.
+- instructional_insights: (String) Actionable pedagogical suggestions for scaffolding the conventionality features in the classroom.
+
+STRICT OUTPUT FORMAT:
+- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators.
+- Do not wrap the JSON in Markdown or code fences (no ``` or ```json).
+- Do not prepend or append any text (e.g., do not write "Here is the JSON:").
+- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments.
+
+The output should be formatted as a JSON instance that conforms to the JSON schema below.
+
+As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]} the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.
+
+Here is the output schema (shown in a code block for readability only — do not include any backticks or Markdown in your output):
+```
+{"properties": {"conventionality_features": {"description": "List of the specific language features driving the complexity (e.g., idioms, metaphors, implied meaning) with direct quotes from the text.", "items": {"type": "string"}, "title": "Conventionality Features", "type": "array"}, "grade_context": {"description": "How the conventionality demands compare to general expectations for the provided target grade.", "title": "Grade Context", "type": "string"}, "instructional_insights": {"description": "Actionable pedagogical suggestions for scaffolding the unconventional language features in the classroom.", "title": "Instructional Insights", "type": "string"}, "complexity_score": {"description": "The conventionality complexity level of the text", "enum": ["slightly_complex", "moderately_complex", "very_complex", "exceedingly_complex"], "title": "Complexity Score", "type": "string"}, "reasoning": {"description": "A synthesis of why the text fits the chosen rubric level.", "title": "Reasoning", "type": "string"}}, "required": ["conventionality_features", "grade_context", "instructional_insights", "complexity_score", "reasoning"]}
+```
+'''
+user_prompt = '''Analyze:
+Text: "Well, then," said the teacher, "you may take your slate and go out behind the schoolhouse for half an hour. Think of something to write about, and write the word on your slate. Then try to tell what it is, what it is like, what it is good for, and what is done with it. That is the way to write a composition." Henry took his slate and went out. Just behind the schoolhouse was Mr. Finney's barn. Quite close to the barn was a garden. And in the garden, Henry saw a turnip. "Well, I know what that is," he said to himself; and he wrote the word turnip on his slate. Then he tried to tell what it was like, what it was good for, and what was done with it. Before the half hour was ended he had written a very neat composition on his slate. He then went into the house, and waited while the teacher read it. The teacher was surprised and pleased. He said, "Henry Longfellow, you have done very well. Today you may stand up before the school and read what you have written about the turnip."
+Grade: 4
+FK Score: 3.75
+'''
+llm_response = '''
+```json
+{
+  "complexity_score": "slightly_complex",
+  "reasoning": "The text is a straightforward, literal narrative that is entirely explicit and easy to understand. The meaning remains on the surface, focusing on concrete actions and observable materials (a slate, a schoolhouse, a turnip). The teacher's instructions are procedural and clear, providing a step-by-step guide for writing a composition. There is no symbolic, ironic, or figurative language that requires interpretation. While the setting is historical (using a 'slate' and 'schoolhouse'), these terms do not create a barrier to understanding the plot or the underlying message, which is a simple account of a student completing a task.",
+  "conventionality_features": [
+    "literal narrative: 'Henry took his slate and went out.'",
+    "concrete actions: 'he wrote the word turnip on his slate.'",
+    "procedural instructions: 'tell what it is, what it is like, what it is good for, and what is done with it.'",
+    "straightforward meaning: 'The teacher was surprised and pleased.'"
+  ],
+  "grade_context": "For a 4th-grade student, this text is highly accessible and falls well within the expected range for conventionality. The Flesch-Kincaid score of 3.75 aligns with the text's simple sentence structures and concrete subject matter, making it an easy read for students at this level.",
+  "instructional_insights": "Since the text is highly conventional, teachers can use it to model the 'procedural' writing steps mentioned in the story. Students can practice the teacher's four-step method (what it is, what it is like, what it is good for, what is done with it) on modern objects. A brief historical note explaining that a 'slate' was a precursor to a notebook or tablet would provide sufficient context for the dated terminology."
+}
+```'''
+
+[cases.turnip.expected_result]
+complexity_score = '''
+slightly_complex'''
+reasoning = '''
+The text is a straightforward, literal narrative that is entirely explicit and easy to understand. The meaning remains on the surface, focusing on concrete actions and observable materials (a slate, a schoolhouse, a turnip). The teacher's instructions are procedural and clear, providing a step-by-step guide for writing a composition. There is no symbolic, ironic, or figurative language that requires interpretation. While the setting is historical (using a 'slate' and 'schoolhouse'), these terms do not create a barrier to understanding the plot or the underlying message, which is a simple account of a student completing a task.'''
+conventionality_features = ["literal narrative: 'Henry took his slate and went out.'", "concrete actions: 'he wrote the word turnip on his slate.'", "procedural instructions: 'tell what it is, what it is like, what it is good for, and what is done with it.'", "straightforward meaning: 'The teacher was surprised and pleased.'"]
+grade_context = '''
+For a 4th-grade student, this text is highly accessible and falls well within the expected range for conventionality. The Flesch-Kincaid score of 3.75 aligns with the text's simple sentence structures and concrete subject matter, making it an easy read for students at this level.'''
+instructional_insights = '''
+Since the text is highly conventional, teachers can use it to model the 'procedural' writing steps mentioned in the story. Students can practice the teacher's four-step method (what it is, what it is like, what it is good for, what is done with it) on modern objects. A brief historical note explaining that a 'slate' was a precursor to a notebook or tablet would provide sufficient context for the dated terminology.'''
diff --git a/sdks/python/src/learning_commons_evaluators/settings/vocabulary/__init__.py b/sdks/python/src/learning_commons_evaluators/settings/vocabulary/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml b/sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml
new file mode 100644
index 0000000..8420e01
--- /dev/null
+++ b/sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml
@@ -0,0 +1,641 @@
+# Contract tests for the Vocabulary evaluator.
+#
+# This file captures the exact LLM interactions for known inputs so that tests
+# can verify the SDK sends the same requests as the notebook and produces the
+# same results from the same responses.
+#
+# HOW TO REGENERATE
+# -----------------
+# Run the notebook "evals/vocabulary_evaluator.ipynb" with valid API keys.
+# The final cells output a TOML block that you paste in here to replace the
+# placeholder values below.
+#
+# WHAT BELONGS HERE
+# -----------------
+# Only data that originates from external sources:
+#   - input:            what the caller provides to the evaluator
+#   - prompt_steps:     what the SDK sends to the LLM (fully formatted messages,
+#                       model, temperature) and the raw response the LLM returned.
+#                       Steps must be listed in call order:
+#                         1. background_knowledge  (plain text response)
+#                         2. vocab_complexity       (JSON response)
+#   - expected_result:  the structured output parsed from the vocab_complexity
+#                       llm_response (notebook format), before SDK result mapping.
+#
+# Deterministic values computed by library code (FK score, format_instructions)
+# are NOT stored as standalone fields — they appear inside the fully formatted
+# prompts, where they are implicitly verified by the prompt-request assertion.
+#
+# TWO CASES ARE INCLUDED
+# -----------------------
+# marco_polo_grade3  — grade 3, uses the grades 3–4 path (Gemini + rubric labels
+#                      + word-level breakdown in expected_result).
+# hurricanes_grade7  — grade 7, uses the grades 5–12 path (GPT; expected_result
+#                      holds ``complexity_score`` as a string plus word-list fields,
+#                      matching the notebook ``Output`` shape. The SDK also accepts
+#                      legacy integer ``answer`` in raw JSON like ``normalize_complexity_output``.)
+
+# ===========================================================================
+# Case 1: marco_polo_grade3  (grades 3–4 path)
+# ===========================================================================
+
+[cases.marco_polo_grade3]
+description = "Marco Polo passage, grade 3 (grades 3-4 Gemini path)"
+
+[cases.marco_polo_grade3.input]
+text = '''
+Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.
+Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.
+His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.'''
+grade = 3
+
+[cases.marco_polo_grade3.prompt_steps.background_knowledge]
+model = "gpt-4o-2024-11-20"
+temperature = 0.0
+system_prompt = ""
+user_prompt = '''
+
+Review the following text, which is an educational text written for students in the following grade band: 3.
+
+Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with.
+
+Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.).
+
+Here's an example:
+[START EXAMPLE]
+Grade Band: 11th
+Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not
+learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was
+not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to
+know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me,
+are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it
+is the chief end of man here to “glorify God and enjoy him forever.”
+
+Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy.
+[END EXAMPLE]
+
+You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else.
+
+You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have:
+[BEGIN TOPICS]
+[
+    K: [
+        "Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees",
+        "The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?",
+        "America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?",
+        "Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together",
+        "How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?",
+        "How Has Our World Changed?", "Why Do People Have Jobs?"
+    ],
+    1: [
+        "Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds",
+        "A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals’ unique features?",
+        "Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?",
+        "Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World",
+        "What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?",
+        "How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?"
+    ],
+    2: [
+        "Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators",
+        "A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?",
+        "Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?",
+        "Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are",
+        "Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?",
+        "How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?"
+    ],
+    "3": [
+        "Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics",
+        "Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist",
+        "Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate",
+        "Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?",
+        "What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?",
+        "How Do People in a Community Meet Their Wants and Needs?"
+    ],
+    4: [
+        "Poetry", "Animal Defense Mechanisms", "The American Revolution",
+        "Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)",
+        "A Great Heart: What does it mean to have a great heart, literally and figuratively?",
+        "Extreme Settings: How does a challenging setting or physical environment change a person?",
+        "American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light",
+        "Earth's Features", "Waves, Energy, and Information", "Regions of the United States",
+        "How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?",
+        "How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?",
+        "How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?"
+    ],
+    5: [
+        "Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change",
+        "Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?",
+        "Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?",
+        "Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky",
+        "Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation",
+        "How Were the Lives of Native Peoples Influenced by Where They Lived?",
+        "What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?",
+        "Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?",
+        "How Does the Constitution Help Us Understand What It Means to Be an American?",
+        "What Do the Early Years of the United States Reveal About the Character of the Nation?",
+        "What Was the Effect of the Civil War on U.S. Society?"
+    ],
+    6: [
+        "Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools",
+        "Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?",
+        "A Hero’s Journey: What is the significance and power of the hero’s journey?",
+        "Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?",
+        "Courage in Crisis: How can the challenges of a hostile environment inspire heroism?",
+        "Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy",
+        "Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate",
+        "Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)",
+        "Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution",
+        "The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)",
+        "Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)",
+        "New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II",
+        "The Cold War", "Civil Rights and American Society", "America Since the 1970s"
+    ],
+    7: [
+        "The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution",
+        "Identity in the Middle Ages: How does society both support and limit the development of identity?",
+        "Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?",
+        "Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering",
+        "Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources",
+        "Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent",
+        "Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan",
+        "Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires",
+        "China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking",
+        "Age of Exploration and Trade", "Revolutions and Empires", "The Modern World"
+    ],
+    8: [
+        "Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment",
+        "The Poetics and Power of Storytelling: What is the power of storytelling?",
+        "The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?",
+        "Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy",
+        "Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun",
+        "Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms",
+        "Places and Regions", "Physical Geography", "Population Geography", "Economic Geography",
+        "Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses",
+        "Government and the Economy", "The Global Economy"
+    ]
+]
+[END TOPICS]
+
+Here is the text:
+[BEGIN TEXT]
+Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.
+Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.
+His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.
+[END TEXT]
+'''
+llm_response = '''
+Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of historical exploration. They are unlikely to be familiar with the Mongol Dynasty, Kublai Khan, or Marco Polo's specific travels and contributions.'''
+
+[cases.marco_polo_grade3.prompt_steps.vocab_complexity]
+model = "gemini-2.5-pro"
+temperature = 0.0
+system_prompt = '''
+
+You are an expert curriculum designer. Your job is to rate the complexity of a text's vocabulary relative to the grade level.
+
+You will be given a rubric (with levels from least to most complex: slightly complex, moderately complex, very complex, exceedingly complex) as well as guidelines for interpreting the rubric.
+IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meaning, etc.)
+
+**Resource 1: Qualitative Text Complexity rubric (SAP)**
+1.  **Level 1: Slightly complex**
+    *   Original Definition: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. A very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.
+    *   Summary definition: Overall, vocabulary is easy to understand and does not impede comprehension of the bulk of the text (including main idea and supporting claims). 1-2 quick pauses for processing by the student are ok here!
+2.  **Level 2: Moderately complex**
+    *   Original Definition: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK
+    *   Summary definition: Overall, vocabulary generally allows students to comprehend the bulk of the text with little difficulty, though there may be occasional pauses for clarification. Several quick pauses or occasional prolonged pauses may occur.
+3.  **Level 3: Very complex**
+    *   Original Definition: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic
+    *   Summary definition: Overall, vocabulary often presents challenges that may slow down comprehension but does not completely block the comprehension of the bulk of the text.
+4.  **Level 4: Exceedingly complex**
+    *   Original Definition: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading.
+    *   Summary definition: Overall, vocabulary is so complex that it makes comprehension of the bulk of the text very challenging and requires careful effort to interpret.
+
+**Resource 2: Flesch-Kincaid Grade Level**
+Use the Flesch-Kincaid (FK) Grade Level as light guidance of the approximate grade level based on readability. The metric alone does not provide final information of vocabulary complexity, but a ballpark of the difficulty of the entire text.
+*   grade 2-3: 1.98-5.34
+*   grade 4-5: 4.51-7.73
+*   grade 6-8: 6.51-10.34
+*   grade 9-10: 8.32-12.12
+*   grade 11-College: 10.34-14.2
+
+**Guidelines for Interpretation and Reasoning**
+
+Your reasoning is the most critical part of your analysis. It's not enough to simply count complex words. You must analyze their impact on a student at the specified grade level. Use the following principles to guide your judgment:
+
+1.  **Density and Cumulative Effect:** Do not just count complex words; evaluate their concentration. A short text with a high density of challenging Tier 2 words (e.g., `peculiar`, `mischievous`, `courageous` for a 4th grader) can be more overwhelming than a longer text with a few scattered Tier 3 words. A constant barrage of unfamiliar words can elevate complexity from `very` to `exceedingly`.
+2.  **Contextual Scaffolding:** Assess how the text supports new vocabulary.
+    *   Are new, complex terms explicitly defined or explained with simple examples (e.g., "volume... to see if it is big enough to hold a liter of food")?
+    *   Is the surrounding language simple and conversational, making the meaning of new words easier to infer?
+    *   Strong scaffolding can lower the complexity rating. A text with many Tier 3 words that are well-explained might only be `moderately complex`.
+3.  **Abstract vs. Concrete Vocabulary:** Differentiate between words for abstract concepts and words for concrete things. A text built on abstract Tier 2 words (e.g., `relationships`, `performance`, `non-physical`) can be more challenging than a text that introduces Tier 3 labels for concrete things or people (e.g., `Sumerians`, `polonium`).
+4.  **Conceptual Load:** Consider the cognitive load of the vocabulary. A list of many new, multi-syllabic, conceptually-heavy terms (e.g., `Paleolithic`, `Mesolithic`, `Neolithic` for a 3rd grader) can be `very complex` even if the terms are briefly defined, because the student must process multiple new concepts at once.
+5.  **Calibrating the Top Levels:** Be precise in your use of `very complex` vs. `exceedingly complex`.
+    *   **Very complex:** The vocabulary creates significant hurdles and slows the reader down, but the main ideas of the text are still accessible with effort.
+    *   **Exceedingly complex:** The vocabulary is so dense, technical, or abstract that it acts as a barrier, making it nearly impossible for the target student to grasp the bulk of the text's meaning without extensive outside help. Reserve this for texts saturated with advanced terminology.
+6.  **Consider Background Knowledge:** Pay close attention to the provided `student_background_knowledge`. Do not classify a word as complex if the student is likely to be familiar with it (e.g., 'oxygen' for a 3rd grader who has learned about the human body).
+
+**Final Analysis Format**
+
+Provide these information as your final analysis:
+1.  **Complex vocabulary:**
+    *   Tier 2 words: Words that are commonly used in academic settings and more complex than colloquial, or everyday language and often have multiple meanings.
+    *   Tier 3 words: Overly academic or domain-specific words.
+    *   Archaic words: Words, or uses of words that are not commonly used in modern conversational language. E.g., “The jury retired to deliberate on their verdict." The use of “retire” to mean withdrawing to a private place is an archaic use.
+    *   Other complex words: All other words that can increase complexity of the text (e.g., idioms, unfamiliar proper nouns that function as vocabulary).
+2.  **Vocabulary complexity:** one of: slightly complex, moderately complex, very complex, exceedingly complex
+3.  **Your reasoning of the complexity:** A detailed explanation of your rating, referencing the principles above.
+'''
+user_prompt = '''
+
+Below is the text you need to evaluate. Let's think step by step in order to predict the output of the vocabulary complexity task.
+
+- It is intended for grade 3.
+
+- You can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of historical exploration. They are unlikely to be familiar with the Mongol Dynasty, Kublai Khan, or Marco Polo's specific travels and contributions.
+
+- Text Flesch-Kincaid grade level: 6.59
+
+- Text to evaluate: [BEGIN TEXT]
+Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.
+Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.
+His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.
+[END TEXT]
+
+STRICT OUTPUT FORMAT:
+- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators.
+- Do not wrap the JSON in Markdown or code fences (no ``` or ```json).
+- Do not prepend or append any text (e.g., do not write "Here is the JSON:").
+- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments.
+
+The output should be formatted as a JSON instance that conforms to the JSON schema below.
+
+As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]} the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.
+
+Here is the output schema (shown in a code block for readability only — do not include any backticks or Markdown in your output):
+```
+{"properties": {"tier_2_words": {"description": "List of Tier 2 words", "title": "Tier 2 Words", "type": "string"}, "tier_3_words": {"description": "List of Tier 3 words", "title": "Tier 3 Words", "type": "string"}, "archaic_words": {"description": "List of Archaic words", "title": "Archaic Words", "type": "string"}, "other_complex_words": {"description": "List of Other Complex words", "title": "Other Complex Words", "type": "string"}, "complexity_score": {"description": "the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex", "title": "Complexity Score", "type": "string"}, "reasoning": {"description": "your reasoning for your answer", "title": "Reasoning", "type": "string"}}, "required": ["tier_2_words", "tier_3_words", "archaic_words", "other_complex_words", "complexity_score", "reasoning"]}
+```
+'''
+llm_response = '''
+```json
+{
+  "tier_2_words": "served, nevertheless, scholars, described, treasures, wealthy, trading, survived, passengers",
+  "tier_3_words": "Mongol Dynasty, Kublai Khan, ivory, jade, porcelain",
+  "archaic_words": "None",
+  "other_complex_words": "Venice, Ayas, Tabriz, Kerman, Mediterranean Sea, Far East, near East",
+  "complexity_score": "very complex",
+  "reasoning": "The vocabulary is rated as very complex due to the high density and conceptual load of unfamiliar terms for a 3rd grader, combined with a near-total lack of contextual scaffolding. The text introduces a large number of proper nouns that function as new vocabulary concepts (Mongol Dynasty, Kublai Khan, Venice, Ayas, Tabriz, Kerman) in a very short space. In addition, it includes several Tier 3 words for unfamiliar goods (ivory, jade, porcelain) and challenging Tier 2 words (nevertheless, scholars). These terms are presented without definition or explanation, requiring the student to process multiple new concepts simultaneously. This density of unfamiliar vocabulary will often present challenges and significantly slow down comprehension, even if the student can grasp the general story of travel and trade. The cumulative effect of these words pushes the text beyond moderately complex into the very complex category for this grade level."
+}
+```'''
+
+[cases.marco_polo_grade3.expected_result]
+tier_2_words = '''
+served, nevertheless, scholars, described, treasures, wealthy, trading, survived, passengers'''
+tier_3_words = '''
+Mongol Dynasty, Kublai Khan, ivory, jade, porcelain'''
+archaic_words = '''
+None'''
+other_complex_words = '''
+Venice, Ayas, Tabriz, Kerman, Mediterranean Sea, Far East, near East'''
+complexity_score = '''
+very complex'''
+reasoning = '''
+The vocabulary is rated as very complex due to the high density and conceptual load of unfamiliar terms for a 3rd grader, combined with a near-total lack of contextual scaffolding. The text introduces a large number of proper nouns that function as new vocabulary concepts (Mongol Dynasty, Kublai Khan, Venice, Ayas, Tabriz, Kerman) in a very short space. In addition, it includes several Tier 3 words for unfamiliar goods (ivory, jade, porcelain) and challenging Tier 2 words (nevertheless, scholars). These terms are presented without definition or explanation, requiring the student to process multiple new concepts simultaneously. This density of unfamiliar vocabulary will often present challenges and significantly slow down comprehension, even if the student can grasp the general story of travel and trade. The cumulative effect of these words pushes the text beyond moderately complex into the very complex category for this grade level.'''
+
+
+# ===========================================================================
+# Case 2: hurricanes_grade7  (grades 5–12 path)
+# ===========================================================================
+
+[cases.hurricanes_grade7]
+description = "Hurricane formation passage, grade 7 (grades 5-12 GPT path)"
+
+[cases.hurricanes_grade7.input]
+text = '''
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.'''
+grade = 7
+
+[cases.hurricanes_grade7.prompt_steps.background_knowledge]
+model = "gpt-4o-2024-11-20"
+temperature = 0.0
+system_prompt = ""
+user_prompt = '''
+
+Review the following text, which is an educational text written for students in the following grade band: 7.
+
+Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with.
+
+Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.).
+
+Here's an example:
+[START EXAMPLE]
+Grade Band: 11th
+Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not
+learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was
+not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to
+know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me,
+are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it
+is the chief end of man here to “glorify God and enjoy him forever.”
+
+Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy.
+[END EXAMPLE]
+
+You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else.
+
+You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have:
+[BEGIN TOPICS]
+[
+    K: [
+        "Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees",
+        "The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?",
+        "America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?",
+        "Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together",
+        "How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?",
+        "How Has Our World Changed?", "Why Do People Have Jobs?"
+    ],
+    1: [
+        "Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds",
+        "A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals’ unique features?",
+        "Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?",
+        "Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World",
+        "What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?",
+        "How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?"
+    ],
+    2: [
+        "Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators",
+        "A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?",
+        "Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?",
+        "Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are",
+        "Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?",
+        "How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?"
+    ],
+    "3": [
+        "Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics",
+        "Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist",
+        "Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate",
+        "Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?",
+        "What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?",
+        "How Do People in a Community Meet Their Wants and Needs?"
+    ],
+    4: [
+        "Poetry", "Animal Defense Mechanisms", "The American Revolution",
+        "Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)",
+        "A Great Heart: What does it mean to have a great heart, literally and figuratively?",
+        "Extreme Settings: How does a challenging setting or physical environment change a person?",
+        "American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light",
+        "Earth's Features", "Waves, Energy, and Information", "Regions of the United States",
+        "How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?",
+        "How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?",
+        "How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?"
+    ],
+    5: [
+        "Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change",
+        "Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?",
+        "Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?",
+        "Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky",
+        "Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation",
+        "How Were the Lives of Native Peoples Influenced by Where They Lived?",
+        "What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?",
+        "Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?",
+        "How Does the Constitution Help Us Understand What It Means to Be an American?",
+        "What Do the Early Years of the United States Reveal About the Character of the Nation?",
+        "What Was the Effect of the Civil War on U.S. Society?"
+    ],
+    6: [
+        "Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools",
+        "Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?",
+        "A Hero’s Journey: What is the significance and power of the hero’s journey?",
+        "Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?",
+        "Courage in Crisis: How can the challenges of a hostile environment inspire heroism?",
+        "Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy",
+        "Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate",
+        "Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)",
+        "Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution",
+        "The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)",
+        "Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)",
+        "New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II",
+        "The Cold War", "Civil Rights and American Society", "America Since the 1970s"
+    ],
+    7: [
+        "The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution",
+        "Identity in the Middle Ages: How does society both support and limit the development of identity?",
+        "Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?",
+        "Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering",
+        "Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources",
+        "Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent",
+        "Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan",
+        "Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires",
+        "China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking",
+        "Age of Exploration and Trade", "Revolutions and Empires", "The Modern World"
+    ],
+    8: [
+        "Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment",
+        "The Poetics and Power of Storytelling: What is the power of storytelling?",
+        "The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?",
+        "Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy",
+        "Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun",
+        "Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms",
+        "Places and Regions", "Physical Geography", "Population Geography", "Economic Geography",
+        "Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses",
+        "Government and the Economy", "The Global Economy"
+    ]
+]
+[END TOPICS]
+
+Here is the text:
+[BEGIN TEXT]
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
+[END TEXT]
+'''
+llm_response = '''
+Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, air pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes, their formation, or their global naming conventions.'''
+
+[cases.hurricanes_grade7.prompt_steps.vocab_complexity]
+model = "gpt-4.1"
+temperature = 0.0
+system_prompt = '''
+
+You are an expert curriculum designer. Your job involves reading text snippets intended for students in K-12 and evaluating the complexity of the vocabulary in each text.
+
+You will be given a rubric (with options 1, 2, 3, 4) as well as guidelines for interpreting the rubric.
+
+IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meainng, etc.)
+IMPORTANT: Rely on the supplied rubric and annotation guidelines along. Do not introduce any new crtieria for evaluating the complexity of a text's vocabulary.
+
+Please first reason out loud about the vocabulary complexity of the text and then provide an answer between 1 and 4 (whole numbers only). Provide the answer as an integer (not a float).
+'''
+user_prompt = '''
+
+Your job is to rate the complexity of a text's vocabulary (relative to the intended level of the text) according to a rubric and annotation guide. Stick to the rubric and annotation guide exactly — do not introduce any additional criteria or lenses for judging the complexity of the text.
+
+[BEGIN ANNOTATION GUIDE AND RUBRIC]
+Instructions
+For the following task, please assume that:
+    - The student is on grade level and proficient in all core content areas, including reading fluency, comprehension, science, & social studies. (example).
+    - The student is moving through a common progression of topics (detailed here).
+    - The student is fluent in speaking English.
+    - The student has an "average" amount of background knowledge on topics not commonly covered in curriculum.
+    - The student will use this material for independent reading/work, without direct instruction.
+    - The text is reasonable for the given grade level.
+
+Please do not consider the presence of figurative language when scoring Vocabulary. For example: with a phrase like "kicked the bucket," consider only the qualities of the words themselves ("kicked", "the" and "bucket").
+
+Please do be sure to consider:
+- all of the different types of vocabulary (listed below)
+- the overall proportion of complex words in the text - including repeated complex words.
+- the resulting holistic complexity of the vocabulary (described in the Summary section below).
+
+Level 1:
+Rubric: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. That said, a very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.
+
+Level 2:
+Rubric: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK, but if it's very low, the text is probably level 1.
+
+Level 3:
+Rubric: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic
+
+Level 4:
+Rubric: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading
+
+And here are some relevant definitions:
+    - Conversational: Everyday language.
+    - Familiar: Words that the student is likely to have seen/heard, from everyday life or their curriculum. Reminder: assume an "average" level of background knowledge.
+    - Unfamiliar: Words the student has probably not heard, or are being used in an unfamiliar way.
+        - For ex: 4th graders are familiar with the word "table" but may not be familiar with the use of the word with respect to data ("a table of data").
+        - Note:
+            - Words with in-line definitions (via appositives, or because they can be easily inferred from other parts of the text) should be evaluated as less unfamiliar.
+            - For ex: "The pharaoh, a powerful ruler of ancient Egypt, was buried in a grand tomb."
+                - The word "pharaoh" might be unfamiliar or subject-specific, but since is defined within the text, you can consider it a more familiar word.
+        - Unfamiliar proper nouns:
+            - A person's name, even if unfamiliar, generally does not add to complexity.
+            - Other unfamiliar proper nouns (eg locations, organizations) do add to complexity.
+
+- Subject-specific: Words that are specific to a subject or field of study that are essential for understanding concepts and engaging with the content.
+- Overly-academic: Words that are excessively formal, complex, or specialized.
+    - For ex: "The agrarian societal structure of the Neolithic Revolution precipitated a paradigm shift in agriculture"
+- Archaic: A word that was common in the past but is now rarely/almost never used. Could also be a word used in an archaic way.
+    - For ex: "After a long day of court proceedings, the jury 'retired' to deliberate on their verdict."
+        - The word "retire" meaning to stop working may be familiar to a student, but "retire" meaning "withdrawing to a private place" is an archaic use.
+
+
+Examples
+The student is on-grade-level:
+- Consider a 6th grade passage about earth systems. Per NGSS standards, students are introduced to earth systems starting in 2nd grade. They encounter words like: wind, water, river, lake, solids, and liquids. For our rating purposes, we would assume most students following 2nd have encountered these words. In 5th grade, they dive more fully into earth systems concepts, learning vocabulary words like geosphere, sediment, biosphere, atmosphere, ecosystems, organisms and climate. While rating, we would consider the words listed in the NGSS standards as more familiar following that grade level.  If the same passage were intended for 3rd graders, though, then the subject-specific vocabulary is likely to be unfamiliar.
+
+Figurative Language
+- Kicked the bucket.
+- The pen is mightier than the sword.
+- The classroom was a zoo.
+- He ran faster than the speed of light.
+[END ANNOTATION GUIDE AND RUBRIC]
+
+Here are a couple examples of texts that have already been scored along with justification for their scores, which you can use as exemplars:
+[BEGIN EXAMPLES]
+
+*** EXAMPLE 1 ***
+The following text was intended for grade level 11 and received a complexity level of 1.
+
+Here is the background knowledge assumption for that text: N/A
+
+Here is the text:
+// START TEXT //
+"In a recent lecture, "Is Nothing Sacred?", Salman Rushdie, one of the most censored authors of our time, talked about the importance of books. He grew up in a household in India where books were as sacred as bread. If anyone in the household dropped a piece of bread or a book, the person not only picked it up, but also kissed the object by way of apologizing for clumsy disrespect.
+
+He goes on to say that he had kissed many books before he had kissed a girl. Bread and books were for his household, and for many like his, food for the body and the soul. This image of the kissing of the book one had accidentally dropped made an impression on me. It speaks to the love and respect many people have for them.
+
+I grew up in a small town in New Mexico, and we had very few books in our household. The first one I remember reading was my catechism book. Before I went to school to learn English, my mother taught me catechism in Spanish.
+
+I remember the questions and answers I had to learn, and I remember the well-thumbed, frayed volume which was sacred to me.
+
+Growing up with few books in the house created in me a desire and a need for them. When I started school, I remember visiting the one room library of our town and standing in front of the dusty shelves. In reality there were only a few shelves and not over a thousand books, but I wanted to read them all. There was food for my soul in the books, that much I realized."
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+This text is a 1 for vocabulary, because the vocabulary that is used is familiar and accessible for a proficient 11th grader. Most of the words used in the text are very common everyday vocabulary for describing growing up, family life, and the importance of reading. A few examples of these very common words are: small town, book, school, learn, food, kissed, image, respect, love, speaks. There are many more in the text. In this text there are only a few "juicier" or more complex words, you can think of those as words that are less familiar, have a more abstract or nuanced meaning, or carry a very large concept. Less commonly spoken words that were used in the text were: frayed, volume, censored, clumsy, sacred. These are still well within reach of a proficient 11th grader, and would still be considered familiar, because they will have encountered them in past reading or academic studies. In the text there are a couple of words that are outliers, but they are not essential to the understanding of the larger text. One of these words or hyphenated compound phrase is well-frayed. A compound phrase is a phrase consisting of multiple words that work together to create a specific meaning or idea, often acting as a single unit in a sentence. If the meaning of individual words is familiar, it is typically quite easy for proficient readers to generalize the larger meaning that the author is implying with their word choice. In this case, proficient students will be accustomed to the phrase well, with the secondary meaning of very, rather than a description of positivity or health; and they will be accustomed to the use frayed, as in worn, aged, or damaged from use. Making the leap to identify the meaning of "well-frayed" as a book that is very used, will take only moments for a proficient 11th grader. Another word that stands out in the text is the word catechism, which might be new for many students based on their personal background or location, but a full understanding of what a catechism book contains is not essential for understanding the paragraph or whole text. The reader can make it through using minimum context clues to know that the catechism must be something important to his family. The type of book he learned to read before going to school is not critical for comprehension, it's enough to understand that reading was so important in his family, his mother started instruction before he even started school. Additionally, it's important to know that having one unknown word for an 11th grade reading, does not merit a rating higher than one.
+
+It is worth noting that another reason this text is a 1, is that the content or topic of the passage is so familiar and covered extensively in K-12 education, i.e. reading is important, loving books, growing up; that coupled with the simple vocabulary choices, getting to the meaning of the overall text, and even the paragraphs, would be incredibly easy for a proficient 11th grader.
+// END REASONING //
+*** EXAMPLE 2 ***
+The following text was intended for grade level 5 and received a complexity level of 2.
+
+Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with the concept of natural disasters, including hurricanes, and basic atmospheric concepts like high and low pressure from their studies on weather and climate. They may not be familiar with the specific formation processes of hurricanes or the global terminology differences (hurricane, typhoon, cyclone).
+
+Here is the text:
+// START TEXT //
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names—hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+I scored this a 2 because of the density of subject-specific vocabulary related to weather and climate, which is often covered in lower grade levels. This adds to the complexity above a 1, but it is not a level 3 because of the familiarity with the topic, which implies some familiarity with the vocabulary as well. The specific formation process and the vocabulary used to explain the processes are also subject-specfiic but not famliar, which would make the second paragraph a level 3 in the rubric language, but when considering the language used in the overall SUMMARY below the rubric, this new content and vocabulary would cause quick pauses and/or occasional prolonged pauses but would not cause the reader to slow down to due to challenging overall comprehension of the key ideas and supporting claims. This is especially the case because the second paragraph builds upon prior knowledge and familiar vocabulary use, so it is not entirely new information and vocabulary. While there is subject-specific vocabulary used, overly academic vocabulary is NOT used and is more conversational in nature, such as "great whiring storms" and "born" / "giving birth" to storm  (although this is the way storms are described!) rather than more technical terms which made comprehension easier due to the accessibility of the vocabulary (even if used in other contexts before reading this text). Words such as "a lot" and "bigger" are more conversational, and while technical, unfamiliar words are provided, such as "hurricane," "typhoon," and "cyclone," knowing and understanding their differences is not necessary to grasp the main idea. The processes by which they are formed are what need to be retained while reading the entire text, and familiarity with the bulk of the vocabulary used would allow for that to happen without too much struggle to make meaning of it. Additionally, the text does not contain any archaic vocabulary or ambiguous words, which prevents it from reaching a rating of 4, although it is not necessary that they text have such vocabulary to meet a level 4, the frequent inclusion of such vocabulary makes it more likely to land at least a 3 or 4.
+// END REASONING //
+
+*** EXAMPLE 3 ***
+The following text was intended for grade level 6 and received a complexity level of 3.
+
+Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with basic Earth science concepts such as rocks, minerals, and fossils, as well as natural processes like volcanic eruptions and earthquakes. They may not be familiar with more advanced topics like plate tectonics or the specific branches of geology such as mineralogy, petrology, and seismology.
+
+Here is the text:
+// START TEXT //
+Geology is the scientific study of Earth. Geologists study the planet—its formation, its internal structure, its materials, its chemical and physical processes, and its history. Mountains, valleys, plains, sea floors, minerals, rocks, fossils, and the processes that create and destroy each of these are all the domain of the geologist. Geology is divided into two broad categories of study: physical geology and historical geology.
+
+Physical geology is concerned with the processes occurring on or below the surface of Earth and the materials on which they operate. These processes include volcanic eruptions, landslides, earthquakes, and floods. Materials include rocks, air, seawater, soils, and sediment. Physical geology further divides into more specific branches, each of which deals with its own part of Earth's materials, landforms, and processes. Mineralogy and petrology investigate the composition and origin of minerals and rocks. Volcanologists study lava, rocks, and gases on live, dormant, and extinct volcanoes. Seismologists use instruments to monitor and predict earthquakes and volcanic eruptions.
+
+Historical geology is concerned with the chronology of events, both physical and biological, that have taken place in Earth's history. Paleontologists study fossils (remains of ancient life) for evidence of the evolution of life on Earth. Fossils not only relate evolution, but also speak of the environment in which the organism lived. Corals in rocks at the top of the Grand Canyon in Arizona, for example, show a shallow sea flooded the area around 290 million years ago. In addition, by determining the ages and types of rocks around the world, geologists piece together continental and oceanic history over the past few billion years. Plate tectonics (the study of the movement of the sections of Earth's crust) adds to Earth's story with details of the changing configuration of the continents and oceans.
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+To determine the complexity rating of this text based on the vocabulary present, I used the annotation guide, scoring rubric, and examples to set the expectations for rating. During the first read of the text, I "bolded" and categorized the more challenging vocabulary words according to the following complexity groupings: archaic, unfamiliar, archaic, subject-specific, and/or overly academic. On the second read, I considered the main idea or "gist" that students need to acquire understanding of. I then referenced the previously mentioned tools–annotation guide, scoring rubric, and examples to remind myself of the expectations for rating.  I agreed that readers would have familiarity with basic concepts of geology; however, I also considered the definitions provided for words such as Geology, Geologists, Physical Geology, Historical Geology, Mineralogy, and Petrology. I considered how students might pause for clarification and for how long. After reviewing the Annotation Guide while considering, I narrowed the rating down because the definitions provided throughout the text of more complex words should make the meaning of the text more accessible for readers, which is why although the words are subject-specific, I rated this text as a 3 instead of a 2-less complex or a 4–more complex. I read the text one final time to ensure clarity around my rating, scored and wrote the justification.
+// END REASONING //
+[END EXAMPLES]
+
+Below is the text you need to evaluate. It is intended for grade 7.
+
+As you read the text, you can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, air pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes, their formation, or their global naming conventions.
+
+[BEGIN TEXT]
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
+[END TEXT]
+
+In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say "level 2").
+
+STRICT OUTPUT FORMAT:
+- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators.
+- Do not wrap the JSON in Markdown or code fences (no ``` or ```json).
+- Do not prepend or append any text (e.g., do not write "Here is the JSON:").
+- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments.
+
+The output should be formatted as a JSON instance that conforms to the JSON schema below.
+
+As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]} the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.
+
+Here is the output schema (shown in a code block for readability only — do not include any backticks or Markdown in your output):
+```
+{"properties": {"tier_2_words": {"description": "List of Tier 2 words", "title": "Tier 2 Words", "type": "string"}, "tier_3_words": {"description": "List of Tier 3 words", "title": "Tier 3 Words", "type": "string"}, "archaic_words": {"description": "List of Archaic words", "title": "Archaic Words", "type": "string"}, "other_complex_words": {"description": "List of Other Complex words", "title": "Other Complex Words", "type": "string"}, "complexity_score": {"description": "the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex", "title": "Complexity Score", "type": "string"}, "reasoning": {"description": "your reasoning for your answer", "title": "Reasoning", "type": "string"}}, "required": ["tier_2_words", "tier_3_words", "archaic_words", "other_complex_words", "complexity_score", "reasoning"]}
+```
+'''
+llm_response = '''
+{"tier_2_words": "roar, fury, envelope, damage, mightiest, develop, match, familiar, surface, rising, sinking, pressing", "tier_3_words": "hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist", "archaic_words": "", "other_complex_words": "ashore", "complexity_score": "slightly complex", "reasoning": "The vocabulary in this text is mostly contemporary, conversational, and familiar for a 7th grader with the assumed background knowledge. Words like 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' and 'low pressure' are subject-specific (Tier 3), but these are introduced with context and are not overly academic or dense. There are a few Tier 2 words such as 'roar,' 'fury,' 'envelope,' and 'damage,' but these are likely familiar from prior curriculum or everyday language. The word 'ashore' may be less familiar, but it is not enough to raise the overall complexity. There are no archaic or overly academic words. The proportion of complex words is low, and the text is accessible for independent reading at this grade level. Therefore, the vocabulary complexity is best described as 'slightly complex,' corresponding to a rubric level 2."}'''
+
+[cases.hurricanes_grade7.expected_result]
+tier_2_words = '''
+roar, fury, envelope, damage, mightiest, develop, match, familiar, surface, rising, sinking, pressing'''
+tier_3_words = '''
+hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist'''
+archaic_words = ""
+other_complex_words = '''
+ashore'''
+complexity_score = '''
+slightly complex'''
+reasoning = '''
+The vocabulary in this text is mostly contemporary, conversational, and familiar for a 7th grader with the assumed background knowledge. Words like 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' and 'low pressure' are subject-specific (Tier 3), but these are introduced with context and are not overly academic or dense. There are a few Tier 2 words such as 'roar,' 'fury,' 'envelope,' and 'damage,' but these are likely familiar from prior curriculum or everyday language. The word 'ashore' may be less familiar, but it is not enough to raise the overall complexity. There are no archaic or overly academic words. The proportion of complex words is low, and the text is accessible for independent reading at this grade level. Therefore, the vocabulary complexity is best described as 'slightly complex,' corresponding to a rubric level 2.'''
diff --git a/sdks/python/tests/contract_tests/__init__.py b/sdks/python/tests/contract_tests/__init__.py
new file mode 100644
index 0000000..569ad77
--- /dev/null
+++ b/sdks/python/tests/contract_tests/__init__.py
@@ -0,0 +1,14 @@
+"""Contract tests: verify SDK fidelity to the evaluation notebooks.
+
+Each evaluator has a ``contracts.toml`` in its settings folder that records:
+  - ``input``:           the evaluator inputs used for the test case
+  - ``prompt_steps``:    the exact LLM request (formatted messages, model, temperature)
+                         and raw response captured from a real notebook run
+  - ``expected_result``: the structured LLM output (notebook format)
+
+Tests load these artifacts, run the evaluator with the LLM mocked to return the
+captured response, and assert that:
+  1. The SDK sends the same prompt as the notebook (same formatted messages,
+     model, and temperature).
+  2. The SDK produces the same result from that response as the notebook.
+"""
diff --git a/sdks/python/tests/contract_tests/conventionality.py b/sdks/python/tests/contract_tests/conventionality.py
new file mode 100644
index 0000000..86283d6
--- /dev/null
+++ b/sdks/python/tests/contract_tests/conventionality.py
@@ -0,0 +1,87 @@
+"""Conventionality-specific helpers for contract tests.
+
+Provides:
+  - Named case loaders (one function per test case in the TOML).
+  - ``conventionality_notebook_to_sdk_result``: converts the notebook-format
+    expected result (raw ``JsonOutputParser`` dict) to the expected
+    ``TextComplexityResult`` that the SDK should produce.
+"""
+
+from __future__ import annotations
+
+from learning_commons_evaluators.schemas.evaluator import EvaluationExplanation
+from learning_commons_evaluators.schemas.metadata import (
+    EvaluationMetadata,
+    EvaluatorMaturity,
+    EvaluatorMetadata,
+    Status,
+)
+from learning_commons_evaluators.schemas.text_complexity import (
+    TextComplexityAnswer,
+    TextComplexityResult,
+)
+
+from .loader import ContractCase, load_contract_case
+
+# ---------------------------------------------------------------------------
+# Case loaders
+# ---------------------------------------------------------------------------
+
+
+def load_conventionality_turnip_case() -> ContractCase:
+    """Load the 'turnip' contract test case for the conventionality evaluator."""
+    return load_contract_case("conventionality", "turnip")
+
+
+# ---------------------------------------------------------------------------
+# Result mapper
+# ---------------------------------------------------------------------------
+
+
+def conventionality_notebook_to_sdk_result(case: ContractCase) -> TextComplexityResult:
+    """Convert ``case.expected_result`` (notebook format) to a ``TextComplexityResult``.
+
+    The notebook outputs a plain dict from ``JsonOutputParser``; the SDK wraps
+    that into ``TextComplexityResult``.  This function performs the same
+    structural mapping the SDK does so tests can assert equality.
+
+    Only ``answer`` and ``explanation`` are compared — ``metadata`` is excluded
+    because it contains non-deterministic fields (timing, evaluation ID, etc.).
+
+    Args:
+        case: A loaded :class:`~loader.ContractCase` with a populated
+            ``expected_result``.
+
+    Returns:
+        A ``TextComplexityResult`` built from the contract's expected output.
+        The ``metadata`` field is a minimal placeholder so the object is valid.
+    """
+    r = case.expected_result
+    answer = TextComplexityAnswer.from_score(r["complexity_score"])
+    explanation = EvaluationExplanation(
+        summary=r["reasoning"],
+        details={
+            "conventionality_features": r["conventionality_features"],
+            "grade_context": r["grade_context"],
+            "instructional_insights": r["instructional_insights"],
+        },
+    )
+    # A minimal metadata object — only used to satisfy the result model; not
+    # compared in assertions (use assert_answer / assert_explanation helpers).
+    placeholder_metadata = EvaluationMetadata(
+        evaluator_metadata=EvaluatorMetadata(
+            id="conventionality",
+            version="0.1",
+            name="Conventionality",
+            description="Contract-test placeholder metadata (not compared).",
+            maturity=EvaluatorMaturity.beta,
+        ),
+        evaluation_settings=None,
+        input_metadata={},
+        status=Status.succeeded,
+    )
+    return TextComplexityResult(
+        answer=answer,
+        explanation=explanation,
+        metadata=placeholder_metadata,
+    )
diff --git a/sdks/python/tests/contract_tests/harness.py b/sdks/python/tests/contract_tests/harness.py
new file mode 100644
index 0000000..93ff7e1
--- /dev/null
+++ b/sdks/python/tests/contract_tests/harness.py
@@ -0,0 +1,229 @@
+"""Contract test harness: LLM mocking and prompt-request assertions.
+
+Usage
+-----
+::
+
+    case = load_conventionality_turnip_case()
+    config = create_config_no_telemetry()
+    evaluator = ConventionalityEvaluator(config)
+
+    inp = ConventionalityEvaluationInput(
+        text=case.input["text"],
+        grade=case.input["grade"],
+    )
+
+    with ContractTestHarness(case) as harness:
+        result = evaluator.evaluate(inp)
+
+    harness.assert_prompt_step("main")
+
+For evaluators with multiple LLM steps the harness automatically queues
+responses in ``prompt_steps`` order and captures each call's request, so the
+assertions still use the step name::
+
+    harness.assert_prompt_step("step_1")
+    harness.assert_prompt_step("step_2")
+
+The captured data is accessible after the context exits.
+
+Prompt strings (system and user) are compared after normalizing line endings
+(``\\r\\n`` → ``\\n``) and stripping leading/trailing whitespace, so minor TOML
+multiline formatting differences do not fail tests.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+from unittest.mock import patch
+
+from .loader import ContractCase, PromptStepContract
+
+# The module-level name that ``base.py`` imports ``create_provider`` under.
+_CREATE_PROVIDER_TARGET = "learning_commons_evaluators.evaluators.base.create_provider"
+
+
+class ContractTestHarness:
+    """Sets up a mocked LLM for a contract test and captures what was sent to it.
+
+    The harness patches ``create_provider`` so that no real API calls are made.
+    Each call to ``create_provider`` pops the next response from the case's
+    ``prompt_steps`` (in order) and returns a lightweight callable that:
+
+    - Records the formatted messages it receives.
+    - Returns an ``AIMessage`` with the stored ``llm_response`` content.
+
+    After the ``with`` block, use :meth:`assert_prompt_step` to verify each
+    captured request matches the stored contract.
+    """
+
+    def __init__(self, case: ContractCase) -> None:
+        self.case = case
+        # Populated during the evaluation run; indexed by call order.
+        self._captured: list[_CapturedCall] = []
+        self._patch: Any = None
+
+    def __enter__(self) -> ContractTestHarness:
+        if not self.case.is_populated():
+            raise ValueError(
+                "Contract case still contains PLACEHOLDER_* values in prompt_steps; "
+                "capture a real notebook run and refresh contracts.toml before running "
+                "contract tests."
+            )
+        # Build a FIFO list of (step_name, response) pairs in definition order.
+        self._response_queue: list[tuple[str, str]] = [
+            (name, step.llm_response) for name, step in self.case.prompt_steps.items()
+        ]
+        self._captured = []
+
+        self._patch = patch(_CREATE_PROVIDER_TARGET, side_effect=self._make_fake_provider)
+        self._patch.start()
+        return self
+
+    def __exit__(self, *args: Any) -> None:
+        if self._patch is not None:
+            self._patch.stop()
+
+    # ------------------------------------------------------------------
+    # Assertions
+    # ------------------------------------------------------------------
+
+    def assert_prompt_step(self, step_name: str) -> None:
+        """Assert that the request captured for *step_name* matches the contract.
+
+        Raises:
+            AssertionError: If the captured messages, model, or temperature
+                differ from the stored contract.
+            IndexError: If no call was captured for the given step.
+        """
+        step_names = list(self.case.prompt_steps.keys())
+        if step_name not in step_names:
+            raise ValueError(f"Step '{step_name}' not in contract. Available steps: {step_names}")
+        step_index = step_names.index(step_name)
+
+        if step_index >= len(self._captured):
+            raise AssertionError(
+                f"No LLM call captured for step '{step_name}' "
+                f"(only {len(self._captured)} call(s) were made)."
+            )
+
+        captured = self._captured[step_index]
+        contract = self.case.prompt_steps[step_name]
+        _assert_prompt_matches(captured, contract, step_name)
+
+    # ------------------------------------------------------------------
+    # Internal
+    # ------------------------------------------------------------------
+
+    def _make_fake_provider(self, prompt_settings: Any, evaluator_config: Any) -> Any:
+        """Return a callable that records input messages and returns the next mock response."""
+        from langchain_core.messages import AIMessage
+
+        if not self._response_queue:
+            raise RuntimeError(
+                "ContractTestHarness: more LLM calls were made than there are "
+                "prompt steps in the contract case."
+            )
+        _step_name, response_content = self._response_queue.pop(0)
+        captured_list = self._captured
+
+        def _fake_llm(prompt_value: Any) -> AIMessage:
+            # ``prompt_value`` is a ChatPromptValue produced by the template.
+            messages = (
+                prompt_value.to_messages()
+                if hasattr(prompt_value, "to_messages")
+                else list(prompt_value)
+            )
+            system_content = _message_content(messages, "system")
+            human_content = _message_content(messages, "human")
+            captured_list.append(
+                _CapturedCall(
+                    system_prompt=system_content,
+                    user_prompt=human_content,
+                    model=prompt_settings.model,
+                    temperature=prompt_settings.temperature,
+                )
+            )
+            return AIMessage(content=response_content)
+
+        return _fake_llm
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+
+
+class _CapturedCall:
+    """One captured LLM invocation."""
+
+    def __init__(
+        self,
+        *,
+        system_prompt: str,
+        user_prompt: str,
+        model: str,
+        temperature: float,
+    ) -> None:
+        self.system_prompt = system_prompt
+        self.user_prompt = user_prompt
+        self.model = model
+        self.temperature = temperature
+
+
+def _message_content(messages: list[Any], message_type: str) -> str:
+    """Extract content from the first message of the given type."""
+    for msg in messages:
+        if getattr(msg, "type", None) == message_type:
+            return str(msg.content)
+    return ""
+
+
+def _normalize_prompt_snapshot(s: str) -> str:
+    """Normalize prompt text for contract comparison.
+
+    Hand-edited or pasted TOML multiline strings may differ from runtime only by
+    CRLF vs LF line endings, or a spurious leading newline after an opening
+    ``'''`` delimiter (TOML trims only the first newline). Normalizing avoids
+    those false failures while preserving internal content.
+    """
+    return s.replace("\r\n", "\n").strip()
+
+
+def _assert_prompt_matches(
+    captured: _CapturedCall,
+    contract: PromptStepContract,
+    step_name: str,
+) -> None:
+    assert captured.model == contract.model, (
+        f"Step '{step_name}': model mismatch.\n"
+        f"  SDK sent:  {captured.model!r}\n"
+        f"  Contract:  {contract.model!r}"
+    )
+    assert captured.temperature == contract.temperature, (
+        f"Step '{step_name}': temperature mismatch.\n"
+        f"  SDK sent:  {captured.temperature}\n"
+        f"  Contract:  {contract.temperature}"
+    )
+    sys_sdk = _normalize_prompt_snapshot(captured.system_prompt)
+    sys_contract = _normalize_prompt_snapshot(contract.system_prompt)
+    assert sys_sdk == sys_contract, (
+        f"Step '{step_name}': system_prompt mismatch.\n"
+        f"  First diff at char {_first_diff_index(sys_sdk, sys_contract)}.\n"
+        f"  SDK sent (first 200):  {sys_sdk[:200]!r}\n"
+        f"  Contract (first 200):  {sys_contract[:200]!r}"
+    )
+    user_sdk = _normalize_prompt_snapshot(captured.user_prompt)
+    user_contract = _normalize_prompt_snapshot(contract.user_prompt)
+    assert user_sdk == user_contract, (
+        f"Step '{step_name}': user_prompt mismatch.\n"
+        f"  SDK sent:  {user_sdk!r}\n"
+        f"  Contract:  {user_contract!r}"
+    )
+
+
+def _first_diff_index(a: str, b: str) -> int:
+    for i, (ca, cb) in enumerate(zip(a, b, strict=False)):
+        if ca != cb:
+            return i
+    return min(len(a), len(b))
diff --git a/sdks/python/tests/contract_tests/loader.py b/sdks/python/tests/contract_tests/loader.py
new file mode 100644
index 0000000..85cabcd
--- /dev/null
+++ b/sdks/python/tests/contract_tests/loader.py
@@ -0,0 +1,169 @@
+"""Load contract test cases from evaluator settings folders.
+
+Each evaluator's ``contracts.toml`` lives next to its ``settings.toml`` in
+``sdks/settings/<evaluator>/`` (e.g.
+``sdks/settings/conventionality/contracts.toml``).  This module provides the data
+models and a loader that reads those files into structured objects usable from
+tests.
+"""
+
+from __future__ import annotations
+
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from importlib.abc import Traversable
+else:
+    try:
+        from importlib.resources.abc import Traversable
+    except ImportError:
+        from importlib.abc import Traversable
+
+if sys.version_info >= (3, 11):
+    import tomllib  # type: ignore[import-untyped]
+else:
+    try:
+        import tomllib  # type: ignore[import-untyped, import-not-found]
+    except ImportError:
+        import tomli as tomllib  # type: ignore[import-not-found,no-redef]
+
+
+# Sentinel used to detect placeholder values left by the TOML template.
+_PLACEHOLDER_PREFIX = "PLACEHOLDER"
+
+
+@dataclass
+class PromptStepContract:
+    """Contract for one LLM prompt step: the exact request sent and response received.
+
+    ``system_prompt`` and ``user_prompt`` are the *fully formatted* messages —
+    all template placeholders have been substituted (including library-computed
+    values like ``{format_instructions}`` and ``{fk_score}``).
+
+    ``llm_response`` is the raw string content returned by the LLM, as captured
+    from the notebook run.  It is used to mock the LLM in the contract test.
+    """
+
+    system_prompt: str
+    user_prompt: str
+    model: str
+    temperature: float
+    llm_response: str
+
+    def is_populated(self) -> bool:
+        """Return False if any field still holds a placeholder value."""
+        return not any(
+            str(v).startswith(_PLACEHOLDER_PREFIX)
+            for v in (self.system_prompt, self.user_prompt, self.llm_response)
+        )
+
+
+@dataclass
+class ContractCase:
+    """One test case for a contract test.
+
+    Attributes:
+        name:           Identifier matching the ``[cases.<name>]`` TOML key.
+        description:    Human-readable label (optional).
+        input:          Raw evaluator input values (e.g. ``{"text": ..., "grade": 4}``).
+        prompt_steps:   Ordered mapping of step name → :class:`PromptStepContract`.
+                        The order matches the order of LLM calls made during evaluation.
+        expected_result: Parsed LLM output in notebook format (i.e. the dict produced by
+                        ``JsonOutputParser``, before SDK result mapping).  Used to verify
+                        the SDK produces the same structured result.
+    """
+
+    name: str
+    description: str
+    input: dict[str, Any]
+    prompt_steps: dict[str, PromptStepContract]
+    expected_result: dict[str, Any]
+
+    def is_populated(self) -> bool:
+        """Return False if any prompt step still holds a placeholder value."""
+        return all(step.is_populated() for step in self.prompt_steps.values())
+
+    def llm_responses_in_order(self) -> list[str]:
+        """Return LLM responses for all steps, in call order."""
+        return [step.llm_response for step in self.prompt_steps.values()]
+
+
+def load_contract_case(evaluator_name: str, case_name: str) -> ContractCase:
+    """Load a named test case from the evaluator's ``contracts.toml``.
+
+    Resolved via :func:`~learning_commons_evaluators.settings.load_settings.shared_settings_root`
+    (bundled ``settings/<evaluator_name>/contracts.toml`` in the package; kept in sync
+    with the canonical ``sdks/settings/`` copy by ``make sync-settings``).
+
+    Args:
+        evaluator_name: Name of the evaluator (e.g. ``"conventionality"``).
+        case_name:      Name of the case within the TOML (e.g. ``"turnip"``).
+
+    Returns:
+        A :class:`ContractCase` loaded from the TOML.
+
+    Raises:
+        FileNotFoundError: If ``contracts.toml`` does not exist.
+        KeyError: If ``case_name`` is not found in the TOML.
+    """
+    toml_path = _settings_path(evaluator_name) / "contracts.toml"
+    if not toml_path.is_file():
+        raise FileNotFoundError(
+            f"contracts.toml not found for evaluator '{evaluator_name}' (expected at {toml_path})"
+        )
+
+    if isinstance(toml_path, Path):
+        with open(toml_path, "rb") as fh:
+            data = tomllib.load(fh)
+    else:
+        with toml_path.open("rb") as fh:
+            data = tomllib.load(fh)
+
+    cases = data.get("cases", {})
+    if case_name not in cases:
+        available = ", ".join(cases.keys()) or "(none)"
+        raise KeyError(f"Case '{case_name}' not found in {toml_path}. Available cases: {available}")
+
+    return _parse_case(case_name, cases[case_name])
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+
+
+def _settings_path(evaluator_name: str) -> Path | Traversable:
+    """Resolve the settings directory for an evaluator from the shared settings root."""
+    from learning_commons_evaluators.settings.load_settings import (
+        shared_settings_root,  # noqa: PLC0415
+    )
+
+    return shared_settings_root() / evaluator_name
+
+
+def _parse_case(name: str, raw: dict[str, Any]) -> ContractCase:
+    prompt_steps_raw = raw.get("prompt_steps", {})
+    prompt_steps: dict[str, PromptStepContract] = {
+        step_name: _parse_prompt_step(step_data)
+        for step_name, step_data in prompt_steps_raw.items()
+    }
+    return ContractCase(
+        name=name,
+        description=raw.get("description", ""),
+        input=raw.get("input", {}),
+        prompt_steps=prompt_steps,
+        expected_result=raw.get("expected_result", {}),
+    )
+
+
+def _parse_prompt_step(raw: dict[str, Any]) -> PromptStepContract:
+    return PromptStepContract(
+        system_prompt=str(raw["system_prompt"]),
+        user_prompt=str(raw["user_prompt"]),
+        model=str(raw["model"]),
+        temperature=float(raw["temperature"]),
+        llm_response=str(raw["llm_response"]),
+    )
diff --git a/sdks/python/tests/contract_tests/test_conventionality.py b/sdks/python/tests/contract_tests/test_conventionality.py
new file mode 100644
index 0000000..44db637
--- /dev/null
+++ b/sdks/python/tests/contract_tests/test_conventionality.py
@@ -0,0 +1,84 @@
+"""Contract test: ConventionalityEvaluator matches the notebook.
+
+This test verifies two things for each contract case:
+  1. The SDK sends the same LLM request as the notebook (same fully-formatted
+     system prompt, user prompt, model, and temperature).
+  2. Given the same LLM response, the SDK produces the same result as the
+     notebook.
+
+HOW TO ADD A NEW CASE
+---------------------
+1. Add a ``[cases.<name>]`` entry to
+   ``sdks/settings/conventionality/contracts.toml``.
+2. Add a loader function to ``contract_tests/conventionality.py``.
+3. Add a test function here following the pattern below.
+
+HOW TO REFRESH CONTRACT DATA
+-----------------------------
+Run the notebook ``evals/Final ship - Conventionality Experimental Evaluator.ipynb``
+from the **repository root** with a valid ``GOOGLE_API_KEY``. The notebook loads
+prompts from the evaluator settings TOML (same as the SDK) and prints a TOML block.
+Paste it into ``sdks/settings/conventionality/contracts.toml`` (the canonical
+copy), then run ``make sync-settings`` from ``sdks/python/`` to update the bundled
+copy.
+"""
+
+from learning_commons_evaluators import (
+    ConventionalityEvaluationInput,
+    ConventionalityEvaluator,
+    create_config_no_telemetry,
+)
+from learning_commons_evaluators.schemas.metadata import Status
+
+from .conventionality import (
+    conventionality_notebook_to_sdk_result,
+    load_conventionality_turnip_case,
+)
+from .harness import ContractTestHarness
+
+
+class TestConventionalityContract:
+    def test_turnip_grade4(self) -> None:
+        """Turnip classroom narrative, grade 4.
+
+        Verifies:
+        - The fully-formatted system and user prompts match the notebook.
+        - The model and temperature match the notebook.
+        - Given the notebook's LLM response, the SDK returns the same
+          answer and explanation as the notebook.
+        """
+        case = load_conventionality_turnip_case()
+
+        config = create_config_no_telemetry()
+        evaluator = ConventionalityEvaluator(config)
+        inp = ConventionalityEvaluationInput(
+            text=case.input["text"],
+            grade=case.input["grade"],
+        )
+
+        with ContractTestHarness(case) as harness:
+            result = evaluator.evaluate(inp)
+
+        # --- Prompt fidelity ---
+        # Asserts that the SDK sent the same fully-formatted request as the
+        # notebook (model, temperature, and both message contents).
+        harness.assert_prompt_step("main")
+
+        # --- Result fidelity ---
+        # Asserts the SDK maps the LLM response to the same answer and
+        # explanation as the notebook.  Metadata (timing, evaluation_id, etc.)
+        # is intentionally excluded from this comparison.
+        expected = conventionality_notebook_to_sdk_result(case)
+        assert result.metadata.status == Status.succeeded
+        assert result.answer.score == expected.answer.score, (
+            f"answer.score: SDK={result.answer.score!r}, notebook={expected.answer.score!r}"
+        )
+        assert result.answer.label == expected.answer.label, (
+            f"answer.label: SDK={result.answer.label!r}, notebook={expected.answer.label!r}"
+        )
+        assert result.explanation.summary == expected.explanation.summary, (
+            "explanation.summary (reasoning) differs between SDK and notebook"
+        )
+        assert result.explanation.details == expected.explanation.details, (
+            "explanation.details differs between SDK and notebook"
+        )
diff --git a/sdks/python/tests/contract_tests/test_vocabulary.py b/sdks/python/tests/contract_tests/test_vocabulary.py
new file mode 100644
index 0000000..57364c7
--- /dev/null
+++ b/sdks/python/tests/contract_tests/test_vocabulary.py
@@ -0,0 +1,136 @@
+"""Contract tests: VocabularyEvaluator matches the notebook.
+
+Each test verifies two things for a known input:
+  1. The SDK sends the same LLM requests as the notebook (fully-formatted
+     system prompt, user prompt, model, and temperature — for both steps).
+  2. Given the same LLM responses, the SDK produces the same structured result
+     as the notebook (score, reasoning, and word-list fields where applicable).
+
+The vocabulary evaluator makes two LLM calls per evaluation:
+  - Step "background_knowledge": plain-text response (no system prompt).
+  - Step "vocab_complexity":      JSON response (system + human prompt).
+
+HOW TO ADD A NEW CASE
+---------------------
+1. Add a ``[cases.<name>]`` entry to
+   ``settings/vocabulary/contracts.toml`` with both prompt steps.
+2. Add a loader function to ``contract_tests/vocabulary.py``.
+3. Add a test function here following the pattern below.
+
+HOW TO REFRESH CONTRACT DATA
+-----------------------------
+Run the notebook ``evals/vocabulary_evaluator.ipynb`` with valid API keys.
+The final cells output a TOML block; paste it into ``contracts.toml``
+replacing the placeholder values (canonical copy under ``sdks/settings/vocabulary/``).
+
+Then run ``make sync-settings`` from ``sdks/python`` so the bundled
+``src/learning_commons_evaluators/settings/vocabulary/contracts.toml`` matches;
+contract tests load contracts from that package copy, not from ``sdks/settings/`` alone.
+
+IMPORTANT: ``system_prompt = ""`` for the background_knowledge step is the
+correct value (the prompt has no system message), not a placeholder. Only
+the ``user_prompt`` and ``llm_response`` fields need to be populated.
+"""
+
+from learning_commons_evaluators import (
+    VocabularyEvaluationInput,
+    VocabularyEvaluator,
+    create_config_no_telemetry,
+)
+from learning_commons_evaluators.schemas.metadata import Status
+
+from .harness import ContractTestHarness
+from .vocabulary import (
+    load_vocabulary_grade34_case,
+    load_vocabulary_other_grades_case,
+    vocabulary_grade34_notebook_to_sdk_result,
+    vocabulary_other_grades_notebook_to_sdk_result,
+)
+
+
+class TestVocabularyContractGrades34:
+    def test_marco_polo_grade3(self) -> None:
+        """Marco Polo passage, grade 3 — grades 3–4 Gemini path.
+
+        Verifies:
+        - Both LLM requests (background_knowledge and vocab_complexity) match
+          the fully-formatted prompts captured from the notebook.
+        - Given the notebook's LLM responses, the SDK returns the same answer,
+          reasoning, and word-breakdown details as the notebook.
+        """
+        case = load_vocabulary_grade34_case()
+
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(
+            text=case.input["text"],
+            grade=case.input["grade"],
+        )
+
+        with ContractTestHarness(case) as harness:
+            result = evaluator.evaluate(inp)
+
+        # --- Prompt fidelity ---
+        # Both steps are asserted: model, temperature, and formatted messages
+        # must match what the notebook sent to the LLM.
+        harness.assert_prompt_step("background_knowledge")
+        harness.assert_prompt_step("vocab_complexity")
+
+        # --- Result fidelity ---
+        expected = vocabulary_grade34_notebook_to_sdk_result(case)
+        assert result.metadata.status == Status.succeeded
+        assert result.answer.score == expected.answer.score, (
+            f"answer.score: SDK={result.answer.score!r}, notebook={expected.answer.score!r}"
+        )
+        assert result.answer.label == expected.answer.label, (
+            f"answer.label: SDK={result.answer.label!r}, notebook={expected.answer.label!r}"
+        )
+        assert result.explanation.summary == expected.explanation.summary, (
+            "explanation.summary (reasoning) differs between SDK and notebook"
+        )
+        assert result.explanation.details == expected.explanation.details, (
+            "explanation.details (word breakdown) differs between SDK and notebook"
+        )
+
+
+class TestVocabularyContractOtherGrades:
+    def test_hurricanes_grade7(self) -> None:
+        """Hurricane formation passage, grade 7 — grades 5–12 GPT path.
+
+        Verifies:
+        - Both LLM requests match the notebook.
+        - Given the notebook's LLM responses, the SDK maps ``complexity_score``
+          to the correct ``TextComplexityAnswer`` and returns the same reasoning
+          and word-breakdown details as the notebook.
+        """
+        case = load_vocabulary_other_grades_case()
+
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(
+            text=case.input["text"],
+            grade=case.input["grade"],
+        )
+
+        with ContractTestHarness(case) as harness:
+            result = evaluator.evaluate(inp)
+
+        # --- Prompt fidelity ---
+        harness.assert_prompt_step("background_knowledge")
+        harness.assert_prompt_step("vocab_complexity")
+
+        # --- Result fidelity ---
+        expected = vocabulary_other_grades_notebook_to_sdk_result(case)
+        assert result.metadata.status == Status.succeeded
+        assert result.answer.score == expected.answer.score, (
+            f"answer.score: SDK={result.answer.score!r}, notebook={expected.answer.score!r}"
+        )
+        assert result.answer.label == expected.answer.label, (
+            f"answer.label: SDK={result.answer.label!r}, notebook={expected.answer.label!r}"
+        )
+        assert result.explanation.summary == expected.explanation.summary, (
+            "explanation.summary (reasoning) differs between SDK and notebook"
+        )
+        assert result.explanation.details == expected.explanation.details, (
+            "explanation.details (word breakdown) differs between SDK and notebook"
+        )
diff --git a/sdks/python/tests/contract_tests/vocabulary.py b/sdks/python/tests/contract_tests/vocabulary.py
new file mode 100644
index 0000000..26e56b1
--- /dev/null
+++ b/sdks/python/tests/contract_tests/vocabulary.py
@@ -0,0 +1,146 @@
+"""Vocabulary-specific helpers for contract tests.
+
+Provides:
+  - Named case loaders (one function per test case in the TOML).
+  - Result mappers: convert the notebook-format expected result (raw
+    ``JsonOutputParser`` dict) to the ``TextComplexityResult`` that the SDK
+    should produce.
+
+Both grade paths use the same ``Output``-style schema (word-level breakdown +
+string ``complexity_score``), so both mappers apply the same normalisation:
+  ``complexity_score.lower().replace(" ", "_")`` → ``TextComplexityAnswer``.
+"""
+
+from __future__ import annotations
+
+from learning_commons_evaluators.schemas.evaluator import EvaluationExplanation
+from learning_commons_evaluators.schemas.metadata import (
+    EvaluationMetadata,
+    EvaluatorMaturity,
+    EvaluatorMetadata,
+    Status,
+)
+from learning_commons_evaluators.schemas.text_complexity import (
+    TextComplexityAnswer,
+    TextComplexityResult,
+)
+
+from .loader import ContractCase, load_contract_case
+
+# ---------------------------------------------------------------------------
+# Case loaders
+# ---------------------------------------------------------------------------
+
+
+def load_vocabulary_grade34_case() -> ContractCase:
+    """Load the 'marco_polo_grade3' contract test case (grades 3–4 path)."""
+    return load_contract_case("vocabulary", "marco_polo_grade3")
+
+
+def load_vocabulary_other_grades_case() -> ContractCase:
+    """Load the 'hurricanes_grade7' contract test case (grades 5–12 path)."""
+    return load_contract_case("vocabulary", "hurricanes_grade7")
+
+
+# ---------------------------------------------------------------------------
+# Result mappers
+# ---------------------------------------------------------------------------
+
+
+def _placeholder_metadata() -> EvaluationMetadata:
+    """Minimal metadata placeholder for result comparison objects.
+
+    Only ``answer`` and ``explanation`` are compared in contract assertions;
+    metadata contains non-deterministic fields (timing, evaluation ID, etc.)
+    and is intentionally excluded.
+    """
+    return EvaluationMetadata(
+        evaluator_metadata=EvaluatorMetadata(
+            id="vocabulary",
+            version="0.1",
+            name="Vocabulary",
+            description="Contract test placeholder.",
+            maturity=EvaluatorMaturity.alpha,
+        ),
+        evaluation_settings=None,  # type: ignore[arg-type]
+        input_metadata={},
+        status=Status.succeeded,
+    )
+
+
+def vocabulary_grade34_notebook_to_sdk_result(
+    case: ContractCase,
+) -> TextComplexityResult:
+    """Convert a grades 3–4 ``expected_result`` dict to a ``TextComplexityResult``.
+
+    The notebook outputs a plain dict from ``JsonOutputParser``; the SDK wraps
+    that into ``TextComplexityResult``.  This function performs the same mapping
+    so tests can assert equality.
+
+    Only ``answer`` and ``explanation`` are compared — ``metadata`` is a
+    placeholder because it contains non-deterministic fields.
+
+    Args:
+        case: A loaded :class:`~loader.ContractCase` with a populated
+            ``expected_result`` (grades 3–4 format).
+
+    Returns:
+        A ``TextComplexityResult`` built from the contract's expected output.
+    """
+    r = case.expected_result
+    # Normalise the score string: the notebook may return "very complex" (spaces).
+    raw_score = r["complexity_score"].lower().replace(" ", "_")
+    answer = TextComplexityAnswer.from_score(raw_score)
+    explanation = EvaluationExplanation(
+        summary=r["reasoning"],
+        details={
+            "tier_2_words": r["tier_2_words"],
+            "tier_3_words": r["tier_3_words"],
+            "archaic_words": r["archaic_words"],
+            "other_complex_words": r["other_complex_words"],
+        },
+    )
+    return TextComplexityResult(
+        answer=answer,
+        explanation=explanation,
+        metadata=_placeholder_metadata(),
+    )
+
+
+def vocabulary_other_grades_notebook_to_sdk_result(
+    case: ContractCase,
+) -> TextComplexityResult:
+    """Convert a grades 5–12 ``expected_result`` dict to a ``TextComplexityResult``.
+
+    The OTHER_GRADES path uses the same Output-style schema as grades 3–4, so
+    ``expected_result`` contains a string ``complexity_score`` (e.g. "slightly
+    complex") which is normalised to underscore form before mapping, plus the
+    same word-list fields as the notebook dict.
+
+    Only ``answer`` and ``explanation`` are compared — ``metadata`` is a
+    placeholder because it contains non-deterministic fields.
+
+    Args:
+        case: A loaded :class:`~loader.ContractCase` with a populated
+            ``expected_result`` (grades 5–12 format).
+
+    Returns:
+        A ``TextComplexityResult`` built from the contract's expected output.
+    """
+    r = case.expected_result
+    score = r["complexity_score"].lower().replace(" ", "_")
+    answer = TextComplexityAnswer.from_score(score)
+    explanation = EvaluationExplanation(
+        summary=r["reasoning"],
+        details={
+            "tier_2_words": r["tier_2_words"],
+            "tier_3_words": r["tier_3_words"],
+            "archaic_words": r["archaic_words"],
+            "other_complex_words": r["other_complex_words"],
+        },
+    )
+    return TextComplexityResult(
+        answer=answer,
+        explanation=explanation,
+        metadata=_placeholder_metadata(),
+    )
diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py
index b453853..333e5c1 100644
--- a/sdks/python/tests/evaluators/test_base.py
+++ b/sdks/python/tests/evaluators/test_base.py
@@ -115,6 +115,15 @@ class TestBaseEvaluatorInit:
     def test_config_is_stored(self, config):
         assert _StubEvaluator(config).config is config
 
+    def test_constructor_default_evaluation_settings_overrides_class_default(self, config):
+        instance_default = _StubSettings(marker=99)
+        ev = _StubEvaluator(config, default_evaluation_settings=instance_default)
+        assert ev.default_evaluation_settings is instance_default
+
+    def test_omitted_constructor_default_falls_back_to_class_attribute(self, config):
+        ev = _StubEvaluator(config)
+        assert ev.default_evaluation_settings is _StubEvaluator.default_evaluation_settings
+
 
 # ---------------------------------------------------------------------------
 # evaluate()
@@ -133,6 +142,20 @@ def test_passes_explicit_evaluation_settings(self, stub_evaluator):
         assert result.metadata.evaluation_settings.marker == 42
         assert result.explanation.details.get("marker") == 42
 
+    def test_constructor_default_used_when_evaluate_settings_omitted(self, config):
+        ev = _StubEvaluator(config, default_evaluation_settings=_StubSettings(marker=77))
+        result = ev.evaluate(_stub_input())
+        assert result.metadata.evaluation_settings.marker == 77
+        assert result.explanation.details.get("marker") == 77
+
+    def test_evaluate_explicit_settings_override_constructor_default(self, config):
+        ev = _StubEvaluator(
+            config,
+            default_evaluation_settings=_StubSettings(marker=1),
+        )
+        result = ev.evaluate(_stub_input(), evaluation_settings=_StubSettings(marker=2))
+        assert result.explanation.details.get("marker") == 2
+
 
 class TestEvaluateInputMetadata:
     """``input_metadata`` on :class:`EvaluationMetadata` always comes from ``input.input_metadata()``."""
@@ -314,6 +337,25 @@ def _fake_llm(_pv):
             )
         assert out == "plain prose"
 
+    def test_json_dict_normalizer_without_parser_type_raises(
+        self, stub_evaluator, evaluation_metadata
+    ):
+        template = ChatPromptTemplate.from_messages([("human", "{input}")])
+        with pytest.raises(ValueError, match="json_dict_normalizer requires"):
+            stub_evaluator.execute_prompt_chain_step(
+                step_name="raw",
+                prompt_settings=PromptSettings(
+                    provider_type=LlmProvider.GOOGLE,
+                    model="gemini-2.0-flash",
+                    temperature=0.0,
+                ),
+                evaluation_metadata=evaluation_metadata,
+                template=template,
+                chain_inputs={"input": "Hello"},
+                parser_output_type=None,
+                json_dict_normalizer=lambda d: d,
+            )
+
     def test_returns_parsed_pydantic_output(self, stub_evaluator, evaluation_metadata):
         def _fake_llm(_pv):
             return AIMessage(content=_CHAIN_JSON)
@@ -336,6 +378,42 @@ def _fake_llm(_pv):
         assert result.label == "ok"
         assert result.score == 7
 
+    def test_json_dict_normalizer_parses_dict_then_normalizes_then_validates(
+        self, stub_evaluator, evaluation_metadata
+    ):
+        """Optional ``json_dict_normalizer``: loose JSON → dict → user fn → ``model_validate``."""
+
+        def _fake_llm(_pv):
+            return AIMessage(content='{"n": 1}')
+
+        class _Out(BaseModel):
+            n: int = Field(description="n")
+            doubled: int = Field(description="doubled")
+
+        def _double(d: dict) -> dict:
+            d = dict(d)
+            d["doubled"] = int(d["n"]) * 2
+            return d
+
+        template = ChatPromptTemplate.from_messages([("human", "{input}")])
+        with patch(_CHAIN_PATCH, return_value=_fake_llm):
+            result = stub_evaluator.execute_prompt_chain_step(
+                step_name="main",
+                prompt_settings=PromptSettings(
+                    provider_type=LlmProvider.GOOGLE,
+                    model="gemini-2.0-flash",
+                    temperature=0.0,
+                ),
+                evaluation_metadata=evaluation_metadata,
+                template=template,
+                chain_inputs={"input": "Hello"},
+                parser_output_type=_Out,
+                json_dict_normalizer=_double,
+            )
+        assert isinstance(result, _Out)
+        assert result.n == 1
+        assert result.doubled == 2
+
     def test_parser_returning_model_instance_short_circuits_model_validate(
         self, stub_evaluator, evaluation_metadata
     ):
diff --git a/sdks/python/tests/evaluators/test_vocabulary.py b/sdks/python/tests/evaluators/test_vocabulary.py
new file mode 100644
index 0000000..a6eacf0
--- /dev/null
+++ b/sdks/python/tests/evaluators/test_vocabulary.py
@@ -0,0 +1,353 @@
+"""Tests for VocabularyEvaluator and vocabulary helpers."""
+
+from unittest.mock import patch
+
+import pytest
+
+from learning_commons_evaluators import (
+    VocabularyEvaluationInput,
+    VocabularyEvaluator,
+    create_config_no_telemetry,
+)
+from learning_commons_evaluators.schemas.errors import ConfigurationError, ValidationError
+from learning_commons_evaluators.schemas.metadata import Status
+from learning_commons_evaluators.schemas.vocabulary import (
+    VocabularyComplexityOutput,
+    normalize_complexity_output,
+)
+
+_SAMPLE_TEXT = "The cat sat on the mat."
+
+# ── Mock helpers ──────────────────────────────────────────────────────────────
+
+_MOCK_BACKGROUND_KNOWLEDGE = "Students are familiar with household pets and basic domestic scenes."
+
+
+def _make_grades34_output(
+    score: str = "moderately_complex",
+) -> VocabularyComplexityOutput:
+    return VocabularyComplexityOutput(
+        tier_2_words="sat",
+        tier_3_words="none",
+        archaic_words="none",
+        other_complex_words="none",
+        complexity_score=score,
+        reasoning="Most words are simple and familiar.",
+    )
+
+
+_OTHER_GRADES_SCORE_MAP: dict[int, str] = {
+    1: "slightly complex",
+    2: "moderately complex",
+    3: "very complex",
+    4: "exceedingly complex",
+}
+
+
+def _make_other_grades_output(answer: int = 2) -> VocabularyComplexityOutput:
+    """Build mock complexity output from a convenience integer rubric level (1–4)."""
+
+    return VocabularyComplexityOutput(
+        tier_2_words="sat",
+        tier_3_words="none",
+        archaic_words="none",
+        other_complex_words="none",
+        complexity_score=_OTHER_GRADES_SCORE_MAP[answer],
+        reasoning="Most words are simple and familiar.",
+    )
+
+
+def _patch_steps(evaluator, bk_return, vocab_return):
+    """Patch execute_prompt_chain_step with side_effect=[bk_return, vocab_return].
+
+    Both steps (background_knowledge and vocab_complexity) go through the same
+    method; side_effect returns them in call order.
+    """
+    return patch.object(
+        evaluator,
+        "execute_prompt_chain_step",
+        side_effect=[bk_return, vocab_return],
+    )
+
+
+# ── Grade 3–4 path ────────────────────────────────────────────────────────────
+
+
+class TestVocabularyEvaluatorGrades34:
+    def test_evaluate_grade_3_returns_result(self):
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=3)
+        with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_grades34_output()):
+            result = evaluator.evaluate(inp)
+
+        assert result.answer.score == "moderately_complex"
+        assert result.answer.label == "Moderately complex"
+        assert result.metadata.status == Status.succeeded
+        assert "tier_2_words" in result.explanation.details
+
+    def test_evaluate_grade_4_returns_result(self):
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=4)
+        with _patch_steps(
+            evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_grades34_output("very_complex")
+        ):
+            result = evaluator.evaluate(inp)
+
+        assert result.answer.score == "very_complex"
+
+    def test_grades34_score_with_spaces_is_normalised(self):
+        """The grades 3–4 prompt may return "slightly complex" (spaces); normalise to underscores."""
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=3)
+        # The evaluator calls .lower().replace(" ", "_") before from_score(),
+        # so we feed a space-separated label and assert it survives the path.
+        output = _make_grades34_output("slightly complex")
+
+        with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, output):
+            result = evaluator.evaluate(inp)
+
+        assert result.answer.score == "slightly_complex"
+
+    def test_evaluate_grades34_explanation_has_word_breakdown(self):
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=3)
+        with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_grades34_output()):
+            result = evaluator.evaluate(inp)
+
+        details = result.explanation.details
+        assert "tier_2_words" in details
+        assert "tier_3_words" in details
+        assert "archaic_words" in details
+        assert "other_complex_words" in details
+
+
+# ── Grades 5–12 path ──────────────────────────────────────────────────────────
+
+
+class TestVocabularyEvaluatorOtherGrades:
+    @pytest.mark.parametrize(
+        "score_label, expected_score",
+        [
+            (1, "slightly_complex"),
+            (2, "moderately_complex"),
+            (3, "very_complex"),
+            (4, "exceedingly_complex"),
+        ],
+    )
+    def test_all_complexity_scores_map_correctly(self, score_label, expected_score):
+        """Each complexity label (passed as convenience int 1–4) maps to the right score."""
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=7)
+        with _patch_steps(
+            evaluator,
+            _MOCK_BACKGROUND_KNOWLEDGE,
+            _make_other_grades_output(score_label),
+        ):
+            result = evaluator.evaluate(inp)
+
+        assert result.answer.score == expected_score
+
+    def test_evaluate_grade_12_returns_result(self):
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=12)
+        with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(1)):
+            result = evaluator.evaluate(inp)
+
+        assert result.metadata.status == Status.succeeded
+        assert result.answer.score == "slightly_complex"
+
+    def test_other_grades_explanation_includes_word_breakdown(self):
+        """Grades 5–12 mirror the notebook: word lists live in ``explanation.details``."""
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=8)
+        with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(2)):
+            result = evaluator.evaluate(inp)
+
+        details = result.explanation.details
+        assert details["tier_2_words"] == "sat"
+        assert details["tier_3_words"] == "none"
+        assert details["archaic_words"] == "none"
+        assert details["other_complex_words"] == "none"
+
+    def test_other_grades_legacy_integer_answer_normalizes_like_notebook(self):
+        """``normalize_complexity_output`` then validate (same order as the notebook)."""
+        parsed = VocabularyComplexityOutput.model_validate(
+            normalize_complexity_output(
+                {"answer": 3, "reasoning": "Dense technical terms throughout."}
+            )
+        )
+        assert parsed.complexity_score == "Very Complex"
+        assert parsed.tier_2_words == ""
+        assert parsed.tier_3_words == ""
+
+    def test_other_grades_legacy_string_digit_answer(self):
+        parsed = VocabularyComplexityOutput.model_validate(
+            normalize_complexity_output({"answer": "2", "reasoning": "Accessible vocabulary."})
+        )
+        assert parsed.complexity_score == "Moderately Complex"
+
+    def test_other_grades_unexpected_digit_answer_raises(self):
+        """Out-of-range rubric digit normalizes to a bare string; ``from_score`` rejects it."""
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=7)
+        # Same ``complexity_score`` as ``normalize_complexity_output({"answer": 9, ...})``.
+        unexpected = VocabularyComplexityOutput(
+            tier_2_words="sat",
+            tier_3_words="none",
+            archaic_words="none",
+            other_complex_words="none",
+            complexity_score="9",
+            reasoning="Model returned an invalid rubric code.",
+        )
+        with (
+            _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, unexpected),
+            pytest.raises(ValueError, match=r"Unknown text complexity score: '9'"),
+        ):
+            evaluator.evaluate(inp)
+
+
+class TestNormalizeComplexityOutput:
+    def test_preserves_complexity_score_when_answer_absent(self):
+        row = normalize_complexity_output(
+            {
+                "tier_2_words": "a",
+                "tier_3_words": "b",
+                "archaic_words": "c",
+                "other_complex_words": "d",
+                "complexity_score": "slightly complex",
+                "reasoning": "r",
+            }
+        )
+        assert row["complexity_score"] == "slightly complex"
+
+    def test_answer_overwrites_or_sets_complexity_score(self):
+        row = normalize_complexity_output({"answer": 1, "reasoning": "x"})
+        assert row["complexity_score"] == "Slightly Complex"
+
+    def test_unexpected_digit_answer_falls_back_to_string_value(self):
+        """Rubric is 1–4; other digits (int or string) become ``complexity_score == str(value)``."""
+        assert (
+            normalize_complexity_output({"answer": 5, "reasoning": "x"})["complexity_score"] == "5"
+        )
+        assert (
+            normalize_complexity_output({"answer": "9", "reasoning": "x"})["complexity_score"]
+            == "9"
+        )
+
+
+# ── Grade validation via framework ────────────────────────────────────────────
+
+
+class TestVocabularyEvaluationInputValidation:
+    def test_allowed_grades_set_from_toml(self):
+        """VocabularyEvaluationInput picks up allowed_grades from the TOML spec."""
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5)
+        assert set(inp.grade.spec.allowed_grades) == frozenset(range(3, 13))
+
+    @pytest.mark.parametrize("unsupported_grade", [0, 1, 2])
+    def test_unsupported_grade_raises_via_framework(self, unsupported_grade):
+        """BaseEvaluator.evaluate() calls input.validate(), which catches the bad grade."""
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=unsupported_grade)
+        # The base evaluator catches the ValidationError, sets status=failed, then re-raises.
+        with pytest.raises(ValidationError):
+            evaluator.evaluate(inp)
+
+    def test_unsupported_grade_sets_status_failed(self):
+        """Metadata status is set to failed when grade validation fails."""
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=2)
+        with pytest.raises(ValidationError):
+            evaluator.evaluate(inp)
+
+
+# ── Metadata and settings ─────────────────────────────────────────────────────
+
+
+class TestVocabularyEvaluatorMetadata:
+    def test_evaluator_metadata(self):
+        evaluator = VocabularyEvaluator(create_config_no_telemetry())
+        assert evaluator.metadata.id == "vocabulary"
+        assert evaluator.metadata.version == "0.1"
+
+    def test_default_settings_has_all_prompt_steps(self):
+        evaluator = VocabularyEvaluator(create_config_no_telemetry())
+        settings = evaluator.default_evaluation_settings
+        assert settings.prompt_settings_step_background_knowledge is not None
+        assert settings.prompt_settings_step_vocab_grades_3_4 is not None
+        assert settings.prompt_settings_step_vocab_other_grades is not None
+
+    def test_evaluate_succeeds_and_records_metadata(self):
+        config = create_config_no_telemetry()
+        evaluator = VocabularyEvaluator(config)
+        inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5)
+        with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(2)):
+            result = evaluator.evaluate(inp)
+
+        assert result.metadata.status == Status.succeeded
+        assert result.metadata.evaluator_metadata.id == "vocabulary"
+
+
+class TestVocabularyEvaluationInputConfiguration:
+    """Tests that VocabularyEvaluationInput fails loudly on bad configuration.
+
+    These tests patch ``VocabularyEvaluationInput._input_settings`` directly
+    because the ClassVar is bound at class-definition time.  Patching the
+    module-level ``_INPUT_SETTINGS`` name would rebind the module variable but
+    leave the class variable pointing at the original dict.
+    """
+
+    def test_missing_text_spec_raises_configuration_error(self, monkeypatch):
+        """If 'text' is absent from _input_settings, ConfigurationError is raised immediately."""
+        monkeypatch.setattr(VocabularyEvaluationInput, "_input_settings", {})
+        with pytest.raises(ConfigurationError, match="'text'"):
+            VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5)
+
+    def test_missing_grade_spec_raises_configuration_error(self, monkeypatch):
+        """If 'grade' is absent from _input_settings, ConfigurationError is raised."""
+        from learning_commons_evaluators.schemas.input_specs import TextInputSpec
+
+        monkeypatch.setattr(
+            VocabularyEvaluationInput,
+            "_input_settings",
+            {"text": TextInputSpec(name="text")},
+        )
+        with pytest.raises(ConfigurationError, match="'grade'"):
+            VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5)
+
+    def test_wrong_text_spec_type_raises_configuration_error(self, monkeypatch):
+        """If the 'text' spec has the wrong type, ConfigurationError names the type mismatch."""
+        from learning_commons_evaluators.schemas.input_specs import GradeInputSpec
+
+        monkeypatch.setattr(
+            VocabularyEvaluationInput,
+            "_input_settings",
+            {"text": GradeInputSpec(name="text")},
+        )
+        with pytest.raises(ConfigurationError, match="TextInputSpec"):
+            VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5)
+
+    def test_wrong_grade_spec_type_raises_configuration_error(self, monkeypatch):
+        """If the 'grade' spec has the wrong type, ConfigurationError names the mismatch."""
+        from learning_commons_evaluators.schemas.input_specs import TextInputSpec
+
+        monkeypatch.setattr(
+            VocabularyEvaluationInput,
+            "_input_settings",
+            {
+                "text": TextInputSpec(name="text"),
+                "grade": TextInputSpec(name="grade"),  # wrong type
+            },
+        )
+        with pytest.raises(ConfigurationError, match="GradeInputSpec"):
+            VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5)
diff --git a/sdks/python/tests/schemas/test_evaluator_schemas.py b/sdks/python/tests/schemas/test_evaluator_schemas.py
index 5ba6a4a..433fa66 100644
--- a/sdks/python/tests/schemas/test_evaluator_schemas.py
+++ b/sdks/python/tests/schemas/test_evaluator_schemas.py
@@ -6,6 +6,8 @@
 instances so nothing depends on a real evaluator's TOML or class names.
 """
 
+from __future__ import annotations
+
 from typing import ClassVar
 
 import pytest
@@ -57,7 +59,13 @@ class _ExampleEvaluationInput(EvaluationInput):
     text: TextInputField
     grade: GradeInputField
 
-    def __init__(self, *, text: str, grade: int, **kwargs):
+    def __init__(
+        self,
+        *,
+        text: str | TextInputField,
+        grade: int | GradeInputField,
+        **kwargs,
+    ):
         super().__init__(text=text, grade=grade, **kwargs)
 
 
@@ -173,7 +181,7 @@ def test_raw_values_are_wrapped_into_input_fields(self):
     def test_already_constructed_input_field_is_not_rewrapped(self):
         """Passing a fully-constructed InputField instance bypasses construction."""
         pre_built = TextInputField(spec=self._COERCE_TEXT_SPEC, value="pre-built")
-        inp = _ExampleEvaluationInput(text=pre_built, grade=3)  # type: ignore[arg-type]
+        inp = _ExampleEvaluationInput(text=pre_built, grade=3)
         assert inp.text is pre_built  # same object, not a copy
 
     def test_non_inputfield_field_is_left_unchanged(self):
diff --git a/sdks/python/tests/settings/test_load_settings.py b/sdks/python/tests/settings/test_load_settings.py
index dae9dff..25dc88b 100644
--- a/sdks/python/tests/settings/test_load_settings.py
+++ b/sdks/python/tests/settings/test_load_settings.py
@@ -3,8 +3,10 @@
 from __future__ import annotations
 
 import importlib.resources
+import os
 import textwrap
 from pathlib import Path
+from unittest.mock import patch
 
 import pytest
 
@@ -13,6 +15,9 @@
     LlmProvider,
     PromptSettings,
 )
+from learning_commons_evaluators.schemas.conventionality import (
+    ConventionalityEvaluationSettings,
+)
 from learning_commons_evaluators.schemas.errors import ConfigurationError
 from learning_commons_evaluators.schemas.input_specs import TextInputSpec
 from learning_commons_evaluators.schemas.metadata import EvaluatorMaturity
@@ -226,27 +231,121 @@ def test_load_evaluator_settings_rejects_non_string_prompt_value(tmp_path: Path)
 
 
 def test_load_evaluator_settings_invalid_prompt_settings_block(tmp_path: Path) -> None:
+    """Unknown ``[[evaluator_metadata.inputs]].type`` must raise a clear configuration error."""
     path = tmp_path / "bad_prompt.toml"
     path.write_text(
         textwrap.dedent(
             """
             [evaluator_metadata]
-            id = "e"
-            version = "1.0"
-            name = "N"
-            description = "D"
-            maturity = "ga"
+            id = "x"
+            version = "0.1"
+            name = "X"
+            description = "X"
+            maturity = "beta"
 
-            [evaluation_settings]
-            marker = 0
+            [[evaluator_metadata.inputs]]
+            name = "mystery"
+            type = "UnknownInputField"
 
-            [evaluation_settings.prompt_settings_main]
-            provider_type = "google"
-            model = "m"
+            [evaluation_settings.prompt_settings_step_conventionality_evaluation]
+            provider_type = "GOOGLE"
+            model = "gemini-2.0-flash"
+            temperature = 0
             """
         ).strip()
     )
-    with pytest.raises(
-        ConfigurationError, match="Invalid \\[evaluation_settings.prompt_settings_main\\]"
-    ):
-        load_evaluator_settings(path, _WithPromptSettings)
+    with pytest.raises(ConfigurationError, match="UnknownInputField"):
+        load_evaluator_settings(path, ConventionalityEvaluationSettings)
+
+
+# ---------------------------------------------------------------------------
+# shared_settings_root
+# ---------------------------------------------------------------------------
+
+
+class TestSharedSettingsRoot:
+    def test_env_var_overrides_bundled_path(self, tmp_path: Path) -> None:
+        with patch.dict(os.environ, {"EVALUATORS_SETTINGS_DIR": str(tmp_path)}):
+            assert shared_settings_root() == tmp_path
+
+    def test_importlib_fallback_resolves_to_existing_directory(self) -> None:
+        env = {k: v for k, v in os.environ.items() if k != "EVALUATORS_SETTINGS_DIR"}
+        with patch.dict(os.environ, env, clear=True):
+            root = shared_settings_root()
+        assert root.is_dir(), f"shared_settings_root() resolved to non-directory: {root}"
+
+    def test_bundled_contract_tests_are_present(self) -> None:
+        """The bundled package must contain contracts.toml for each evaluator.
+
+        The evaluator settings TOML is intentionally NOT bundled (evaluators use _generated_*_settings.py).
+        contracts.toml IS bundled so contract tests work against an installed package.
+        """
+        env = {k: v for k, v in os.environ.items() if k != "EVALUATORS_SETTINGS_DIR"}
+        with patch.dict(os.environ, env, clear=True):
+            root = shared_settings_root()
+        for evaluator in ("conventionality",):
+            assert (root / evaluator / "contracts.toml").is_file(), (
+                f"Bundled {evaluator}/contracts.toml not found — "
+                f"run 'python scripts/generate_settings.py --sync'"
+            )
+
+
+# ---------------------------------------------------------------------------
+# Settings bundle sync guard
+# ---------------------------------------------------------------------------
+#
+# What is and isn't bundled in the package:
+#
+#   BUNDLED:   contracts.toml — needed so contract tests run against an
+#              installed package without access to sdks/settings/.
+#
+#   NOT BUNDLED: settings.toml — not needed at runtime; evaluators import
+#              from _generated_*_settings.py (pre-built at generation time).
+#              The canonical copy lives in sdks/settings/ and is the input to
+#              `make generate-settings`.
+#
+# If a sync test fails, run from the repo root:
+#   python scripts/generate_settings.py --sync    — copies contracts.toml canonical → bundled
+#   python scripts/generate_settings.py --check   — verifies generated .py files are up to date
+
+
+def _bundled_settings_root() -> Path:
+    """Return the importlib.resources path for bundled settings, bypassing EVALUATORS_SETTINGS_DIR."""
+    pkg = importlib.resources.files("learning_commons_evaluators.settings")
+    return Path(str(pkg))
+
+
+def _canonical_settings_root() -> Path | None:
+    """Return sdks/settings/ relative to this file, or None if not in the monorepo."""
+    # This file lives at: sdks/python/tests/settings/test_load_settings.py
+    # parents[3] = sdks/
+    candidate = Path(__file__).parents[3] / "settings"
+    return candidate if candidate.is_dir() else None
+
+
+@pytest.mark.parametrize("evaluator", ["conventionality", "vocabulary"])
+def test_bundled_contract_tests_match_canonical(evaluator: str) -> None:
+    """Bundled contracts.toml must be byte-for-byte identical to sdks/settings/.
+
+    Skipped when running outside the monorepo (e.g., from an installed package).
+    If this fails, run ``python scripts/generate_settings.py --sync`` from the repo root.
+    """
+    canonical_root = _canonical_settings_root()
+    if canonical_root is None:
+        pytest.skip("sdks/settings/ not found — running outside the monorepo")
+
+    canonical = canonical_root / evaluator / "contracts.toml"
+    if not canonical.exists():
+        pytest.skip(f"Canonical file not found: {canonical}")
+
+    bundled = _bundled_settings_root() / evaluator / "contracts.toml"
+    assert bundled.exists(), (
+        f"Bundled {evaluator}/contracts.toml not found.\n"
+        f"Run: python scripts/generate_settings.py --sync"
+    )
+    assert canonical.read_bytes() == bundled.read_bytes(), (
+        f"{evaluator}/contracts.toml is out of sync.\n"
+        f"  canonical: {canonical}\n"
+        f"  bundled:   {bundled}\n"
+        f"Fix: python scripts/generate_settings.py --sync"
+    )
diff --git a/sdks/settings/conventionality/contracts.toml b/sdks/settings/conventionality/contracts.toml
new file mode 100644
index 0000000..47bac4c
--- /dev/null
+++ b/sdks/settings/conventionality/contracts.toml
@@ -0,0 +1,130 @@
+# Contract tests for the Conventionality evaluator.
+#
+# This file captures the exact LLM interaction for known inputs so that
+# tests can verify the SDK sends the same request as the notebook and
+# produces the same result from the same response.
+#
+# HOW TO REGENERATE
+# -----------------
+# Run the notebook "evals/Final ship - Conventionality Experimental Evaluator.ipynb"
+# from the repo root with GOOGLE_API_KEY set. It loads prompts from the evaluator
+# settings TOML (same as the SDK) and prints a TOML block — paste into this file
+# and run `make sync-settings` from sdks/python/ to update the bundled copy.
+#
+# WHAT BELONGS HERE
+# -----------------
+# Only data that originates from external sources:
+#   - input:           what the caller provides to the evaluator
+#   - prompt_steps:    what the SDK sends to the LLM (fully formatted messages,
+#                      model, temperature) and the raw response the LLM returned
+#   - expected_result: the structured output the LLM returned (notebook format)
+#
+# Deterministic values computed by library code (e.g. FK score from textstat,
+# format_instructions from LangChain's JsonOutputParser) are NOT stored as
+# standalone fields — they appear inside the fully formatted prompts, where they
+# are implicitly verified by the prompt-request assertion in the contract test.
+
+[cases.turnip]
+description = "Grade 4 classroom narrative (Henry and the turnip)"
+
+[cases.turnip.input]
+text = '''
+
+"Well, then," said the teacher, "you may take your slate and go out behind the schoolhouse for half an hour. Think of something to write about, and write the word on your slate. Then try to tell what it is, what it is like, what it is good for, and what is done with it. That is the way to write a composition." Henry took his slate and went out. Just behind the schoolhouse was Mr. Finney's barn. Quite close to the barn was a garden. And in the garden, Henry saw a turnip. "Well, I know what that is," he said to himself; and he wrote the word turnip on his slate. Then he tried to tell what it was like, what it was good for, and what was done with it. Before the half hour was ended he had written a very neat composition on his slate. He then went into the house, and waited while the teacher read it. The teacher was surprised and pleased. He said, "Henry Longfellow, you have done very well. Today you may stand up before the school and read what you have written about the turnip."
+'''
+grade = 4
+
+[cases.turnip.prompt_steps.main]
+model = "gemini-3-flash-preview"
+temperature = 0.0
+system_prompt = '''Role
+You are an expert reading teacher and text complexity evaluator. Your task is to evaluate the "Conventionality" of a text and assign it a complexity level based on a 4-point scale, carefully factoring in the target grade level.
+
+Objective
+Measure how explicit, literal, and straightforward the text's meaning is, versus how abstract, ironic, figurative, or archaic it is. Focus on the hiddenness of the meaning, the use of conceptual framing, the reliance on abstract reasoning, and the familiarity of the expression for the target grade.
+
+Complexity Levels
+- Slightly Complex: Explicit, literal, straightforward, easy to understand. Meaning is entirely on the surface. The language is concrete, and the meaning is clear and procedural, mostly referring to observable materials and actions. Contains no symbolic or ironic language, and conceptual interpretation is not required. Contains limited figurative language that is common and easy to comprehend at the target grade level.
+- Moderately Complex: Largely explicit and easy to understand with some occasions for more complex meaning. May contain a noticeable amount of archaic/dated phrasing, formal historical prose, vocabulary demands, background knowledge requirements, or expressions that are less familiar to the target grade level, which might make the text feel vague or slightly challenging.
+- Very Complex: Fairly complex; contains sustained abstract language, conceptual framing, rhetorical idealization, ironic comparisons, or central metaphors that drive the meaning of the text. Addresses concepts, beliefs, and abstract qualities rather than just concrete objects. The tone or underlying message requires interpretation, even if the surface message is clear.
+- Exceedingly Complex: Dense and complex; contains considerable abstract, ironic, and/or figurative language. Meaning is heavily hidden, deeply conceptual, or relies heavily on complex rhetorical devices.
+
+Essential Evaluation Rules
+1. Concrete & Procedural Texts: Texts that are highly concrete, clear, and procedural (e.g., describing observable materials, mechanical processes, or physical actions) should typically be rated "Slightly Complex."
+
+2. Grade-Level Anchoring and Vague Narratives: Always consider the target grade. A literal historical narrative that might be straightforward for older students can be "Moderately Complex" for younger students (e.g., 4th graders) if it involves less familiar expressions, older contexts (e.g., wagon loads, traveling by horseback), vocabulary demands, and background knowledge requirements that make the text feel vague or slightly demanding for that age group.
+
+3. Rhetorical Idealization and Abstract Qualities: If an entire argument or narrative is built around abstract qualities (e.g., national character, bravery, liberty) and uses repeated figurative language or personification to portray a subject in a certain idealized way, rate the text as "Very Complex." Even if the figurative language is easy to interpret, the need to interpret the rhetorical tone and sustained abstract focus elevates the complexity beyond level two.
+
+4. Common Idioms and Grade-Level Appropriateness: Do NOT elevate a text to "Moderately Complex" simply because it contains a few common idiomatic expressions. If these expressions are widely known and easy for the target grade to understand without making the text feel vague, the text remains "Slightly Complex."
+
+5. Conversational and Hypothetical Framing: Using a second-person conversational hook (e.g., "Imagine you are...") to explain a concept is a standard, literal device for engaging readers. It does not constitute complex conceptual framing.
+
+6. Sustained vs. Occasional Impact: If abstract language, figurative phrasing, irony, or conceptual framing is sustained throughout the text and central to the argument/meaning, the text is Very Complex. Reserve Moderately Complex for texts where the explicit meaning dominates but the expression, vocabulary, or archaic language provides a moderate conventionality challenge.
+
+7. Central Metaphors and Conceptual Framing: When an author uses a central metaphor to explain a concept or uses figurative phrasing to explain how things "work," this abstract reasoning drives the meaning, elevating the text to Very Complex.
+
+8. Irony and Abstract Comparisons: Texts that rely on sustained irony, especially through comparative arguments, are inherently Very Complex for younger students.
+
+9. Isolate Conventionality from Vocabulary: Do not inflate the Conventionality score just because the text uses archaic, dated, or highly academic vocabulary.
+
+Input Format
+You will receive:
+- text: The passage to evaluate.
+- grade: The target student grade level.
+- fk_score: The Flesch-Kincaid readability score.
+
+Output Format
+Provide a JSON object containing ONLY the following keys:
+- complexity_score: (String) One of the 4 scale levels exactly as formatted: 'slightly_complex', 'moderately_complex', 'very_complex', or 'exceedingly_complex'.
+- reasoning: (String) A detailed explanation of the rating, citing specific features in the text and referencing the expert guardrails (e.g., noting if the text relies on abstract qualities/rhetorical idealization, if vocabulary/background knowledge demands make a literal text vague for the grade level, or if it is strictly concrete/procedural).
+- conventionality_features: (List of Strings) The specific language features driving the complexity (e.g., literal narrative, concrete actions, less familiar expressions, sustained irony, abstract qualities, rhetorical idealization, archaic phrasing) with direct quotes from the text.
+- grade_context: (String) How the conventionality demands compare to general expectations for the provided target grade.
+- instructional_insights: (String) Actionable pedagogical suggestions for scaffolding the conventionality features in the classroom.
+
+STRICT OUTPUT FORMAT:
+- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators.
+- Do not wrap the JSON in Markdown or code fences (no ``` or ```json).
+- Do not prepend or append any text (e.g., do not write "Here is the JSON:").
+- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments.
+
+The output should be formatted as a JSON instance that conforms to the JSON schema below.
+
+As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]} the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.
+
+Here is the output schema (shown in a code block for readability only — do not include any backticks or Markdown in your output):
+```
+{"properties": {"conventionality_features": {"description": "List of the specific language features driving the complexity (e.g., idioms, metaphors, implied meaning) with direct quotes from the text.", "items": {"type": "string"}, "title": "Conventionality Features", "type": "array"}, "grade_context": {"description": "How the conventionality demands compare to general expectations for the provided target grade.", "title": "Grade Context", "type": "string"}, "instructional_insights": {"description": "Actionable pedagogical suggestions for scaffolding the unconventional language features in the classroom.", "title": "Instructional Insights", "type": "string"}, "complexity_score": {"description": "The conventionality complexity level of the text", "enum": ["slightly_complex", "moderately_complex", "very_complex", "exceedingly_complex"], "title": "Complexity Score", "type": "string"}, "reasoning": {"description": "A synthesis of why the text fits the chosen rubric level.", "title": "Reasoning", "type": "string"}}, "required": ["conventionality_features", "grade_context", "instructional_insights", "complexity_score", "reasoning"]}
+```
+'''
+user_prompt = '''Analyze:
+Text: "Well, then," said the teacher, "you may take your slate and go out behind the schoolhouse for half an hour. Think of something to write about, and write the word on your slate. Then try to tell what it is, what it is like, what it is good for, and what is done with it. That is the way to write a composition." Henry took his slate and went out. Just behind the schoolhouse was Mr. Finney's barn. Quite close to the barn was a garden. And in the garden, Henry saw a turnip. "Well, I know what that is," he said to himself; and he wrote the word turnip on his slate. Then he tried to tell what it was like, what it was good for, and what was done with it. Before the half hour was ended he had written a very neat composition on his slate. He then went into the house, and waited while the teacher read it. The teacher was surprised and pleased. He said, "Henry Longfellow, you have done very well. Today you may stand up before the school and read what you have written about the turnip."
+Grade: 4
+FK Score: 3.75
+'''
+llm_response = '''
+```json
+{
+  "complexity_score": "slightly_complex",
+  "reasoning": "The text is a straightforward, literal narrative that is entirely explicit and easy to understand. The meaning remains on the surface, focusing on concrete actions and observable materials (a slate, a schoolhouse, a turnip). The teacher's instructions are procedural and clear, providing a step-by-step guide for writing a composition. There is no symbolic, ironic, or figurative language that requires interpretation. While the setting is historical (using a 'slate' and 'schoolhouse'), these terms do not create a barrier to understanding the plot or the underlying message, which is a simple account of a student completing a task.",
+  "conventionality_features": [
+    "literal narrative: 'Henry took his slate and went out.'",
+    "concrete actions: 'he wrote the word turnip on his slate.'",
+    "procedural instructions: 'tell what it is, what it is like, what it is good for, and what is done with it.'",
+    "straightforward meaning: 'The teacher was surprised and pleased.'"
+  ],
+  "grade_context": "For a 4th-grade student, this text is highly accessible and falls well within the expected range for conventionality. The Flesch-Kincaid score of 3.75 aligns with the text's simple sentence structures and concrete subject matter, making it an easy read for students at this level.",
+  "instructional_insights": "Since the text is highly conventional, teachers can use it to model the 'procedural' writing steps mentioned in the story. Students can practice the teacher's four-step method (what it is, what it is like, what it is good for, what is done with it) on modern objects. A brief historical note explaining that a 'slate' was a precursor to a notebook or tablet would provide sufficient context for the dated terminology."
+}
+```'''
+
+[cases.turnip.expected_result]
+complexity_score = '''
+slightly_complex'''
+reasoning = '''
+The text is a straightforward, literal narrative that is entirely explicit and easy to understand. The meaning remains on the surface, focusing on concrete actions and observable materials (a slate, a schoolhouse, a turnip). The teacher's instructions are procedural and clear, providing a step-by-step guide for writing a composition. There is no symbolic, ironic, or figurative language that requires interpretation. While the setting is historical (using a 'slate' and 'schoolhouse'), these terms do not create a barrier to understanding the plot or the underlying message, which is a simple account of a student completing a task.'''
+conventionality_features = ["literal narrative: 'Henry took his slate and went out.'", "concrete actions: 'he wrote the word turnip on his slate.'", "procedural instructions: 'tell what it is, what it is like, what it is good for, and what is done with it.'", "straightforward meaning: 'The teacher was surprised and pleased.'"]
+grade_context = '''
+For a 4th-grade student, this text is highly accessible and falls well within the expected range for conventionality. The Flesch-Kincaid score of 3.75 aligns with the text's simple sentence structures and concrete subject matter, making it an easy read for students at this level.'''
+instructional_insights = '''
+Since the text is highly conventional, teachers can use it to model the 'procedural' writing steps mentioned in the story. Students can practice the teacher's four-step method (what it is, what it is like, what it is good for, what is done with it) on modern objects. A brief historical note explaining that a 'slate' was a precursor to a notebook or tablet would provide sufficient context for the dated terminology.'''
diff --git a/sdks/settings/vocabulary/contracts.toml b/sdks/settings/vocabulary/contracts.toml
new file mode 100644
index 0000000..8420e01
--- /dev/null
+++ b/sdks/settings/vocabulary/contracts.toml
@@ -0,0 +1,641 @@
+# Contract tests for the Vocabulary evaluator.
+#
+# This file captures the exact LLM interactions for known inputs so that tests
+# can verify the SDK sends the same requests as the notebook and produces the
+# same results from the same responses.
+#
+# HOW TO REGENERATE
+# -----------------
+# Run the notebook "evals/vocabulary_evaluator.ipynb" with valid API keys.
+# The final cells output a TOML block that you paste in here to replace the
+# placeholder values below.
+#
+# WHAT BELONGS HERE
+# -----------------
+# Only data that originates from external sources:
+#   - input:            what the caller provides to the evaluator
+#   - prompt_steps:     what the SDK sends to the LLM (fully formatted messages,
+#                       model, temperature) and the raw response the LLM returned.
+#                       Steps must be listed in call order:
+#                         1. background_knowledge  (plain text response)
+#                         2. vocab_complexity       (JSON response)
+#   - expected_result:  the structured output parsed from the vocab_complexity
+#                       llm_response (notebook format), before SDK result mapping.
+#
+# Deterministic values computed by library code (FK score, format_instructions)
+# are NOT stored as standalone fields — they appear inside the fully formatted
+# prompts, where they are implicitly verified by the prompt-request assertion.
+#
+# TWO CASES ARE INCLUDED
+# -----------------------
+# marco_polo_grade3  — grade 3, uses the grades 3–4 path (Gemini + rubric labels
+#                      + word-level breakdown in expected_result).
+# hurricanes_grade7  — grade 7, uses the grades 5–12 path (GPT; expected_result
+#                      holds ``complexity_score`` as a string plus word-list fields,
+#                      matching the notebook ``Output`` shape. The SDK also accepts
+#                      legacy integer ``answer`` in raw JSON like ``normalize_complexity_output``.)
+
+# ===========================================================================
+# Case 1: marco_polo_grade3  (grades 3–4 path)
+# ===========================================================================
+
+[cases.marco_polo_grade3]
+description = "Marco Polo passage, grade 3 (grades 3-4 Gemini path)"
+
+[cases.marco_polo_grade3.input]
+text = '''
+Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.
+Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.
+His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.'''
+grade = 3
+
+[cases.marco_polo_grade3.prompt_steps.background_knowledge]
+model = "gpt-4o-2024-11-20"
+temperature = 0.0
+system_prompt = ""
+user_prompt = '''
+
+Review the following text, which is an educational text written for students in the following grade band: 3.
+
+Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with.
+
+Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.).
+
+Here's an example:
+[START EXAMPLE]
+Grade Band: 11th
+Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not
+learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was
+not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to
+know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me,
+are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it
+is the chief end of man here to “glorify God and enjoy him forever.”
+
+Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy.
+[END EXAMPLE]
+
+You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else.
+
+You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have:
+[BEGIN TOPICS]
+[
+    K: [
+        "Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees",
+        "The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?",
+        "America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?",
+        "Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together",
+        "How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?",
+        "How Has Our World Changed?", "Why Do People Have Jobs?"
+    ],
+    1: [
+        "Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds",
+        "A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals’ unique features?",
+        "Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?",
+        "Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World",
+        "What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?",
+        "How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?"
+    ],
+    2: [
+        "Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators",
+        "A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?",
+        "Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?",
+        "Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are",
+        "Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?",
+        "How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?"
+    ],
+    "3": [
+        "Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics",
+        "Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist",
+        "Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate",
+        "Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?",
+        "What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?",
+        "How Do People in a Community Meet Their Wants and Needs?"
+    ],
+    4: [
+        "Poetry", "Animal Defense Mechanisms", "The American Revolution",
+        "Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)",
+        "A Great Heart: What does it mean to have a great heart, literally and figuratively?",
+        "Extreme Settings: How does a challenging setting or physical environment change a person?",
+        "American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light",
+        "Earth's Features", "Waves, Energy, and Information", "Regions of the United States",
+        "How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?",
+        "How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?",
+        "How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?"
+    ],
+    5: [
+        "Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change",
+        "Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?",
+        "Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?",
+        "Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky",
+        "Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation",
+        "How Were the Lives of Native Peoples Influenced by Where They Lived?",
+        "What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?",
+        "Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?",
+        "How Does the Constitution Help Us Understand What It Means to Be an American?",
+        "What Do the Early Years of the United States Reveal About the Character of the Nation?",
+        "What Was the Effect of the Civil War on U.S. Society?"
+    ],
+    6: [
+        "Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools",
+        "Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?",
+        "A Hero’s Journey: What is the significance and power of the hero’s journey?",
+        "Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?",
+        "Courage in Crisis: How can the challenges of a hostile environment inspire heroism?",
+        "Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy",
+        "Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate",
+        "Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)",
+        "Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution",
+        "The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)",
+        "Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)",
+        "New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II",
+        "The Cold War", "Civil Rights and American Society", "America Since the 1970s"
+    ],
+    7: [
+        "The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution",
+        "Identity in the Middle Ages: How does society both support and limit the development of identity?",
+        "Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?",
+        "Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering",
+        "Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources",
+        "Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent",
+        "Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan",
+        "Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires",
+        "China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking",
+        "Age of Exploration and Trade", "Revolutions and Empires", "The Modern World"
+    ],
+    8: [
+        "Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment",
+        "The Poetics and Power of Storytelling: What is the power of storytelling?",
+        "The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?",
+        "Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy",
+        "Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun",
+        "Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms",
+        "Places and Regions", "Physical Geography", "Population Geography", "Economic Geography",
+        "Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses",
+        "Government and the Economy", "The Global Economy"
+    ]
+]
+[END TOPICS]
+
+Here is the text:
+[BEGIN TEXT]
+Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.
+Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.
+His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.
+[END TEXT]
+'''
+llm_response = '''
+Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of historical exploration. They are unlikely to be familiar with the Mongol Dynasty, Kublai Khan, or Marco Polo's specific travels and contributions.'''
+
+[cases.marco_polo_grade3.prompt_steps.vocab_complexity]
+model = "gemini-2.5-pro"
+temperature = 0.0
+system_prompt = '''
+
+You are an expert curriculum designer. Your job is to rate the complexity of a text's vocabulary relative to the grade level.
+
+You will be given a rubric (with levels from least to most complex: slightly complex, moderately complex, very complex, exceedingly complex) as well as guidelines for interpreting the rubric.
+IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meaning, etc.)
+
+**Resource 1: Qualitative Text Complexity rubric (SAP)**
+1.  **Level 1: Slightly complex**
+    *   Original Definition: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. A very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.
+    *   Summary definition: Overall, vocabulary is easy to understand and does not impede comprehension of the bulk of the text (including main idea and supporting claims). 1-2 quick pauses for processing by the student are ok here!
+2.  **Level 2: Moderately complex**
+    *   Original Definition: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK
+    *   Summary definition: Overall, vocabulary generally allows students to comprehend the bulk of the text with little difficulty, though there may be occasional pauses for clarification. Several quick pauses or occasional prolonged pauses may occur.
+3.  **Level 3: Very complex**
+    *   Original Definition: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic
+    *   Summary definition: Overall, vocabulary often presents challenges that may slow down comprehension but does not completely block the comprehension of the bulk of the text.
+4.  **Level 4: Exceedingly complex**
+    *   Original Definition: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading.
+    *   Summary definition: Overall, vocabulary is so complex that it makes comprehension of the bulk of the text very challenging and requires careful effort to interpret.
+
+**Resource 2: Flesch-Kincaid Grade Level**
+Use the Flesch-Kincaid (FK) Grade Level as light guidance of the approximate grade level based on readability. The metric alone does not provide final information of vocabulary complexity, but a ballpark of the difficulty of the entire text.
+*   grade 2-3: 1.98-5.34
+*   grade 4-5: 4.51-7.73
+*   grade 6-8: 6.51-10.34
+*   grade 9-10: 8.32-12.12
+*   grade 11-College: 10.34-14.2
+
+**Guidelines for Interpretation and Reasoning**
+
+Your reasoning is the most critical part of your analysis. It's not enough to simply count complex words. You must analyze their impact on a student at the specified grade level. Use the following principles to guide your judgment:
+
+1.  **Density and Cumulative Effect:** Do not just count complex words; evaluate their concentration. A short text with a high density of challenging Tier 2 words (e.g., `peculiar`, `mischievous`, `courageous` for a 4th grader) can be more overwhelming than a longer text with a few scattered Tier 3 words. A constant barrage of unfamiliar words can elevate complexity from `very` to `exceedingly`.
+2.  **Contextual Scaffolding:** Assess how the text supports new vocabulary.
+    *   Are new, complex terms explicitly defined or explained with simple examples (e.g., "volume... to see if it is big enough to hold a liter of food")?
+    *   Is the surrounding language simple and conversational, making the meaning of new words easier to infer?
+    *   Strong scaffolding can lower the complexity rating. A text with many Tier 3 words that are well-explained might only be `moderately complex`.
+3.  **Abstract vs. Concrete Vocabulary:** Differentiate between words for abstract concepts and words for concrete things. A text built on abstract Tier 2 words (e.g., `relationships`, `performance`, `non-physical`) can be more challenging than a text that introduces Tier 3 labels for concrete things or people (e.g., `Sumerians`, `polonium`).
+4.  **Conceptual Load:** Consider the cognitive load of the vocabulary. A list of many new, multi-syllabic, conceptually-heavy terms (e.g., `Paleolithic`, `Mesolithic`, `Neolithic` for a 3rd grader) can be `very complex` even if the terms are briefly defined, because the student must process multiple new concepts at once.
+5.  **Calibrating the Top Levels:** Be precise in your use of `very complex` vs. `exceedingly complex`.
+    *   **Very complex:** The vocabulary creates significant hurdles and slows the reader down, but the main ideas of the text are still accessible with effort.
+    *   **Exceedingly complex:** The vocabulary is so dense, technical, or abstract that it acts as a barrier, making it nearly impossible for the target student to grasp the bulk of the text's meaning without extensive outside help. Reserve this for texts saturated with advanced terminology.
+6.  **Consider Background Knowledge:** Pay close attention to the provided `student_background_knowledge`. Do not classify a word as complex if the student is likely to be familiar with it (e.g., 'oxygen' for a 3rd grader who has learned about the human body).
+
+**Final Analysis Format**
+
+Provide these information as your final analysis:
+1.  **Complex vocabulary:**
+    *   Tier 2 words: Words that are commonly used in academic settings and more complex than colloquial, or everyday language and often have multiple meanings.
+    *   Tier 3 words: Overly academic or domain-specific words.
+    *   Archaic words: Words, or uses of words that are not commonly used in modern conversational language. E.g., “The jury retired to deliberate on their verdict." The use of “retire” to mean withdrawing to a private place is an archaic use.
+    *   Other complex words: All other words that can increase complexity of the text (e.g., idioms, unfamiliar proper nouns that function as vocabulary).
+2.  **Vocabulary complexity:** one of: slightly complex, moderately complex, very complex, exceedingly complex
+3.  **Your reasoning of the complexity:** A detailed explanation of your rating, referencing the principles above.
+'''
+user_prompt = '''
+
+Below is the text you need to evaluate. Let's think step by step in order to predict the output of the vocabulary complexity task.
+
+- It is intended for grade 3.
+
+- You can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of historical exploration. They are unlikely to be familiar with the Mongol Dynasty, Kublai Khan, or Marco Polo's specific travels and contributions.
+
+- Text Flesch-Kincaid grade level: 6.59
+
+- Text to evaluate: [BEGIN TEXT]
+Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all.
+Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk.
+His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.
+[END TEXT]
+
+STRICT OUTPUT FORMAT:
+- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators.
+- Do not wrap the JSON in Markdown or code fences (no ``` or ```json).
+- Do not prepend or append any text (e.g., do not write "Here is the JSON:").
+- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments.
+
+The output should be formatted as a JSON instance that conforms to the JSON schema below.
+
+As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]} the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.
+
+Here is the output schema (shown in a code block for readability only — do not include any backticks or Markdown in your output):
+```
+{"properties": {"tier_2_words": {"description": "List of Tier 2 words", "title": "Tier 2 Words", "type": "string"}, "tier_3_words": {"description": "List of Tier 3 words", "title": "Tier 3 Words", "type": "string"}, "archaic_words": {"description": "List of Archaic words", "title": "Archaic Words", "type": "string"}, "other_complex_words": {"description": "List of Other Complex words", "title": "Other Complex Words", "type": "string"}, "complexity_score": {"description": "the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex", "title": "Complexity Score", "type": "string"}, "reasoning": {"description": "your reasoning for your answer", "title": "Reasoning", "type": "string"}}, "required": ["tier_2_words", "tier_3_words", "archaic_words", "other_complex_words", "complexity_score", "reasoning"]}
+```
+'''
+llm_response = '''
+```json
+{
+  "tier_2_words": "served, nevertheless, scholars, described, treasures, wealthy, trading, survived, passengers",
+  "tier_3_words": "Mongol Dynasty, Kublai Khan, ivory, jade, porcelain",
+  "archaic_words": "None",
+  "other_complex_words": "Venice, Ayas, Tabriz, Kerman, Mediterranean Sea, Far East, near East",
+  "complexity_score": "very complex",
+  "reasoning": "The vocabulary is rated as very complex due to the high density and conceptual load of unfamiliar terms for a 3rd grader, combined with a near-total lack of contextual scaffolding. The text introduces a large number of proper nouns that function as new vocabulary concepts (Mongol Dynasty, Kublai Khan, Venice, Ayas, Tabriz, Kerman) in a very short space. In addition, it includes several Tier 3 words for unfamiliar goods (ivory, jade, porcelain) and challenging Tier 2 words (nevertheless, scholars). These terms are presented without definition or explanation, requiring the student to process multiple new concepts simultaneously. This density of unfamiliar vocabulary will often present challenges and significantly slow down comprehension, even if the student can grasp the general story of travel and trade. The cumulative effect of these words pushes the text beyond moderately complex into the very complex category for this grade level."
+}
+```'''
+
+[cases.marco_polo_grade3.expected_result]
+tier_2_words = '''
+served, nevertheless, scholars, described, treasures, wealthy, trading, survived, passengers'''
+tier_3_words = '''
+Mongol Dynasty, Kublai Khan, ivory, jade, porcelain'''
+archaic_words = '''
+None'''
+other_complex_words = '''
+Venice, Ayas, Tabriz, Kerman, Mediterranean Sea, Far East, near East'''
+complexity_score = '''
+very complex'''
+reasoning = '''
+The vocabulary is rated as very complex due to the high density and conceptual load of unfamiliar terms for a 3rd grader, combined with a near-total lack of contextual scaffolding. The text introduces a large number of proper nouns that function as new vocabulary concepts (Mongol Dynasty, Kublai Khan, Venice, Ayas, Tabriz, Kerman) in a very short space. In addition, it includes several Tier 3 words for unfamiliar goods (ivory, jade, porcelain) and challenging Tier 2 words (nevertheless, scholars). These terms are presented without definition or explanation, requiring the student to process multiple new concepts simultaneously. This density of unfamiliar vocabulary will often present challenges and significantly slow down comprehension, even if the student can grasp the general story of travel and trade. The cumulative effect of these words pushes the text beyond moderately complex into the very complex category for this grade level.'''
+
+
+# ===========================================================================
+# Case 2: hurricanes_grade7  (grades 5–12 path)
+# ===========================================================================
+
+[cases.hurricanes_grade7]
+description = "Hurricane formation passage, grade 7 (grades 5-12 GPT path)"
+
+[cases.hurricanes_grade7.input]
+text = '''
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.'''
+grade = 7
+
+[cases.hurricanes_grade7.prompt_steps.background_knowledge]
+model = "gpt-4o-2024-11-20"
+temperature = 0.0
+system_prompt = ""
+user_prompt = '''
+
+Review the following text, which is an educational text written for students in the following grade band: 7.
+
+Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with.
+
+Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.).
+
+Here's an example:
+[START EXAMPLE]
+Grade Band: 11th
+Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not
+learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was
+not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to
+know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me,
+are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it
+is the chief end of man here to “glorify God and enjoy him forever.”
+
+Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy.
+[END EXAMPLE]
+
+You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else.
+
+You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have:
+[BEGIN TOPICS]
+[
+    K: [
+        "Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees",
+        "The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?",
+        "America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?",
+        "Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together",
+        "How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?",
+        "How Has Our World Changed?", "Why Do People Have Jobs?"
+    ],
+    1: [
+        "Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds",
+        "A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals’ unique features?",
+        "Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?",
+        "Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World",
+        "What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?",
+        "How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?"
+    ],
+    2: [
+        "Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators",
+        "A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?",
+        "Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?",
+        "Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are",
+        "Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?",
+        "How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?"
+    ],
+    "3": [
+        "Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics",
+        "Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist",
+        "Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate",
+        "Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?",
+        "What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?",
+        "How Do People in a Community Meet Their Wants and Needs?"
+    ],
+    4: [
+        "Poetry", "Animal Defense Mechanisms", "The American Revolution",
+        "Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)",
+        "A Great Heart: What does it mean to have a great heart, literally and figuratively?",
+        "Extreme Settings: How does a challenging setting or physical environment change a person?",
+        "American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light",
+        "Earth's Features", "Waves, Energy, and Information", "Regions of the United States",
+        "How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?",
+        "How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?",
+        "How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?"
+    ],
+    5: [
+        "Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change",
+        "Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?",
+        "Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?",
+        "Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky",
+        "Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation",
+        "How Were the Lives of Native Peoples Influenced by Where They Lived?",
+        "What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?",
+        "Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?",
+        "How Does the Constitution Help Us Understand What It Means to Be an American?",
+        "What Do the Early Years of the United States Reveal About the Character of the Nation?",
+        "What Was the Effect of the Civil War on U.S. Society?"
+    ],
+    6: [
+        "Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools",
+        "Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?",
+        "A Hero’s Journey: What is the significance and power of the hero’s journey?",
+        "Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?",
+        "Courage in Crisis: How can the challenges of a hostile environment inspire heroism?",
+        "Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy",
+        "Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate",
+        "Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)",
+        "Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution",
+        "The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)",
+        "Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)",
+        "New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II",
+        "The Cold War", "Civil Rights and American Society", "America Since the 1970s"
+    ],
+    7: [
+        "The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution",
+        "Identity in the Middle Ages: How does society both support and limit the development of identity?",
+        "Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?",
+        "Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering",
+        "Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources",
+        "Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent",
+        "Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan",
+        "Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires",
+        "China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking",
+        "Age of Exploration and Trade", "Revolutions and Empires", "The Modern World"
+    ],
+    8: [
+        "Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment",
+        "The Poetics and Power of Storytelling: What is the power of storytelling?",
+        "The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?",
+        "Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy",
+        "Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun",
+        "Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms",
+        "Places and Regions", "Physical Geography", "Population Geography", "Economic Geography",
+        "Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses",
+        "Government and the Economy", "The Global Economy"
+    ]
+]
+[END TOPICS]
+
+Here is the text:
+[BEGIN TEXT]
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
+[END TEXT]
+'''
+llm_response = '''
+Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, air pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes, their formation, or their global naming conventions.'''
+
+[cases.hurricanes_grade7.prompt_steps.vocab_complexity]
+model = "gpt-4.1"
+temperature = 0.0
+system_prompt = '''
+
+You are an expert curriculum designer. Your job involves reading text snippets intended for students in K-12 and evaluating the complexity of the vocabulary in each text.
+
+You will be given a rubric (with options 1, 2, 3, 4) as well as guidelines for interpreting the rubric.
+
+IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meainng, etc.)
+IMPORTANT: Rely on the supplied rubric and annotation guidelines along. Do not introduce any new crtieria for evaluating the complexity of a text's vocabulary.
+
+Please first reason out loud about the vocabulary complexity of the text and then provide an answer between 1 and 4 (whole numbers only). Provide the answer as an integer (not a float).
+'''
+user_prompt = '''
+
+Your job is to rate the complexity of a text's vocabulary (relative to the intended level of the text) according to a rubric and annotation guide. Stick to the rubric and annotation guide exactly — do not introduce any additional criteria or lenses for judging the complexity of the text.
+
+[BEGIN ANNOTATION GUIDE AND RUBRIC]
+Instructions
+For the following task, please assume that:
+    - The student is on grade level and proficient in all core content areas, including reading fluency, comprehension, science, & social studies. (example).
+    - The student is moving through a common progression of topics (detailed here).
+    - The student is fluent in speaking English.
+    - The student has an "average" amount of background knowledge on topics not commonly covered in curriculum.
+    - The student will use this material for independent reading/work, without direct instruction.
+    - The text is reasonable for the given grade level.
+
+Please do not consider the presence of figurative language when scoring Vocabulary. For example: with a phrase like "kicked the bucket," consider only the qualities of the words themselves ("kicked", "the" and "bucket").
+
+Please do be sure to consider:
+- all of the different types of vocabulary (listed below)
+- the overall proportion of complex words in the text - including repeated complex words.
+- the resulting holistic complexity of the vocabulary (described in the Summary section below).
+
+Level 1:
+Rubric: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. That said, a very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.
+
+Level 2:
+Rubric: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK, but if it's very low, the text is probably level 1.
+
+Level 3:
+Rubric: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic
+
+Level 4:
+Rubric: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading
+
+And here are some relevant definitions:
+    - Conversational: Everyday language.
+    - Familiar: Words that the student is likely to have seen/heard, from everyday life or their curriculum. Reminder: assume an "average" level of background knowledge.
+    - Unfamiliar: Words the student has probably not heard, or are being used in an unfamiliar way.
+        - For ex: 4th graders are familiar with the word "table" but may not be familiar with the use of the word with respect to data ("a table of data").
+        - Note:
+            - Words with in-line definitions (via appositives, or because they can be easily inferred from other parts of the text) should be evaluated as less unfamiliar.
+            - For ex: "The pharaoh, a powerful ruler of ancient Egypt, was buried in a grand tomb."
+                - The word "pharaoh" might be unfamiliar or subject-specific, but since is defined within the text, you can consider it a more familiar word.
+        - Unfamiliar proper nouns:
+            - A person's name, even if unfamiliar, generally does not add to complexity.
+            - Other unfamiliar proper nouns (eg locations, organizations) do add to complexity.
+
+- Subject-specific: Words that are specific to a subject or field of study that are essential for understanding concepts and engaging with the content.
+- Overly-academic: Words that are excessively formal, complex, or specialized.
+    - For ex: "The agrarian societal structure of the Neolithic Revolution precipitated a paradigm shift in agriculture"
+- Archaic: A word that was common in the past but is now rarely/almost never used. Could also be a word used in an archaic way.
+    - For ex: "After a long day of court proceedings, the jury 'retired' to deliberate on their verdict."
+        - The word "retire" meaning to stop working may be familiar to a student, but "retire" meaning "withdrawing to a private place" is an archaic use.
+
+
+Examples
+The student is on-grade-level:
+- Consider a 6th grade passage about earth systems. Per NGSS standards, students are introduced to earth systems starting in 2nd grade. They encounter words like: wind, water, river, lake, solids, and liquids. For our rating purposes, we would assume most students following 2nd have encountered these words. In 5th grade, they dive more fully into earth systems concepts, learning vocabulary words like geosphere, sediment, biosphere, atmosphere, ecosystems, organisms and climate. While rating, we would consider the words listed in the NGSS standards as more familiar following that grade level.  If the same passage were intended for 3rd graders, though, then the subject-specific vocabulary is likely to be unfamiliar.
+
+Figurative Language
+- Kicked the bucket.
+- The pen is mightier than the sword.
+- The classroom was a zoo.
+- He ran faster than the speed of light.
+[END ANNOTATION GUIDE AND RUBRIC]
+
+Here are a couple examples of texts that have already been scored along with justification for their scores, which you can use as exemplars:
+[BEGIN EXAMPLES]
+
+*** EXAMPLE 1 ***
+The following text was intended for grade level 11 and received a complexity level of 1.
+
+Here is the background knowledge assumption for that text: N/A
+
+Here is the text:
+// START TEXT //
+"In a recent lecture, "Is Nothing Sacred?", Salman Rushdie, one of the most censored authors of our time, talked about the importance of books. He grew up in a household in India where books were as sacred as bread. If anyone in the household dropped a piece of bread or a book, the person not only picked it up, but also kissed the object by way of apologizing for clumsy disrespect.
+
+He goes on to say that he had kissed many books before he had kissed a girl. Bread and books were for his household, and for many like his, food for the body and the soul. This image of the kissing of the book one had accidentally dropped made an impression on me. It speaks to the love and respect many people have for them.
+
+I grew up in a small town in New Mexico, and we had very few books in our household. The first one I remember reading was my catechism book. Before I went to school to learn English, my mother taught me catechism in Spanish.
+
+I remember the questions and answers I had to learn, and I remember the well-thumbed, frayed volume which was sacred to me.
+
+Growing up with few books in the house created in me a desire and a need for them. When I started school, I remember visiting the one room library of our town and standing in front of the dusty shelves. In reality there were only a few shelves and not over a thousand books, but I wanted to read them all. There was food for my soul in the books, that much I realized."
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+This text is a 1 for vocabulary, because the vocabulary that is used is familiar and accessible for a proficient 11th grader. Most of the words used in the text are very common everyday vocabulary for describing growing up, family life, and the importance of reading. A few examples of these very common words are: small town, book, school, learn, food, kissed, image, respect, love, speaks. There are many more in the text. In this text there are only a few "juicier" or more complex words, you can think of those as words that are less familiar, have a more abstract or nuanced meaning, or carry a very large concept. Less commonly spoken words that were used in the text were: frayed, volume, censored, clumsy, sacred. These are still well within reach of a proficient 11th grader, and would still be considered familiar, because they will have encountered them in past reading or academic studies. In the text there are a couple of words that are outliers, but they are not essential to the understanding of the larger text. One of these words or hyphenated compound phrase is well-frayed. A compound phrase is a phrase consisting of multiple words that work together to create a specific meaning or idea, often acting as a single unit in a sentence. If the meaning of individual words is familiar, it is typically quite easy for proficient readers to generalize the larger meaning that the author is implying with their word choice. In this case, proficient students will be accustomed to the phrase well, with the secondary meaning of very, rather than a description of positivity or health; and they will be accustomed to the use frayed, as in worn, aged, or damaged from use. Making the leap to identify the meaning of "well-frayed" as a book that is very used, will take only moments for a proficient 11th grader. Another word that stands out in the text is the word catechism, which might be new for many students based on their personal background or location, but a full understanding of what a catechism book contains is not essential for understanding the paragraph or whole text. The reader can make it through using minimum context clues to know that the catechism must be something important to his family. The type of book he learned to read before going to school is not critical for comprehension, it's enough to understand that reading was so important in his family, his mother started instruction before he even started school. Additionally, it's important to know that having one unknown word for an 11th grade reading, does not merit a rating higher than one.
+
+It is worth noting that another reason this text is a 1, is that the content or topic of the passage is so familiar and covered extensively in K-12 education, i.e. reading is important, loving books, growing up; that coupled with the simple vocabulary choices, getting to the meaning of the overall text, and even the paragraphs, would be incredibly easy for a proficient 11th grader.
+// END REASONING //
+*** EXAMPLE 2 ***
+The following text was intended for grade level 5 and received a complexity level of 2.
+
+Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with the concept of natural disasters, including hurricanes, and basic atmospheric concepts like high and low pressure from their studies on weather and climate. They may not be familiar with the specific formation processes of hurricanes or the global terminology differences (hurricane, typhoon, cyclone).
+
+Here is the text:
+// START TEXT //
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names—hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+I scored this a 2 because of the density of subject-specific vocabulary related to weather and climate, which is often covered in lower grade levels. This adds to the complexity above a 1, but it is not a level 3 because of the familiarity with the topic, which implies some familiarity with the vocabulary as well. The specific formation process and the vocabulary used to explain the processes are also subject-specfiic but not famliar, which would make the second paragraph a level 3 in the rubric language, but when considering the language used in the overall SUMMARY below the rubric, this new content and vocabulary would cause quick pauses and/or occasional prolonged pauses but would not cause the reader to slow down to due to challenging overall comprehension of the key ideas and supporting claims. This is especially the case because the second paragraph builds upon prior knowledge and familiar vocabulary use, so it is not entirely new information and vocabulary. While there is subject-specific vocabulary used, overly academic vocabulary is NOT used and is more conversational in nature, such as "great whiring storms" and "born" / "giving birth" to storm  (although this is the way storms are described!) rather than more technical terms which made comprehension easier due to the accessibility of the vocabulary (even if used in other contexts before reading this text). Words such as "a lot" and "bigger" are more conversational, and while technical, unfamiliar words are provided, such as "hurricane," "typhoon," and "cyclone," knowing and understanding their differences is not necessary to grasp the main idea. The processes by which they are formed are what need to be retained while reading the entire text, and familiarity with the bulk of the vocabulary used would allow for that to happen without too much struggle to make meaning of it. Additionally, the text does not contain any archaic vocabulary or ambiguous words, which prevents it from reaching a rating of 4, although it is not necessary that they text have such vocabulary to meet a level 4, the frequent inclusion of such vocabulary makes it more likely to land at least a 3 or 4.
+// END REASONING //
+
+*** EXAMPLE 3 ***
+The following text was intended for grade level 6 and received a complexity level of 3.
+
+Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with basic Earth science concepts such as rocks, minerals, and fossils, as well as natural processes like volcanic eruptions and earthquakes. They may not be familiar with more advanced topics like plate tectonics or the specific branches of geology such as mineralogy, petrology, and seismology.
+
+Here is the text:
+// START TEXT //
+Geology is the scientific study of Earth. Geologists study the planet—its formation, its internal structure, its materials, its chemical and physical processes, and its history. Mountains, valleys, plains, sea floors, minerals, rocks, fossils, and the processes that create and destroy each of these are all the domain of the geologist. Geology is divided into two broad categories of study: physical geology and historical geology.
+
+Physical geology is concerned with the processes occurring on or below the surface of Earth and the materials on which they operate. These processes include volcanic eruptions, landslides, earthquakes, and floods. Materials include rocks, air, seawater, soils, and sediment. Physical geology further divides into more specific branches, each of which deals with its own part of Earth's materials, landforms, and processes. Mineralogy and petrology investigate the composition and origin of minerals and rocks. Volcanologists study lava, rocks, and gases on live, dormant, and extinct volcanoes. Seismologists use instruments to monitor and predict earthquakes and volcanic eruptions.
+
+Historical geology is concerned with the chronology of events, both physical and biological, that have taken place in Earth's history. Paleontologists study fossils (remains of ancient life) for evidence of the evolution of life on Earth. Fossils not only relate evolution, but also speak of the environment in which the organism lived. Corals in rocks at the top of the Grand Canyon in Arizona, for example, show a shallow sea flooded the area around 290 million years ago. In addition, by determining the ages and types of rocks around the world, geologists piece together continental and oceanic history over the past few billion years. Plate tectonics (the study of the movement of the sections of Earth's crust) adds to Earth's story with details of the changing configuration of the continents and oceans.
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+To determine the complexity rating of this text based on the vocabulary present, I used the annotation guide, scoring rubric, and examples to set the expectations for rating. During the first read of the text, I "bolded" and categorized the more challenging vocabulary words according to the following complexity groupings: archaic, unfamiliar, archaic, subject-specific, and/or overly academic. On the second read, I considered the main idea or "gist" that students need to acquire understanding of. I then referenced the previously mentioned tools–annotation guide, scoring rubric, and examples to remind myself of the expectations for rating.  I agreed that readers would have familiarity with basic concepts of geology; however, I also considered the definitions provided for words such as Geology, Geologists, Physical Geology, Historical Geology, Mineralogy, and Petrology. I considered how students might pause for clarification and for how long. After reviewing the Annotation Guide while considering, I narrowed the rating down because the definitions provided throughout the text of more complex words should make the meaning of the text more accessible for readers, which is why although the words are subject-specific, I rated this text as a 3 instead of a 2-less complex or a 4–more complex. I read the text one final time to ensure clarity around my rating, scored and wrote the justification.
+// END REASONING //
+[END EXAMPLES]
+
+Below is the text you need to evaluate. It is intended for grade 7.
+
+As you read the text, you can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, air pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes, their formation, or their global naming conventions.
+
+[BEGIN TEXT]
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names — hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
+[END TEXT]
+
+In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say "level 2").
+
+STRICT OUTPUT FORMAT:
+- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators.
+- Do not wrap the JSON in Markdown or code fences (no ``` or ```json).
+- Do not prepend or append any text (e.g., do not write "Here is the JSON:").
+- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments.
+
+The output should be formatted as a JSON instance that conforms to the JSON schema below.
+
+As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]} the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.
+
+Here is the output schema (shown in a code block for readability only — do not include any backticks or Markdown in your output):
+```
+{"properties": {"tier_2_words": {"description": "List of Tier 2 words", "title": "Tier 2 Words", "type": "string"}, "tier_3_words": {"description": "List of Tier 3 words", "title": "Tier 3 Words", "type": "string"}, "archaic_words": {"description": "List of Archaic words", "title": "Archaic Words", "type": "string"}, "other_complex_words": {"description": "List of Other Complex words", "title": "Other Complex Words", "type": "string"}, "complexity_score": {"description": "the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex", "title": "Complexity Score", "type": "string"}, "reasoning": {"description": "your reasoning for your answer", "title": "Reasoning", "type": "string"}}, "required": ["tier_2_words", "tier_3_words", "archaic_words", "other_complex_words", "complexity_score", "reasoning"]}
+```
+'''
+llm_response = '''
+{"tier_2_words": "roar, fury, envelope, damage, mightiest, develop, match, familiar, surface, rising, sinking, pressing", "tier_3_words": "hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist", "archaic_words": "", "other_complex_words": "ashore", "complexity_score": "slightly complex", "reasoning": "The vocabulary in this text is mostly contemporary, conversational, and familiar for a 7th grader with the assumed background knowledge. Words like 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' and 'low pressure' are subject-specific (Tier 3), but these are introduced with context and are not overly academic or dense. There are a few Tier 2 words such as 'roar,' 'fury,' 'envelope,' and 'damage,' but these are likely familiar from prior curriculum or everyday language. The word 'ashore' may be less familiar, but it is not enough to raise the overall complexity. There are no archaic or overly academic words. The proportion of complex words is low, and the text is accessible for independent reading at this grade level. Therefore, the vocabulary complexity is best described as 'slightly complex,' corresponding to a rubric level 2."}'''
+
+[cases.hurricanes_grade7.expected_result]
+tier_2_words = '''
+roar, fury, envelope, damage, mightiest, develop, match, familiar, surface, rising, sinking, pressing'''
+tier_3_words = '''
+hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist'''
+archaic_words = ""
+other_complex_words = '''
+ashore'''
+complexity_score = '''
+slightly complex'''
+reasoning = '''
+The vocabulary in this text is mostly contemporary, conversational, and familiar for a 7th grader with the assumed background knowledge. Words like 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' and 'low pressure' are subject-specific (Tier 3), but these are introduced with context and are not overly academic or dense. There are a few Tier 2 words such as 'roar,' 'fury,' 'envelope,' and 'damage,' but these are likely familiar from prior curriculum or everyday language. The word 'ashore' may be less familiar, but it is not enough to raise the overall complexity. There are no archaic or overly academic words. The proportion of complex words is low, and the text is accessible for independent reading at this grade level. Therefore, the vocabulary complexity is best described as 'slightly complex,' corresponding to a rubric level 2.'''
diff --git a/sdks/settings/vocabulary/settings.toml b/sdks/settings/vocabulary/settings.toml
new file mode 100644
index 0000000..295444f
--- /dev/null
+++ b/sdks/settings/vocabulary/settings.toml
@@ -0,0 +1,410 @@
+[evaluator_metadata]
+id = "vocabulary"
+version = "0.1"
+name = "Vocabulary"
+description = "Vocabulary Complexity Evaluator"
+maturity = "alpha"
+
+[[evaluator_metadata.inputs]]
+name = "text"
+type = "TextInputField"
+description = "The text to evaluate for vocabulary complexity."
+
+[[evaluator_metadata.inputs]]
+name = "grade"
+type = "GradeInputField"
+description = "The grade level of the text (3–12)."
+allowed_grades = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+
+# ── Prompt templates ───────────────────────────────────────────────────────────
+#
+# Three prompt keys are used by the evaluator:
+#   background_knowledge_prompt  – step 1 (all grades): produces a plain-text
+#                                  student background knowledge assumption.
+#   vocab_grades_3_4_system_prompt / vocab_grades_3_4_user_prompt  – step 2 for
+#                                  grades 3–4: rubric-based, returns a string label
+#                                  and word-level breakdown.
+#   vocab_other_grades_system_prompt / vocab_other_grades_user_prompt  – step 2
+#                                  for grades 5–12: annotation-guide-based, returns
+#                                  a string complexity_score and reasoning.
+#
+# These prompts are the source of truth for what the evaluator sends to the LLM.
+# They match the corresponding prompts in evals/prompts/vocab_prompts.py.
+# If you update a prompt here, update the notebook file too (or vice versa).
+
+[prompts]
+background_knowledge_prompt = """
+Review the following text, which is an educational text written for students in the following grade band: {grade}.
+
+Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with.
+
+Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.).
+
+Here's an example:
+[START EXAMPLE]
+Grade Band: 11th
+Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not
+learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was
+not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to
+know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me,
+are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it
+is the chief end of man here to “glorify God and enjoy him forever.”
+
+Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy.
+[END EXAMPLE]
+
+You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else.
+
+You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have:
+[BEGIN TOPICS]
+[
+    K: [
+        "Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees",
+        "The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?",
+        "America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?",
+        "Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together",
+        "How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?",
+        "How Has Our World Changed?", "Why Do People Have Jobs?"
+    ],
+    1: [
+        "Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds",
+        "A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals’ unique features?",
+        "Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?",
+        "Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World",
+        "What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?",
+        "How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?"
+    ],
+    2: [
+        "Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators",
+        "A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?",
+        "Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?",
+        "Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are",
+        "Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?",
+        "How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?"
+    ],
+    "3": [
+        "Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics",
+        "Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist",
+        "Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate",
+        "Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?",
+        "What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?",
+        "How Do People in a Community Meet Their Wants and Needs?"
+    ],
+    4: [
+        "Poetry", "Animal Defense Mechanisms", "The American Revolution",
+        "Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)",
+        "A Great Heart: What does it mean to have a great heart, literally and figuratively?",
+        "Extreme Settings: How does a challenging setting or physical environment change a person?",
+        "American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light",
+        "Earth's Features", "Waves, Energy, and Information", "Regions of the United States",
+        "How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?",
+        "How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?",
+        "How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?"
+    ],
+    5: [
+        "Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change",
+        "Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?",
+        "Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?",
+        "Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky",
+        "Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation",
+        "How Were the Lives of Native Peoples Influenced by Where They Lived?",
+        "What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?",
+        "Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?",
+        "How Does the Constitution Help Us Understand What It Means to Be an American?",
+        "What Do the Early Years of the United States Reveal About the Character of the Nation?",
+        "What Was the Effect of the Civil War on U.S. Society?"
+    ],
+    6: [
+        "Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools",
+        "Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?",
+        "A Hero’s Journey: What is the significance and power of the hero’s journey?",
+        "Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?",
+        "Courage in Crisis: How can the challenges of a hostile environment inspire heroism?",
+        "Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy",
+        "Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate",
+        "Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)",
+        "Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution",
+        "The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)",
+        "Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)",
+        "New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II",
+        "The Cold War", "Civil Rights and American Society", "America Since the 1970s"
+    ],
+    7: [
+        "The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution",
+        "Identity in the Middle Ages: How does society both support and limit the development of identity?",
+        "Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?",
+        "Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering",
+        "Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources",
+        "Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent",
+        "Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan",
+        "Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires",
+        "China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking",
+        "Age of Exploration and Trade", "Revolutions and Empires", "The Modern World"
+    ],
+    8: [
+        "Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment",
+        "The Poetics and Power of Storytelling: What is the power of storytelling?",
+        "The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?",
+        "Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy",
+        "Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun",
+        "Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms",
+        "Places and Regions", "Physical Geography", "Population Geography", "Economic Geography",
+        "Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses",
+        "Government and the Economy", "The Global Economy"
+    ]
+]
+[END TOPICS]
+
+Here is the text:
+[BEGIN TEXT]
+{text}
+[END TEXT]
+"""
+
+vocab_grades_3_4_system_prompt = """
+You are an expert curriculum designer. Your job is to rate the complexity of a text's vocabulary relative to the grade level.
+
+You will be given a rubric (with levels from least to most complex: slightly complex, moderately complex, very complex, exceedingly complex) as well as guidelines for interpreting the rubric.
+IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meaning, etc.)
+
+**Resource 1: Qualitative Text Complexity rubric (SAP)**
+1.  **Level 1: Slightly complex**
+    *   Original Definition: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. A very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.
+    *   Summary definition: Overall, vocabulary is easy to understand and does not impede comprehension of the bulk of the text (including main idea and supporting claims). 1-2 quick pauses for processing by the student are ok here!
+2.  **Level 2: Moderately complex**
+    *   Original Definition: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK
+    *   Summary definition: Overall, vocabulary generally allows students to comprehend the bulk of the text with little difficulty, though there may be occasional pauses for clarification. Several quick pauses or occasional prolonged pauses may occur.
+3.  **Level 3: Very complex**
+    *   Original Definition: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic
+    *   Summary definition: Overall, vocabulary often presents challenges that may slow down comprehension but does not completely block the comprehension of the bulk of the text.
+4.  **Level 4: Exceedingly complex**
+    *   Original Definition: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading.
+    *   Summary definition: Overall, vocabulary is so complex that it makes comprehension of the bulk of the text very challenging and requires careful effort to interpret.
+
+**Resource 2: Flesch-Kincaid Grade Level**
+Use the Flesch-Kincaid (FK) Grade Level as light guidance of the approximate grade level based on readability. The metric alone does not provide final information of vocabulary complexity, but a ballpark of the difficulty of the entire text.
+*   grade 2-3: 1.98-5.34
+*   grade 4-5: 4.51-7.73
+*   grade 6-8: 6.51-10.34
+*   grade 9-10: 8.32-12.12
+*   grade 11-College: 10.34-14.2
+
+**Guidelines for Interpretation and Reasoning**
+
+Your reasoning is the most critical part of your analysis. It's not enough to simply count complex words. You must analyze their impact on a student at the specified grade level. Use the following principles to guide your judgment:
+
+1.  **Density and Cumulative Effect:** Do not just count complex words; evaluate their concentration. A short text with a high density of challenging Tier 2 words (e.g., `peculiar`, `mischievous`, `courageous` for a 4th grader) can be more overwhelming than a longer text with a few scattered Tier 3 words. A constant barrage of unfamiliar words can elevate complexity from `very` to `exceedingly`.
+2.  **Contextual Scaffolding:** Assess how the text supports new vocabulary.
+    *   Are new, complex terms explicitly defined or explained with simple examples (e.g., "volume... to see if it is big enough to hold a liter of food")?
+    *   Is the surrounding language simple and conversational, making the meaning of new words easier to infer?
+    *   Strong scaffolding can lower the complexity rating. A text with many Tier 3 words that are well-explained might only be `moderately complex`.
+3.  **Abstract vs. Concrete Vocabulary:** Differentiate between words for abstract concepts and words for concrete things. A text built on abstract Tier 2 words (e.g., `relationships`, `performance`, `non-physical`) can be more challenging than a text that introduces Tier 3 labels for concrete things or people (e.g., `Sumerians`, `polonium`).
+4.  **Conceptual Load:** Consider the cognitive load of the vocabulary. A list of many new, multi-syllabic, conceptually-heavy terms (e.g., `Paleolithic`, `Mesolithic`, `Neolithic` for a 3rd grader) can be `very complex` even if the terms are briefly defined, because the student must process multiple new concepts at once.
+5.  **Calibrating the Top Levels:** Be precise in your use of `very complex` vs. `exceedingly complex`.
+    *   **Very complex:** The vocabulary creates significant hurdles and slows the reader down, but the main ideas of the text are still accessible with effort.
+    *   **Exceedingly complex:** The vocabulary is so dense, technical, or abstract that it acts as a barrier, making it nearly impossible for the target student to grasp the bulk of the text's meaning without extensive outside help. Reserve this for texts saturated with advanced terminology.
+6.  **Consider Background Knowledge:** Pay close attention to the provided `student_background_knowledge`. Do not classify a word as complex if the student is likely to be familiar with it (e.g., 'oxygen' for a 3rd grader who has learned about the human body).
+
+**Final Analysis Format**
+
+Provide these information as your final analysis:
+1.  **Complex vocabulary:**
+    *   Tier 2 words: Words that are commonly used in academic settings and more complex than colloquial, or everyday language and often have multiple meanings.
+    *   Tier 3 words: Overly academic or domain-specific words.
+    *   Archaic words: Words, or uses of words that are not commonly used in modern conversational language. E.g., “The jury retired to deliberate on their verdict." The use of “retire” to mean withdrawing to a private place is an archaic use.
+    *   Other complex words: All other words that can increase complexity of the text (e.g., idioms, unfamiliar proper nouns that function as vocabulary).
+2.  **Vocabulary complexity:** one of: slightly complex, moderately complex, very complex, exceedingly complex
+3.  **Your reasoning of the complexity:** A detailed explanation of your rating, referencing the principles above.
+"""
+
+vocab_grades_3_4_user_prompt = """
+Below is the text you need to evaluate. Let's think step by step in order to predict the output of the vocabulary complexity task.
+
+- It is intended for grade {student_grade_level}.
+
+- You can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: {student_background_knowledge}
+
+- Text Flesch-Kincaid grade level: {fk_level}
+
+- Text to evaluate: [BEGIN TEXT]
+{text}
+[END TEXT]
+
+{format_instructions}
+"""
+
+vocab_other_grades_system_prompt = """
+You are an expert curriculum designer. Your job involves reading text snippets intended for students in K-12 and evaluating the complexity of the vocabulary in each text.
+
+You will be given a rubric (with options 1, 2, 3, 4) as well as guidelines for interpreting the rubric.
+
+IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meainng, etc.)
+IMPORTANT: Rely on the supplied rubric and annotation guidelines along. Do not introduce any new crtieria for evaluating the complexity of a text's vocabulary.
+
+Please first reason out loud about the vocabulary complexity of the text and then provide an answer between 1 and 4 (whole numbers only). Provide the answer as an integer (not a float).
+"""
+
+vocab_other_grades_user_prompt = """
+Your job is to rate the complexity of a text's vocabulary (relative to the intended level of the text) according to a rubric and annotation guide. Stick to the rubric and annotation guide exactly — do not introduce any additional criteria or lenses for judging the complexity of the text.
+
+[BEGIN ANNOTATION GUIDE AND RUBRIC]
+Instructions
+For the following task, please assume that:
+    - The student is on grade level and proficient in all core content areas, including reading fluency, comprehension, science, & social studies. (example).
+    - The student is moving through a common progression of topics (detailed here).
+    - The student is fluent in speaking English.
+    - The student has an "average" amount of background knowledge on topics not commonly covered in curriculum.
+    - The student will use this material for independent reading/work, without direct instruction.
+    - The text is reasonable for the given grade level.
+
+Please do not consider the presence of figurative language when scoring Vocabulary. For example: with a phrase like "kicked the bucket," consider only the qualities of the words themselves ("kicked", "the" and "bucket").
+
+Please do be sure to consider:
+- all of the different types of vocabulary (listed below)
+- the overall proportion of complex words in the text - including repeated complex words.
+- the resulting holistic complexity of the vocabulary (described in the Summary section below).
+
+Level 1:
+Rubric: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. That said, a very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0.
+
+Level 2:
+Rubric: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK, but if it's very low, the text is probably level 1.
+
+Level 3:
+Rubric: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic
+
+Level 4:
+Rubric: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading
+
+And here are some relevant definitions:
+    - Conversational: Everyday language.
+    - Familiar: Words that the student is likely to have seen/heard, from everyday life or their curriculum. Reminder: assume an "average" level of background knowledge.
+    - Unfamiliar: Words the student has probably not heard, or are being used in an unfamiliar way.
+        - For ex: 4th graders are familiar with the word "table" but may not be familiar with the use of the word with respect to data ("a table of data").
+        - Note:
+            - Words with in-line definitions (via appositives, or because they can be easily inferred from other parts of the text) should be evaluated as less unfamiliar.
+            - For ex: "The pharaoh, a powerful ruler of ancient Egypt, was buried in a grand tomb."
+                - The word "pharaoh" might be unfamiliar or subject-specific, but since is defined within the text, you can consider it a more familiar word.
+        - Unfamiliar proper nouns:
+            - A person's name, even if unfamiliar, generally does not add to complexity.
+            - Other unfamiliar proper nouns (eg locations, organizations) do add to complexity.
+
+- Subject-specific: Words that are specific to a subject or field of study that are essential for understanding concepts and engaging with the content.
+- Overly-academic: Words that are excessively formal, complex, or specialized.
+    - For ex: "The agrarian societal structure of the Neolithic Revolution precipitated a paradigm shift in agriculture"
+- Archaic: A word that was common in the past but is now rarely/almost never used. Could also be a word used in an archaic way.
+    - For ex: "After a long day of court proceedings, the jury 'retired' to deliberate on their verdict."
+        - The word "retire" meaning to stop working may be familiar to a student, but "retire" meaning "withdrawing to a private place" is an archaic use.
+
+
+Examples
+The student is on-grade-level:
+- Consider a 6th grade passage about earth systems. Per NGSS standards, students are introduced to earth systems starting in 2nd grade. They encounter words like: wind, water, river, lake, solids, and liquids. For our rating purposes, we would assume most students following 2nd have encountered these words. In 5th grade, they dive more fully into earth systems concepts, learning vocabulary words like geosphere, sediment, biosphere, atmosphere, ecosystems, organisms and climate. While rating, we would consider the words listed in the NGSS standards as more familiar following that grade level.  If the same passage were intended for 3rd graders, though, then the subject-specific vocabulary is likely to be unfamiliar.
+
+Figurative Language
+- Kicked the bucket.
+- The pen is mightier than the sword.
+- The classroom was a zoo.
+- He ran faster than the speed of light.
+[END ANNOTATION GUIDE AND RUBRIC]
+
+Here are a couple examples of texts that have already been scored along with justification for their scores, which you can use as exemplars:
+[BEGIN EXAMPLES]
+
+*** EXAMPLE 1 ***
+The following text was intended for grade level 11 and received a complexity level of 1.
+
+Here is the background knowledge assumption for that text: N/A
+
+Here is the text:
+// START TEXT //
+"In a recent lecture, "Is Nothing Sacred?", Salman Rushdie, one of the most censored authors of our time, talked about the importance of books. He grew up in a household in India where books were as sacred as bread. If anyone in the household dropped a piece of bread or a book, the person not only picked it up, but also kissed the object by way of apologizing for clumsy disrespect.
+
+He goes on to say that he had kissed many books before he had kissed a girl. Bread and books were for his household, and for many like his, food for the body and the soul. This image of the kissing of the book one had accidentally dropped made an impression on me. It speaks to the love and respect many people have for them.
+
+I grew up in a small town in New Mexico, and we had very few books in our household. The first one I remember reading was my catechism book. Before I went to school to learn English, my mother taught me catechism in Spanish.
+
+I remember the questions and answers I had to learn, and I remember the well-thumbed, frayed volume which was sacred to me.
+
+Growing up with few books in the house created in me a desire and a need for them. When I started school, I remember visiting the one room library of our town and standing in front of the dusty shelves. In reality there were only a few shelves and not over a thousand books, but I wanted to read them all. There was food for my soul in the books, that much I realized."
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+This text is a 1 for vocabulary, because the vocabulary that is used is familiar and accessible for a proficient 11th grader. Most of the words used in the text are very common everyday vocabulary for describing growing up, family life, and the importance of reading. A few examples of these very common words are: small town, book, school, learn, food, kissed, image, respect, love, speaks. There are many more in the text. In this text there are only a few "juicier" or more complex words, you can think of those as words that are less familiar, have a more abstract or nuanced meaning, or carry a very large concept. Less commonly spoken words that were used in the text were: frayed, volume, censored, clumsy, sacred. These are still well within reach of a proficient 11th grader, and would still be considered familiar, because they will have encountered them in past reading or academic studies. In the text there are a couple of words that are outliers, but they are not essential to the understanding of the larger text. One of these words or hyphenated compound phrase is well-frayed. A compound phrase is a phrase consisting of multiple words that work together to create a specific meaning or idea, often acting as a single unit in a sentence. If the meaning of individual words is familiar, it is typically quite easy for proficient readers to generalize the larger meaning that the author is implying with their word choice. In this case, proficient students will be accustomed to the phrase well, with the secondary meaning of very, rather than a description of positivity or health; and they will be accustomed to the use frayed, as in worn, aged, or damaged from use. Making the leap to identify the meaning of "well-frayed" as a book that is very used, will take only moments for a proficient 11th grader. Another word that stands out in the text is the word catechism, which might be new for many students based on their personal background or location, but a full understanding of what a catechism book contains is not essential for understanding the paragraph or whole text. The reader can make it through using minimum context clues to know that the catechism must be something important to his family. The type of book he learned to read before going to school is not critical for comprehension, it's enough to understand that reading was so important in his family, his mother started instruction before he even started school. Additionally, it's important to know that having one unknown word for an 11th grade reading, does not merit a rating higher than one.
+
+It is worth noting that another reason this text is a 1, is that the content or topic of the passage is so familiar and covered extensively in K-12 education, i.e. reading is important, loving books, growing up; that coupled with the simple vocabulary choices, getting to the meaning of the overall text, and even the paragraphs, would be incredibly easy for a proficient 11th grader.
+// END REASONING //
+*** EXAMPLE 2 ***
+The following text was intended for grade level 5 and received a complexity level of 2.
+
+Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with the concept of natural disasters, including hurricanes, and basic atmospheric concepts like high and low pressure from their studies on weather and climate. They may not be familiar with the specific formation processes of hurricanes or the global terminology differences (hurricane, typhoon, cyclone).
+
+Here is the text:
+// START TEXT //
+Great whirling storms roar out of the oceans in many parts of the world. They are called by several names—hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms.
+
+Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+I scored this a 2 because of the density of subject-specific vocabulary related to weather and climate, which is often covered in lower grade levels. This adds to the complexity above a 1, but it is not a level 3 because of the familiarity with the topic, which implies some familiarity with the vocabulary as well. The specific formation process and the vocabulary used to explain the processes are also subject-specfiic but not famliar, which would make the second paragraph a level 3 in the rubric language, but when considering the language used in the overall SUMMARY below the rubric, this new content and vocabulary would cause quick pauses and/or occasional prolonged pauses but would not cause the reader to slow down to due to challenging overall comprehension of the key ideas and supporting claims. This is especially the case because the second paragraph builds upon prior knowledge and familiar vocabulary use, so it is not entirely new information and vocabulary. While there is subject-specific vocabulary used, overly academic vocabulary is NOT used and is more conversational in nature, such as "great whiring storms" and "born" / "giving birth" to storm  (although this is the way storms are described!) rather than more technical terms which made comprehension easier due to the accessibility of the vocabulary (even if used in other contexts before reading this text). Words such as "a lot" and "bigger" are more conversational, and while technical, unfamiliar words are provided, such as "hurricane," "typhoon," and "cyclone," knowing and understanding their differences is not necessary to grasp the main idea. The processes by which they are formed are what need to be retained while reading the entire text, and familiarity with the bulk of the vocabulary used would allow for that to happen without too much struggle to make meaning of it. Additionally, the text does not contain any archaic vocabulary or ambiguous words, which prevents it from reaching a rating of 4, although it is not necessary that they text have such vocabulary to meet a level 4, the frequent inclusion of such vocabulary makes it more likely to land at least a 3 or 4.
+// END REASONING //
+
+*** EXAMPLE 3 ***
+The following text was intended for grade level 6 and received a complexity level of 3.
+
+Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with basic Earth science concepts such as rocks, minerals, and fossils, as well as natural processes like volcanic eruptions and earthquakes. They may not be familiar with more advanced topics like plate tectonics or the specific branches of geology such as mineralogy, petrology, and seismology.
+
+Here is the text:
+// START TEXT //
+Geology is the scientific study of Earth. Geologists study the planet—its formation, its internal structure, its materials, its chemical and physical processes, and its history. Mountains, valleys, plains, sea floors, minerals, rocks, fossils, and the processes that create and destroy each of these are all the domain of the geologist. Geology is divided into two broad categories of study: physical geology and historical geology.
+
+Physical geology is concerned with the processes occurring on or below the surface of Earth and the materials on which they operate. These processes include volcanic eruptions, landslides, earthquakes, and floods. Materials include rocks, air, seawater, soils, and sediment. Physical geology further divides into more specific branches, each of which deals with its own part of Earth's materials, landforms, and processes. Mineralogy and petrology investigate the composition and origin of minerals and rocks. Volcanologists study lava, rocks, and gases on live, dormant, and extinct volcanoes. Seismologists use instruments to monitor and predict earthquakes and volcanic eruptions.
+
+Historical geology is concerned with the chronology of events, both physical and biological, that have taken place in Earth's history. Paleontologists study fossils (remains of ancient life) for evidence of the evolution of life on Earth. Fossils not only relate evolution, but also speak of the environment in which the organism lived. Corals in rocks at the top of the Grand Canyon in Arizona, for example, show a shallow sea flooded the area around 290 million years ago. In addition, by determining the ages and types of rocks around the world, geologists piece together continental and oceanic history over the past few billion years. Plate tectonics (the study of the movement of the sections of Earth's crust) adds to Earth's story with details of the changing configuration of the continents and oceans.
+// END TEXT //
+
+Here is the reasoning for that complexity level:
+// START REASONING //
+To determine the complexity rating of this text based on the vocabulary present, I used the annotation guide, scoring rubric, and examples to set the expectations for rating. During the first read of the text, I "bolded" and categorized the more challenging vocabulary words according to the following complexity groupings: archaic, unfamiliar, archaic, subject-specific, and/or overly academic. On the second read, I considered the main idea or "gist" that students need to acquire understanding of. I then referenced the previously mentioned tools–annotation guide, scoring rubric, and examples to remind myself of the expectations for rating.  I agreed that readers would have familiarity with basic concepts of geology; however, I also considered the definitions provided for words such as Geology, Geologists, Physical Geology, Historical Geology, Mineralogy, and Petrology. I considered how students might pause for clarification and for how long. After reviewing the Annotation Guide while considering, I narrowed the rating down because the definitions provided throughout the text of more complex words should make the meaning of the text more accessible for readers, which is why although the words are subject-specific, I rated this text as a 3 instead of a 2-less complex or a 4–more complex. I read the text one final time to ensure clarity around my rating, scored and wrote the justification.
+// END REASONING //
+[END EXAMPLES]
+
+Below is the text you need to evaluate. It is intended for grade {student_grade_level}.
+
+As you read the text, you can assume the student has the following background knowledge about the text — this background knowledge influences which words from the text are familiar versus unfamiliar for the student: {student_background_knowledge}
+
+[BEGIN TEXT]
+{text}
+[END TEXT]
+
+In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say "level 2").
+
+{format_instructions}
+"""
+
+# ── Evaluation settings ────────────────────────────────────────────────────────
+#
+# Three LLM steps, each with its own provider and model:
+#   background_knowledge  – GPT-4o for the background knowledge generation step
+#   vocab_grades_3_4      – Gemini for the grades 3–4 complexity step
+#   vocab_other_grades    – GPT-4.1 for the grades 5–12 complexity step
+
+[evaluation_settings]
+
+[evaluation_settings.prompt_settings_step_background_knowledge]
+provider_type = "OPENAI"
+model = "gpt-4o-2024-11-20"
+temperature = 0
+
+[evaluation_settings.prompt_settings_step_vocab_grades_3_4]
+provider_type = "GOOGLE"
+model = "gemini-2.5-pro"
+temperature = 0
+
+[evaluation_settings.prompt_settings_step_vocab_other_grades]
+provider_type = "OPENAI"
+model = "gpt-4.1"
+temperature = 0