From 8702d932a48ee5025aa6a7c3f60a952dbb8f690c Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Thu, 30 Apr 2026 05:59:10 -0700 Subject: [PATCH 01/15] feat: python SDK scaffold --- .gitattributes | 7 + .github/workflows/test-sdk-python.yml | 51 +++ sdks/python/.gitignore | 28 ++ sdks/python/Makefile | 51 +++ sdks/python/README.md | 334 ++++++++++++++++++ sdks/python/pyproject.toml | 129 +++++++ .../learning_commons_evaluators/__init__.py | 127 +++++++ .../learning_commons_evaluators/_version.py | 10 + .../src/learning_commons_evaluators/config.py | 25 ++ .../src/learning_commons_evaluators/errors.py | 27 ++ .../evaluators/__init__.py | 15 + .../evaluators/base.py | 263 ++++++++++++++ .../src/learning_commons_evaluators/logger.py | 98 +++++ .../providers/__init__.py | 15 + .../providers/langchain.py | 110 ++++++ .../src/learning_commons_evaluators/py.typed | 0 .../schemas/__init__.py | 69 ++++ .../schemas/common_inputs.py | 83 +++++ .../schemas/config.py | 159 +++++++++ .../schemas/errors.py | 140 ++++++++ .../schemas/evaluator.py | 219 ++++++++++++ .../schemas/input_specs.py | 126 +++++++ .../schemas/metadata.py | 94 +++++ .../schemas/text_complexity.py | 48 +++ .../settings/__init__.py | 9 + .../settings/load_settings.py | 257 ++++++++++++++ sdks/python/tests/__init__.py | 1 + sdks/python/tests/conftest.py | 57 +++ sdks/python/tests/evaluators/__init__.py | 1 + sdks/python/tests/providers/__init__.py | 1 + sdks/python/tests/providers/test_langchain.py | 134 +++++++ sdks/python/tests/schemas/__init__.py | 1 + .../tests/schemas/test_common_inputs.py | 100 ++++++ sdks/python/tests/schemas/test_config.py | 87 +++++ sdks/python/tests/schemas/test_errors.py | 134 +++++++ sdks/python/tests/schemas/test_input_specs.py | 87 +++++ sdks/python/tests/schemas/test_metadata.py | 137 +++++++ .../tests/schemas/test_text_complexity.py | 84 +++++ sdks/python/tests/settings/__init__.py | 1 + sdks/python/tests/test_logger.py | 75 ++++ sdks/python/tests/test_package_imports.py | 34 ++ 41 files changed, 3428 insertions(+) create mode 100644 .gitattributes create mode 100644 .github/workflows/test-sdk-python.yml create mode 100644 sdks/python/.gitignore create mode 100644 sdks/python/Makefile create mode 100644 sdks/python/README.md create mode 100644 sdks/python/pyproject.toml create mode 100644 sdks/python/src/learning_commons_evaluators/__init__.py create mode 100644 sdks/python/src/learning_commons_evaluators/_version.py create mode 100644 sdks/python/src/learning_commons_evaluators/config.py create mode 100644 sdks/python/src/learning_commons_evaluators/errors.py create mode 100644 sdks/python/src/learning_commons_evaluators/evaluators/__init__.py create mode 100644 sdks/python/src/learning_commons_evaluators/evaluators/base.py create mode 100644 sdks/python/src/learning_commons_evaluators/logger.py create mode 100644 sdks/python/src/learning_commons_evaluators/providers/__init__.py create mode 100644 sdks/python/src/learning_commons_evaluators/providers/langchain.py create mode 100644 sdks/python/src/learning_commons_evaluators/py.typed create mode 100644 sdks/python/src/learning_commons_evaluators/schemas/__init__.py create mode 100644 sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py create mode 100644 sdks/python/src/learning_commons_evaluators/schemas/config.py create mode 100644 sdks/python/src/learning_commons_evaluators/schemas/errors.py create mode 100644 sdks/python/src/learning_commons_evaluators/schemas/evaluator.py create mode 100644 sdks/python/src/learning_commons_evaluators/schemas/input_specs.py create mode 100644 sdks/python/src/learning_commons_evaluators/schemas/metadata.py create mode 100644 sdks/python/src/learning_commons_evaluators/schemas/text_complexity.py create mode 100644 sdks/python/src/learning_commons_evaluators/settings/__init__.py create mode 100644 sdks/python/src/learning_commons_evaluators/settings/load_settings.py create mode 100644 sdks/python/tests/__init__.py create mode 100644 sdks/python/tests/conftest.py create mode 100644 sdks/python/tests/evaluators/__init__.py create mode 100644 sdks/python/tests/providers/__init__.py create mode 100644 sdks/python/tests/providers/test_langchain.py create mode 100644 sdks/python/tests/schemas/__init__.py create mode 100644 sdks/python/tests/schemas/test_common_inputs.py create mode 100644 sdks/python/tests/schemas/test_config.py create mode 100644 sdks/python/tests/schemas/test_errors.py create mode 100644 sdks/python/tests/schemas/test_input_specs.py create mode 100644 sdks/python/tests/schemas/test_metadata.py create mode 100644 sdks/python/tests/schemas/test_text_complexity.py create mode 100644 sdks/python/tests/settings/__init__.py create mode 100644 sdks/python/tests/test_logger.py create mode 100644 sdks/python/tests/test_package_imports.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..7609101 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,7 @@ +# Machine-generated / capture-synced paths: GitHub shows "Generated" in PRs +# (diff collapsed by default, expandable; omitted from language stats). +# _generated_*.py β†’ make generate-settings +# contracts.toml β†’ notebook/contract capture + make sync-settings (bundled copy) +sdks/python/src/learning_commons_evaluators/settings/_generated_*.py linguist-generated=true +sdks/python/src/learning_commons_evaluators/settings/**/contracts.toml linguist-generated=true +sdks/settings/**/contracts.toml linguist-generated=true diff --git a/.github/workflows/test-sdk-python.yml b/.github/workflows/test-sdk-python.yml new file mode 100644 index 0000000..af35a78 --- /dev/null +++ b/.github/workflows/test-sdk-python.yml @@ -0,0 +1,51 @@ +name: 🐍 Test Python SDK + +on: + push: + branches: + - main + paths: + - "sdks/python/**" + - "scripts/generate_settings.py" + - "sdks/settings/**" + - ".github/workflows/test-sdk-python.yml" + pull_request: + paths: + - "sdks/python/**" + - "scripts/generate_settings.py" + - "sdks/settings/**" + - ".github/workflows/test-sdk-python.yml" + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + verify: + name: Verify (Python ${{ matrix.python-version }}) + runs-on: ubuntu-latest + defaults: + run: + working-directory: sdks/python + strategy: + matrix: + python-version: ["3.10", "3.12", "3.13"] + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Install package and dev tools + run: | + python -m pip install --upgrade pip + pip install -e ".[dev]" + + - name: make verify + run: make verify diff --git a/sdks/python/.gitignore b/sdks/python/.gitignore new file mode 100644 index 0000000..cd4ac98 --- /dev/null +++ b/sdks/python/.gitignore @@ -0,0 +1,28 @@ +# Virtual environments +.venv/ +venv/ +env/ + +# Build / install +build/ +dist/ +*.egg-info/ +*.egg + +# Testing +.pytest_cache/ +.coverage +htmlcov/ + +# Environment +.env +.env.local + +# IDE +.idea/ +.vscode/ +*.swp +*~ + +# OS +.DS_Store diff --git a/sdks/python/Makefile b/sdks/python/Makefile new file mode 100644 index 0000000..bf97dfe --- /dev/null +++ b/sdks/python/Makefile @@ -0,0 +1,51 @@ +VENV := ../../.venv +PYTHON := $(VENV)/bin/python +PYTEST := $(VENV)/bin/pytest +RUFF := $(VENV)/bin/ruff +MYPY := $(VENV)/bin/mypy + +.PHONY: help test unit-test lint format format-check typecheck pip-check verify coverage + +help: + @echo "Usage: make " + @echo "" + @echo " lint Ruff linter (src, tests)" + @echo " format Apply Ruff formatter" + @echo " format-check Fail if Ruff would reformat any file" + @echo " typecheck Mypy on src package + tests" + @echo " pip-check pip dependency consistency check" + @echo " verify lint + format-check + typecheck + pip-check + unit-test" + @echo " coverage Unit tests with coverage report (requires dev install)" + @echo "" + @echo " test Run the full test suite" + @echo " unit-test Run unit tests only" + +# ── Static checks (Ruff + Mypy) ─────────────────────────────────────────────── + +lint: + $(RUFF) check src tests + +format: + $(RUFF) format src tests + +format-check: + $(RUFF) format --check src tests + +typecheck: + $(MYPY) src/learning_commons_evaluators tests + +pip-check: + $(PYTHON) -m pip check + +verify: lint format-check typecheck pip-check unit-test + +coverage: + $(PYTEST) tests/ -q --ignore=tests/contract_tests --cov=learning_commons_evaluators --cov-report=term-missing + +# ── Testing ─────────────────────────────────────────────────────────────────── + +test: + $(PYTEST) tests/ -v + +unit-test: + $(PYTEST) tests/ -v --ignore=tests/contract_tests diff --git a/sdks/python/README.md b/sdks/python/README.md new file mode 100644 index 0000000..1630de4 --- /dev/null +++ b/sdks/python/README.md @@ -0,0 +1,334 @@ +# learning-commons-evaluators (Python) + +Python SDK for Learning Commons educational text evaluators. + +## Package layout + +- **`evaluators/`** – Evaluator classes: `BaseEvaluator`, `ConventionalityEvaluator`. +- **`schemas/`** – Pydantic schemas for inputs, outputs, config, metadata, and errors. +- **`providers/`** – LangChain-based LLM provider factory (OpenAI, Google, Anthropic). +- **`settings/`** – TOML settings loader for evaluator configuration. +- **`config.py`** – Re-exports config types (`EvaluatorConfig`, `PromptProviderConfig`, factory functions). +- **`errors.py`** – Re-exports error types. +- **`logger.py`** – Standard Python logging utilities. + +## Installation + +```bash +pip install learning-commons-evaluators +``` + +### Development setup + +```bash +cd sdks/python + +# Create and activate a virtual environment (Python 3.10+) +python3 -m venv .venv +source .venv/bin/activate # macOS/Linux +# .venv\Scripts\activate.bat # Windows CMD +# .venv\Scripts\Activate.ps1 # Windows PowerShell + +# Install in editable mode with dev dependencies +pip install -e ".[dev]" + +# Static checks + unit tests (same gate as CI for the Python SDK) +make verify + +# Run the full test suite (unit + contract) +make test +``` + +Linting and typing use **Ruff** and **Mypy** (see `pyproject.toml`). From `sdks/python/`: + +- `make lint` β€” Ruff lint on `src/`, `tests/`, and `scripts/generate_settings.py` +- `make format` / `make format-check` β€” Ruff formatter +- `make typecheck` β€” Mypy on the package and tests +- `make pip-check` β€” `pip check` for broken dependency metadata +- `make coverage` β€” unit tests with `pytest-cov` terminal report + +### Regenerating settings after TOML changes + +Evaluator settings (prompts, models, temperatures) live in `sdks/settings/` and are +baked into pre-generated Python modules at build time. After editing any evaluator +settings TOML under `sdks/settings/`, regenerate those modules from `sdks/python/`: + +```bash +make build # regenerates _generated_*_settings.py and syncs contracts.toml +``` + +Commit both the updated TOML and the regenerated `.py` files together. CI runs +`make check-build` to catch any drift between the TOML source and the generated files. + +The repo root **`.gitattributes`** marks `_generated_*.py` and `contracts.toml` +(under `sdks/python/.../settings/` and `sdks/settings/`) with `linguist-generated=true`, +so GitHub treats them as generated in pull requests (diffs default to collapsed and +they are omitted from language statistics; you can still expand a file to review it). + +### Running tests + +```bash +# All tests (unit + contract, skipping unpopulated contract cases automatically) +pytest + +# Unit tests only +pytest tests/ --ignore=tests/contract_tests + +# Contract tests only +pytest tests/contract_tests/ +``` + +### Contract tests + +Contract tests verify that the Python SDK sends the same LLM request (fully formatted +prompts, model, temperature) as the Jupyter notebook, and produces the same structured +result from the same LLM response. + +Each evaluator has a `contracts.toml` file that captures a real LLM interaction +from a notebook run. Until that file is populated, contract tests are **automatically +skipped** β€” they do not fail. + +**Populating the contract data** (once per evaluator, or after any prompt change): + +1. Open `evals/Final ship - Conventionality Experimental Evaluator.ipynb` (or the + relevant evaluator notebook) with a valid `GOOGLE_API_KEY` (or other provider key). +2. Run all cells. The final "Contract test capture β€” TOML output" cell prints a TOML + block. +3. Paste the printed block into + `sdks/settings/conventionality/contracts.toml`, replacing the + placeholder values in `[cases.turnip.prompt_steps.main]` and + `[cases.turnip.expected_result]`. +4. Run `make sync-settings` to copy the updated file into the bundled package copy. +5. Run `pytest tests/contract_tests/` β€” the tests should now execute and pass. + +### Keeping settings in sync + +The canonical settings live in `sdks/settings/` (source of truth for all SDKs). +The Python package needs two things derived from them: + +- **`_generated_*_settings.py`** β€” Python modules baked from the evaluator settings + TOML at build time; evaluators import these at runtime (zero file I/O). +- **Bundled `contracts.toml`** β€” copied into the package so contract tests work + after a plain `pip install`. + +Run from `sdks/python/` after any change to `sdks/settings/`: + +```bash +make build # regenerate _generated_*_settings.py + sync contracts.toml +make check-build # verify generated files match canonical TOML (used in CI) +``` + +Commit the updated canonical TOML together with the regenerated/synced files. + +### Using the SDK before publishing + +To use the SDK as source code in another project before publishing to PyPI: + +```bash +pip install -e /path/to/evaluators/sdks/python +``` + +This installs the package in editable mode β€” changes to the SDK source are reflected +immediately without reinstalling. + +## Quick start + +```python +import logging +from learning_commons_evaluators import ( + ConventionalityEvaluator, + ConventionalityEvaluationInput, + GooglePromptProviderConfig, + create_config, +) + +# Optional: app logging so SDK messages propagate (default logger is learning_commons_evaluators) +logging.basicConfig(level=logging.INFO) + +# Create config with provider credentials +config = create_config( + google_prompt_provider_config=GooglePromptProviderConfig(api_key="your-google-key"), + telemetry_id="your-telemetry-id", +) + +# Create evaluator and run evaluation +evaluator = ConventionalityEvaluator(config) +result = evaluator.evaluate( + ConventionalityEvaluationInput(text="The cat's out of the bag now.", grade=5) +) + +print(result.answer.score) # e.g. "moderately_complex" +print(result.answer.label) # e.g. "Moderately complex" +print(result.explanation.summary) # Reasoning for the score +``` + +## Evaluators + +### Conventionality Evaluator + +Evaluates text for conventionality of language (idioms, metaphors, implied meaning) relative to a grade level. + +- **Input:** Text + grade level (0-12) +- **Output:** Complexity score, conventionality features, grade context, instructional insights + +```python +from learning_commons_evaluators import ( + ConventionalityEvaluator, + ConventionalityEvaluationInput, + GooglePromptProviderConfig, + create_config, +) + +config = create_config( + google_prompt_provider_config=GooglePromptProviderConfig(api_key="..."), + telemetry_id="your-telemetry-id", +) +evaluator = ConventionalityEvaluator(config) + +result = evaluator.evaluate( + ConventionalityEvaluationInput(text="Your text here.", grade=5) +) + +# Result structure +result.answer.score # "slightly_complex" | "moderately_complex" | "very_complex" | "exceedingly_complex" +result.answer.label # Human-readable label +result.explanation.summary # Reasoning +result.explanation.details # {"conventionality_features": [...], "grade_context": "...", "instructional_insights": "..."} +result.metadata # Timing, token usage, evaluator info +``` + +## Configuration + +### Provider configs + +Each LLM provider requires its own config with an API key: + +```python +from learning_commons_evaluators import ( + GooglePromptProviderConfig, + OpenAIPromptProviderConfig, + AnthropicPromptProviderConfig, +) + +google_config = GooglePromptProviderConfig(api_key="...") +openai_config = OpenAIPromptProviderConfig(api_key="...", base_url=None) # base_url for Azure/proxies +anthropic_config = AnthropicPromptProviderConfig(api_key="...") +``` + +### EvaluatorConfig + +Use `create_config()` to create an `EvaluatorConfig`: + +```python +from learning_commons_evaluators import create_config + +config = create_config( + google_prompt_provider_config=google_config, + openai_prompt_provider_config=openai_config, + telemetry_id="your-telemetry-id", + logger=my_logger, # Optional: any standard logging.Logger (default: package logger) +) +``` + +### Logging + +The SDK uses Python's standard `logging` module. By default, `EvaluatorConfig` uses the +package logger `learning_commons_evaluators`, so log lines propagate to the root like +other libraries once your app configures handlers (for example `basicConfig` or +`dictConfig`). + +```python +import logging + +logging.basicConfig(level=logging.DEBUG) +# Optional: cap this library while the rest of the app stays DEBUG +logging.getLogger("learning_commons_evaluators").setLevel(logging.WARNING) + +# Or route SDK logs through your own logger +my_logger = logging.getLogger("my_app.evaluators") +# create_config(..., logger=my_logger) + +# Or use SDK helpers +from learning_commons_evaluators import ( + create_config_no_telemetry, + create_logger, + create_silent_logger, + get_logger, +) + +logger = create_logger(level=logging.DEBUG) # stream handler on SDK subtree +sdk_logger = get_logger() # same name as default config logger + +# Discard evaluator log lines entirely +config = create_config_no_telemetry(logger=create_silent_logger()) +``` + +## Error handling + +```python +from learning_commons_evaluators import ( + ConfigurationError, # Missing/invalid config + ValidationError, # Invalid input + AuthenticationError, # Invalid API keys (401/403) + RateLimitError, # Rate limit exceeded (429) - has retry_after + NetworkError, # Network failures + EvaluatorTimeoutError, # Request timeout + APIError, # Other API errors +) + +try: + result = evaluator.evaluate(input) +except ConfigurationError as e: + print(f"Config issue: {e}") +except ValidationError as e: + print(f"Invalid input: {e}") +except RateLimitError as e: + print(f"Rate limited, retry after {e.retry_after}ms") +except APIError as e: + print(f"API error (retryable={e.retryable}): {e}") +``` + +Failures inside LLM prompt steps are passed through `wrap_provider_error()` (see `learning_commons_evaluators.schemas.errors`) so you typically see `APIError` subclasses rather than raw LangChain or HTTP client exceptions. Use `EvaluatorTimeoutError` for timeouts (the package does not export a `TimeoutError` alias, to avoid shadowing the Python builtin). + +On evaluation failure, `metadata.status` and `error_details` are set on the in-memory metadata object for the run and appear on the evaluation end log line; `BaseEvaluator.evaluate` still re-raises and does not return a result object. + +## Creating custom evaluators + +Extend `BaseEvaluator` to create custom evaluators: + +```python +from learning_commons_evaluators import BaseEvaluator, EvaluatorConfig +from learning_commons_evaluators.schemas.evaluator import EvaluationInput, EvaluationResult +from learning_commons_evaluators.schemas.metadata import EvaluatorMetadata, EvaluatorMaturity, EvaluationMetadata + +class MyEvaluator(BaseEvaluator[MyInput, EvaluationResult, MySettings]): + metadata = EvaluatorMetadata( + id="my-evaluator", + version="0.1.0", + name="My Evaluator", + description="Evaluates something custom", + maturity=EvaluatorMaturity.alpha, + ) + default_evaluation_settings = MySettings(...) + + def evaluate_impl( + self, + input: MyInput, + evaluation_settings: MySettings, + evaluation_metadata: EvaluationMetadata, + ) -> EvaluationResult: + # Use self.execute_prompt_chain_step() for LLM calls + output = self.execute_prompt_chain_step( + step_name="main", + prompt_settings=evaluation_settings.prompt_settings, + evaluation_metadata=evaluation_metadata, + template=my_prompt_template, + chain_inputs={"text": input.text.value}, + parser_output_type=MyOutputSchema, + ) + return EvaluationResult(answer=..., explanation=..., metadata=evaluation_metadata) +``` + +## License + +MIT diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml new file mode 100644 index 0000000..d48f934 --- /dev/null +++ b/sdks/python/pyproject.toml @@ -0,0 +1,129 @@ +[build-system] +requires = ["setuptools>=61", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "learning-commons-evaluators" +version = "0.1.0" +description = "Python SDK for Learning Commons educational evaluators" +readme = "README.md" +license = { text = "MIT" } +requires-python = ">=3.10" +authors = [{ name = "Learning Commons" }] +keywords = ["education", "evaluators", "literacy", "text-complexity"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Education", +] +dependencies = [ + # LLM access is via LangChain; the langchain-* packages pull in provider SDKs as needed. + "pydantic>=2.0.0", + "textstat>=0.7.0", + "langchain-anthropic>=0.2.0", + "langchain-core>=0.3.0", + "langchain-google-genai>=2.0.0", + "langchain-openai>=0.2.0", + "tomli>=2.0.0; python_version < \"3.11\"", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", + "pytest-asyncio>=0.21.0", + "pytest-cov>=5.0.0", + "ruff>=0.9.0", + "mypy>=1.14.0", +] + +[project.urls] +Homepage = "https://github.com/learning-commons-org/evaluators" +Repository = "https://github.com/learning-commons-org/evaluators/tree/main/sdks/python" +"Bug Tracker" = "https://github.com/learning-commons-org/evaluators/issues" + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +learning_commons_evaluators = [ + "py.typed", + "settings/**/*.toml", +] + +# The settings TOML files inside the package are the bundled copy used when the +# SDK is installed via pip. The authoritative shared copy lives at sdks/settings/ +# (shared with TypeScript and other SDKs); keep the two in sync after any prompt +# or settings change β€” see the README for instructions. +# +# When developing inside the monorepo, set EVALUATORS_SETTINGS_DIR to +# sdks/settings/ so the SDK reads the shared copy instead of the bundled one: +# +# export EVALUATORS_SETTINGS_DIR=/path/to/evaluators/sdks/settings + +[tool.pytest.ini_options] +asyncio_mode = "auto" +testpaths = ["tests"] + +# ── Ruff (lint + format) ────────────────────────────────────────────────────── + +[tool.ruff] +target-version = "py310" +line-length = 100 +extend-exclude = [ + "src/learning_commons_evaluators/settings/_generated_conventionality_settings.py", + "src/learning_commons_evaluators/settings/_generated_vocabulary_settings.py", +] + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # Pyflakes + "I", # isort + "UP", # pyupgrade + "B", # flake8-bugbear + "SIM", # flake8-simplify +] +ignore = ["E501"] # line length β€” formatter wraps where it matters + +[tool.ruff.lint.per-file-ignores] +"../../scripts/generate_settings.py" = ["E402"] + +# ── Mypy ────────────────────────────────────────────────────────────────────── + +[tool.mypy] +python_version = "3.10" +mypy_path = ["src", "tests"] +explicit_package_bases = true +plugins = ["pydantic.mypy"] +warn_unused_configs = true +show_error_codes = true +# Intentional `validate(self) -> None` on models shadows Pydantic v2's classmethod `validate`. +disable_error_code = ["override"] + +[[tool.mypy.overrides]] +module = "learning_commons_evaluators.settings.load_settings" +disable_error_code = ["import-untyped"] + +[[tool.mypy.overrides]] +module = "contract_tests.loader" +disable_error_code = ["import-untyped"] + +[[tool.mypy.overrides]] +module = "learning_commons_evaluators.providers.langchain" +# LangChain stubs lag the runtime API (e.g. ChatAnthropic kwargs). +disable_error_code = ["call-arg", "arg-type"] + +[[tool.mypy.overrides]] +module = "learning_commons_evaluators.evaluators.conventionality" +disable_error_code = ["import-untyped", "call-arg"] + +[[tool.mypy.overrides]] +module = "learning_commons_evaluators.evaluators.vocabulary" +disable_error_code = ["import-untyped", "call-arg"] diff --git a/sdks/python/src/learning_commons_evaluators/__init__.py b/sdks/python/src/learning_commons_evaluators/__init__.py new file mode 100644 index 0000000..0b96e44 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/__init__.py @@ -0,0 +1,127 @@ +"""Learning Commons Evaluators – Python SDK for educational text evaluators.""" + +from learning_commons_evaluators._version import __description__, __version__ + +# Config +from learning_commons_evaluators.config import ( + AnthropicPromptProviderConfig, + EvaluatorConfig, + GooglePromptProviderConfig, + OpenAIPromptProviderConfig, + PromptProviderConfig, + PromptSettings, + create_config, + create_config_no_telemetry, + create_config_telemetry_with_full_input, +) + +# Errors +from learning_commons_evaluators.errors import ( + APIError, + AuthenticationError, + ConfigurationError, + EvaluatorError, + EvaluatorRetryableError, + EvaluatorTimeoutError, + NetworkError, + RateLimitError, + ValidationError, + wrap_provider_error, +) + +# Evaluators +from learning_commons_evaluators.evaluators import ( + BaseEvaluator, + InputT, + OutputT, +) + +# Logger (uses Python standard logging) +from learning_commons_evaluators.logger import ( + SDK_LOGGER_NAME, + Logger, + create_logger, + create_silent_logger, + get_logger, +) +from learning_commons_evaluators.schemas.common_inputs import ( + GradeInputField, + TextInputField, +) +from learning_commons_evaluators.schemas.config import EvaluationSettings, LlmProvider + +# Schemas (core) +from learning_commons_evaluators.schemas.evaluator import ( + EvaluationAnswer, + EvaluationExplanation, + EvaluationInput, + EvaluationResult, + InputField, +) +from learning_commons_evaluators.schemas.input_specs import ( + AnyInputSpec, + GradeInputSpec, + InputSpec, + TextInputSpec, +) +from learning_commons_evaluators.schemas.metadata import ( + EvaluationMetadata, + EvaluatorMaturity, + EvaluatorMetadata, + Status, + TokenUsage, +) +from learning_commons_evaluators.schemas.text_complexity import ( + TextComplexityEvaluationInput, +) + +__all__ = [ + "__description__", + "__version__", + "APIError", + "AnthropicPromptProviderConfig", + "AuthenticationError", + "BaseEvaluator", + "ConfigurationError", + "EvaluationAnswer", + "EvaluationExplanation", + "EvaluationInput", + "EvaluationMetadata", + "EvaluationResult", + "EvaluationSettings", + "EvaluatorConfig", + "EvaluatorError", + "EvaluatorMaturity", + "EvaluatorMetadata", + "EvaluatorRetryableError", + "EvaluatorTimeoutError", + "GooglePromptProviderConfig", + "AnyInputSpec", + "GradeInputField", + "GradeInputSpec", + "InputField", + "InputSpec", + "InputT", + "TextInputSpec", + "LlmProvider", + "Logger", + "NetworkError", + "OpenAIPromptProviderConfig", + "OutputT", + "PromptProviderConfig", + "PromptSettings", + "RateLimitError", + "SDK_LOGGER_NAME", + "Status", + "TextComplexityEvaluationInput", + "TextInputField", + "TokenUsage", + "ValidationError", + "create_config", + "create_config_no_telemetry", + "create_config_telemetry_with_full_input", + "create_logger", + "create_silent_logger", + "get_logger", + "wrap_provider_error", +] diff --git a/sdks/python/src/learning_commons_evaluators/_version.py b/sdks/python/src/learning_commons_evaluators/_version.py new file mode 100644 index 0000000..f3abdb6 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/_version.py @@ -0,0 +1,10 @@ +"""Package version and description. Isolated to avoid circular imports.""" + +try: + from importlib.metadata import version + + __version__ = version("learning-commons-evaluators") +except Exception: + __version__ = "0.1.0" + +__description__ = "Python SDK for Learning Commons educational evaluators" diff --git a/sdks/python/src/learning_commons_evaluators/config.py b/sdks/python/src/learning_commons_evaluators/config.py new file mode 100644 index 0000000..f3dd635 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/config.py @@ -0,0 +1,25 @@ +"""Re-export evaluator config from schemas.config for package-level imports.""" + +from learning_commons_evaluators.schemas.config import ( + AnthropicPromptProviderConfig, + EvaluatorConfig, + GooglePromptProviderConfig, + OpenAIPromptProviderConfig, + PromptProviderConfig, + PromptSettings, + create_config, + create_config_no_telemetry, + create_config_telemetry_with_full_input, +) + +__all__ = [ + "AnthropicPromptProviderConfig", + "EvaluatorConfig", + "GooglePromptProviderConfig", + "OpenAIPromptProviderConfig", + "PromptProviderConfig", + "PromptSettings", + "create_config", + "create_config_no_telemetry", + "create_config_telemetry_with_full_input", +] diff --git a/sdks/python/src/learning_commons_evaluators/errors.py b/sdks/python/src/learning_commons_evaluators/errors.py new file mode 100644 index 0000000..648fe06 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/errors.py @@ -0,0 +1,27 @@ +"""Re-export error types from schemas.errors for package-level imports.""" + +from learning_commons_evaluators.schemas.errors import ( + APIError, + AuthenticationError, + ConfigurationError, + EvaluatorError, + EvaluatorRetryableError, + EvaluatorTimeoutError, + NetworkError, + RateLimitError, + ValidationError, + wrap_provider_error, +) + +__all__ = [ + "APIError", + "AuthenticationError", + "ConfigurationError", + "EvaluatorError", + "EvaluatorRetryableError", + "EvaluatorTimeoutError", + "NetworkError", + "RateLimitError", + "ValidationError", + "wrap_provider_error", +] diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py b/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py new file mode 100644 index 0000000..a22ec2b --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py @@ -0,0 +1,15 @@ +"""Evaluator implementations. + +Config types (``EvaluatorConfig``, ``GooglePromptProviderConfig``, etc.) are +exported from the top-level :mod:`learning_commons_evaluators` package, not +from here. Import evaluator classes directly from this sub-package only when +you want to be explicit about the source. +""" + +from learning_commons_evaluators.evaluators.base import BaseEvaluator, InputT, OutputT + +__all__ = [ + "BaseEvaluator", + "InputT", + "OutputT", +] diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py new file mode 100644 index 0000000..47af736 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py @@ -0,0 +1,263 @@ +"""Base evaluator class. Config types live in schemas.config.""" + +from __future__ import annotations + +import time +from abc import ABC, abstractmethod +from collections.abc import Callable +from typing import Any, Generic, TypeVar, cast + +from pydantic import BaseModel +from pydantic import ValidationError as PydanticValidationError + +from learning_commons_evaluators.providers import ( + create_provider, + token_usage_from_aimessage, +) +from learning_commons_evaluators.schemas.config import ( + EvaluationSettings, + EvaluatorConfig, + PromptSettings, +) +from learning_commons_evaluators.schemas.errors import ( + EvaluatorError, + wrap_provider_error, +) +from learning_commons_evaluators.schemas.evaluator import ( + EvaluationInput, + EvaluationResult, +) +from learning_commons_evaluators.schemas.metadata import ( + PROMPT_STEP_EXTRA_PROMPT_SETTINGS, + PROMPT_STEP_EXTRA_TOKEN_USAGE, + EvaluationMetadata, + EvaluatorMetadata, + Status, + StepMetadata, + TokenUsage, + prompt_settings_to_extras_value, +) + +InputT = TypeVar("InputT", bound=EvaluationInput) +OutputT = TypeVar("OutputT", bound=EvaluationResult) +SettingsT = TypeVar("SettingsT", bound=EvaluationSettings) +ResultT = TypeVar("ResultT", bound=BaseModel) + + +class BaseEvaluator(ABC, Generic[InputT, OutputT, SettingsT]): + """ + Abstract base class for all evaluators. + Subclasses must set metadata, default_evaluation_settings, and implement evaluate_impl(). + """ + + config: EvaluatorConfig + metadata: EvaluatorMetadata + default_evaluation_settings: SettingsT + + def __init__(self, config: EvaluatorConfig) -> None: + self.config = config + + def evaluate( + self, + input: InputT, + evaluation_settings: SettingsT | None = None, + ) -> OutputT: + """Evaluate the given input and return a structured result. + + Validates the input, delegates to :meth:`evaluate_impl`, records timing + and status on the returned metadata, and logs start/end events via the + configured logger. If ``evaluation_settings`` is ``None``, the + evaluator's :attr:`default_evaluation_settings` is used. + + Args: + input: Typed input for this evaluator. + evaluation_settings: Optional override for evaluation settings. + Defaults to :attr:`default_evaluation_settings`. + + Returns: + A typed result whose ``metadata.status`` is + :attr:`~learning_commons_evaluators.schemas.metadata.Status.succeeded` + on success. + + Raises: + ValidationError: Input fails validation. + ConfigurationError: No provider config for the required LLM provider. + APIError (or subclasses): The LLM API call failed. + + On failure, :attr:`~learning_commons_evaluators.schemas.metadata.EvaluationMetadata.status` + and ``error_details`` are set on the in-memory metadata object and included in the + evaluation end log record; no result object is returned because this method re-raises. + """ + if evaluation_settings is None: + evaluation_settings = self.default_evaluation_settings + start = time.perf_counter() + evaluation_metadata = EvaluationMetadata( + evaluator_metadata=self.metadata, + evaluation_settings=evaluation_settings, + input_metadata=input.input_values() + if self.config.send_full_input_with_telemetry + else input.input_metadata(), + ) + self.config.logger.info( + "evaluation start", + extra={"evaluation_metadata": evaluation_metadata}, + ) + try: + input.validate() + result = self.evaluate_impl(input, evaluation_settings, evaluation_metadata) + evaluation_metadata.status = Status.succeeded + result.metadata = evaluation_metadata + return result + except Exception as e: + evaluation_metadata.status = Status.failed + evaluation_metadata.error_details = str(e) + raise + finally: + evaluation_metadata.processing_time_ms = (time.perf_counter() - start) * 1000 + self.config.logger.info( + "evaluation end", + extra={"evaluation_metadata": evaluation_metadata}, + ) + # TODO: send_telemetry(evaluation_metadata) + + @abstractmethod + def evaluate_impl( + self, + input: InputT, + evaluation_settings: SettingsT, + evaluation_metadata: EvaluationMetadata, + ) -> OutputT: + """Implement the evaluation logic. Return a result; base assigns evaluation_metadata onto it.""" + ... + + def execute_step( + self, + step_name: str, + evaluation_metadata: EvaluationMetadata, + implementation_function: Callable[[], ResultT], + *, + extras: dict[str, Any] | None = None, + ) -> ResultT: + """Run ``implementation_function`` and record step metadata on ``evaluation_metadata``. + + ``step_name`` is always the step id. Optional ``extras`` is copied into + :attr:`StepMetadata.extras` (merged with any updates made during the step, e.g. token usage). + """ + start = time.perf_counter() + step_extras = dict(extras) if extras is not None else {} + step_metadata = StepMetadata(step_id=step_name, extras=step_extras) + self.config.logger.info("step start", extra={"step_metadata": step_metadata}) + try: + result = implementation_function() + step_metadata.status = Status.succeeded + return result + except Exception as e: + step_metadata.status = Status.failed + step_metadata.error_details = str(e) + raise + finally: + step_metadata.processing_time_ms = (time.perf_counter() - start) * 1000 + self.config.logger.info("step end", extra={"step_metadata": step_metadata}) + evaluation_metadata.step_details[step_name] = step_metadata + + def execute_prompt_chain_step( + self, + step_name: str, + prompt_settings: PromptSettings, + evaluation_metadata: EvaluationMetadata, + template: Any, + chain_inputs: dict[str, Any], + parser_output_type: type[ResultT] | None = None, + ) -> ResultT | str: + """Run a prompt chain (template | LLM), record metadata, and return the result. + + When ``parser_output_type`` is a Pydantic model class, the LLM response is + parsed as JSON and returned as an instance of that class. When + ``parser_output_type`` is ``None``, the raw response content is returned as + a plain ``str`` β€” use this for steps that produce unstructured prose (e.g. a + background-knowledge assumption). + + Provider config (e.g. API key) is resolved from ``self.config`` by + ``prompt_settings.provider_type``. + + Args: + step_name: Identifier for this step in evaluation_metadata.step_details. + prompt_settings: Provider type, model, and temperature for the LLM call. + evaluation_metadata: Metadata for the full evaluation; step metadata and + token usage are updated in place. + template: A LangChain prompt template. + chain_inputs: Variables to format the template and invoke the chain. + parser_output_type: Pydantic model class for JSON parsing, or ``None`` + to return the raw text response. + + Returns: + Parsed instance of ``parser_output_type`` when a type is given; plain + ``str`` when ``None``. + + Raises: + ConfigurationError: No provider config for prompt_settings.provider_type. + EvaluatorError: SDK errors, including :func:`~learning_commons_evaluators.schemas.errors.wrap_provider_error` output for LangChain or HTTP failures (typically :class:`~learning_commons_evaluators.schemas.errors.APIError` subclasses). Pydantic :exc:`pydantic.ValidationError` from output parsing is re-raised unchanged. + """ + # Populated after a successful LLM invoke so we can attach usage even if parsing fails. + token_usage: TokenUsage | None = None + + def _run_chain() -> ResultT | str: + nonlocal token_usage + try: + provider = create_provider(prompt_settings, self.config) + llm_chain: Any = template | provider + ai_message = llm_chain.invoke(chain_inputs) + token_usage = token_usage_from_aimessage(ai_message, prompt_settings) + if parser_output_type is None: + return str(ai_message.content) + from langchain_core.output_parsers.json import JsonOutputParser + + parser = JsonOutputParser(pydantic_object=parser_output_type) + raw = parser.invoke(ai_message) + if isinstance(raw, parser_output_type): + return raw + return parser_output_type.model_validate(raw) + except EvaluatorError: + raise + except PydanticValidationError: + raise + except (KeyboardInterrupt, SystemExit): + raise + except Exception as e: + raise wrap_provider_error(e) from e + + try: + return cast( + ResultT | str, + self.execute_step( + step_name, + evaluation_metadata, + extras={ + PROMPT_STEP_EXTRA_PROMPT_SETTINGS: prompt_settings_to_extras_value( + prompt_settings + ), + }, + implementation_function=cast(Callable[[], ResultT], _run_chain), + ), + ) + finally: + if token_usage is not None: + self.update_total_token_usage(token_usage, evaluation_metadata) + step = evaluation_metadata.step_details.get(step_name) + if step is not None: + step.extras[PROMPT_STEP_EXTRA_TOKEN_USAGE] = token_usage.model_dump(mode="json") + + def update_total_token_usage( + self, token_usage: TokenUsage, evaluation_metadata: EvaluationMetadata + ) -> None: + """Record token usage for the current run.""" + if token_usage.provider_type not in evaluation_metadata.total_token_usage: + evaluation_metadata.total_token_usage[token_usage.provider_type] = token_usage + else: + current_total = evaluation_metadata.total_token_usage[token_usage.provider_type] + evaluation_metadata.total_token_usage[token_usage.provider_type] = TokenUsage( + provider_type=current_total.provider_type, + model=current_total.model, + input_tokens=current_total.input_tokens + token_usage.input_tokens, + output_tokens=current_total.output_tokens + token_usage.output_tokens, + ) diff --git a/sdks/python/src/learning_commons_evaluators/logger.py b/sdks/python/src/learning_commons_evaluators/logger.py new file mode 100644 index 0000000..4cf5f18 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/logger.py @@ -0,0 +1,98 @@ +"""Logging utilities for the Learning Commons Evaluators SDK. + +Follows the "Configuring logging for a library" section of the Python logging HOWTO: + +- A :class:`logging.NullHandler` is attached once to the logger named + ``learning_commons_evaluators`` when this module is imported, so optional log + calls never trigger "no handler" warnings if the host app does not configure + logging. +- The library does not call :func:`logging.basicConfig` or otherwise configure + the root logger. +- By default, :class:`~learning_commons_evaluators.schemas.config.EvaluatorConfig` + uses the package logger ``learning_commons_evaluators`` so messages propagate + to the root like typical libraries. Use :func:`create_silent_logger` when + passing ``logger=`` if you need to discard SDK log lines entirely. +- Applications attach handlers to the root or to ``learning_commons_evaluators``, + adjust levels (e.g. suppress the SDK while the app is at DEBUG), pass a custom + :class:`~logging.Logger` through config, or use :func:`create_logger` for a + small stream-handler helper. +""" + +from __future__ import annotations + +import logging + +Logger = logging.Logger + +SDK_LOGGER_NAME = "learning_commons_evaluators" + + +def _has_non_null_handler(logger: logging.Logger) -> bool: + return any(not isinstance(h, logging.NullHandler) for h in logger.handlers) + + +def _install_library_null_handler() -> None: + """Attach NullHandler to the SDK root logger once (stdlib library pattern).""" + root = logging.getLogger(SDK_LOGGER_NAME) + if not any(isinstance(h, logging.NullHandler) for h in root.handlers): + root.addHandler(logging.NullHandler()) + + +_install_library_null_handler() + + +def get_logger(name: str | None = None) -> logging.Logger: + """Return the SDK root logger or a child (``learning_commons_evaluators.``). + + Does not add handlers or change levels; configuration is left to the + application or to :func:`create_logger`. + """ + if name is None: + return logging.getLogger(SDK_LOGGER_NAME) + return logging.getLogger(f"{SDK_LOGGER_NAME}.{name}") + + +def create_logger( + name: str | None = None, + level: int = logging.WARNING, + handler: logging.Handler | None = None, + *, + use_logger: logging.Logger | None = None, +) -> logging.Logger: + """Return ``use_logger`` unchanged, or configure an SDK subtree logger. + + When no ``use_logger`` is given, sets *level* and attaches *handler*, or a + :class:`logging.StreamHandler` if there is no non-:class:`~logging.NullHandler` + handler yet (the import-time NullHandler alone does not count). + + Intended for applications and quick debugging, not for internal SDK calls. + """ + if use_logger is not None: + return use_logger + + logger = get_logger(name) + logger.setLevel(level) + + if handler is not None: + logger.addHandler(handler) + elif not _has_non_null_handler(logger): + stream_handler = logging.StreamHandler() + stream_handler.setLevel(level) + stream_handler.setFormatter(logging.Formatter("[%(levelname)s] %(name)s: %(message)s")) + logger.addHandler(stream_handler) + + return logger + + +def create_silent_logger() -> logging.Logger: + """Logger that discards all records (no propagation). Use for ``logger=`` opt-out.""" + logger = logging.getLogger(f"{SDK_LOGGER_NAME}.silent") + if not logger.handlers: + logger.addHandler(logging.NullHandler()) + logger.propagate = False + return logger + + +def format_error(error: BaseException) -> str: + """Format an exception for logging.""" + return f"{type(error).__name__}: {error}" diff --git a/sdks/python/src/learning_commons_evaluators/providers/__init__.py b/sdks/python/src/learning_commons_evaluators/providers/__init__.py new file mode 100644 index 0000000..f30b238 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/providers/__init__.py @@ -0,0 +1,15 @@ +"""LLM providers for use in chains (e.g. LangChain). + +create_provider() returns a langchain_core BaseChatModel (OpenAI, Google, Anthropic). + +Config types (PromptProviderConfig and provider-specific configs: +AnthropicPromptProviderConfig, GooglePromptProviderConfig, OpenAIPromptProviderConfig) +live in learning_commons_evaluators.schemas.config. +""" + +from learning_commons_evaluators.providers.langchain import ( + create_provider, + token_usage_from_aimessage, +) + +__all__ = ["create_provider", "token_usage_from_aimessage"] diff --git a/sdks/python/src/learning_commons_evaluators/providers/langchain.py b/sdks/python/src/learning_commons_evaluators/providers/langchain.py new file mode 100644 index 0000000..d4d07b7 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/providers/langchain.py @@ -0,0 +1,110 @@ +"""LangChain chat model factory. + +create_provider(prompt_settings, evaluator_config) builds a LangChain chat model +(OpenAI, Google, Anthropic) using PromptSettings and EvaluatorConfig. +Set model and temperature when building the chain, e.g. llm.bind(model="gpt-4o", temperature=0.7). + +token_usage_from_aimessage() extracts TokenUsage from a LangChain AIMessage. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from learning_commons_evaluators.errors import ConfigurationError +from learning_commons_evaluators.schemas.config import ( + EvaluatorConfig, + LlmProvider, + PromptSettings, +) +from learning_commons_evaluators.schemas.metadata import TokenUsage + +if TYPE_CHECKING: + from langchain_core.language_models.chat_models import BaseChatModel + + +def token_usage_from_aimessage(message: Any, prompt_settings: PromptSettings) -> TokenUsage: + """Extract TokenUsage from a LangChain AIMessage and prompt settings.""" + usage = getattr(message, "usage_metadata", None) or getattr( + message, "response_metadata", {} + ).get("usage_metadata") + if not usage: + return TokenUsage( + provider_type=prompt_settings.provider_type, + model=prompt_settings.model, + input_tokens=0, + output_tokens=0, + ) + return TokenUsage( + provider_type=prompt_settings.provider_type, + model=prompt_settings.model, + input_tokens=usage.get("input_tokens", 0), + output_tokens=usage.get("output_tokens", 0), + ) + + +def _create_openai_llm( + prompt_settings: PromptSettings, evaluator_config: EvaluatorConfig +) -> BaseChatModel: + from langchain_openai import ChatOpenAI + + openai_config = evaluator_config.openai_prompt_provider_config + if openai_config is None: + raise ConfigurationError("OpenAI provider config is not set on EvaluatorConfig") + kwargs: dict[str, Any] = { + "api_key": openai_config.api_key, + "model": prompt_settings.model, + "temperature": prompt_settings.temperature, + } + if openai_config.base_url is not None: + kwargs["base_url"] = openai_config.base_url + return ChatOpenAI(**kwargs) + + +def _create_google_llm( + prompt_settings: PromptSettings, evaluator_config: EvaluatorConfig +) -> BaseChatModel: + from langchain_google_genai import ChatGoogleGenerativeAI + + google_config = evaluator_config.google_prompt_provider_config + if google_config is None: + raise ConfigurationError("Google provider config is not set on EvaluatorConfig") + return ChatGoogleGenerativeAI( + google_api_key=google_config.api_key, + model=prompt_settings.model, + temperature=prompt_settings.temperature, + ) + + +def _create_anthropic_llm( + prompt_settings: PromptSettings, evaluator_config: EvaluatorConfig +) -> BaseChatModel: + from langchain_anthropic import ChatAnthropic + + anthropic_config = evaluator_config.anthropic_prompt_provider_config + if anthropic_config is None: + raise ConfigurationError("Anthropic provider config is not set on EvaluatorConfig") + return ChatAnthropic( + api_key=anthropic_config.api_key, + model=prompt_settings.model, + temperature=prompt_settings.temperature, + ) + + +def create_provider( + prompt_settings: PromptSettings, evaluator_config: EvaluatorConfig +) -> BaseChatModel: + """Create a LangChain chat model from a PromptSettings for use in a chain.""" + if prompt_settings.provider_type == LlmProvider.OPENAI: + return _create_openai_llm(prompt_settings, evaluator_config) + if prompt_settings.provider_type == LlmProvider.GOOGLE: + return _create_google_llm(prompt_settings, evaluator_config) + if prompt_settings.provider_type == LlmProvider.ANTHROPIC: + return _create_anthropic_llm(prompt_settings, evaluator_config) + raise ConfigurationError( + f"Unsupported LLM provider type: {prompt_settings.provider_type!r}. " + "Expected openai, google, or anthropic." + ) + + +__all__ = ["create_provider", "token_usage_from_aimessage"] diff --git a/sdks/python/src/learning_commons_evaluators/py.typed b/sdks/python/src/learning_commons_evaluators/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/sdks/python/src/learning_commons_evaluators/schemas/__init__.py b/sdks/python/src/learning_commons_evaluators/schemas/__init__.py new file mode 100644 index 0000000..5b003b4 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/schemas/__init__.py @@ -0,0 +1,69 @@ +"""Schema and config types. Import from submodules or from learning_commons_evaluators.""" + +from learning_commons_evaluators.schemas.common_inputs import ( + GradeInputField, + TextInputField, +) +from learning_commons_evaluators.schemas.config import ( + EvaluationSettings, + LlmProvider, + PromptSettings, +) +from learning_commons_evaluators.schemas.errors import ValidationError +from learning_commons_evaluators.schemas.evaluator import ( + EvaluationAnswer, + EvaluationExplanation, + EvaluationInput, + EvaluationResult, + InputField, +) +from learning_commons_evaluators.schemas.input_specs import ( + AnyInputSpec, + GradeInputSpec, + InputSpec, + TextInputSpec, +) +from learning_commons_evaluators.schemas.metadata import ( + PROMPT_STEP_EXTRA_PROMPT_SETTINGS, + PROMPT_STEP_EXTRA_TOKEN_USAGE, + EvaluationMetadata, + EvaluatorMaturity, + EvaluatorMetadata, + InputMetadata, + Status, + StepMetadata, + TokenUsage, + prompt_settings_to_extras_value, +) +from learning_commons_evaluators.schemas.text_complexity import ( + TextComplexityEvaluationInput, +) + +__all__ = [ + "AnyInputSpec", + "GradeInputSpec", + "InputSpec", + "TextInputSpec", + "EvaluationAnswer", + "EvaluationExplanation", + "EvaluationInput", + "EvaluationMetadata", + "EvaluationResult", + "EvaluationSettings", + "EvaluatorMetadata", + "EvaluatorMaturity", + "GradeInputField", + "InputField", + "InputMetadata", + "LlmProvider", + "PromptSettings", + "PROMPT_STEP_EXTRA_PROMPT_SETTINGS", + "PROMPT_STEP_EXTRA_TOKEN_USAGE", + "Status", + "StepMetadata", + "TextComplexityEvaluationInput", + "TextInputField", + "TokenUsage", + "ValidationError", + "prompt_settings_to_extras_value", +] diff --git a/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py b/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py new file mode 100644 index 0000000..37a6aa0 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py @@ -0,0 +1,83 @@ +"""Common input field types for evaluators. + +Each field pairs a typed value with an :class:`~.input_specs.InputSpec` that +describes its constraints. Validation logic lives on the field; constraint +values live on the spec. Callers typically obtain a spec from TOML settings +rather than constructing one directly:: + + # spec loaded from TOML via EvaluatorMetadata.inputs + text_spec: TextInputSpec = config.evaluator_metadata.inputs["text"] + field = TextInputField(spec=text_spec, value="The quick brown fox...") + field.validate() # raises ValidationError if constraints are violated +""" + +from typing import Any + +from learning_commons_evaluators.schemas.errors import ValidationError +from learning_commons_evaluators.schemas.evaluator import InputField +from learning_commons_evaluators.schemas.input_specs import ( + GradeInputSpec, + TextInputSpec, +) + +__all__ = ["GradeInputField", "TextInputField"] + + +class TextInputField(InputField[str]): + """Input field for free-form text. + + Narrows :class:`~.evaluator.InputField` to ``value: str`` and + ``spec: TextInputSpec``. Free-form text may contain PII, so + :meth:`input_metadata` returns only the character count β€” never the raw + value. + + Constraints (min/max text length) are read from ``spec`` rather than + stored directly on the field, so the same spec object can be shared across + many field instances. + """ + + # Narrows the abstract InputField.spec: InputSpec β†’ TextInputSpec. + # value: str is inherited from InputField[str] and does not need to be redeclared. + spec: TextInputSpec + + def validate(self) -> None: + """Raise :class:`~.errors.ValidationError` if the value violates the spec constraints.""" + text_length = len(self.value) + min_len = self.spec.min_text_length or 0 + if text_length < min_len: + raise ValidationError(f"Text length {text_length} is below minimum {min_len}.") + if self.spec.max_text_length is not None and text_length > self.spec.max_text_length: + raise ValidationError( + f"Text length {text_length} exceeds maximum {self.spec.max_text_length}." + ) + + def input_metadata(self) -> dict[str, Any]: + """Return the character count β€” never the raw text (may contain PII).""" + return {"textLength": str(len(self.value))} + + +class GradeInputField(InputField[int]): + """Input field for a K–12 grade level (0 = kindergarten, 12 = senior year). + + Narrows :class:`~.evaluator.InputField` to ``value: int`` and + ``spec: GradeInputSpec``. The baseline 0–12 range is always enforced; + ``spec.allowed_grades`` further restricts the valid set when defined. + """ + + # Narrows the abstract InputField.spec: InputSpec β†’ GradeInputSpec. + # value: int is inherited from InputField[int] and does not need to be redeclared. + spec: GradeInputSpec + + def validate(self) -> None: + """Raise :class:`~.errors.ValidationError` if the grade is out of range or not in the allowed set.""" + grade = self.value + if grade < 0 or grade > 12: + raise ValidationError(f"Grade {grade} is not in allowed range 0-12.") + if self.spec.allowed_grades is not None and grade not in self.spec.allowed_grades: + raise ValidationError( + f"Grade {grade} is not in allowed set: {sorted(self.spec.allowed_grades)}." + ) + + def input_metadata(self) -> dict[str, Any]: + """Return grade metadata. Grade is not PII so the value is included directly.""" + return {"grade": self.value} diff --git a/sdks/python/src/learning_commons_evaluators/schemas/config.py b/sdks/python/src/learning_commons_evaluators/schemas/config.py new file mode 100644 index 0000000..c451fb7 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/schemas/config.py @@ -0,0 +1,159 @@ +""" +Evaluator configuration and metadata. + +Prompt provider configs (base, Google, OpenAI, Anthropic), EvaluatorConfig, and evaluator +metadata live here. Evaluator config is created via factory methods (create_config, +create_config_no_telemetry, create_config_telemetry_with_full_input). +""" + +from dataclasses import dataclass, field +from enum import Enum + +from pydantic import BaseModel, ConfigDict + +from learning_commons_evaluators.logger import Logger, get_logger + +# --- Prompt provider configs (for LLM calls in prompt steps) --- + + +class LlmProvider(str, Enum): + """LLM provider identifier. Subclass of str so it compares and serializes as the provider name.""" + + ANTHROPIC = "anthropic" + GOOGLE = "google" + OPENAI = "openai" + + +@dataclass(frozen=True) +class PromptProviderConfig: + """Base type for prompt provider configuration.""" + + api_key: str + type: LlmProvider + base_url: str | None = ( + None # Optional; for OpenAI-compatible endpoints (e.g. Azure, proxy). Used only when type is OPENAI. + ) + + +@dataclass(frozen=True) +class GooglePromptProviderConfig(PromptProviderConfig): + """Google (Gemini) prompt provider config. Takes an API key.""" + + api_key: str + type: LlmProvider = LlmProvider.GOOGLE + + +@dataclass(frozen=True) +class OpenAIPromptProviderConfig(PromptProviderConfig): + """OpenAI prompt provider config. Takes an API key. Optional base_url for custom endpoints.""" + + api_key: str + type: LlmProvider = LlmProvider.OPENAI + + +@dataclass(frozen=True) +class AnthropicPromptProviderConfig(PromptProviderConfig): + """Anthropic (Claude) prompt provider config. Takes an API key.""" + + api_key: str + type: LlmProvider = LlmProvider.ANTHROPIC + + +@dataclass(frozen=True) +class PromptSettings: + """Settings for a prompt step: provider, model, temperature.""" + + provider_type: LlmProvider + model: str + temperature: float + + +class EvaluationSettings(BaseModel): + """Base model for evaluation run settings (e.g. model, temperature). + + Concrete evaluators subclass this and declare evaluation settings fields as needed. + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + +# --- Evaluator config --- + + +@dataclass(frozen=True) +class EvaluatorConfig: + """ + Config for creating an evaluator: prompt provider configs, logger, telemetry. + + Logger defaults to the SDK package logger (``learning_commons_evaluators``), so + log records propagate like typical library loggers. Pass ``logger=`` to use a + different logger (for example ``create_silent_logger()`` from + ``learning_commons_evaluators.logger`` to discard SDK messages). Accepts any + standard ``logging.Logger`` instance. + Create via create_config, create_config_no_telemetry, or + create_config_telemetry_with_full_input, then pass the config to the evaluator constructor. + """ + + google_prompt_provider_config: GooglePromptProviderConfig | None = None + openai_prompt_provider_config: OpenAIPromptProviderConfig | None = None + anthropic_prompt_provider_config: AnthropicPromptProviderConfig | None = None + logger: Logger = field(default_factory=get_logger) + telemetry_id: str | None = None + send_full_input_with_telemetry: bool = False + + +def create_config( + *, + google_prompt_provider_config: GooglePromptProviderConfig | None = None, + openai_prompt_provider_config: OpenAIPromptProviderConfig | None = None, + anthropic_prompt_provider_config: AnthropicPromptProviderConfig | None = None, + logger: Logger | None = None, + telemetry_id: str, + send_full_input_with_telemetry: bool = False, +) -> EvaluatorConfig: + """Create evaluator config with telemetry. telemetry_id is required.""" + return EvaluatorConfig( + google_prompt_provider_config=google_prompt_provider_config, + openai_prompt_provider_config=openai_prompt_provider_config, + anthropic_prompt_provider_config=anthropic_prompt_provider_config, + logger=get_logger() if logger is None else logger, + telemetry_id=telemetry_id, + send_full_input_with_telemetry=send_full_input_with_telemetry, + ) + + +def create_config_no_telemetry( + *, + google_prompt_provider_config: GooglePromptProviderConfig | None = None, + openai_prompt_provider_config: OpenAIPromptProviderConfig | None = None, + anthropic_prompt_provider_config: AnthropicPromptProviderConfig | None = None, + logger: Logger | None = None, +) -> EvaluatorConfig: + """Create evaluator config with telemetry disabled.""" + return EvaluatorConfig( + google_prompt_provider_config=google_prompt_provider_config, + openai_prompt_provider_config=openai_prompt_provider_config, + anthropic_prompt_provider_config=anthropic_prompt_provider_config, + logger=get_logger() if logger is None else logger, + telemetry_id=None, + send_full_input_with_telemetry=False, + ) + + +def create_config_telemetry_with_full_input( + *, + google_prompt_provider_config: GooglePromptProviderConfig | None = None, + openai_prompt_provider_config: OpenAIPromptProviderConfig | None = None, + anthropic_prompt_provider_config: AnthropicPromptProviderConfig | None = None, + logger: Logger | None = None, + telemetry_id: str, +) -> EvaluatorConfig: + """Create evaluator config with telemetry and full input sent with telemetry.""" + return EvaluatorConfig( + google_prompt_provider_config=google_prompt_provider_config, + openai_prompt_provider_config=openai_prompt_provider_config, + anthropic_prompt_provider_config=anthropic_prompt_provider_config, + logger=get_logger() if logger is None else logger, + telemetry_id=telemetry_id, + send_full_input_with_telemetry=True, + ) diff --git a/sdks/python/src/learning_commons_evaluators/schemas/errors.py b/sdks/python/src/learning_commons_evaluators/schemas/errors.py new file mode 100644 index 0000000..470d371 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/schemas/errors.py @@ -0,0 +1,140 @@ +"""Custom error types for the Evaluators SDK.""" + +import re + + +class EvaluatorError(Exception): + """Base error class for all evaluator errors.""" + + def __init__(self, message: str, code: str | None = None): + super().__init__(message) + self.message = message + self.code = code + self.name = "EvaluatorError" + + +class EvaluatorRetryableError(EvaluatorError): + """Base for errors that may be retried (possibly with backoff). All other evaluator errors are non-retryable.""" + + def __init__(self, message: str, code: str | None = None): + super().__init__(message, code) + self.name = "EvaluatorRetryableError" + + +class ConfigurationError(EvaluatorError): + """Configuration error - e.g. missing API keys. Non-retryable.""" + + def __init__(self, message: str): + super().__init__(message, "CONFIGURATION_ERROR") + self.name = "ConfigurationError" + + +class ValidationError(EvaluatorError): + """Validation error - invalid input. Non-retryable.""" + + def __init__(self, message: str): + super().__init__(message, "VALIDATION_ERROR") + self.name = "ValidationError" + + +class APIError(EvaluatorError): + """Base API error - LLM API calls failed. Use subclasses or set retryable explicitly.""" + + def __init__( + self, + message: str, + status_code: int | None = None, + retryable: bool = False, + code: str | None = None, + ): + super().__init__(message, code) + self.status_code = status_code + self.retryable = retryable + self.name = "APIError" + + +class AuthenticationError(APIError): + """Invalid or missing API keys (401/403). Non-retryable.""" + + def __init__(self, message: str, status_code: int | None = None): + super().__init__(message, status_code, False, "AUTHENTICATION_ERROR") + self.name = "AuthenticationError" + + +class RateLimitError(APIError, EvaluatorRetryableError): + """Rate limit exceeded (429). Should be retried with backoff.""" + + def __init__(self, message: str, retry_after: int | None = None): + super().__init__(message, 429, True, "RATE_LIMIT_ERROR") + self.retry_after = retry_after # milliseconds + self.name = "RateLimitError" + + +class NetworkError(APIError, EvaluatorRetryableError): + """Network failure. May be retryable.""" + + def __init__(self, message: str, retryable: bool = True): + super().__init__(message, None, retryable, "NETWORK_ERROR") + self.name = "NetworkError" + + +class EvaluatorTimeoutError(APIError, EvaluatorRetryableError): + """Request timed out. Should be retried with caution.""" + + def __init__(self, message: str = "Request timed out"): + super().__init__(message, 408, True, "TIMEOUT_ERROR") + self.name = "EvaluatorTimeoutError" + + +def _parse_provider_error(error: BaseException) -> tuple[str, int | None, str | None]: + message = str(error) + status_code = None + code = getattr(error, "name", None) or type(error).__name__ + if code == "Error": + code = None + match = re.search(r"\b(4\d{2}|5\d{2})\b", message) + if match: + status_code = int(match.group(1)) + return message, status_code, code + + +def wrap_provider_error( + error: BaseException, default_message: str = "API request failed" +) -> APIError: + """Wrap a provider error into the appropriate SDK error type.""" + message, status_code, code = _parse_provider_error(error) + msg = message or default_message + + if status_code in (401, 403): + return AuthenticationError( + msg if "API key" in msg or "api key" in msg.lower() else "Invalid API key", + status_code, + ) + if status_code == 429: + retry_match = re.search(r"retry[- ]after[:\s]+(\d+)", msg, re.I) + retry_after = int(retry_match.group(1)) * 1000 if retry_match else None + return RateLimitError( + msg if "rate limit" in msg.lower() else "Rate limit exceeded", + retry_after, + ) + # Timeouts before generic "Connection" β€” many stacks use "Connection timed out" + if "timeout" in msg.lower() or "timed out" in msg.lower(): + return EvaluatorTimeoutError(msg) + if any( + x in msg + for x in ( + "ECONNREFUSED", + "ENOTFOUND", + "ETIMEDOUT", + "network", + "Network", + "Connection", + ) + ): + return NetworkError(msg) + return APIError( + msg, + status_code, + bool(status_code and status_code >= 500), + code, + ) diff --git a/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py b/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py new file mode 100644 index 0000000..1e54a57 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py @@ -0,0 +1,219 @@ +"""Evaluation schemas.""" + +from abc import ABC, abstractmethod +from typing import Any, ClassVar, Generic, TypeVar + +__all__ = [ + "EvaluationAnswer", + "EvaluationExplanation", + "EvaluationInput", + "EvaluationResult", + "InputField", +] + +from pydantic import BaseModel, Field, model_validator + +from .errors import ConfigurationError, ValidationError +from .input_specs import InputSpec +from .metadata import EvaluationMetadata + +# Type variable for the raw Python value stored in an InputField (str, int, etc.) +_V = TypeVar("_V") + + +class InputField(BaseModel, Generic[_V], ABC): + """Abstract base class for all evaluator input fields. + + Pairs a typed ``value`` with an :class:`~.input_specs.InputSpec` that + describes its constraints. Subclasses must: + + - Redeclare ``spec`` with the appropriate :class:`~.input_specs.InputSpec` + subclass to narrow the base type. + - Provide concrete implementations of :meth:`validate` and + :meth:`input_metadata`. + + Type parameter: + _V: The Python type of the raw value (e.g. ``str`` for text fields, + ``int`` for grade fields). + + Example:: + + class TextInputField(InputField[str]): + spec: TextInputSpec # narrows InputField.spec: InputSpec + + def validate(self) -> None: + if len(self.value) < self.spec.min_text_length: + raise ValidationError(...) + + def input_metadata(self) -> dict[str, Any]: + return {"textLength": str(len(self.value))} + """ + + spec: InputSpec + value: _V + + @abstractmethod + def validate(self) -> None: + """Validate *value* against *spec* constraints. + + Raise :class:`~.errors.ValidationError` if the value is invalid. + """ + + @abstractmethod + def input_metadata(self) -> dict[str, Any]: + """Return safe, serialisable metadata describing this field. + + Must never expose raw values for fields that may contain PII (e.g. use + character counts rather than the text itself for free-form text fields). + """ + + +class EvaluationInput(BaseModel, ABC): + """Abstract base class for evaluator inputs. + + Concrete subclasses declare their fields as :class:`InputField` subclasses + and point ``_input_settings`` at the evaluator's settings dict:: + + class ConventionalityEvaluationInput(EvaluationInput): + _input_settings: ClassVar[dict] = _CONVENTIONALITY_CONFIG.evaluator_metadata.inputs + + text: TextInputField + grade: GradeInputField + + def __init__(self, *, text: str, grade: int, **kwargs): + super().__init__(text=text, grade=grade, **kwargs) + + The base class :meth:`_coerce_raw_to_input_fields` validator intercepts + raw Python values before Pydantic field validation, looks up each spec from + ``_input_settings``, validates the spec type, and constructs the appropriate + :class:`InputField` instance. Subclasses only need the minimal + ``__init__`` above for correct caller-facing type annotations; all + spec-lookup and construction logic lives here. + + The base class also provides :meth:`validate`, :meth:`input_metadata`, and + :meth:`input_values` by iterating over every Pydantic model field and + collecting those that are :class:`InputField` instances. + """ + + _input_settings: ClassVar[dict[str, InputSpec]] = {} + model_config = {"arbitrary_types_allowed": True} + + @model_validator(mode="before") + @classmethod + def _coerce_raw_to_input_fields(cls, data: Any) -> Any: + """Transform raw Python values into :class:`InputField` instances. + + For each field annotated as an :class:`InputField` subclass, looks up + the spec from ``cls._input_settings`` and constructs the field. Raises + :class:`~.errors.ConfigurationError` if the spec is missing or the wrong + type. Non-:class:`InputField` fields and already-constructed + :class:`InputField` instances are left unchanged. + """ + if not isinstance(data, dict): + return data + + out = dict(data) + for field_name, field_info in cls.model_fields.items(): + annotation = field_info.annotation + # isinstance(annotation, type) rejects generic aliases like Optional[TextInputField] + # or Union[TextInputField, None], which Pydantic represents as non-type objects. + # All current InputField fields are declared as concrete subclasses, so this is safe. + # If optional InputField fields are ever needed, this guard will need updating. + if not (isinstance(annotation, type) and issubclass(annotation, InputField)): + continue # not a concrete InputField subclass β€” leave as-is + if field_name not in out or isinstance(out[field_name], InputField): + continue # missing (let Pydantic handle) or already constructed + + spec = cls._input_settings.get(field_name) + if spec is None: + raise ConfigurationError( + f"Missing input spec for '{field_name}' in {cls.__name__}. " + "Check [[evaluator_metadata.inputs]] in evaluator settings." + ) + + expected_spec_type = annotation.model_fields["spec"].annotation + if not isinstance(expected_spec_type, type): + raise ConfigurationError( + f"Cannot resolve spec type for field '{field_name}' in {cls.__name__}." + ) + if not isinstance(spec, expected_spec_type): + raise ConfigurationError( + f"Input spec for '{field_name}' in {cls.__name__} has unexpected type " + f"{type(spec).__name__!r}; expected {expected_spec_type.__name__}. " + "Check [[evaluator_metadata.inputs]] in evaluator settings." + ) + + out[field_name] = annotation(spec=spec, value=out[field_name]) + + return out + + def validate(self) -> None: + """Validate all :class:`InputField` members, collecting every error before raising. + + Raises :class:`~.errors.ValidationError` if any field is invalid. + """ + errors: list[ValidationError] = [] + for name in type(self).model_fields: + field_val = getattr(self, name) + if isinstance(field_val, InputField): + try: + field_val.validate() + except ValidationError as e: + errors.append(e) + if errors: + raise ValidationError(f"Validation errors: {errors}") + + def input_metadata(self) -> dict[str, Any]: + """Return a mapping of field name β†’ :meth:`InputField.input_metadata` for each field. + + Non-:class:`InputField` fields produce a ``None`` entry. + """ + out: dict[str, Any] = {} + for name in type(self).model_fields: + field_val = getattr(self, name) + if isinstance(field_val, InputField): + out[name] = field_val.input_metadata() + else: + out[name] = None + return out + + def input_values(self) -> dict[str, Any]: + """Return a mapping of field name β†’ raw Python value. + + For :class:`InputField` members this is ``field.value``; for any other + field it is the field value itself. + """ + out: dict[str, Any] = {} + for name in type(self).model_fields: + v = getattr(self, name) + out[name] = v.value if isinstance(v, InputField) else v + return out + + +class EvaluationAnswer(BaseModel): + """The main answer of an evaluation: score and label.""" + + score: Any = Field( + description="The score of the evaluation. This is typically a string or a number." + ) + label: str = Field( + description="The label of the evaluation. This is typically a human-friendly string." + ) + + +class EvaluationExplanation(BaseModel): + """Explanation of the evaluation: summary (markdown) and optional keyed details.""" + + summary: str = Field(description="A summary of the evaluation in markdown format.") + details: dict[str, Any] = Field( + default_factory=dict, + description="Optional keyed details of the evaluation.", + ) + + +class EvaluationResult(BaseModel): + """Standard evaluation result: answer, explanation, and metadata.""" + + answer: EvaluationAnswer + explanation: EvaluationExplanation + metadata: EvaluationMetadata diff --git a/sdks/python/src/learning_commons_evaluators/schemas/input_specs.py b/sdks/python/src/learning_commons_evaluators/schemas/input_specs.py new file mode 100644 index 0000000..c35a6cc --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/schemas/input_specs.py @@ -0,0 +1,126 @@ +"""Input spec types describing the constraints for evaluator input fields. + +A spec is defined once β€” in the evaluator's settings TOML β€” and attached to +its corresponding field when the input object is constructed. + +Hierarchy:: + + InputSpec base: name, type, description, required + β”œβ”€ TextInputSpec adds: min_text_length, max_text_length + └─ GradeInputSpec adds: allowed_grades + +**Adding a new input spec type β€” checklist** + +1. Define a new subclass of :class:`InputSpec` in *this file* with a + ``type: Literal["YourFieldType"]`` default. The registry is rebuilt + automatically via :func:`_populate_input_spec_registry`. +2. Add the new class to the explicit :data:`AnyInputSpec` union below + (required β€” ``AnyInputSpec`` is not rebuilt automatically). +3. Add the matching ``InputField`` concrete class in ``common_inputs.py``. +4. Run ``make generate-settings`` (from the ``sdks/python/`` directory) after + updating any TOML that uses the new type. +""" + +from collections.abc import Iterator +from typing import Annotated, Literal + +from pydantic import BaseModel, Field +from pydantic_core import PydanticUndefined + +__all__ = [ + "AnyInputSpec", + "GradeInputSpec", + "INPUT_SPEC_REGISTRY", + "InputSpec", + "TextInputSpec", +] + +# --------------------------------------------------------------------------- +# Registry: populated by _populate_input_spec_registry() after subclasses exist +# --------------------------------------------------------------------------- + +# Maps the ``type`` discriminator string to the concrete InputSpec class. +# Used by the TOML parser to instantiate the right subclass and to build +# the AnyInputSpec discriminated union below. +INPUT_SPEC_REGISTRY: dict[str, type["InputSpec"]] = {} + + +# --------------------------------------------------------------------------- +# Base and concrete spec classes +# --------------------------------------------------------------------------- + + +class InputSpec(BaseModel): + """Base spec for one evaluator input field. + + Identifies the field (``name``, ``type``) and carries optional metadata + (``description``, ``required``). Subclasses add type-specific constraints. + """ + + name: str + type: str + description: str = "" + required: bool = True + + +class TextInputSpec(InputSpec): + """Spec for a text input field. + + Constraint fields are optional; omitting them means no length limit is + enforced for that boundary. + """ + + type: Literal["TextInputField"] = "TextInputField" + min_text_length: int | None = None + max_text_length: int | None = None + + +class GradeInputSpec(InputSpec): + """Spec for a grade input field. + + When ``allowed_grades`` is set, only those values pass validation in + addition to the field's baseline 0–12 range check. + """ + + type: Literal["GradeInputField"] = "GradeInputField" + allowed_grades: list[int] | None = None + + +def _discriminated_input_spec_subclasses( + base: type[InputSpec], +) -> Iterator[type[InputSpec]]: + """Descendants of ``base`` that define a string ``type`` default (discriminator value).""" + for cls in base.__subclasses__(): + yield from _discriminated_input_spec_subclasses(cls) + tf = cls.model_fields.get("type") + if tf is None or tf.is_required(): + continue + d = tf.default + if d is PydanticUndefined or not isinstance(d, str): + continue + yield cls + + +def _populate_input_spec_registry() -> None: + """Fill :data:`INPUT_SPEC_REGISTRY` once concrete InputSpec models exist.""" + INPUT_SPEC_REGISTRY.clear() + for cls in _discriminated_input_spec_subclasses(InputSpec): + key = cls.model_fields["type"].default + assert isinstance(key, str) # guarded by _discriminated_input_spec_subclasses + INPUT_SPEC_REGISTRY[key] = cls + + +_populate_input_spec_registry() + +# --------------------------------------------------------------------------- +# AnyInputSpec β€” explicit union; update this when adding a new InputSpec subclass +# --------------------------------------------------------------------------- + +# NOTE: This union is intentionally explicit rather than built dynamically from +# INPUT_SPEC_REGISTRY. Dynamic unions are opaque to static type checkers and IDEs. +# When you add a new InputSpec subclass, add it here too (see the module docstring +# for the full checklist). +AnyInputSpec = Annotated[ + TextInputSpec | GradeInputSpec, + Field(discriminator="type"), +] diff --git a/sdks/python/src/learning_commons_evaluators/schemas/metadata.py b/sdks/python/src/learning_commons_evaluators/schemas/metadata.py new file mode 100644 index 0000000..3ca60ed --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/schemas/metadata.py @@ -0,0 +1,94 @@ +"""Evaluation metadata schemas.""" + +from datetime import datetime, timezone +from enum import Enum +from typing import Any + +from pydantic import BaseModel, Field + +from learning_commons_evaluators._version import __version__ as sdk_version +from learning_commons_evaluators.schemas.config import LlmProvider, PromptSettings +from learning_commons_evaluators.schemas.input_specs import AnyInputSpec + + +class EvaluatorMaturity(Enum): + alpha = "alpha" + beta = "beta" + rc = "rc" + ga = "ga" + + +class Status(Enum): + processing = "processing" + succeeded = "succeeded" + failed = "failed" + + +# Input metadata is the recommended way to represent an input in logs and metadata. +InputMetadata = dict[str, Any] + + +class EvaluatorMetadata(BaseModel): + """Evaluator metadata: id, version, name, description; maturity (alpha, beta, rc, ga); sdk_version.""" + + id: str + version: str + name: str + description: str + maturity: EvaluatorMaturity + sdk_version: str = f"learning-commons-evaluators-python-{sdk_version}" + inputs: dict[str, AnyInputSpec] = Field(default_factory=dict) + + +class TokenUsage(BaseModel): + """Token usage for a some step of an evaluation: provider type, model, and token counts.""" + + provider_type: LlmProvider + model: str + input_tokens: int + output_tokens: int + + +# Well-known keys for :attr:`StepMetadata.extras` (e.g. prompt / LLM steps). +PROMPT_STEP_EXTRA_PROMPT_SETTINGS = "prompt_settings" +PROMPT_STEP_EXTRA_TOKEN_USAGE = "token_usage" + + +class StepMetadata(BaseModel): + """Metadata common to every evaluation step. + + Use :attr:`extras` for step-specific payloads (prompt settings, token usage, etc.). + See :data:`PROMPT_STEP_EXTRA_PROMPT_SETTINGS` and :data:`PROMPT_STEP_EXTRA_TOKEN_USAGE` for + standard keys used by :meth:`BaseEvaluator.execute_prompt_chain_step`. + """ + + step_id: str + status: Status = Status.processing + error_details: str = "" + processing_time_ms: float = 0 + extras: dict[str, Any] = Field(default_factory=dict) + + +def prompt_settings_to_extras_value(settings: PromptSettings) -> dict[str, Any]: + """JSON-friendly dict for the value at :data:`PROMPT_STEP_EXTRA_PROMPT_SETTINGS` in :attr:`StepMetadata.extras`.""" + return { + "provider_type": settings.provider_type.value, + "model": settings.model, + "temperature": settings.temperature, + } + + +class EvaluationMetadata(BaseModel): + """Metadata for an evaluation run.""" + + model_config = {"arbitrary_types_allowed": True} + + timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + evaluator_metadata: EvaluatorMetadata + evaluation_settings: Any + input_metadata: InputMetadata + status: Status = Status.processing + error_details: str | None = None + total_token_usage: dict[LlmProvider, TokenUsage] = Field(default_factory=dict) + processing_time_ms: float = 0 + step_details: dict[str, StepMetadata] = Field(default_factory=dict) diff --git a/sdks/python/src/learning_commons_evaluators/schemas/text_complexity.py b/sdks/python/src/learning_commons_evaluators/schemas/text_complexity.py new file mode 100644 index 0000000..6615068 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/schemas/text_complexity.py @@ -0,0 +1,48 @@ +"""Common schemas for text complexity evaluators.""" + +from enum import Enum + +from pydantic import Field + +from .common_inputs import GradeInputField, TextInputField +from .evaluator import EvaluationAnswer, EvaluationInput, EvaluationResult + + +class TextComplexityEvaluationInput(EvaluationInput): + """Input for a text complexity evaluation.""" + + text: TextInputField = Field(description="The text to evaluate.") + grade_level: GradeInputField = Field(description="Target grade level for the text.") + + +class TextComplexityAnswer(Enum): + """ + Allowed text complexity answers. Each member's value is an EvaluationAnswer; + use .label and .score for the human label and score string. + """ + + SLIGHTLY_COMPLEX = EvaluationAnswer(score="slightly_complex", label="Slightly complex") + MODERATELY_COMPLEX = EvaluationAnswer(score="moderately_complex", label="Moderately complex") + VERY_COMPLEX = EvaluationAnswer(score="very_complex", label="Very complex") + EXCEEDINGLY_COMPLEX = EvaluationAnswer(score="exceedingly_complex", label="Exceedingly complex") + + @property + def score(self) -> str: + return self.value.score + + @property + def label(self) -> str: + return self.value.label + + @classmethod + def from_score(cls, score: str) -> "TextComplexityAnswer": + for member in cls: + if member.value.score == score: + return member + raise ValueError(f"Unknown text complexity score: {score!r}") + + +class TextComplexityResult(EvaluationResult): + """Result of a text complexity evaluation: answer (enum), explanation, metadata.""" + + answer: TextComplexityAnswer # type: ignore[assignment] # Enum members hold EvaluationAnswer values diff --git a/sdks/python/src/learning_commons_evaluators/settings/__init__.py b/sdks/python/src/learning_commons_evaluators/settings/__init__.py new file mode 100644 index 0000000..7601534 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/settings/__init__.py @@ -0,0 +1,9 @@ +"""Evaluator settings package β€” TOML configs and contract test data. + +This package is the authoritative source for evaluator settings when the SDK is +installed via pip. When working inside the monorepo the shared copy at +``sdks/settings/`` is used instead (controlled by ``EVALUATORS_SETTINGS_DIR``). + +See :func:`~learning_commons_evaluators.settings.load_settings.shared_settings_root` +for the resolution order. +""" diff --git a/sdks/python/src/learning_commons_evaluators/settings/load_settings.py b/sdks/python/src/learning_commons_evaluators/settings/load_settings.py new file mode 100644 index 0000000..ebaa4a9 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/settings/load_settings.py @@ -0,0 +1,257 @@ +"""Load evaluator settings from TOML files.""" + +from __future__ import annotations + +import importlib.resources +import os +from dataclasses import dataclass +from pathlib import Path +from typing import Generic, TypeVar, cast + +try: + import tomllib +except ImportError: + import tomli as tomllib # type: ignore[import-not-found,no-redef] # Python < 3.11 + +from learning_commons_evaluators.schemas.config import LlmProvider, PromptSettings +from learning_commons_evaluators.schemas.errors import ConfigurationError +from learning_commons_evaluators.schemas.input_specs import ( + INPUT_SPEC_REGISTRY, + AnyInputSpec, +) +from learning_commons_evaluators.schemas.metadata import ( + EvaluatorMaturity, + EvaluatorMetadata, +) + +T = TypeVar("T") + + +def shared_settings_root() -> Path: + """Return the path to the evaluator settings directory. + + Resolution order: + + 1. ``EVALUATORS_SETTINGS_DIR`` environment variable β€” point this at the + ``sdks/settings/`` directory when working inside the monorepo so the + SDK reads the shared copy (shared with TypeScript and other SDKs) + rather than the bundled copy inside the package:: + + export EVALUATORS_SETTINGS_DIR=/path/to/evaluators/sdks/settings + + Also useful in CI jobs that check out settings separately. + + 2. Bundled package data β€” resolved via :mod:`importlib.resources` from the + ``learning_commons_evaluators.settings`` sub-package. This is the path + taken by a normal ``pip install`` (editable or non-editable) when the + env var is not set. + + The bundled copy is kept in sync with ``sdks/settings/`` β€” see the + *Keeping settings in sync* section of the README. + """ + env = os.environ.get("EVALUATORS_SETTINGS_DIR") + if env: + return Path(env) + # importlib.resources.files() returns a Traversable; converting to Path + # works for both editable installs (a real directory) and zip/wheel installs + # (where Python 3.9+ extracts to a temp dir automatically). + pkg = importlib.resources.files("learning_commons_evaluators.settings") + return Path(str(pkg)) + + +def _require(key: str, value: object, section: str) -> None: + """Raise ConfigurationError if value is missing or empty.""" + if value is None or (isinstance(value, str) and not value.strip()): + raise ConfigurationError(f"Missing required field '{key}' in [{section}]") + + +def load_settings(path: Path | str) -> dict: + """ + Load raw settings from a TOML file. + + Args: + path: Path to the .toml file. + + Returns: + Parsed TOML as a dict. + """ + path = Path(path) + with path.open("rb") as f: + return tomllib.load(f) + + +def _parse_maturity(value: str) -> EvaluatorMaturity: + """Map TOML maturity string to EvaluatorMaturity enum.""" + try: + return EvaluatorMaturity(value.lower()) + except ValueError as e: + raise ConfigurationError( + f"Invalid maturity '{value}' in [evaluator_metadata]; expected one of alpha, beta, rc, ga." + ) from e + + +def _parse_evaluator_metadata(data: dict) -> EvaluatorMetadata: + """Build EvaluatorMetadata from TOML evaluator_metadata section. Raises ConfigurationError if a required field is missing.""" + em = data.get("evaluator_metadata") + if not em or not isinstance(em, dict): + raise ConfigurationError("Missing required section [evaluator_metadata].") + section = "evaluator_metadata" + id_val = em.get("id") + _require("id", id_val, section) + version = em.get("version") + _require("version", version, section) + if not isinstance(version, str): + version = str(version) + name = em.get("name") + _require("name", name, section) + description = em.get("description") + _require("description", description, section) + maturity_val = em.get("maturity") + _require("maturity", maturity_val, section) + if not isinstance(maturity_val, str): + raise ConfigurationError(f"Field 'maturity' in [{section}] must be a string.") + maturity = _parse_maturity(maturity_val) + # Parse [[evaluator_metadata.inputs]] into a dict keyed by field name. + # Dispatch on ``type`` to create the correct InputSpec subclass so that + # type-specific constraint fields (e.g. min_text_length) are preserved. + inputs: dict[str, AnyInputSpec] = {} + for spec_dict in em.get("inputs", []): + if not (isinstance(spec_dict, dict) and "name" in spec_dict): + continue + field_name = spec_dict["name"] + type_key = spec_dict.get("type", "") + spec_cls = INPUT_SPEC_REGISTRY.get(type_key) + if spec_cls is None: + raise ConfigurationError( + f"Unknown input type '{type_key}' in [[evaluator_metadata.inputs]] " + f"for field '{field_name}'. Expected one of: {sorted(INPUT_SPEC_REGISTRY)}." + ) + inputs[field_name] = cast(AnyInputSpec, spec_cls(**spec_dict)) + + return EvaluatorMetadata( + id=str(id_val).strip(), + version=version.strip(), + name=str(name).strip(), + description=str(description).strip(), + maturity=maturity, + inputs=inputs, + ) + + +def _parse_provider_type(value: str, step_name: str) -> LlmProvider: + """Map TOML provider type string to LlmProvider enum.""" + normalized = value.upper().strip() + if normalized == "GOOGLE": + return LlmProvider.GOOGLE + if normalized == "OPENAI": + return LlmProvider.OPENAI + if normalized == "ANTHROPIC": + return LlmProvider.ANTHROPIC + raise ConfigurationError( + f"Invalid provider type '{value}' in [{step_name}]; expected one of: google, openai, anthropic." + ) + + +def _parse_prompt_settings_step(pm: dict, step_name: str) -> PromptSettings: + """Build PromptSettings from a TOML prompt_settings_* subsection. Raises ConfigurationError if a required field is missing.""" + pt = pm.get("type") or pm.get("provider_type") + _require("type", pt, step_name) + if not isinstance(pt, str): + raise ConfigurationError(f"Field 'type' in [{step_name}] must be a string.") + provider_type = _parse_provider_type(pt, step_name) + model = pm.get("model") + _require("model", model, step_name) + if not isinstance(model, str): + raise ConfigurationError(f"Field 'model' in [{step_name}] must be a string.") + temp = pm.get("temperature") + if temp is None: + raise ConfigurationError(f"Missing required field 'temperature' in [{step_name}].") + try: + temperature = float(temp) + except (TypeError, ValueError) as e: + raise ConfigurationError(f"Field 'temperature' in [{step_name}] must be a number.") from e + return PromptSettings( + provider_type=provider_type, + model=model.strip(), + temperature=temperature, + ) + + +def _normalize_prompt_whitespace(prompt: str) -> str: + """Turn whitespace-only lines into empty lines (TOML indentation quirk). + + Multiline TOML often uses lines that contain only spaces so blank rows align with + indented text; editors and notebook capture usually use truly empty lines instead. + Each whitespace-only line becomes empty; runs of already-empty lines are left + unchanged so prompts can keep extra vertical gaps where intended. + """ + lines = prompt.split("\n") + return "\n".join("" if (line != "" and line.strip() == "") else line for line in lines) + + +def _parse_prompts(data: dict) -> dict[str, str]: + """Build prompts dict from TOML [prompts] section. Optional; any string values are included.""" + prompts_section = data.get("prompts") + if prompts_section is None or not isinstance(prompts_section, dict): + return {} + return { + k: _normalize_prompt_whitespace(v if isinstance(v, str) else str(v)) + for k, v in prompts_section.items() + } + + +@dataclass(frozen=True) +class EvaluatorSettingsResult(Generic[T]): + """Result of loading an evaluator TOML: metadata, typed evaluation settings, and prompt templates.""" + + evaluator_metadata: EvaluatorMetadata + evaluation_settings: T + prompts: dict[str, str] + + +def load_evaluator_settings(path: Path | str, settings_cls: type[T]) -> EvaluatorSettingsResult[T]: + """ + Load evaluator settings from a TOML file. + + Parses evaluator_metadata into EvaluatorMetadata, evaluation_settings into an instance + of settings_cls (with prompt_settings_* subsections as PromptSettings), and extracts + prompt text (e.g. system_prompt) into a prompts dict. Raises ConfigurationError if + any required field or section is missing. + + Args: + path: Path to the .toml file. + settings_cls: Class for evaluation settings (e.g. ConventionalityEvaluationSettings). + Must accept keyword arguments matching the TOML evaluation_settings keys. + + Returns: + EvaluatorSettingsResult with evaluator_metadata, evaluation_settings (typed), and prompts. + """ + data = load_settings(path) + evaluator_metadata = _parse_evaluator_metadata(data) + + prompts = _parse_prompts(data) + + raw = data.get("evaluation_settings") + if raw is None or not isinstance(raw, dict): + raise ConfigurationError("Missing required section [evaluation_settings].") + raw = dict(raw) + + # Convert prompt_settings_* subsections to PromptSettings. + for key in list(raw): + if key.startswith("prompt_settings_") and isinstance(raw[key], dict): + raw[key] = _parse_prompt_settings_step(raw[key], f"evaluation_settings.{key}") + + try: + evaluation_settings = settings_cls(**raw) + except Exception as e: + if isinstance(e, ConfigurationError): + raise + raise ConfigurationError( + f"Invalid [evaluation_settings]: {e!s}", + ) from e + + return EvaluatorSettingsResult( + evaluator_metadata=evaluator_metadata, + evaluation_settings=evaluation_settings, + prompts=prompts, + ) diff --git a/sdks/python/tests/__init__.py b/sdks/python/tests/__init__.py new file mode 100644 index 0000000..959a7ae --- /dev/null +++ b/sdks/python/tests/__init__.py @@ -0,0 +1 @@ +# Test package for learning_commons_evaluators diff --git a/sdks/python/tests/conftest.py b/sdks/python/tests/conftest.py new file mode 100644 index 0000000..15f6ff8 --- /dev/null +++ b/sdks/python/tests/conftest.py @@ -0,0 +1,57 @@ +"""Shared pytest fixtures for the learning_commons_evaluators test suite.""" + +import pytest + +from learning_commons_evaluators import create_config_no_telemetry +from learning_commons_evaluators.schemas.config import ( + EvaluationSettings, + LlmProvider, + PromptSettings, +) +from learning_commons_evaluators.schemas.metadata import ( + EvaluationMetadata, + EvaluatorMaturity, + EvaluatorMetadata, +) + + +class _StubEvaluationSettings(EvaluationSettings): + """Minimal concrete EvaluationSettings for fixtures; replaced once a real evaluator lands.""" + + +@pytest.fixture +def evaluator_metadata(): + """Minimal EvaluatorMetadata suitable for use in tests.""" + return EvaluatorMetadata( + id="test-evaluator", + version="0.1", + name="Test Evaluator", + description="Used in unit tests.", + maturity=EvaluatorMaturity.beta, + ) + + +@pytest.fixture +def evaluation_metadata(evaluator_metadata): + """EvaluationMetadata with sensible defaults for unit tests.""" + return EvaluationMetadata( + evaluator_metadata=evaluator_metadata, + evaluation_settings=_StubEvaluationSettings(), + input_metadata={}, + ) + + +@pytest.fixture +def prompt_settings_google(): + """PromptSettings configured for Google, usable in multiple test modules.""" + return PromptSettings( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + temperature=0.0, + ) + + +@pytest.fixture +def config(): + """EvaluatorConfig with no telemetry, suitable for unit tests.""" + return create_config_no_telemetry() diff --git a/sdks/python/tests/evaluators/__init__.py b/sdks/python/tests/evaluators/__init__.py new file mode 100644 index 0000000..718585c --- /dev/null +++ b/sdks/python/tests/evaluators/__init__.py @@ -0,0 +1 @@ +# Tests for learning_commons_evaluators.evaluators diff --git a/sdks/python/tests/providers/__init__.py b/sdks/python/tests/providers/__init__.py new file mode 100644 index 0000000..ff6bba4 --- /dev/null +++ b/sdks/python/tests/providers/__init__.py @@ -0,0 +1 @@ +# Tests for providers diff --git a/sdks/python/tests/providers/test_langchain.py b/sdks/python/tests/providers/test_langchain.py new file mode 100644 index 0000000..65f3f41 --- /dev/null +++ b/sdks/python/tests/providers/test_langchain.py @@ -0,0 +1,134 @@ +"""Tests for LangChain provider factory and token usage.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from learning_commons_evaluators.errors import ConfigurationError +from learning_commons_evaluators.providers.langchain import ( + create_provider, + token_usage_from_aimessage, +) +from learning_commons_evaluators.schemas.config import ( + EvaluatorConfig, + GooglePromptProviderConfig, + LlmProvider, + OpenAIPromptProviderConfig, + PromptSettings, +) + + +def _config(**kwargs) -> EvaluatorConfig: + """Return an EvaluatorConfig with all providers set to None; pass provider kwargs to override.""" + defaults: dict = { + "google_prompt_provider_config": None, + "openai_prompt_provider_config": None, + "anthropic_prompt_provider_config": None, + } + defaults.update(kwargs) + return EvaluatorConfig(**defaults) + + +# --------------------------------------------------------------------------- +# create_provider +# --------------------------------------------------------------------------- + + +class TestCreateProvider: + def test_google_provider_returns_model(self): + config = _config( + google_prompt_provider_config=GooglePromptProviderConfig(api_key="test-key") + ) + settings = PromptSettings( + provider_type=LlmProvider.GOOGLE, model="gemini-2.0-flash", temperature=0.0 + ) + assert create_provider(settings, config) is not None + + def test_openai_provider_returns_model(self): + config = _config( + openai_prompt_provider_config=OpenAIPromptProviderConfig(api_key="test-key") + ) + settings = PromptSettings( + provider_type=LlmProvider.OPENAI, model="gpt-4o-mini", temperature=0.0 + ) + assert create_provider(settings, config) is not None + + def test_openai_provider_with_base_url_forwards_it_to_chat_model(self): + """When OpenAIPromptProviderConfig.base_url is set, it must be passed to ChatOpenAI.""" + config = _config( + openai_prompt_provider_config=OpenAIPromptProviderConfig( + api_key="test-key", + base_url="https://custom.example.com/v1", + ), + ) + settings = PromptSettings( + provider_type=LlmProvider.OPENAI, model="gpt-4o-mini", temperature=0.0 + ) + with patch("langchain_openai.ChatOpenAI") as mock_chat: + create_provider(settings, config) + _, kwargs = mock_chat.call_args + assert kwargs.get("base_url") == "https://custom.example.com/v1" + + def test_raises_when_google_config_missing(self): + settings = PromptSettings( + provider_type=LlmProvider.GOOGLE, model="gemini-2.0-flash", temperature=0.0 + ) + with pytest.raises(ConfigurationError, match="Google provider config is not set"): + create_provider(settings, _config()) + + def test_raises_when_openai_config_missing(self): + settings = PromptSettings(provider_type=LlmProvider.OPENAI, model="gpt-4o", temperature=0.0) + with pytest.raises(ConfigurationError, match="OpenAI provider config is not set"): + create_provider(settings, _config()) + + def test_raises_when_anthropic_config_missing(self): + settings = PromptSettings( + provider_type=LlmProvider.ANTHROPIC, model="claude-3", temperature=0.0 + ) + with pytest.raises(ConfigurationError, match="Anthropic provider config is not set"): + create_provider(settings, _config()) + + def test_raises_configuration_error_for_unrecognized_provider_type(self): + """Unknown provider_type must surface as ConfigurationError, not ValueError.""" + mock_settings = MagicMock() + mock_settings.provider_type = object() + with pytest.raises(ConfigurationError, match="Unsupported LLM provider type"): + create_provider( + mock_settings, + _config(google_prompt_provider_config=GooglePromptProviderConfig(api_key="k")), + ) + + +# --------------------------------------------------------------------------- +# token_usage_from_aimessage +# --------------------------------------------------------------------------- + + +class TestTokenUsageFromAIMessage: + def test_returns_zero_usage_when_no_usage_metadata(self): + settings = PromptSettings( + provider_type=LlmProvider.GOOGLE, model="gemini-2.0-flash", temperature=0.0 + ) + usage = token_usage_from_aimessage(object(), settings) + assert usage.provider_type == LlmProvider.GOOGLE + assert usage.model == "gemini-2.0-flash" + assert usage.input_tokens == 0 + assert usage.output_tokens == 0 + + def test_uses_usage_metadata_when_present(self): + settings = PromptSettings(provider_type=LlmProvider.OPENAI, model="gpt-4o", temperature=0.0) + message = type("Msg", (), {"usage_metadata": {"input_tokens": 100, "output_tokens": 50}})() + usage = token_usage_from_aimessage(message, settings) + assert usage.input_tokens == 100 + assert usage.output_tokens == 50 + + def test_falls_back_to_response_metadata_when_usage_metadata_absent(self): + settings = PromptSettings(provider_type=LlmProvider.GOOGLE, model="gemini", temperature=0.0) + message = type( + "Msg", + (), + {"response_metadata": {"usage_metadata": {"input_tokens": 10, "output_tokens": 20}}}, + )() + usage = token_usage_from_aimessage(message, settings) + assert usage.input_tokens == 10 + assert usage.output_tokens == 20 diff --git a/sdks/python/tests/schemas/__init__.py b/sdks/python/tests/schemas/__init__.py new file mode 100644 index 0000000..37270b5 --- /dev/null +++ b/sdks/python/tests/schemas/__init__.py @@ -0,0 +1 @@ +# Tests for learning_commons_evaluators.schemas diff --git a/sdks/python/tests/schemas/test_common_inputs.py b/sdks/python/tests/schemas/test_common_inputs.py new file mode 100644 index 0000000..904a83a --- /dev/null +++ b/sdks/python/tests/schemas/test_common_inputs.py @@ -0,0 +1,100 @@ +"""Tests for TextInputField and GradeInputField.""" + +import pytest + +from learning_commons_evaluators.schemas.common_inputs import ( + GradeInputField, + TextInputField, +) +from learning_commons_evaluators.schemas.errors import ValidationError +from learning_commons_evaluators.schemas.input_specs import ( + GradeInputSpec, + TextInputSpec, +) + +# --------------------------------------------------------------------------- +# Helpers: minimal specs for tests that don't exercise constraints +# --------------------------------------------------------------------------- + + +def _text_spec(**kwargs) -> TextInputSpec: + """Return a TextInputSpec with no constraints unless overridden.""" + return TextInputSpec(name="text", **kwargs) + + +def _grade_spec(**kwargs) -> GradeInputSpec: + """Return a GradeInputSpec with no constraints unless overridden.""" + return GradeInputSpec(name="grade", **kwargs) + + +class TestTextInputField: + def test_value_and_metadata(self): + field = TextInputField(spec=_text_spec(), value="Hello, world!") + assert field.value == "Hello, world!" + assert field.input_metadata() == {"textLength": "13"} + + def test_validate_passes_within_limits(self): + TextInputField( + spec=_text_spec(min_text_length=2, max_text_length=10), + value="Hello", + ).validate() + + def test_validate_passes_at_exact_min(self): + TextInputField(spec=_text_spec(min_text_length=2), value="ab").validate() + + def test_validate_raises_below_min(self): + with pytest.raises(ValidationError, match="below minimum"): + TextInputField(spec=_text_spec(min_text_length=2), value="a").validate() + + def test_validate_passes_at_exact_max(self): + TextInputField(spec=_text_spec(max_text_length=10), value="a" * 10).validate() + + def test_validate_raises_above_max(self): + with pytest.raises(ValidationError, match="exceeds maximum"): + TextInputField(spec=_text_spec(max_text_length=10), value="a" * 11).validate() + + def test_validate_no_max_by_default(self): + # A spec with no constraints should accept arbitrarily long text. + TextInputField(spec=_text_spec(), value="x" * 1000).validate() + + def test_spec_is_accessible(self): + spec = _text_spec(min_text_length=50) + field = TextInputField(spec=spec, value="abc") + assert field.spec.min_text_length == 50 + + +class TestGradeInputField: + def test_value_and_metadata(self): + field = GradeInputField(spec=_grade_spec(), value=5) + assert field.value == 5 + assert field.input_metadata() == {"grade": 5} + + def test_validate_passes_at_boundaries(self): + GradeInputField(spec=_grade_spec(), value=0).validate() # lower boundary + GradeInputField(spec=_grade_spec(), value=12).validate() # upper boundary + + def test_validate_raises_below_0(self): + with pytest.raises(ValidationError, match="0-12"): + GradeInputField(spec=_grade_spec(), value=-1).validate() + + def test_validate_raises_above_12(self): + with pytest.raises(ValidationError, match="0-12"): + GradeInputField(spec=_grade_spec(), value=13).validate() + + def test_validate_passes_when_in_allowed_grades(self): + GradeInputField( + spec=_grade_spec(allowed_grades=[5, 6, 7]), + value=6, + ).validate() + + def test_validate_raises_when_not_in_allowed_grades(self): + with pytest.raises(ValidationError, match="not in allowed set"): + GradeInputField( + spec=_grade_spec(allowed_grades=[5, 6, 7]), + value=8, + ).validate() + + def test_spec_is_accessible(self): + spec = _grade_spec(allowed_grades=[3, 4, 5]) + field = GradeInputField(spec=spec, value=4) + assert field.spec.allowed_grades == [3, 4, 5] diff --git a/sdks/python/tests/schemas/test_config.py b/sdks/python/tests/schemas/test_config.py new file mode 100644 index 0000000..9175450 --- /dev/null +++ b/sdks/python/tests/schemas/test_config.py @@ -0,0 +1,87 @@ +"""Tests for EvaluatorConfig, PromptProviderConfig subclasses, and factory functions.""" + +import pytest + +from learning_commons_evaluators.logger import SDK_LOGGER_NAME, get_logger +from learning_commons_evaluators.schemas.config import ( + AnthropicPromptProviderConfig, + GooglePromptProviderConfig, + LlmProvider, + OpenAIPromptProviderConfig, + create_config, + create_config_no_telemetry, + create_config_telemetry_with_full_input, +) + + +class TestLlmProvider: + @pytest.mark.parametrize( + "member,value", + [ + (LlmProvider.ANTHROPIC, "anthropic"), + (LlmProvider.GOOGLE, "google"), + (LlmProvider.OPENAI, "openai"), + ], + ) + def test_provider_value(self, member, value): + assert member.value == value + + +class TestPromptProviderConfigs: + @pytest.mark.parametrize( + "cls,expected_type", + [ + (GooglePromptProviderConfig, LlmProvider.GOOGLE), + (OpenAIPromptProviderConfig, LlmProvider.OPENAI), + (AnthropicPromptProviderConfig, LlmProvider.ANTHROPIC), + ], + ) + def test_provider_config_default_type(self, cls, expected_type): + cfg = cls(api_key="test-key") + assert cfg.type == expected_type + assert cfg.api_key == "test-key" + + def test_openai_config_accepts_base_url(self): + cfg = OpenAIPromptProviderConfig(api_key="key", base_url="https://custom.example.com") + assert cfg.base_url == "https://custom.example.com" + + def test_provider_config_base_url_defaults_to_none(self): + cfg = GooglePromptProviderConfig(api_key="key") + assert cfg.base_url is None + + +class TestEvaluatorConfigFactory: + def test_create_config_no_telemetry_defaults(self): + config = create_config_no_telemetry() + assert config.telemetry_id is None + assert config.send_full_input_with_telemetry is False + assert config.logger.name == SDK_LOGGER_NAME + + def test_create_config_no_telemetry_accepts_providers(self): + config = create_config_no_telemetry( + google_prompt_provider_config=GooglePromptProviderConfig(api_key="gk"), + openai_prompt_provider_config=OpenAIPromptProviderConfig(api_key="ok"), + ) + assert config.google_prompt_provider_config.api_key == "gk" + assert config.openai_prompt_provider_config.api_key == "ok" + + def test_create_config_sets_telemetry_id(self): + config = create_config(telemetry_id="tid-123") + assert config.telemetry_id == "tid-123" + assert config.send_full_input_with_telemetry is False + + def test_create_config_telemetry_with_full_input_sets_flag(self): + config = create_config_telemetry_with_full_input(telemetry_id="tid") + assert config.telemetry_id == "tid" + assert config.send_full_input_with_telemetry is True + + def test_explicit_logger_is_preserved(self): + custom = get_logger("custom_test") + config = create_config_no_telemetry(logger=custom) + assert config.logger is custom + + def test_config_is_frozen(self): + """EvaluatorConfig is a frozen dataclass; mutation must raise.""" + config = create_config_no_telemetry() + with pytest.raises((AttributeError, TypeError)): + config.telemetry_id = "other" # type: ignore[misc] diff --git a/sdks/python/tests/schemas/test_errors.py b/sdks/python/tests/schemas/test_errors.py new file mode 100644 index 0000000..d25cefd --- /dev/null +++ b/sdks/python/tests/schemas/test_errors.py @@ -0,0 +1,134 @@ +"""Tests for the error hierarchy and wrap_provider_error(). + +Each wrap_provider_error test targets a specific branch in the function so +that every routing decision is covered independently. +""" + +import pytest + +from learning_commons_evaluators.schemas.errors import ( + APIError, + AuthenticationError, + ConfigurationError, + EvaluatorError, + EvaluatorTimeoutError, + NetworkError, + RateLimitError, + ValidationError, + wrap_provider_error, +) + + +class TestErrorHierarchy: + """Verify that every error type carries the right code, name, and retryable flag.""" + + @pytest.mark.parametrize( + "exc,expected_code,expected_retryable", + [ + (ValidationError("bad"), "VALIDATION_ERROR", None), + (ConfigurationError("missing"), "CONFIGURATION_ERROR", None), + (AuthenticationError("denied", 401), "AUTHENTICATION_ERROR", False), + (RateLimitError("slow down"), "RATE_LIMIT_ERROR", True), + (NetworkError("no route"), "NETWORK_ERROR", True), + (EvaluatorTimeoutError(), "TIMEOUT_ERROR", True), + ], + ) + def test_error_code_and_retryable(self, exc, expected_code, expected_retryable): + assert exc.code == expected_code + if expected_retryable is not None: + assert exc.retryable is expected_retryable + + def test_evaluator_error_stores_message(self): + err = EvaluatorError("something broke", code="ERR") + assert str(err) == "something broke" + assert err.message == "something broke" + assert err.code == "ERR" + + def test_api_error_status_code_and_retryable(self): + err = APIError("server error", status_code=500, retryable=True) + assert err.status_code == 500 + assert err.retryable is True + + def test_rate_limit_error_carries_retry_after(self): + err = RateLimitError("too fast", retry_after=60_000) + assert err.status_code == 429 + assert err.retry_after == 60_000 + + +class TestWrapProviderError: + """One test per routing branch in wrap_provider_error().""" + + def test_401_with_api_key_text_returns_authentication_error(self): + wrapped = wrap_provider_error(Exception("401 Invalid API key")) + assert isinstance(wrapped, AuthenticationError) + assert wrapped.status_code == 401 + + def test_401_without_api_key_text_uses_fallback_message(self): + """When the 401 message doesn't mention 'api key', wrap should still use AuthenticationError + with the generic 'Invalid API key' fallback rather than the raw message.""" + wrapped = wrap_provider_error(Exception("401 Unauthorized")) + assert isinstance(wrapped, AuthenticationError) + assert wrapped.status_code == 401 + assert "Invalid API key" in str(wrapped) + + def test_403_returns_authentication_error(self): + wrapped = wrap_provider_error(Exception("403 Forbidden")) + assert isinstance(wrapped, AuthenticationError) + assert wrapped.status_code == 403 + + def test_429_returns_rate_limit_error(self): + wrapped = wrap_provider_error(Exception("429 rate limit exceeded")) + assert isinstance(wrapped, RateLimitError) + assert wrapped.status_code == 429 + + def test_429_with_retry_after_header_parses_delay(self): + """Retry-After value in the message should be extracted and converted to ms.""" + wrapped = wrap_provider_error(Exception("429 rate limit exceeded. Retry-After: 5")) + assert isinstance(wrapped, RateLimitError) + assert wrapped.retry_after == 5000 # 5 seconds β†’ 5000 ms + + def test_429_without_retry_after_sets_none(self): + wrapped = wrap_provider_error(Exception("429 rate limit exceeded")) + assert isinstance(wrapped, RateLimitError) + assert wrapped.retry_after is None + + @pytest.mark.parametrize( + "message", + [ + "ECONNREFUSED 127.0.0.1:443", + "ENOTFOUND api.example.com", + "ETIMEDOUT after 30s", + "Connection failed", + "Network error", + ], + ) + def test_network_keywords_return_network_error(self, message): + wrapped = wrap_provider_error(Exception(message)) + assert isinstance(wrapped, NetworkError) + + @pytest.mark.parametrize( + "message", + [ + "request timed out", + "Connection timed out after 10 seconds", + ], + ) + def test_timeout_phrases_return_timeout_error(self, message): + wrapped = wrap_provider_error(Exception(message)) + assert isinstance(wrapped, EvaluatorTimeoutError) + + def test_500_returns_retryable_api_error(self): + wrapped = wrap_provider_error(Exception("500 Internal Server Error")) + assert isinstance(wrapped, APIError) + assert wrapped.status_code == 500 + assert wrapped.retryable is True + + def test_unknown_exception_returns_non_retryable_api_error(self): + wrapped = wrap_provider_error(Exception("something completely unexpected")) + assert isinstance(wrapped, APIError) + assert wrapped.status_code is None + assert wrapped.retryable is False + + def test_empty_message_uses_default_message(self): + wrapped = wrap_provider_error(Exception(""), default_message="fallback message") + assert "fallback message" in str(wrapped) diff --git a/sdks/python/tests/schemas/test_input_specs.py b/sdks/python/tests/schemas/test_input_specs.py new file mode 100644 index 0000000..b672565 --- /dev/null +++ b/sdks/python/tests/schemas/test_input_specs.py @@ -0,0 +1,87 @@ +"""Tests for input spec types, the INPUT_SPEC_REGISTRY, and AnyInputSpec.""" + +from typing import Literal, get_args + +from learning_commons_evaluators.schemas.input_specs import ( + INPUT_SPEC_REGISTRY, + AnyInputSpec, + GradeInputSpec, + InputSpec, + TextInputSpec, + _populate_input_spec_registry, +) + + +class TestInputSpecRegistry: + def test_registry_contains_text_input_spec(self): + assert "TextInputField" in INPUT_SPEC_REGISTRY + assert INPUT_SPEC_REGISTRY["TextInputField"] is TextInputSpec + + def test_registry_contains_grade_input_spec(self): + assert "GradeInputField" in INPUT_SPEC_REGISTRY + assert INPUT_SPEC_REGISTRY["GradeInputField"] is GradeInputSpec + + def test_registry_only_contains_known_keys(self): + """Guard against accidental additions β€” update this if a new type is added.""" + assert set(INPUT_SPEC_REGISTRY.keys()) == {"TextInputField", "GradeInputField"} + + def test_new_subclass_is_picked_up_after_repopulation(self): + """A new InputSpec subclass appears in INPUT_SPEC_REGISTRY after _populate_input_spec_registry(). + + This verifies the registry mechanism works end-to-end. New subclasses must + also be added to AnyInputSpec manually (see module docstring checklist). + """ + + class _TestInputSpec(InputSpec): + type: Literal["_TestInputField"] = "_TestInputField" + + try: + _populate_input_spec_registry() + assert "_TestInputField" in INPUT_SPEC_REGISTRY + assert INPUT_SPEC_REGISTRY["_TestInputField"] is _TestInputSpec + finally: + # Restore the registry to the canonical state so other tests aren't affected. + INPUT_SPEC_REGISTRY.pop("_TestInputField", None) + _populate_input_spec_registry() + + +class TestAnyInputSpec: + def test_any_input_spec_includes_text_input_spec(self): + """AnyInputSpec must include TextInputSpec in its union members.""" + members = get_args(get_args(AnyInputSpec)[0]) + assert TextInputSpec in members + + def test_any_input_spec_includes_grade_input_spec(self): + """AnyInputSpec must include GradeInputSpec in its union members.""" + members = get_args(get_args(AnyInputSpec)[0]) + assert GradeInputSpec in members + + +class TestTextInputSpec: + def test_default_type_discriminator(self): + spec = TextInputSpec(name="text") + assert spec.type == "TextInputField" + + def test_optional_length_constraints(self): + spec = TextInputSpec(name="text", min_text_length=10, max_text_length=1000) + assert spec.min_text_length == 10 + assert spec.max_text_length == 1000 + + def test_no_length_constraints_by_default(self): + spec = TextInputSpec(name="text") + assert spec.min_text_length is None + assert spec.max_text_length is None + + +class TestGradeInputSpec: + def test_default_type_discriminator(self): + spec = GradeInputSpec(name="grade") + assert spec.type == "GradeInputField" + + def test_optional_allowed_grades(self): + spec = GradeInputSpec(name="grade", allowed_grades=[3, 4, 5]) + assert spec.allowed_grades == [3, 4, 5] + + def test_no_allowed_grades_by_default(self): + spec = GradeInputSpec(name="grade") + assert spec.allowed_grades is None diff --git a/sdks/python/tests/schemas/test_metadata.py b/sdks/python/tests/schemas/test_metadata.py new file mode 100644 index 0000000..408a39e --- /dev/null +++ b/sdks/python/tests/schemas/test_metadata.py @@ -0,0 +1,137 @@ +"""Unit tests for schemas/metadata.py. + +Covers: EvaluatorMaturity, Status, EvaluatorMetadata, TokenUsage, +StepMetadata, EvaluationMetadata, and prompt_settings_to_extras_value. +""" + +from datetime import timezone + +import pytest + +from learning_commons_evaluators.schemas.config import LlmProvider, PromptSettings +from learning_commons_evaluators.schemas.metadata import ( + PROMPT_STEP_EXTRA_PROMPT_SETTINGS, + PROMPT_STEP_EXTRA_TOKEN_USAGE, + EvaluatorMaturity, + EvaluatorMetadata, + Status, + StepMetadata, + TokenUsage, + prompt_settings_to_extras_value, +) + + +class TestEnums: + @pytest.mark.parametrize( + "member,value", + [ + (EvaluatorMaturity.alpha, "alpha"), + (EvaluatorMaturity.beta, "beta"), + (EvaluatorMaturity.rc, "rc"), + (EvaluatorMaturity.ga, "ga"), + ], + ) + def test_evaluator_maturity_values(self, member, value): + assert member.value == value + + @pytest.mark.parametrize( + "member,value", + [ + (Status.processing, "processing"), + (Status.succeeded, "succeeded"), + (Status.failed, "failed"), + ], + ) + def test_status_values(self, member, value): + assert member.value == value + + +class TestEvaluatorMetadata: + def test_fields_and_sdk_version(self): + meta = EvaluatorMetadata( + id="my-evaluator", + version="1.0", + name="My Evaluator", + description="Does stuff.", + maturity=EvaluatorMaturity.ga, + ) + assert meta.id == "my-evaluator" + assert meta.name == "My Evaluator" + assert meta.maturity == EvaluatorMaturity.ga + # sdk_version is auto-populated from the installed package version. + assert "learning-commons-evaluators-python" in meta.sdk_version + + +class TestTokenUsage: + def test_fields(self): + usage = TokenUsage( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + input_tokens=100, + output_tokens=50, + ) + assert usage.provider_type == LlmProvider.GOOGLE + assert usage.input_tokens == 100 + assert usage.output_tokens == 50 + + def test_zero_tokens_are_valid(self): + usage = TokenUsage( + provider_type=LlmProvider.OPENAI, + model="gpt-4o-mini", + input_tokens=0, + output_tokens=0, + ) + assert usage.input_tokens == 0 + + +class TestStepMetadata: + def test_defaults(self): + step = StepMetadata(step_id="main") + assert step.status == Status.processing + assert step.error_details == "" + assert step.processing_time_ms == 0 + assert step.extras == {} + + def test_extras_dict_is_mutable(self): + step = StepMetadata(step_id="step1", extras={"a": 1}) + step.extras["b"] = 2 + assert step.extras == {"a": 1, "b": 2} + + def test_well_known_extra_key_constants(self): + assert PROMPT_STEP_EXTRA_PROMPT_SETTINGS == "prompt_settings" + assert PROMPT_STEP_EXTRA_TOKEN_USAGE == "token_usage" + + +class TestPromptSettingsToExtrasValue: + def test_produces_json_serialisable_dict(self): + """provider_type must be a plain string (not the enum) so the dict is JSON-safe.""" + settings = PromptSettings( + provider_type=LlmProvider.ANTHROPIC, + model="claude-3-haiku", + temperature=0.5, + ) + extras = prompt_settings_to_extras_value(settings) + assert isinstance(extras["provider_type"], str) + assert extras["provider_type"] == "anthropic" + assert extras["model"] == "claude-3-haiku" + assert extras["temperature"] == 0.5 + + +class TestEvaluationMetadata: + def test_timestamp_is_utc(self, evaluation_metadata): + assert evaluation_metadata.timestamp.tzinfo == timezone.utc + + def test_defaults(self, evaluation_metadata): + assert evaluation_metadata.status == Status.processing + assert evaluation_metadata.step_details == {} + assert evaluation_metadata.total_token_usage == {} + assert evaluation_metadata.error_details is None + + def test_status_can_be_mutated(self, evaluation_metadata): + evaluation_metadata.status = Status.succeeded + assert evaluation_metadata.status == Status.succeeded + + def test_step_details_can_be_populated(self, evaluation_metadata): + step = StepMetadata(step_id="main", status=Status.succeeded) + evaluation_metadata.step_details["main"] = step + assert evaluation_metadata.step_details["main"].status == Status.succeeded diff --git a/sdks/python/tests/schemas/test_text_complexity.py b/sdks/python/tests/schemas/test_text_complexity.py new file mode 100644 index 0000000..858f36f --- /dev/null +++ b/sdks/python/tests/schemas/test_text_complexity.py @@ -0,0 +1,84 @@ +"""Unit tests for schemas/text_complexity.py. + +Covers: TextComplexityAnswer (all members, from_score, score/label properties), +TextComplexityEvaluationInput, TextComplexityResult. +""" + +import pytest + +from learning_commons_evaluators.schemas.common_inputs import ( + GradeInputField, + TextInputField, +) +from learning_commons_evaluators.schemas.evaluator import EvaluationExplanation +from learning_commons_evaluators.schemas.input_specs import ( + GradeInputSpec, + TextInputSpec, +) +from learning_commons_evaluators.schemas.metadata import Status +from learning_commons_evaluators.schemas.text_complexity import ( + TextComplexityAnswer, + TextComplexityEvaluationInput, + TextComplexityResult, +) + +_ANSWER_CASES = [ + ("slightly_complex", "Slightly complex", TextComplexityAnswer.SLIGHTLY_COMPLEX), + ( + "moderately_complex", + "Moderately complex", + TextComplexityAnswer.MODERATELY_COMPLEX, + ), + ("very_complex", "Very complex", TextComplexityAnswer.VERY_COMPLEX), + ( + "exceedingly_complex", + "Exceedingly complex", + TextComplexityAnswer.EXCEEDINGLY_COMPLEX, + ), +] + + +class TestTextComplexityAnswer: + @pytest.mark.parametrize("score,label,member", _ANSWER_CASES) + def test_score_and_label(self, score, label, member): + assert member.score == score + assert member.label == label + + @pytest.mark.parametrize("score,_label,member", _ANSWER_CASES) + def test_from_score_round_trip(self, score, _label, member): + assert TextComplexityAnswer.from_score(score) is member + + def test_from_score_raises_on_unknown_score(self): + with pytest.raises(ValueError, match="Unknown text complexity score"): + TextComplexityAnswer.from_score("not_a_real_score") + + def test_from_score_is_case_sensitive(self): + """Scores are lowercase; the wrong case must not silently succeed.""" + with pytest.raises(ValueError): + TextComplexityAnswer.from_score("Slightly_Complex") + + +class TestTextComplexityEvaluationInput: + def test_input_values_returns_primitives(self): + inp = TextComplexityEvaluationInput( + text=TextInputField(spec=TextInputSpec(name="text"), value="Some text."), + grade_level=GradeInputField(spec=GradeInputSpec(name="grade_level"), value=7), + ) + values = inp.input_values() + assert values["text"] == "Some text." + assert values["grade_level"] == 7 + + +class TestTextComplexityResult: + def test_answer_and_metadata_status(self, evaluation_metadata): + result = TextComplexityResult( + answer=TextComplexityAnswer.VERY_COMPLEX, + explanation=EvaluationExplanation( + summary="Abstract language throughout.", + details={"conventionality_features": ["metaphor"]}, + ), + metadata=evaluation_metadata, + ) + assert result.answer is TextComplexityAnswer.VERY_COMPLEX + assert result.answer.score == "very_complex" + assert result.metadata.status == Status.processing diff --git a/sdks/python/tests/settings/__init__.py b/sdks/python/tests/settings/__init__.py new file mode 100644 index 0000000..0b4c93e --- /dev/null +++ b/sdks/python/tests/settings/__init__.py @@ -0,0 +1 @@ +# Tests for learning_commons_evaluators.settings diff --git a/sdks/python/tests/test_logger.py b/sdks/python/tests/test_logger.py new file mode 100644 index 0000000..5fba650 --- /dev/null +++ b/sdks/python/tests/test_logger.py @@ -0,0 +1,75 @@ +"""Tests for SDK logging helpers.""" + +import logging +from io import StringIO + +from learning_commons_evaluators.logger import ( + SDK_LOGGER_NAME, + create_logger, + create_silent_logger, + format_error, + get_logger, +) + + +class TestGetLogger: + def test_root_name(self) -> None: + log = get_logger() + assert log.name == SDK_LOGGER_NAME + + def test_child_name(self) -> None: + log = get_logger("evaluators") + assert log.name == f"{SDK_LOGGER_NAME}.evaluators" + + +class TestCreateLogger: + def test_returns_use_logger_unchanged(self) -> None: + custom = logging.getLogger("my_app.tests") + result = create_logger(use_logger=custom) + assert result is custom + + def test_default_adds_stream_handler(self) -> None: + child = get_logger("test_create_default") + # Isolate from other tests: fresh child name + for h in list(child.handlers): + child.removeHandler(h) + log = create_logger("test_create_default", level=logging.INFO) + assert isinstance(log.handlers[-1], logging.StreamHandler) + log.setLevel(logging.INFO) + + def test_stream_handler_emits(self, capsys) -> None: + buf = StringIO() + h = logging.StreamHandler(buf) + h.setFormatter(logging.Formatter("%(message)s")) + log = create_logger("test_emit", level=logging.DEBUG, handler=h) + log.info("hello") + assert "hello" in buf.getvalue() + + def test_respects_level_on_stream_handler(self) -> None: + buf = StringIO() + h = logging.StreamHandler(buf) + h.setLevel(logging.ERROR) + log = create_logger("test_level", level=logging.DEBUG, handler=h) + log.setLevel(logging.DEBUG) + log.info("nope") + log.error("yep") + out = buf.getvalue() + assert "nope" not in out + assert "yep" in out + + +class TestCreateSilentLogger: + def test_no_output(self, capsys) -> None: + log = create_silent_logger() + log.critical("should not appear") + captured = capsys.readouterr() + assert captured.out == "" + assert captured.err == "" + + +class TestFormatError: + def test_format_error_returns_string(self) -> None: + err = ValueError("bad value") + s = format_error(err) + assert "ValueError" in s + assert "bad value" in s diff --git a/sdks/python/tests/test_package_imports.py b/sdks/python/tests/test_package_imports.py new file mode 100644 index 0000000..df987a2 --- /dev/null +++ b/sdks/python/tests/test_package_imports.py @@ -0,0 +1,34 @@ +"""Smoke tests for package public API imports.""" + + +def test_main_package_imports(): + """All advertised public symbols can be imported from the root package.""" + from learning_commons_evaluators import ( + BaseEvaluator, + __version__, + create_config_no_telemetry, + ) + + assert __version__ is not None + assert create_config_no_telemetry is not None + assert BaseEvaluator is not None + + +def test_errors_import(): + from learning_commons_evaluators import ( + APIError, + wrap_provider_error, + ) + + assert APIError is not None + assert wrap_provider_error is not None + + +def test_providers_import(): + from learning_commons_evaluators.providers import ( + create_provider, + token_usage_from_aimessage, + ) + + assert create_provider is not None + assert token_usage_from_aimessage is not None From 8f07f58c782055d36ea93f54bf5fb45cab83ddd5 Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Mon, 11 May 2026 17:12:03 -0700 Subject: [PATCH 02/15] chore: PR comments - config validation todo, input_metadata only in logs --- .../evaluators/base.py | 6 +- sdks/python/tests/evaluators/test_base.py | 126 ++++++++++++++++++ sdks/python/tests/schemas/test_metadata.py | 1 + 3 files changed, 130 insertions(+), 3 deletions(-) create mode 100644 sdks/python/tests/evaluators/test_base.py diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py index 47af736..81fd14f 100644 --- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py +++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py @@ -56,6 +56,7 @@ class BaseEvaluator(ABC, Generic[InputT, OutputT, SettingsT]): def __init__(self, config: EvaluatorConfig) -> None: self.config = config + # TODO: validate config def evaluate( self, @@ -94,9 +95,7 @@ def evaluate( evaluation_metadata = EvaluationMetadata( evaluator_metadata=self.metadata, evaluation_settings=evaluation_settings, - input_metadata=input.input_values() - if self.config.send_full_input_with_telemetry - else input.input_metadata(), + input_metadata=input.input_metadata() ) self.config.logger.info( "evaluation start", @@ -118,6 +117,7 @@ def evaluate( "evaluation end", extra={"evaluation_metadata": evaluation_metadata}, ) + # TODO: add full input to telemetry if enabled # TODO: send_telemetry(evaluation_metadata) @abstractmethod diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py new file mode 100644 index 0000000..af1b2d9 --- /dev/null +++ b/sdks/python/tests/evaluators/test_base.py @@ -0,0 +1,126 @@ +"""Unit tests for :class:`~learning_commons_evaluators.evaluators.base.BaseEvaluator`. + +Covers ``evaluate`` wiring: :class:`~learning_commons_evaluators.schemas.metadata.EvaluationMetadata` +always receives ``input.input_metadata()`` (PII-safe field summaries), regardless of +``send_full_input_with_telemetry`` on config. Also covers ``execute_step`` and +``evaluate`` error propagation via a minimal concrete evaluator. +""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import pytest + +from learning_commons_evaluators import ( + BaseEvaluator, + EvaluationExplanation, + TextComplexityEvaluationInput, + create_config, +) +from learning_commons_evaluators.schemas.common_inputs import GradeInputField, TextInputField +from learning_commons_evaluators.schemas.config import EvaluationSettings +from learning_commons_evaluators.schemas.errors import ValidationError +from learning_commons_evaluators.schemas.input_specs import GradeInputSpec, TextInputSpec +from learning_commons_evaluators.schemas.metadata import ( + EvaluatorMaturity, + EvaluatorMetadata, + Status, +) +from learning_commons_evaluators.schemas.text_complexity import ( + TextComplexityAnswer, + TextComplexityResult, +) + + +class _StubSettings(EvaluationSettings): + """Minimal settings model for stub evaluator.""" + + +def _stub_input() -> TextComplexityEvaluationInput: + return TextComplexityEvaluationInput( + text=TextInputField(spec=TextInputSpec(name="text"), value="hello world"), + grade_level=GradeInputField(spec=GradeInputSpec(name="grade_level"), value=3), + ) + + +class _StubEvaluator( + BaseEvaluator[TextComplexityEvaluationInput, TextComplexityResult, _StubSettings] +): + metadata = EvaluatorMetadata( + id="stub-evaluator", + version="0", + name="Stub", + description="Unit test stub.", + maturity=EvaluatorMaturity.beta, + ) + default_evaluation_settings = _StubSettings() + + def evaluate_impl( + self, + input: TextComplexityEvaluationInput, + evaluation_settings: _StubSettings, + evaluation_metadata, + ) -> TextComplexityResult: + return TextComplexityResult( + answer=TextComplexityAnswer.SLIGHTLY_COMPLEX, + explanation=EvaluationExplanation(summary="stub", details={}), + metadata=evaluation_metadata, + ) + + +@pytest.fixture +def stub_evaluator(config): + return _StubEvaluator(config) + + +class TestEvaluateInputMetadata: + def test_evaluate_sets_metadata_from_input_metadata(self, stub_evaluator): + inp = _stub_input() + result = stub_evaluator.evaluate(inp) + assert result.metadata.input_metadata == inp.input_metadata() + assert result.metadata.input_metadata["text"] == {"textLength": "11"} + assert result.metadata.input_metadata["grade_level"] == {"grade": 3} + + def test_full_telemetry_config_still_uses_input_metadata_not_raw_values(self, stub_evaluator): + """``send_full_input_with_telemetry`` does not replace ``input_metadata`` with raw values.""" + # Re-bind evaluator with telemetry + full-input flag (same class, new config). + cfg = create_config(telemetry_id="test", send_full_input_with_telemetry=True) + ev = _StubEvaluator(cfg) + inp = _stub_input() + result = ev.evaluate(inp) + assert result.metadata.input_metadata == inp.input_metadata() + assert result.metadata.input_metadata["text"] == {"textLength": "11"} + assert result.metadata.input_metadata["grade_level"] == {"grade": 3} + + +class TestEvaluateErrorHandling: + def test_raises_validation_error_for_invalid_input(self, stub_evaluator): + inp = TextComplexityEvaluationInput( + text=TextInputField( + spec=TextInputSpec(name="text", min_text_length=100), + value="short", + ), + grade_level=GradeInputField(spec=GradeInputSpec(name="grade_level"), value=3), + ) + with pytest.raises(ValidationError): + stub_evaluator.evaluate(inp) + + def test_propagates_evaluate_impl_exception(self, stub_evaluator): + with ( + patch.object(stub_evaluator, "evaluate_impl", side_effect=RuntimeError("boom")), + pytest.raises(RuntimeError, match="boom"), + ): + stub_evaluator.evaluate(_stub_input()) + + +class TestExecuteStep: + def test_returns_implementation_result(self, stub_evaluator, evaluation_metadata): + assert stub_evaluator.execute_step("s", evaluation_metadata, lambda: "ok") == "ok" + + def test_records_failed_status_on_exception(self, stub_evaluator, evaluation_metadata): + failing = MagicMock(side_effect=ValueError("boom")) + with pytest.raises(ValueError, match="boom"): + stub_evaluator.execute_step("s", evaluation_metadata, failing) + assert evaluation_metadata.step_details["s"].status == Status.failed + assert "boom" in evaluation_metadata.step_details["s"].error_details diff --git a/sdks/python/tests/schemas/test_metadata.py b/sdks/python/tests/schemas/test_metadata.py index 408a39e..c2b7590 100644 --- a/sdks/python/tests/schemas/test_metadata.py +++ b/sdks/python/tests/schemas/test_metadata.py @@ -126,6 +126,7 @@ def test_defaults(self, evaluation_metadata): assert evaluation_metadata.step_details == {} assert evaluation_metadata.total_token_usage == {} assert evaluation_metadata.error_details is None + assert evaluation_metadata.input_metadata == {} def test_status_can_be_mutated(self, evaluation_metadata): evaluation_metadata.status = Status.succeeded From 25d3ff0dbe3a87cae7078ea4310e226ea80569af Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Mon, 11 May 2026 17:24:54 -0700 Subject: [PATCH 03/15] chore: execute_step implementation without unsafe casts --- .../evaluators/base.py | 61 +++++++++++++------ 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py index 81fd14f..0a7f495 100644 --- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py +++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py @@ -5,7 +5,7 @@ import time from abc import ABC, abstractmethod from collections.abc import Callable -from typing import Any, Generic, TypeVar, cast +from typing import Any, Generic, TypeVar, overload from pydantic import BaseModel from pydantic import ValidationError as PydanticValidationError @@ -41,7 +41,8 @@ InputT = TypeVar("InputT", bound=EvaluationInput) OutputT = TypeVar("OutputT", bound=EvaluationResult) SettingsT = TypeVar("SettingsT", bound=EvaluationSettings) -ResultT = TypeVar("ResultT", bound=BaseModel) +StepResultT = TypeVar("StepResultT") +ParsedT = TypeVar("ParsedT", bound=BaseModel) class BaseEvaluator(ABC, Generic[InputT, OutputT, SettingsT]): @@ -134,14 +135,17 @@ def execute_step( self, step_name: str, evaluation_metadata: EvaluationMetadata, - implementation_function: Callable[[], ResultT], + implementation_function: Callable[[], StepResultT], *, extras: dict[str, Any] | None = None, - ) -> ResultT: + ) -> StepResultT: """Run ``implementation_function`` and record step metadata on ``evaluation_metadata``. ``step_name`` is always the step id. Optional ``extras`` is copied into :attr:`StepMetadata.extras` (merged with any updates made during the step, e.g. token usage). + + The step may return any type (e.g. a Pydantic model, a plain ``str``, or ``None``); the same + type is returned to the caller. """ start = time.perf_counter() step_extras = dict(extras) if extras is not None else {} @@ -160,6 +164,28 @@ def execute_step( self.config.logger.info("step end", extra={"step_metadata": step_metadata}) evaluation_metadata.step_details[step_name] = step_metadata + @overload + def execute_prompt_chain_step( + self, + step_name: str, + prompt_settings: PromptSettings, + evaluation_metadata: EvaluationMetadata, + template: Any, + chain_inputs: dict[str, Any], + parser_output_type: None = None, + ) -> str: ... + + @overload + def execute_prompt_chain_step( + self, + step_name: str, + prompt_settings: PromptSettings, + evaluation_metadata: EvaluationMetadata, + template: Any, + chain_inputs: dict[str, Any], + parser_output_type: type[ParsedT], + ) -> ParsedT: ... + def execute_prompt_chain_step( self, step_name: str, @@ -167,8 +193,8 @@ def execute_prompt_chain_step( evaluation_metadata: EvaluationMetadata, template: Any, chain_inputs: dict[str, Any], - parser_output_type: type[ResultT] | None = None, - ) -> ResultT | str: + parser_output_type: type[BaseModel] | None = None, + ) -> BaseModel | str: """Run a prompt chain (template | LLM), record metadata, and return the result. When ``parser_output_type`` is a Pydantic model class, the LLM response is @@ -201,7 +227,7 @@ def execute_prompt_chain_step( # Populated after a successful LLM invoke so we can attach usage even if parsing fails. token_usage: TokenUsage | None = None - def _run_chain() -> ResultT | str: + def _run_chain() -> BaseModel | str: nonlocal token_usage try: provider = create_provider(prompt_settings, self.config) @@ -227,18 +253,15 @@ def _run_chain() -> ResultT | str: raise wrap_provider_error(e) from e try: - return cast( - ResultT | str, - self.execute_step( - step_name, - evaluation_metadata, - extras={ - PROMPT_STEP_EXTRA_PROMPT_SETTINGS: prompt_settings_to_extras_value( - prompt_settings - ), - }, - implementation_function=cast(Callable[[], ResultT], _run_chain), - ), + return self.execute_step( + step_name, + evaluation_metadata, + extras={ + PROMPT_STEP_EXTRA_PROMPT_SETTINGS: prompt_settings_to_extras_value( + prompt_settings + ), + }, + implementation_function=_run_chain, ) finally: if token_usage is not None: From 7e9c989a8262d0d6e4bbd30f6101954dd3338b22 Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Mon, 11 May 2026 17:41:02 -0700 Subject: [PATCH 04/15] test: unit tests for base evaluator. somehow didn't make into this branch when I split the PR using AI --- sdks/python/tests/evaluators/test_base.py | 486 +++++++++++++++++++++- 1 file changed, 471 insertions(+), 15 deletions(-) diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py index af1b2d9..ca21596 100644 --- a/sdks/python/tests/evaluators/test_base.py +++ b/sdks/python/tests/evaluators/test_base.py @@ -1,40 +1,66 @@ -"""Unit tests for :class:`~learning_commons_evaluators.evaluators.base.BaseEvaluator`. +"""Tests for :class:`~learning_commons_evaluators.evaluators.base.BaseEvaluator`. -Covers ``evaluate`` wiring: :class:`~learning_commons_evaluators.schemas.metadata.EvaluationMetadata` -always receives ``input.input_metadata()`` (PII-safe field summaries), regardless of -``send_full_input_with_telemetry`` on config. Also covers ``execute_step`` and -``evaluate`` error propagation via a minimal concrete evaluator. +Covers: ``__init__``, ``evaluate`` (metadata, settings override, success/failure), +``update_total_token_usage``, ``execute_step``, ``execute_prompt_chain_step``. """ from __future__ import annotations +import logging from unittest.mock import MagicMock, patch import pytest +from langchain_core.messages import AIMessage +from langchain_core.prompts import ChatPromptTemplate +from pydantic import BaseModel, Field +from pydantic import ValidationError as PydanticValidationError from learning_commons_evaluators import ( BaseEvaluator, EvaluationExplanation, TextComplexityEvaluationInput, create_config, + create_config_no_telemetry, ) +from learning_commons_evaluators.errors import ConfigurationError from learning_commons_evaluators.schemas.common_inputs import GradeInputField, TextInputField -from learning_commons_evaluators.schemas.config import EvaluationSettings -from learning_commons_evaluators.schemas.errors import ValidationError +from learning_commons_evaluators.schemas.config import ( + EvaluationSettings, + LlmProvider, + PromptSettings, +) +from learning_commons_evaluators.schemas.errors import APIError, EvaluatorError, ValidationError from learning_commons_evaluators.schemas.input_specs import GradeInputSpec, TextInputSpec from learning_commons_evaluators.schemas.metadata import ( + PROMPT_STEP_EXTRA_PROMPT_SETTINGS, + PROMPT_STEP_EXTRA_TOKEN_USAGE, EvaluatorMaturity, EvaluatorMetadata, Status, + TokenUsage, ) from learning_commons_evaluators.schemas.text_complexity import ( TextComplexityAnswer, TextComplexityResult, ) +_CHAIN_PATCH = "learning_commons_evaluators.evaluators.base.create_provider" + + +class _ChainOutput(BaseModel): + """Minimal LLM JSON payload model (stand-in for conventionality output models).""" + + label: str = Field(description="short label") + score: int = Field(description="numeric score") + + +_CHAIN_JSON = '{"label": "ok", "score": 7}' + class _StubSettings(EvaluationSettings): - """Minimal settings model for stub evaluator.""" + """Minimal settings; ``marker`` supports tests that pass explicit ``evaluation_settings``.""" + + marker: int = 0 def _stub_input() -> TextComplexityEvaluationInput: @@ -64,7 +90,10 @@ def evaluate_impl( ) -> TextComplexityResult: return TextComplexityResult( answer=TextComplexityAnswer.SLIGHTLY_COMPLEX, - explanation=EvaluationExplanation(summary="stub", details={}), + explanation=EvaluationExplanation( + summary="stub", + details={"marker": evaluation_settings.marker}, + ), metadata=evaluation_metadata, ) @@ -74,7 +103,37 @@ def stub_evaluator(config): return _StubEvaluator(config) +# --------------------------------------------------------------------------- +# BaseEvaluator.__init__ +# --------------------------------------------------------------------------- + + +class TestBaseEvaluatorInit: + def test_config_is_stored(self, config): + assert _StubEvaluator(config).config is config + + +# --------------------------------------------------------------------------- +# evaluate() +# --------------------------------------------------------------------------- + + +class TestEvaluateSuccess: + def test_sets_status_succeeded_and_processing_time(self, stub_evaluator): + result = stub_evaluator.evaluate(_stub_input()) + assert result.metadata.status == Status.succeeded + assert result.metadata.processing_time_ms >= 0.0 + + def test_passes_explicit_evaluation_settings(self, stub_evaluator): + custom = _StubSettings(marker=42) + result = stub_evaluator.evaluate(_stub_input(), evaluation_settings=custom) + assert result.metadata.evaluation_settings.marker == 42 + assert result.explanation.details.get("marker") == 42 + + class TestEvaluateInputMetadata: + """``input_metadata`` on :class:`EvaluationMetadata` always comes from ``input.input_metadata()``.""" + def test_evaluate_sets_metadata_from_input_metadata(self, stub_evaluator): inp = _stub_input() result = stub_evaluator.evaluate(inp) @@ -83,8 +142,6 @@ def test_evaluate_sets_metadata_from_input_metadata(self, stub_evaluator): assert result.metadata.input_metadata["grade_level"] == {"grade": 3} def test_full_telemetry_config_still_uses_input_metadata_not_raw_values(self, stub_evaluator): - """``send_full_input_with_telemetry`` does not replace ``input_metadata`` with raw values.""" - # Re-bind evaluator with telemetry + full-input flag (same class, new config). cfg = create_config(telemetry_id="test", send_full_input_with_telemetry=True) ev = _StubEvaluator(cfg) inp = _stub_input() @@ -113,14 +170,413 @@ def test_propagates_evaluate_impl_exception(self, stub_evaluator): ): stub_evaluator.evaluate(_stub_input()) + def test_validation_failure_emits_end_log_with_failed_status(self, stub_evaluator): + captured: list = [] + + class _Capture(logging.Handler): + def emit(self, record: logging.LogRecord) -> None: + meta = getattr(record, "evaluation_metadata", None) + if meta is not None and record.getMessage() == "evaluation end": + captured.append(meta) + + h = _Capture() + stub_evaluator.config.logger.addHandler(h) + stub_evaluator.config.logger.setLevel(logging.INFO) + try: + inp = TextComplexityEvaluationInput( + text=TextInputField( + spec=TextInputSpec(name="text", min_text_length=100), + value="short", + ), + grade_level=GradeInputField(spec=GradeInputSpec(name="grade_level"), value=3), + ) + with pytest.raises(ValidationError): + stub_evaluator.evaluate(inp) + finally: + stub_evaluator.config.logger.removeHandler(h) + + assert captured + assert captured[-1].status == Status.failed + assert captured[-1].error_details + + +# --------------------------------------------------------------------------- +# update_total_token_usage +# --------------------------------------------------------------------------- + + +class TestUpdateTotalTokenUsage: + def test_inserts_usage_for_new_provider(self, stub_evaluator, evaluation_metadata): + usage = TokenUsage( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + input_tokens=10, + output_tokens=5, + ) + stub_evaluator.update_total_token_usage(usage, evaluation_metadata) + stored = evaluation_metadata.total_token_usage[LlmProvider.GOOGLE] + assert stored.input_tokens == 10 + assert stored.output_tokens == 5 + + def test_accumulates_usage_for_existing_provider(self, stub_evaluator, evaluation_metadata): + evaluation_metadata.total_token_usage[LlmProvider.GOOGLE] = TokenUsage( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + input_tokens=10, + output_tokens=5, + ) + stub_evaluator.update_total_token_usage( + TokenUsage( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + input_tokens=20, + output_tokens=15, + ), + evaluation_metadata, + ) + stored = evaluation_metadata.total_token_usage[LlmProvider.GOOGLE] + assert stored.input_tokens == 30 + assert stored.output_tokens == 20 + + +# --------------------------------------------------------------------------- +# execute_step +# --------------------------------------------------------------------------- + class TestExecuteStep: def test_returns_implementation_result(self, stub_evaluator, evaluation_metadata): - assert stub_evaluator.execute_step("s", evaluation_metadata, lambda: "ok") == "ok" + assert ( + stub_evaluator.execute_step("s", evaluation_metadata, lambda: "the-result") + == "the-result" + ) + + def test_records_succeeded_status_on_success(self, stub_evaluator, evaluation_metadata): + stub_evaluator.execute_step("s", evaluation_metadata, lambda: None) + assert evaluation_metadata.step_details["s"].status == Status.succeeded - def test_records_failed_status_on_exception(self, stub_evaluator, evaluation_metadata): + def test_records_failed_status_and_error_on_exception( + self, stub_evaluator, evaluation_metadata + ): failing = MagicMock(side_effect=ValueError("boom")) with pytest.raises(ValueError, match="boom"): stub_evaluator.execute_step("s", evaluation_metadata, failing) - assert evaluation_metadata.step_details["s"].status == Status.failed - assert "boom" in evaluation_metadata.step_details["s"].error_details + step = evaluation_metadata.step_details["s"] + assert step.status == Status.failed + assert "boom" in step.error_details + + def test_re_raises_exception(self, stub_evaluator, evaluation_metadata): + failing = MagicMock(side_effect=RuntimeError("inner")) + with pytest.raises(RuntimeError, match="inner"): + stub_evaluator.execute_step("s", evaluation_metadata, failing) + + def test_extras_appear_in_step_metadata(self, stub_evaluator, evaluation_metadata): + stub_evaluator.execute_step("s", evaluation_metadata, lambda: None, extras={"k": "v"}) + assert evaluation_metadata.step_details["s"].extras["k"] == "v" + + +# --------------------------------------------------------------------------- +# execute_prompt_chain_step +# --------------------------------------------------------------------------- + + +class TestExecutePromptChainStep: + """Mock ``create_provider`` so ``template | provider`` runs in-process (matches sdk_python tests).""" + + def test_returns_raw_string_when_parser_output_type_is_none(self, evaluation_metadata): + def _fake_llm(_pv): + return AIMessage(content="plain prose") + + template = ChatPromptTemplate.from_messages([("human", "{input}")]) + ev = _StubEvaluator(create_config_no_telemetry()) + with patch(_CHAIN_PATCH, return_value=_fake_llm): + out = ev.execute_prompt_chain_step( + step_name="raw", + prompt_settings=PromptSettings( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + temperature=0.0, + ), + evaluation_metadata=evaluation_metadata, + template=template, + chain_inputs={"input": "Hello"}, + parser_output_type=None, + ) + assert out == "plain prose" + + def test_returns_parsed_pydantic_output(self, stub_evaluator, evaluation_metadata): + def _fake_llm(_pv): + return AIMessage(content=_CHAIN_JSON) + + template = ChatPromptTemplate.from_messages([("human", "{input}")]) + with patch(_CHAIN_PATCH, return_value=_fake_llm): + result = stub_evaluator.execute_prompt_chain_step( + step_name="main", + prompt_settings=PromptSettings( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + temperature=0.0, + ), + evaluation_metadata=evaluation_metadata, + template=template, + chain_inputs={"input": "Hello"}, + parser_output_type=_ChainOutput, + ) + assert isinstance(result, _ChainOutput) + assert result.label == "ok" + assert result.score == 7 + + def test_parser_returning_model_instance_short_circuits_model_validate( + self, stub_evaluator, evaluation_metadata + ): + """When ``JsonOutputParser.invoke`` returns a model, ``isinstance`` path skips ``model_validate``.""" + prebuilt = _ChainOutput(label="direct", score=99) + + def _fake_llm(_pv): + return AIMessage(content="unused") + + template = ChatPromptTemplate.from_messages([("human", "{input}")]) + with ( + patch(_CHAIN_PATCH, return_value=_fake_llm), + patch("langchain_core.output_parsers.json.JsonOutputParser") as mock_parser_cls, + ): + mock_parser = MagicMock() + mock_parser.invoke.return_value = prebuilt + mock_parser_cls.return_value = mock_parser + result = stub_evaluator.execute_prompt_chain_step( + step_name="main", + prompt_settings=PromptSettings( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + temperature=0.0, + ), + evaluation_metadata=evaluation_metadata, + template=template, + chain_inputs={"input": "Hello"}, + parser_output_type=_ChainOutput, + ) + assert result is prebuilt + + def test_keyboard_interrupt_from_parser_propagates(self, stub_evaluator, evaluation_metadata): + def _fake_llm(_pv): + return AIMessage(content=_CHAIN_JSON) + + template = ChatPromptTemplate.from_messages([("human", "{input}")]) + with ( + patch(_CHAIN_PATCH, return_value=_fake_llm), + patch("langchain_core.output_parsers.json.JsonOutputParser") as mock_parser_cls, + ): + mock_parser = MagicMock() + mock_parser.invoke.side_effect = KeyboardInterrupt + mock_parser_cls.return_value = mock_parser + with pytest.raises(KeyboardInterrupt): + stub_evaluator.execute_prompt_chain_step( + step_name="main", + prompt_settings=PromptSettings( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + temperature=0.0, + ), + evaluation_metadata=evaluation_metadata, + template=template, + chain_inputs={"input": "Hello"}, + parser_output_type=_ChainOutput, + ) + + def test_system_exit_from_parser_propagates(self, stub_evaluator, evaluation_metadata): + def _fake_llm(_pv): + return AIMessage(content=_CHAIN_JSON) + + template = ChatPromptTemplate.from_messages([("human", "{input}")]) + with ( + patch(_CHAIN_PATCH, return_value=_fake_llm), + patch("langchain_core.output_parsers.json.JsonOutputParser") as mock_parser_cls, + ): + mock_parser = MagicMock() + mock_parser.invoke.side_effect = SystemExit(3) + mock_parser_cls.return_value = mock_parser + with pytest.raises(SystemExit) as exc_info: + stub_evaluator.execute_prompt_chain_step( + step_name="main", + prompt_settings=PromptSettings( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + temperature=0.0, + ), + evaluation_metadata=evaluation_metadata, + template=template, + chain_inputs={"input": "Hello"}, + parser_output_type=_ChainOutput, + ) + assert exc_info.value.code == 3 + + def test_prompt_settings_recorded_in_step_extras(self, stub_evaluator, evaluation_metadata): + settings = PromptSettings( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + temperature=0.0, + ) + template = ChatPromptTemplate.from_messages([("human", "{input}")]) + + with patch(_CHAIN_PATCH, return_value=lambda _pv: AIMessage(content=_CHAIN_JSON)): + stub_evaluator.execute_prompt_chain_step( + step_name="main", + prompt_settings=settings, + evaluation_metadata=evaluation_metadata, + template=template, + chain_inputs={"input": "text"}, + parser_output_type=_ChainOutput, + ) + + step = evaluation_metadata.step_details["main"] + assert step.extras[PROMPT_STEP_EXTRA_PROMPT_SETTINGS]["model"] == "gemini-2.0-flash" + assert PROMPT_STEP_EXTRA_TOKEN_USAGE in step.extras + + def test_token_usage_recorded_when_llm_reports_usage(self, stub_evaluator, evaluation_metadata): + def _llm_with_usage(_pv): + return AIMessage( + content=_CHAIN_JSON, + usage_metadata={ + "input_tokens": 42, + "output_tokens": 17, + "total_tokens": 59, + }, + ) + + settings = PromptSettings( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + temperature=0.0, + ) + template = ChatPromptTemplate.from_messages([("human", "{input}")]) + + with patch(_CHAIN_PATCH, return_value=_llm_with_usage): + stub_evaluator.execute_prompt_chain_step( + step_name="main", + prompt_settings=settings, + evaluation_metadata=evaluation_metadata, + template=template, + chain_inputs={"input": "text"}, + parser_output_type=_ChainOutput, + ) + + step = evaluation_metadata.step_details["main"] + assert step.extras[PROMPT_STEP_EXTRA_TOKEN_USAGE]["input_tokens"] == 42 + assert step.extras[PROMPT_STEP_EXTRA_TOKEN_USAGE]["output_tokens"] == 17 + assert evaluation_metadata.total_token_usage[LlmProvider.GOOGLE].input_tokens == 42 + + def test_propagates_configuration_error_from_create_provider( + self, stub_evaluator, evaluation_metadata + ): + template = ChatPromptTemplate.from_messages([("human", "{input}")]) + with ( + patch( + _CHAIN_PATCH, + side_effect=ConfigurationError("Google provider config is not set"), + ), + pytest.raises(ConfigurationError, match="Google provider config is not set"), + ): + stub_evaluator.execute_prompt_chain_step( + step_name="main", + prompt_settings=PromptSettings( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + temperature=0.0, + ), + evaluation_metadata=evaluation_metadata, + template=template, + chain_inputs={"input": "text"}, + parser_output_type=_ChainOutput, + ) + + def test_propagates_evaluator_error_without_wrapping(self, stub_evaluator, evaluation_metadata): + """``EvaluatorError`` subclasses raised inside the chain are re-raised unchanged.""" + template = ChatPromptTemplate.from_messages([("human", "{input}")]) + with ( + patch(_CHAIN_PATCH, side_effect=EvaluatorError("bare evaluator error")), + pytest.raises(EvaluatorError, match="bare evaluator error"), + ): + stub_evaluator.execute_prompt_chain_step( + step_name="main", + prompt_settings=PromptSettings( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + temperature=0.0, + ), + evaluation_metadata=evaluation_metadata, + template=template, + chain_inputs={"input": "text"}, + parser_output_type=_ChainOutput, + ) + + def test_wraps_unexpected_chain_failure_as_api_error(self, stub_evaluator, evaluation_metadata): + def _boom(_pv): + raise ValueError("simulated provider failure") + + template = ChatPromptTemplate.from_messages([("human", "{input}")]) + with ( + patch(_CHAIN_PATCH, return_value=_boom), + pytest.raises(APIError, match="simulated provider failure"), + ): + stub_evaluator.execute_prompt_chain_step( + step_name="main", + prompt_settings=PromptSettings( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + temperature=0.0, + ), + evaluation_metadata=evaluation_metadata, + template=template, + chain_inputs={"input": "text"}, + parser_output_type=_ChainOutput, + ) + + def test_malformed_llm_json_raises_api_error(self, stub_evaluator, evaluation_metadata): + """Invalid JSON from the LLM becomes :class:`APIError` via ``wrap_provider_error``.""" + + def _bad(_pv): + return AIMessage(content="not-json") + + template = ChatPromptTemplate.from_messages([("human", "{input}")]) + with ( + patch(_CHAIN_PATCH, return_value=_bad), + pytest.raises(APIError, match="Invalid json output"), + ): + stub_evaluator.execute_prompt_chain_step( + step_name="main", + prompt_settings=PromptSettings( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + temperature=0.0, + ), + evaluation_metadata=evaluation_metadata, + template=template, + chain_inputs={"input": "text"}, + parser_output_type=_ChainOutput, + ) + + def test_schema_mismatch_raises_pydantic_validation_error( + self, stub_evaluator, evaluation_metadata + ): + """Valid JSON that does not satisfy the output model raises Pydantic ``ValidationError``.""" + + def _partial(_pv): + return AIMessage(content='{"label": "only"}') + + template = ChatPromptTemplate.from_messages([("human", "{input}")]) + with ( + patch(_CHAIN_PATCH, return_value=_partial), + pytest.raises(PydanticValidationError), + ): + stub_evaluator.execute_prompt_chain_step( + step_name="main", + prompt_settings=PromptSettings( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + temperature=0.0, + ), + evaluation_metadata=evaluation_metadata, + template=template, + chain_inputs={"input": "text"}, + parser_output_type=_ChainOutput, + ) From daf7e7eccb3ba9930482ff9428314f56381b2cbb Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Mon, 11 May 2026 17:52:40 -0700 Subject: [PATCH 05/15] chore: textLength as int --- .../src/learning_commons_evaluators/schemas/common_inputs.py | 2 +- .../src/learning_commons_evaluators/schemas/evaluator.py | 2 +- sdks/python/tests/evaluators/test_base.py | 4 ++-- sdks/python/tests/schemas/test_common_inputs.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py b/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py index 37a6aa0..446f7a4 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py @@ -53,7 +53,7 @@ def validate(self) -> None: def input_metadata(self) -> dict[str, Any]: """Return the character count β€” never the raw text (may contain PII).""" - return {"textLength": str(len(self.value))} + return {"textLength": len(self.value)} class GradeInputField(InputField[int]): diff --git a/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py b/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py index 1e54a57..9b3c6d0 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py @@ -46,7 +46,7 @@ def validate(self) -> None: raise ValidationError(...) def input_metadata(self) -> dict[str, Any]: - return {"textLength": str(len(self.value))} + return {"textLength": len(self.value)} """ spec: InputSpec diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py index ca21596..ba0310b 100644 --- a/sdks/python/tests/evaluators/test_base.py +++ b/sdks/python/tests/evaluators/test_base.py @@ -138,7 +138,7 @@ def test_evaluate_sets_metadata_from_input_metadata(self, stub_evaluator): inp = _stub_input() result = stub_evaluator.evaluate(inp) assert result.metadata.input_metadata == inp.input_metadata() - assert result.metadata.input_metadata["text"] == {"textLength": "11"} + assert result.metadata.input_metadata["text"] == {"textLength": 11} assert result.metadata.input_metadata["grade_level"] == {"grade": 3} def test_full_telemetry_config_still_uses_input_metadata_not_raw_values(self, stub_evaluator): @@ -147,7 +147,7 @@ def test_full_telemetry_config_still_uses_input_metadata_not_raw_values(self, st inp = _stub_input() result = ev.evaluate(inp) assert result.metadata.input_metadata == inp.input_metadata() - assert result.metadata.input_metadata["text"] == {"textLength": "11"} + assert result.metadata.input_metadata["text"] == {"textLength": 11} assert result.metadata.input_metadata["grade_level"] == {"grade": 3} diff --git a/sdks/python/tests/schemas/test_common_inputs.py b/sdks/python/tests/schemas/test_common_inputs.py index 904a83a..bc6467c 100644 --- a/sdks/python/tests/schemas/test_common_inputs.py +++ b/sdks/python/tests/schemas/test_common_inputs.py @@ -31,7 +31,7 @@ class TestTextInputField: def test_value_and_metadata(self): field = TextInputField(spec=_text_spec(), value="Hello, world!") assert field.value == "Hello, world!" - assert field.input_metadata() == {"textLength": "13"} + assert field.input_metadata() == {"textLength": 13} def test_validate_passes_within_limits(self): TextInputField( From e5dd451a3783130ef7a96e18c98dd3139ae62669 Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Mon, 11 May 2026 18:09:37 -0700 Subject: [PATCH 06/15] feat: strip white space by default for text inputs --- .../schemas/common_inputs.py | 17 +++++++++++ .../schemas/input_specs.py | 7 ++++- .../tests/schemas/test_common_inputs.py | 29 +++++++++++++++++++ sdks/python/tests/schemas/test_input_specs.py | 4 +++ 4 files changed, 56 insertions(+), 1 deletion(-) diff --git a/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py b/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py index 446f7a4..4de9964 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py @@ -13,6 +13,8 @@ from typing import Any +from pydantic import model_validator + from learning_commons_evaluators.schemas.errors import ValidationError from learning_commons_evaluators.schemas.evaluator import InputField from learning_commons_evaluators.schemas.input_specs import ( @@ -34,12 +36,27 @@ class TextInputField(InputField[str]): Constraints (min/max text length) are read from ``spec`` rather than stored directly on the field, so the same spec object can be shared across many field instances. + + By default ``spec.strip_whitespace`` is true, so ``value`` is trimmed when the field is built. + Set it to false to preserve leading and trailing whitespace. """ # Narrows the abstract InputField.spec: InputSpec β†’ TextInputSpec. # value: str is inherited from InputField[str] and does not need to be redeclared. spec: TextInputSpec + @model_validator(mode="before") + @classmethod + def _strip_value_if_spec_requests(cls, data: Any) -> Any: + if not isinstance(data, dict): + return data + spec = data.get("spec") + value = data.get("value") + if not isinstance(value, str) or spec is None: + return data + if isinstance(spec, TextInputSpec) and spec.strip_whitespace: + return {**data, "value": value.strip()} + return data def validate(self) -> None: """Raise :class:`~.errors.ValidationError` if the value violates the spec constraints.""" text_length = len(self.value) diff --git a/sdks/python/src/learning_commons_evaluators/schemas/input_specs.py b/sdks/python/src/learning_commons_evaluators/schemas/input_specs.py index c35a6cc..bab6e06 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/input_specs.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/input_specs.py @@ -6,7 +6,7 @@ Hierarchy:: InputSpec base: name, type, description, required - β”œβ”€ TextInputSpec adds: min_text_length, max_text_length + β”œβ”€ TextInputSpec adds: min_text_length, max_text_length, strip_whitespace └─ GradeInputSpec adds: allowed_grades **Adding a new input spec type β€” checklist** @@ -68,11 +68,16 @@ class TextInputSpec(InputSpec): Constraint fields are optional; omitting them means no length limit is enforced for that boundary. + + When ``strip_whitespace`` is true (the default), leading and trailing whitespace is removed + from the value when a :class:`~.common_inputs.TextInputField` is constructed + (before length validation). Set it to false to keep the raw string unchanged. """ type: Literal["TextInputField"] = "TextInputField" min_text_length: int | None = None max_text_length: int | None = None + strip_whitespace: bool = True class GradeInputSpec(InputSpec): diff --git a/sdks/python/tests/schemas/test_common_inputs.py b/sdks/python/tests/schemas/test_common_inputs.py index bc6467c..2cc7d3e 100644 --- a/sdks/python/tests/schemas/test_common_inputs.py +++ b/sdks/python/tests/schemas/test_common_inputs.py @@ -62,6 +62,35 @@ def test_spec_is_accessible(self): field = TextInputField(spec=spec, value="abc") assert field.spec.min_text_length == 50 + def test_strip_whitespace_false_preserves_padding(self): + field = TextInputField(spec=_text_spec(strip_whitespace=False), value=" ab ") + assert field.value == " ab " + + def test_strip_whitespace_true_trims_value_by_default(self): + field = TextInputField(spec=_text_spec(), value=" ab ") + assert field.value == "ab" + + def test_strip_whitespace_applies_before_length_validation(self): + TextInputField( + spec=_text_spec(min_text_length=2), + value=" xx ", + ).validate() + + def test_strip_whitespace_explicit_true_trims_value(self): + field = TextInputField( + spec=_text_spec(strip_whitespace=True), + value=" cd ", + ) + assert field.value == "cd" + + def test_validate_raises_when_strip_shortens_below_min(self): + """Padding does not count toward ``min_text_length`` when stripping is on.""" + with pytest.raises(ValidationError, match="below minimum"): + TextInputField( + spec=_text_spec(min_text_length=5, strip_whitespace=True), + value=" ab ", + ).validate() + class TestGradeInputField: def test_value_and_metadata(self): diff --git a/sdks/python/tests/schemas/test_input_specs.py b/sdks/python/tests/schemas/test_input_specs.py index b672565..7c06d74 100644 --- a/sdks/python/tests/schemas/test_input_specs.py +++ b/sdks/python/tests/schemas/test_input_specs.py @@ -72,6 +72,10 @@ def test_no_length_constraints_by_default(self): assert spec.min_text_length is None assert spec.max_text_length is None + def test_strip_whitespace_defaults_true(self): + spec = TextInputSpec(name="text") + assert spec.strip_whitespace is True + class TestGradeInputSpec: def test_default_type_discriminator(self): From 6287e00ab186bf77785d9baa1a4d0126f7580f8c Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Mon, 11 May 2026 19:02:05 -0700 Subject: [PATCH 07/15] chore: PR comments TODOs, remove redundant fields from PromptProviderConfig --- .../learning_commons_evaluators/schemas/config.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sdks/python/src/learning_commons_evaluators/schemas/config.py b/sdks/python/src/learning_commons_evaluators/schemas/config.py index c451fb7..f3b1577 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/config.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/config.py @@ -16,6 +16,7 @@ # --- Prompt provider configs (for LLM calls in prompt steps) --- +# TODO: rename to LLMProvider class LlmProvider(str, Enum): """LLM provider identifier. Subclass of str so it compares and serializes as the provider name.""" @@ -24,22 +25,23 @@ class LlmProvider(str, Enum): OPENAI = "openai" +# TODO: rename to LLMProviderConfig and subclasses to GoogleLLMProviderConfig, OpenAILLMProviderConfig, AnthropicLLMProviderConfig. @dataclass(frozen=True) class PromptProviderConfig: """Base type for prompt provider configuration.""" api_key: str type: LlmProvider - base_url: str | None = ( - None # Optional; for OpenAI-compatible endpoints (e.g. Azure, proxy). Used only when type is OPENAI. - ) + # TODO: verify base_url functionality before enabling + # base_url: str | None = ( + # None # Optional; for OpenAI-compatible endpoints (e.g. Azure, proxy). Used only when type is OPENAI. + # ) @dataclass(frozen=True) class GooglePromptProviderConfig(PromptProviderConfig): """Google (Gemini) prompt provider config. Takes an API key.""" - api_key: str type: LlmProvider = LlmProvider.GOOGLE @@ -47,7 +49,6 @@ class GooglePromptProviderConfig(PromptProviderConfig): class OpenAIPromptProviderConfig(PromptProviderConfig): """OpenAI prompt provider config. Takes an API key. Optional base_url for custom endpoints.""" - api_key: str type: LlmProvider = LlmProvider.OPENAI @@ -55,7 +56,6 @@ class OpenAIPromptProviderConfig(PromptProviderConfig): class AnthropicPromptProviderConfig(PromptProviderConfig): """Anthropic (Claude) prompt provider config. Takes an API key.""" - api_key: str type: LlmProvider = LlmProvider.ANTHROPIC From 08c16d890aeaccb4fc379408c72547de6f3d59c0 Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Mon, 11 May 2026 19:18:34 -0700 Subject: [PATCH 08/15] chore: introduce TelemetryConfig class. --- sdks/python/README.md | 6 ++--- .../learning_commons_evaluators/__init__.py | 2 ++ .../src/learning_commons_evaluators/config.py | 2 ++ .../schemas/config.py | 25 +++++++++-------- sdks/python/tests/evaluators/test_base.py | 2 +- sdks/python/tests/schemas/test_config.py | 27 ++++++++++--------- sdks/python/tests/test_package_imports.py | 2 ++ 7 files changed, 39 insertions(+), 27 deletions(-) diff --git a/sdks/python/README.md b/sdks/python/README.md index 1630de4..49f54ad 100644 --- a/sdks/python/README.md +++ b/sdks/python/README.md @@ -148,7 +148,7 @@ logging.basicConfig(level=logging.INFO) # Create config with provider credentials config = create_config( google_prompt_provider_config=GooglePromptProviderConfig(api_key="your-google-key"), - telemetry_id="your-telemetry-id", + telemetry_partner_id="your-telemetry-id", ) # Create evaluator and run evaluation @@ -181,7 +181,7 @@ from learning_commons_evaluators import ( config = create_config( google_prompt_provider_config=GooglePromptProviderConfig(api_key="..."), - telemetry_id="your-telemetry-id", + telemetry_partner_id="your-telemetry-id", ) evaluator = ConventionalityEvaluator(config) @@ -225,7 +225,7 @@ from learning_commons_evaluators import create_config config = create_config( google_prompt_provider_config=google_config, openai_prompt_provider_config=openai_config, - telemetry_id="your-telemetry-id", + telemetry_partner_id="your-telemetry-id", logger=my_logger, # Optional: any standard logging.Logger (default: package logger) ) ``` diff --git a/sdks/python/src/learning_commons_evaluators/__init__.py b/sdks/python/src/learning_commons_evaluators/__init__.py index 0b96e44..1d2228d 100644 --- a/sdks/python/src/learning_commons_evaluators/__init__.py +++ b/sdks/python/src/learning_commons_evaluators/__init__.py @@ -10,6 +10,7 @@ OpenAIPromptProviderConfig, PromptProviderConfig, PromptSettings, + TelemetryConfig, create_config, create_config_no_telemetry, create_config_telemetry_with_full_input, @@ -113,6 +114,7 @@ "RateLimitError", "SDK_LOGGER_NAME", "Status", + "TelemetryConfig", "TextComplexityEvaluationInput", "TextInputField", "TokenUsage", diff --git a/sdks/python/src/learning_commons_evaluators/config.py b/sdks/python/src/learning_commons_evaluators/config.py index f3dd635..dc5995f 100644 --- a/sdks/python/src/learning_commons_evaluators/config.py +++ b/sdks/python/src/learning_commons_evaluators/config.py @@ -7,6 +7,7 @@ OpenAIPromptProviderConfig, PromptProviderConfig, PromptSettings, + TelemetryConfig, create_config, create_config_no_telemetry, create_config_telemetry_with_full_input, @@ -19,6 +20,7 @@ "OpenAIPromptProviderConfig", "PromptProviderConfig", "PromptSettings", + "TelemetryConfig", "create_config", "create_config_no_telemetry", "create_config_telemetry_with_full_input", diff --git a/sdks/python/src/learning_commons_evaluators/schemas/config.py b/sdks/python/src/learning_commons_evaluators/schemas/config.py index f3b1577..c083840 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/config.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/config.py @@ -77,6 +77,13 @@ class EvaluationSettings(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) +@dataclass(frozen=True) +class TelemetryConfig: + """Config for telemetry.""" + + telemetry_partner_id: str | None = None + send_full_input_with_telemetry: bool = False + # --- Evaluator config --- @@ -98,8 +105,7 @@ class EvaluatorConfig: openai_prompt_provider_config: OpenAIPromptProviderConfig | None = None anthropic_prompt_provider_config: AnthropicPromptProviderConfig | None = None logger: Logger = field(default_factory=get_logger) - telemetry_id: str | None = None - send_full_input_with_telemetry: bool = False + telemetry: TelemetryConfig = field(default_factory=TelemetryConfig) def create_config( @@ -108,17 +114,16 @@ def create_config( openai_prompt_provider_config: OpenAIPromptProviderConfig | None = None, anthropic_prompt_provider_config: AnthropicPromptProviderConfig | None = None, logger: Logger | None = None, - telemetry_id: str, + telemetry_partner_id: str, send_full_input_with_telemetry: bool = False, ) -> EvaluatorConfig: - """Create evaluator config with telemetry. telemetry_id is required.""" + """Create evaluator config with telemetry. telemetry_partner_id is required.""" return EvaluatorConfig( google_prompt_provider_config=google_prompt_provider_config, openai_prompt_provider_config=openai_prompt_provider_config, anthropic_prompt_provider_config=anthropic_prompt_provider_config, logger=get_logger() if logger is None else logger, - telemetry_id=telemetry_id, - send_full_input_with_telemetry=send_full_input_with_telemetry, + telemetry=TelemetryConfig(telemetry_partner_id=telemetry_partner_id, send_full_input_with_telemetry=send_full_input_with_telemetry), ) @@ -135,8 +140,7 @@ def create_config_no_telemetry( openai_prompt_provider_config=openai_prompt_provider_config, anthropic_prompt_provider_config=anthropic_prompt_provider_config, logger=get_logger() if logger is None else logger, - telemetry_id=None, - send_full_input_with_telemetry=False, + telemetry=TelemetryConfig(telemetry_partner_id=None, send_full_input_with_telemetry=False), ) @@ -146,7 +150,7 @@ def create_config_telemetry_with_full_input( openai_prompt_provider_config: OpenAIPromptProviderConfig | None = None, anthropic_prompt_provider_config: AnthropicPromptProviderConfig | None = None, logger: Logger | None = None, - telemetry_id: str, + telemetry_partner_id: str, ) -> EvaluatorConfig: """Create evaluator config with telemetry and full input sent with telemetry.""" return EvaluatorConfig( @@ -154,6 +158,5 @@ def create_config_telemetry_with_full_input( openai_prompt_provider_config=openai_prompt_provider_config, anthropic_prompt_provider_config=anthropic_prompt_provider_config, logger=get_logger() if logger is None else logger, - telemetry_id=telemetry_id, - send_full_input_with_telemetry=True, + telemetry=TelemetryConfig(telemetry_partner_id=telemetry_partner_id, send_full_input_with_telemetry=True), ) diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py index ba0310b..32cc3b5 100644 --- a/sdks/python/tests/evaluators/test_base.py +++ b/sdks/python/tests/evaluators/test_base.py @@ -142,7 +142,7 @@ def test_evaluate_sets_metadata_from_input_metadata(self, stub_evaluator): assert result.metadata.input_metadata["grade_level"] == {"grade": 3} def test_full_telemetry_config_still_uses_input_metadata_not_raw_values(self, stub_evaluator): - cfg = create_config(telemetry_id="test", send_full_input_with_telemetry=True) + cfg = create_config(telemetry_partner_id="test", send_full_input_with_telemetry=True) ev = _StubEvaluator(cfg) inp = _stub_input() result = ev.evaluate(inp) diff --git a/sdks/python/tests/schemas/test_config.py b/sdks/python/tests/schemas/test_config.py index 9175450..3727ee3 100644 --- a/sdks/python/tests/schemas/test_config.py +++ b/sdks/python/tests/schemas/test_config.py @@ -1,5 +1,7 @@ """Tests for EvaluatorConfig, PromptProviderConfig subclasses, and factory functions.""" +from dataclasses import FrozenInstanceError + import pytest from learning_commons_evaluators.logger import SDK_LOGGER_NAME, get_logger @@ -8,6 +10,7 @@ GooglePromptProviderConfig, LlmProvider, OpenAIPromptProviderConfig, + TelemetryConfig, create_config, create_config_no_telemetry, create_config_telemetry_with_full_input, @@ -46,15 +49,15 @@ def test_openai_config_accepts_base_url(self): assert cfg.base_url == "https://custom.example.com" def test_provider_config_base_url_defaults_to_none(self): - cfg = GooglePromptProviderConfig(api_key="key") + cfg = OpenAIPromptProviderConfig(api_key="key") assert cfg.base_url is None class TestEvaluatorConfigFactory: def test_create_config_no_telemetry_defaults(self): config = create_config_no_telemetry() - assert config.telemetry_id is None - assert config.send_full_input_with_telemetry is False + assert config.telemetry.telemetry_partner_id is None + assert config.telemetry.send_full_input_with_telemetry is False assert config.logger.name == SDK_LOGGER_NAME def test_create_config_no_telemetry_accepts_providers(self): @@ -65,15 +68,15 @@ def test_create_config_no_telemetry_accepts_providers(self): assert config.google_prompt_provider_config.api_key == "gk" assert config.openai_prompt_provider_config.api_key == "ok" - def test_create_config_sets_telemetry_id(self): - config = create_config(telemetry_id="tid-123") - assert config.telemetry_id == "tid-123" - assert config.send_full_input_with_telemetry is False + def test_create_config_sets_telemetry_partner_id(self): + config = create_config(telemetry_partner_id="tid-123") + assert config.telemetry.telemetry_partner_id == "tid-123" + assert config.telemetry.send_full_input_with_telemetry is False def test_create_config_telemetry_with_full_input_sets_flag(self): - config = create_config_telemetry_with_full_input(telemetry_id="tid") - assert config.telemetry_id == "tid" - assert config.send_full_input_with_telemetry is True + config = create_config_telemetry_with_full_input(telemetry_partner_id="tid") + assert config.telemetry.telemetry_partner_id == "tid" + assert config.telemetry.send_full_input_with_telemetry is True def test_explicit_logger_is_preserved(self): custom = get_logger("custom_test") @@ -83,5 +86,5 @@ def test_explicit_logger_is_preserved(self): def test_config_is_frozen(self): """EvaluatorConfig is a frozen dataclass; mutation must raise.""" config = create_config_no_telemetry() - with pytest.raises((AttributeError, TypeError)): - config.telemetry_id = "other" # type: ignore[misc] + with pytest.raises(FrozenInstanceError): + config.telemetry = TelemetryConfig(telemetry_partner_id="x", send_full_input_with_telemetry=False) diff --git a/sdks/python/tests/test_package_imports.py b/sdks/python/tests/test_package_imports.py index df987a2..bee9dad 100644 --- a/sdks/python/tests/test_package_imports.py +++ b/sdks/python/tests/test_package_imports.py @@ -5,6 +5,7 @@ def test_main_package_imports(): """All advertised public symbols can be imported from the root package.""" from learning_commons_evaluators import ( BaseEvaluator, + TelemetryConfig, __version__, create_config_no_telemetry, ) @@ -12,6 +13,7 @@ def test_main_package_imports(): assert __version__ is not None assert create_config_no_telemetry is not None assert BaseEvaluator is not None + assert TelemetryConfig is not None def test_errors_import(): From be6908addfb8e4a2aa7475d0b8c07a0cb1c6216a Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Mon, 11 May 2026 19:24:53 -0700 Subject: [PATCH 09/15] chore: remove custom LLM endpoints support for now. --- sdks/python/README.md | 2 +- .../providers/langchain.py | 2 -- .../schemas/config.py | 10 +++++----- sdks/python/tests/providers/test_langchain.py | 18 +----------------- sdks/python/tests/schemas/test_config.py | 8 -------- 5 files changed, 7 insertions(+), 33 deletions(-) diff --git a/sdks/python/README.md b/sdks/python/README.md index 49f54ad..4c27daa 100644 --- a/sdks/python/README.md +++ b/sdks/python/README.md @@ -211,7 +211,7 @@ from learning_commons_evaluators import ( ) google_config = GooglePromptProviderConfig(api_key="...") -openai_config = OpenAIPromptProviderConfig(api_key="...", base_url=None) # base_url for Azure/proxies +openai_config = OpenAIPromptProviderConfig(api_key="...") anthropic_config = AnthropicPromptProviderConfig(api_key="...") ``` diff --git a/sdks/python/src/learning_commons_evaluators/providers/langchain.py b/sdks/python/src/learning_commons_evaluators/providers/langchain.py index d4d07b7..9496831 100644 --- a/sdks/python/src/learning_commons_evaluators/providers/langchain.py +++ b/sdks/python/src/learning_commons_evaluators/providers/langchain.py @@ -56,8 +56,6 @@ def _create_openai_llm( "model": prompt_settings.model, "temperature": prompt_settings.temperature, } - if openai_config.base_url is not None: - kwargs["base_url"] = openai_config.base_url return ChatOpenAI(**kwargs) diff --git a/sdks/python/src/learning_commons_evaluators/schemas/config.py b/sdks/python/src/learning_commons_evaluators/schemas/config.py index c083840..5c16495 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/config.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/config.py @@ -32,10 +32,6 @@ class PromptProviderConfig: api_key: str type: LlmProvider - # TODO: verify base_url functionality before enabling - # base_url: str | None = ( - # None # Optional; for OpenAI-compatible endpoints (e.g. Azure, proxy). Used only when type is OPENAI. - # ) @dataclass(frozen=True) @@ -47,9 +43,13 @@ class GooglePromptProviderConfig(PromptProviderConfig): @dataclass(frozen=True) class OpenAIPromptProviderConfig(PromptProviderConfig): - """OpenAI prompt provider config. Takes an API key. Optional base_url for custom endpoints.""" + """OpenAI prompt provider config. Takes an API key.""" type: LlmProvider = LlmProvider.OPENAI + # TODO: verify base_url functionality before enabling + # base_url: str | None = ( + # None # Optional; for OpenAI-compatible endpoints (e.g. Azure, proxy). Used only when type is OPENAI. + # ) @dataclass(frozen=True) diff --git a/sdks/python/tests/providers/test_langchain.py b/sdks/python/tests/providers/test_langchain.py index 65f3f41..b9f9c31 100644 --- a/sdks/python/tests/providers/test_langchain.py +++ b/sdks/python/tests/providers/test_langchain.py @@ -1,6 +1,6 @@ """Tests for LangChain provider factory and token usage.""" -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock import pytest @@ -53,22 +53,6 @@ def test_openai_provider_returns_model(self): ) assert create_provider(settings, config) is not None - def test_openai_provider_with_base_url_forwards_it_to_chat_model(self): - """When OpenAIPromptProviderConfig.base_url is set, it must be passed to ChatOpenAI.""" - config = _config( - openai_prompt_provider_config=OpenAIPromptProviderConfig( - api_key="test-key", - base_url="https://custom.example.com/v1", - ), - ) - settings = PromptSettings( - provider_type=LlmProvider.OPENAI, model="gpt-4o-mini", temperature=0.0 - ) - with patch("langchain_openai.ChatOpenAI") as mock_chat: - create_provider(settings, config) - _, kwargs = mock_chat.call_args - assert kwargs.get("base_url") == "https://custom.example.com/v1" - def test_raises_when_google_config_missing(self): settings = PromptSettings( provider_type=LlmProvider.GOOGLE, model="gemini-2.0-flash", temperature=0.0 diff --git a/sdks/python/tests/schemas/test_config.py b/sdks/python/tests/schemas/test_config.py index 3727ee3..264304f 100644 --- a/sdks/python/tests/schemas/test_config.py +++ b/sdks/python/tests/schemas/test_config.py @@ -44,14 +44,6 @@ def test_provider_config_default_type(self, cls, expected_type): assert cfg.type == expected_type assert cfg.api_key == "test-key" - def test_openai_config_accepts_base_url(self): - cfg = OpenAIPromptProviderConfig(api_key="key", base_url="https://custom.example.com") - assert cfg.base_url == "https://custom.example.com" - - def test_provider_config_base_url_defaults_to_none(self): - cfg = OpenAIPromptProviderConfig(api_key="key") - assert cfg.base_url is None - class TestEvaluatorConfigFactory: def test_create_config_no_telemetry_defaults(self): From eaa5cf54d90af47f7d7a071233ffc300cc3a52ab Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Mon, 11 May 2026 20:21:37 -0700 Subject: [PATCH 10/15] chore: errors TODOs --- .../python/src/learning_commons_evaluators/schemas/errors.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sdks/python/src/learning_commons_evaluators/schemas/errors.py b/sdks/python/src/learning_commons_evaluators/schemas/errors.py index 470d371..a14e80b 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/errors.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/errors.py @@ -2,7 +2,7 @@ import re - +# TODO: rename name and message, and remove Evaluator prefix where appropriate class EvaluatorError(Exception): """Base error class for all evaluator errors.""" @@ -85,7 +85,7 @@ def __init__(self, message: str = "Request timed out"): super().__init__(message, 408, True, "TIMEOUT_ERROR") self.name = "EvaluatorTimeoutError" - +# TODO: OpenAI & Anthropic may return a status_code in the response. def _parse_provider_error(error: BaseException) -> tuple[str, int | None, str | None]: message = str(error) status_code = None @@ -120,6 +120,7 @@ def wrap_provider_error( # Timeouts before generic "Connection" β€” many stacks use "Connection timed out" if "timeout" in msg.lower() or "timed out" in msg.lower(): return EvaluatorTimeoutError(msg) + # TODO: confirm if these apply to Python too. Based on TypeScript SDK implementation. if any( x in msg for x in ( From 6086a917f9b8bda8c544c9a563e42e492b58300e Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Tue, 12 May 2026 00:59:56 -0700 Subject: [PATCH 11/15] chore: simplified load_settings and formatting --- sdks/python/pyproject.toml | 16 +- .../evaluators/base.py | 2 +- .../schemas/common_inputs.py | 1 + .../schemas/config.py | 10 +- .../schemas/errors.py | 2 + .../schemas/input_specs.py | 4 +- .../schemas/metadata.py | 46 +++- .../settings/load_settings.py | 191 +++++---------- sdks/python/tests/schemas/test_config.py | 4 +- sdks/python/tests/schemas/test_metadata.py | 35 ++- .../tests/settings/fixtures/__init__.py | 1 + .../tests/settings/fixtures/minimal.toml | 1 + .../tests/settings/test_load_settings.py | 228 ++++++++++++++++++ 13 files changed, 384 insertions(+), 157 deletions(-) create mode 100644 sdks/python/tests/settings/fixtures/__init__.py create mode 100644 sdks/python/tests/settings/fixtures/minimal.toml create mode 100644 sdks/python/tests/settings/test_load_settings.py diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml index d48f934..ec69933 100644 --- a/sdks/python/pyproject.toml +++ b/sdks/python/pyproject.toml @@ -76,8 +76,8 @@ testpaths = ["tests"] target-version = "py310" line-length = 100 extend-exclude = [ - "src/learning_commons_evaluators/settings/_generated_conventionality_settings.py", - "src/learning_commons_evaluators/settings/_generated_vocabulary_settings.py", + # Codegen output from ``make generate-settings`` β€” not hand-edited. + "src/learning_commons_evaluators/settings/_generated_*_settings.py", ] [tool.ruff.lint] @@ -111,19 +111,7 @@ disable_error_code = ["override"] module = "learning_commons_evaluators.settings.load_settings" disable_error_code = ["import-untyped"] -[[tool.mypy.overrides]] -module = "contract_tests.loader" -disable_error_code = ["import-untyped"] - [[tool.mypy.overrides]] module = "learning_commons_evaluators.providers.langchain" # LangChain stubs lag the runtime API (e.g. ChatAnthropic kwargs). disable_error_code = ["call-arg", "arg-type"] - -[[tool.mypy.overrides]] -module = "learning_commons_evaluators.evaluators.conventionality" -disable_error_code = ["import-untyped", "call-arg"] - -[[tool.mypy.overrides]] -module = "learning_commons_evaluators.evaluators.vocabulary" -disable_error_code = ["import-untyped", "call-arg"] diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py index 0a7f495..9b48d99 100644 --- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py +++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py @@ -96,7 +96,7 @@ def evaluate( evaluation_metadata = EvaluationMetadata( evaluator_metadata=self.metadata, evaluation_settings=evaluation_settings, - input_metadata=input.input_metadata() + input_metadata=input.input_metadata(), ) self.config.logger.info( "evaluation start", diff --git a/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py b/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py index 4de9964..d916289 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py @@ -57,6 +57,7 @@ def _strip_value_if_spec_requests(cls, data: Any) -> Any: if isinstance(spec, TextInputSpec) and spec.strip_whitespace: return {**data, "value": value.strip()} return data + def validate(self) -> None: """Raise :class:`~.errors.ValidationError` if the value violates the spec constraints.""" text_length = len(self.value) diff --git a/sdks/python/src/learning_commons_evaluators/schemas/config.py b/sdks/python/src/learning_commons_evaluators/schemas/config.py index 5c16495..94e44f2 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/config.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/config.py @@ -84,6 +84,7 @@ class TelemetryConfig: telemetry_partner_id: str | None = None send_full_input_with_telemetry: bool = False + # --- Evaluator config --- @@ -123,7 +124,10 @@ def create_config( openai_prompt_provider_config=openai_prompt_provider_config, anthropic_prompt_provider_config=anthropic_prompt_provider_config, logger=get_logger() if logger is None else logger, - telemetry=TelemetryConfig(telemetry_partner_id=telemetry_partner_id, send_full_input_with_telemetry=send_full_input_with_telemetry), + telemetry=TelemetryConfig( + telemetry_partner_id=telemetry_partner_id, + send_full_input_with_telemetry=send_full_input_with_telemetry, + ), ) @@ -158,5 +162,7 @@ def create_config_telemetry_with_full_input( openai_prompt_provider_config=openai_prompt_provider_config, anthropic_prompt_provider_config=anthropic_prompt_provider_config, logger=get_logger() if logger is None else logger, - telemetry=TelemetryConfig(telemetry_partner_id=telemetry_partner_id, send_full_input_with_telemetry=True), + telemetry=TelemetryConfig( + telemetry_partner_id=telemetry_partner_id, send_full_input_with_telemetry=True + ), ) diff --git a/sdks/python/src/learning_commons_evaluators/schemas/errors.py b/sdks/python/src/learning_commons_evaluators/schemas/errors.py index a14e80b..47e5a22 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/errors.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/errors.py @@ -2,6 +2,7 @@ import re + # TODO: rename name and message, and remove Evaluator prefix where appropriate class EvaluatorError(Exception): """Base error class for all evaluator errors.""" @@ -85,6 +86,7 @@ def __init__(self, message: str = "Request timed out"): super().__init__(message, 408, True, "TIMEOUT_ERROR") self.name = "EvaluatorTimeoutError" + # TODO: OpenAI & Anthropic may return a status_code in the response. def _parse_provider_error(error: BaseException) -> tuple[str, int | None, str | None]: message = str(error) diff --git a/sdks/python/src/learning_commons_evaluators/schemas/input_specs.py b/sdks/python/src/learning_commons_evaluators/schemas/input_specs.py index bab6e06..0b6154b 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/input_specs.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/input_specs.py @@ -40,8 +40,8 @@ # --------------------------------------------------------------------------- # Maps the ``type`` discriminator string to the concrete InputSpec class. -# Used by the TOML parser to instantiate the right subclass and to build -# the AnyInputSpec discriminated union below. +# Used when building :class:`~learning_commons_evaluators.schemas.metadata.EvaluatorMetadata` +# from TOML (via Pydantic's discriminated union on ``AnyInputSpec``). INPUT_SPEC_REGISTRY: dict[str, type["InputSpec"]] = {} diff --git a/sdks/python/src/learning_commons_evaluators/schemas/metadata.py b/sdks/python/src/learning_commons_evaluators/schemas/metadata.py index 3ca60ed..26de4dd 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/metadata.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/metadata.py @@ -4,7 +4,7 @@ from enum import Enum from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, TypeAdapter, field_validator, model_validator from learning_commons_evaluators._version import __version__ as sdk_version from learning_commons_evaluators.schemas.config import LlmProvider, PromptSettings @@ -31,14 +31,50 @@ class Status(Enum): class EvaluatorMetadata(BaseModel): """Evaluator metadata: id, version, name, description; maturity (alpha, beta, rc, ga); sdk_version.""" - id: str - version: str - name: str - description: str + id: str = Field(..., min_length=1) + version: str = Field(..., min_length=1) + name: str = Field(..., min_length=1) + description: str = Field(..., min_length=1) maturity: EvaluatorMaturity sdk_version: str = f"learning-commons-evaluators-python-{sdk_version}" inputs: dict[str, AnyInputSpec] = Field(default_factory=dict) + @model_validator(mode="before") + @classmethod + def _coerce_toml_inputs(cls, data: Any) -> Any: + """Turn ``[[evaluator_metadata.inputs]]`` list rows into ``inputs`` keyed by field name.""" + if not isinstance(data, dict): + return data + out = dict(data) + if "inputs" not in out: + return out + raw = out["inputs"] + if isinstance(raw, list): + adapter: TypeAdapter[Any] = TypeAdapter(AnyInputSpec) + parsed: dict[str, AnyInputSpec] = {} + for item in raw: + if not (isinstance(item, dict) and "name" in item): + continue + parsed[str(item["name"])] = adapter.validate_python(item) + out["inputs"] = parsed + elif raw is None: + out["inputs"] = {} + return out + + @field_validator("id", "version", "name", "description", mode="before") + @classmethod + def _strip_required_strings(cls, v: Any) -> Any: + if v is None: + return v + return str(v).strip() + + @field_validator("maturity", mode="before") + @classmethod + def _normalize_maturity(cls, v: Any) -> Any: + if isinstance(v, str): + return v.lower() + return v + class TokenUsage(BaseModel): """Token usage for a some step of an evaluation: provider type, model, and token counts.""" diff --git a/sdks/python/src/learning_commons_evaluators/settings/load_settings.py b/sdks/python/src/learning_commons_evaluators/settings/load_settings.py index ebaa4a9..530dd9c 100644 --- a/sdks/python/src/learning_commons_evaluators/settings/load_settings.py +++ b/sdks/python/src/learning_commons_evaluators/settings/load_settings.py @@ -5,30 +5,31 @@ import importlib.resources import os from dataclasses import dataclass + +try: + from importlib.resources.abc import Traversable +except ImportError: + from importlib.abc import Traversable # Python < 3.11 from pathlib import Path -from typing import Generic, TypeVar, cast +from typing import Generic, TypeVar try: import tomllib except ImportError: import tomli as tomllib # type: ignore[import-not-found,no-redef] # Python < 3.11 -from learning_commons_evaluators.schemas.config import LlmProvider, PromptSettings +from pydantic import TypeAdapter +from pydantic import ValidationError as PydanticValidationError + +from learning_commons_evaluators.schemas.config import EvaluationSettings, PromptSettings from learning_commons_evaluators.schemas.errors import ConfigurationError -from learning_commons_evaluators.schemas.input_specs import ( - INPUT_SPEC_REGISTRY, - AnyInputSpec, -) -from learning_commons_evaluators.schemas.metadata import ( - EvaluatorMaturity, - EvaluatorMetadata, -) +from learning_commons_evaluators.schemas.metadata import EvaluatorMetadata -T = TypeVar("T") +T = TypeVar("T", bound=EvaluationSettings) -def shared_settings_root() -> Path: - """Return the path to the evaluator settings directory. +def shared_settings_root() -> Path | Traversable: + """Return the evaluator settings directory (filesystem or package Traversable). Resolution order: @@ -41,10 +42,13 @@ def shared_settings_root() -> Path: Also useful in CI jobs that check out settings separately. - 2. Bundled package data β€” resolved via :mod:`importlib.resources` from the - ``learning_commons_evaluators.settings`` sub-package. This is the path - taken by a normal ``pip install`` (editable or non-editable) when the - env var is not set. + 2. Bundled package data β€” a :class:`importlib.abc.Traversable` from + :func:`importlib.resources.files` for the ``learning_commons_evaluators.settings`` + sub-package (works from a wheel/zip without assuming a real directory path). + + Use :func:`load_settings` / :func:`load_evaluator_settings`, which accept a + ``Traversable``, or :func:`importlib.resources.as_file` if an API requires a + concrete :class:`pathlib.Path` on disk. The bundled copy is kept in sync with ``sdks/settings/`` β€” see the *Keeping settings in sync* section of the README. @@ -52,129 +56,55 @@ def shared_settings_root() -> Path: env = os.environ.get("EVALUATORS_SETTINGS_DIR") if env: return Path(env) - # importlib.resources.files() returns a Traversable; converting to Path - # works for both editable installs (a real directory) and zip/wheel installs - # (where Python 3.9+ extracts to a temp dir automatically). - pkg = importlib.resources.files("learning_commons_evaluators.settings") - return Path(str(pkg)) + return importlib.resources.files("learning_commons_evaluators.settings") -def _require(key: str, value: object, section: str) -> None: - """Raise ConfigurationError if value is missing or empty.""" - if value is None or (isinstance(value, str) and not value.strip()): - raise ConfigurationError(f"Missing required field '{key}' in [{section}]") - - -def load_settings(path: Path | str) -> dict: +def load_settings(path: Path | str | Traversable) -> dict: """ Load raw settings from a TOML file. Args: - path: Path to the .toml file. + path: Path to the .toml file, or a :class:`~importlib.abc.Traversable` + (e.g. from :func:`shared_settings_root` when using bundled settings). Returns: Parsed TOML as a dict. """ - path = Path(path) - with path.open("rb") as f: + if isinstance(path, Traversable): + with path.open("rb") as f: + return tomllib.load(f) + with Path(path).open("rb") as f: return tomllib.load(f) -def _parse_maturity(value: str) -> EvaluatorMaturity: - """Map TOML maturity string to EvaluatorMaturity enum.""" +_prompt_settings_adapter = TypeAdapter(PromptSettings) + + +def _prepare_prompt_settings_dict(pm: dict) -> dict: + """Lowercase ``provider_type`` strings for TOML / enum matching.""" + d = dict(pm) + pt = d.get("provider_type") + if isinstance(pt, str): + d["provider_type"] = pt.lower().strip() + return d + + +def _validated_prompt_settings(pm: dict, step_name: str) -> PromptSettings: try: - return EvaluatorMaturity(value.lower()) - except ValueError as e: - raise ConfigurationError( - f"Invalid maturity '{value}' in [evaluator_metadata]; expected one of alpha, beta, rc, ga." - ) from e + return _prompt_settings_adapter.validate_python(_prepare_prompt_settings_dict(pm)) + except PydanticValidationError as e: + raise ConfigurationError(f"Invalid [{step_name}]: {e}") from e def _parse_evaluator_metadata(data: dict) -> EvaluatorMetadata: - """Build EvaluatorMetadata from TOML evaluator_metadata section. Raises ConfigurationError if a required field is missing.""" + """Build EvaluatorMetadata from TOML ``[evaluator_metadata]`` via Pydantic validation.""" em = data.get("evaluator_metadata") if not em or not isinstance(em, dict): raise ConfigurationError("Missing required section [evaluator_metadata].") - section = "evaluator_metadata" - id_val = em.get("id") - _require("id", id_val, section) - version = em.get("version") - _require("version", version, section) - if not isinstance(version, str): - version = str(version) - name = em.get("name") - _require("name", name, section) - description = em.get("description") - _require("description", description, section) - maturity_val = em.get("maturity") - _require("maturity", maturity_val, section) - if not isinstance(maturity_val, str): - raise ConfigurationError(f"Field 'maturity' in [{section}] must be a string.") - maturity = _parse_maturity(maturity_val) - # Parse [[evaluator_metadata.inputs]] into a dict keyed by field name. - # Dispatch on ``type`` to create the correct InputSpec subclass so that - # type-specific constraint fields (e.g. min_text_length) are preserved. - inputs: dict[str, AnyInputSpec] = {} - for spec_dict in em.get("inputs", []): - if not (isinstance(spec_dict, dict) and "name" in spec_dict): - continue - field_name = spec_dict["name"] - type_key = spec_dict.get("type", "") - spec_cls = INPUT_SPEC_REGISTRY.get(type_key) - if spec_cls is None: - raise ConfigurationError( - f"Unknown input type '{type_key}' in [[evaluator_metadata.inputs]] " - f"for field '{field_name}'. Expected one of: {sorted(INPUT_SPEC_REGISTRY)}." - ) - inputs[field_name] = cast(AnyInputSpec, spec_cls(**spec_dict)) - - return EvaluatorMetadata( - id=str(id_val).strip(), - version=version.strip(), - name=str(name).strip(), - description=str(description).strip(), - maturity=maturity, - inputs=inputs, - ) - - -def _parse_provider_type(value: str, step_name: str) -> LlmProvider: - """Map TOML provider type string to LlmProvider enum.""" - normalized = value.upper().strip() - if normalized == "GOOGLE": - return LlmProvider.GOOGLE - if normalized == "OPENAI": - return LlmProvider.OPENAI - if normalized == "ANTHROPIC": - return LlmProvider.ANTHROPIC - raise ConfigurationError( - f"Invalid provider type '{value}' in [{step_name}]; expected one of: google, openai, anthropic." - ) - - -def _parse_prompt_settings_step(pm: dict, step_name: str) -> PromptSettings: - """Build PromptSettings from a TOML prompt_settings_* subsection. Raises ConfigurationError if a required field is missing.""" - pt = pm.get("type") or pm.get("provider_type") - _require("type", pt, step_name) - if not isinstance(pt, str): - raise ConfigurationError(f"Field 'type' in [{step_name}] must be a string.") - provider_type = _parse_provider_type(pt, step_name) - model = pm.get("model") - _require("model", model, step_name) - if not isinstance(model, str): - raise ConfigurationError(f"Field 'model' in [{step_name}] must be a string.") - temp = pm.get("temperature") - if temp is None: - raise ConfigurationError(f"Missing required field 'temperature' in [{step_name}].") try: - temperature = float(temp) - except (TypeError, ValueError) as e: - raise ConfigurationError(f"Field 'temperature' in [{step_name}] must be a number.") from e - return PromptSettings( - provider_type=provider_type, - model=model.strip(), - temperature=temperature, - ) + return EvaluatorMetadata.model_validate(em) + except PydanticValidationError as e: + raise ConfigurationError(f"Invalid [evaluator_metadata]: {e}") from e def _normalize_prompt_whitespace(prompt: str) -> str: @@ -209,19 +139,22 @@ class EvaluatorSettingsResult(Generic[T]): prompts: dict[str, str] -def load_evaluator_settings(path: Path | str, settings_cls: type[T]) -> EvaluatorSettingsResult[T]: +def load_evaluator_settings( + path: Path | str | Traversable, settings_cls: type[T] +) -> EvaluatorSettingsResult[T]: """ Load evaluator settings from a TOML file. Parses evaluator_metadata into EvaluatorMetadata, evaluation_settings into an instance - of settings_cls (with prompt_settings_* subsections as PromptSettings), and extracts + of settings_cls (with ``prompt_settings_*`` subsections as :class:`~learning_commons_evaluators.schemas.config.PromptSettings` + using ``provider_type``, ``model``, and ``temperature``), and extracts prompt text (e.g. system_prompt) into a prompts dict. Raises ConfigurationError if any required field or section is missing. Args: - path: Path to the .toml file. + path: Path to the .toml file, or a :class:`~importlib.abc.Traversable` to it. settings_cls: Class for evaluation settings (e.g. ConventionalityEvaluationSettings). - Must accept keyword arguments matching the TOML evaluation_settings keys. + Must be a Pydantic :class:`~pydantic.BaseModel` subclass; validated with ``model_validate``. Returns: EvaluatorSettingsResult with evaluator_metadata, evaluation_settings (typed), and prompts. @@ -239,16 +172,12 @@ def load_evaluator_settings(path: Path | str, settings_cls: type[T]) -> Evaluato # Convert prompt_settings_* subsections to PromptSettings. for key in list(raw): if key.startswith("prompt_settings_") and isinstance(raw[key], dict): - raw[key] = _parse_prompt_settings_step(raw[key], f"evaluation_settings.{key}") + raw[key] = _validated_prompt_settings(raw[key], f"evaluation_settings.{key}") try: - evaluation_settings = settings_cls(**raw) - except Exception as e: - if isinstance(e, ConfigurationError): - raise - raise ConfigurationError( - f"Invalid [evaluation_settings]: {e!s}", - ) from e + evaluation_settings = settings_cls.model_validate(raw) + except PydanticValidationError as e: + raise ConfigurationError(f"Invalid [evaluation_settings]: {e}") from e return EvaluatorSettingsResult( evaluator_metadata=evaluator_metadata, diff --git a/sdks/python/tests/schemas/test_config.py b/sdks/python/tests/schemas/test_config.py index 264304f..6a50b09 100644 --- a/sdks/python/tests/schemas/test_config.py +++ b/sdks/python/tests/schemas/test_config.py @@ -79,4 +79,6 @@ def test_config_is_frozen(self): """EvaluatorConfig is a frozen dataclass; mutation must raise.""" config = create_config_no_telemetry() with pytest.raises(FrozenInstanceError): - config.telemetry = TelemetryConfig(telemetry_partner_id="x", send_full_input_with_telemetry=False) + config.telemetry = TelemetryConfig( + telemetry_partner_id="x", send_full_input_with_telemetry=False + ) diff --git a/sdks/python/tests/schemas/test_metadata.py b/sdks/python/tests/schemas/test_metadata.py index c2b7590..6ddbd81 100644 --- a/sdks/python/tests/schemas/test_metadata.py +++ b/sdks/python/tests/schemas/test_metadata.py @@ -7,8 +7,10 @@ from datetime import timezone import pytest +from pydantic import ValidationError from learning_commons_evaluators.schemas.config import LlmProvider, PromptSettings +from learning_commons_evaluators.schemas.input_specs import TextInputSpec from learning_commons_evaluators.schemas.metadata import ( PROMPT_STEP_EXTRA_PROMPT_SETTINGS, PROMPT_STEP_EXTRA_TOKEN_USAGE, @@ -61,8 +63,39 @@ def test_fields_and_sdk_version(self): # sdk_version is auto-populated from the installed package version. assert "learning-commons-evaluators-python" in meta.sdk_version + def test_model_validate_toml_like_inputs_and_maturity(self): + meta = EvaluatorMetadata.model_validate( + { + "id": " e ", + "version": 1.0, + "name": "N", + "description": "D", + "maturity": "GA", + "inputs": [ + {"name": "text", "type": "TextInputField"}, + ], + } + ) + assert meta.id == "e" + assert meta.version == "1.0" + assert meta.maturity == EvaluatorMaturity.ga + assert isinstance(meta.inputs["text"], TextInputSpec) + + def test_model_validate_unknown_input_type_raises(self): + with pytest.raises( + ValidationError, match="TextInputField|GradeInputField|Unable to extract" + ): + EvaluatorMetadata.model_validate( + { + "id": "e", + "version": "1", + "name": "N", + "description": "D", + "maturity": "ga", + "inputs": [{"name": "x", "type": "UnknownType"}], + } + ) -class TestTokenUsage: def test_fields(self): usage = TokenUsage( provider_type=LlmProvider.GOOGLE, diff --git a/sdks/python/tests/settings/fixtures/__init__.py b/sdks/python/tests/settings/fixtures/__init__.py new file mode 100644 index 0000000..0296240 --- /dev/null +++ b/sdks/python/tests/settings/fixtures/__init__.py @@ -0,0 +1 @@ +"""Static TOML fixtures for settings loader tests.""" diff --git a/sdks/python/tests/settings/fixtures/minimal.toml b/sdks/python/tests/settings/fixtures/minimal.toml new file mode 100644 index 0000000..662527e --- /dev/null +++ b/sdks/python/tests/settings/fixtures/minimal.toml @@ -0,0 +1 @@ +answer = 42 diff --git a/sdks/python/tests/settings/test_load_settings.py b/sdks/python/tests/settings/test_load_settings.py new file mode 100644 index 0000000..a572a5d --- /dev/null +++ b/sdks/python/tests/settings/test_load_settings.py @@ -0,0 +1,228 @@ +"""Tests for :mod:`learning_commons_evaluators.settings.load_settings`.""" + +from __future__ import annotations + +import importlib.resources +import textwrap +from pathlib import Path + +import pytest + +from learning_commons_evaluators.schemas.config import ( + EvaluationSettings, + LlmProvider, + PromptSettings, +) +from learning_commons_evaluators.schemas.errors import ConfigurationError +from learning_commons_evaluators.schemas.input_specs import TextInputSpec +from learning_commons_evaluators.schemas.metadata import EvaluatorMaturity +from learning_commons_evaluators.settings.load_settings import ( + load_evaluator_settings, + load_settings, + shared_settings_root, +) + + +class _MiniSettings(EvaluationSettings): + """Minimal evaluation_settings block for loader tests.""" + + marker: int = 0 + + +class _WithPromptSettings(EvaluationSettings): + marker: int = 0 + prompt_settings_main: PromptSettings + + +def test_load_evaluator_settings_end_to_end(tmp_path: Path) -> None: + path = tmp_path / "eval.toml" + path.write_text( + textwrap.dedent( + """ + [evaluator_metadata] + id = "e" + version = "1.0" + name = "N" + description = "D" + maturity = "GA" + + [[evaluator_metadata.inputs]] + name = "text" + type = "TextInputField" + + [evaluation_settings] + marker = 7 + """ + ).strip() + ) + result = load_evaluator_settings(path, _MiniSettings) + assert result.evaluator_metadata.maturity == EvaluatorMaturity.ga + assert isinstance(result.evaluator_metadata.inputs["text"], TextInputSpec) + assert result.evaluation_settings.marker == 7 + + +def test_load_evaluator_settings_wraps_metadata_validation(tmp_path: Path) -> None: + path = tmp_path / "bad.toml" + path.write_text( + textwrap.dedent( + """ + [evaluator_metadata] + id = "" + version = "1" + name = "N" + description = "D" + maturity = "ga" + + [evaluation_settings] + marker = 0 + """ + ).strip() + ) + with pytest.raises(ConfigurationError, match="Invalid \\[evaluator_metadata\\]"): + load_evaluator_settings(path, _MiniSettings) + + +def test_load_settings_path_and_str_same_result(tmp_path: Path) -> None: + path = tmp_path / "raw.toml" + path.write_text('mode = "test"\n') + by_path = load_settings(path) + by_str = load_settings(str(path)) + assert by_path == by_str == {"mode": "test"} + + +def test_load_settings_traversable_package_file() -> None: + root = importlib.resources.files("tests.settings.fixtures") + tom = root.joinpath("minimal.toml") + assert load_settings(tom) == {"answer": 42} + + +def test_shared_settings_root_env_overrides( + monkeypatch: pytest.MonkeyPatch, tmp_path: Path +) -> None: + custom = tmp_path / "custom_settings" + custom.mkdir() + monkeypatch.setenv("EVALUATORS_SETTINGS_DIR", str(custom)) + assert shared_settings_root() == custom + + +def test_shared_settings_root_bundled_points_at_package(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("EVALUATORS_SETTINGS_DIR", raising=False) + root = shared_settings_root() + assert root.joinpath("load_settings.py").is_file() + + +def test_load_evaluator_settings_missing_evaluator_metadata(tmp_path: Path) -> None: + path = tmp_path / "no_meta.toml" + path.write_text( + textwrap.dedent( + """ + [evaluation_settings] + marker = 0 + """ + ).strip() + ) + with pytest.raises( + ConfigurationError, match="Missing required section \\[evaluator_metadata\\]" + ): + load_evaluator_settings(path, _MiniSettings) + + +def test_load_evaluator_settings_missing_evaluation_settings(tmp_path: Path) -> None: + path = tmp_path / "no_eval.toml" + path.write_text( + textwrap.dedent( + """ + [evaluator_metadata] + id = "e" + version = "1.0" + name = "N" + description = "D" + maturity = "ga" + """ + ).strip() + ) + with pytest.raises( + ConfigurationError, match="Missing required section \\[evaluation_settings\\]" + ): + load_evaluator_settings(path, _MiniSettings) + + +def test_load_evaluator_settings_invalid_evaluation_settings(tmp_path: Path) -> None: + path = tmp_path / "bad_eval.toml" + path.write_text( + textwrap.dedent( + """ + [evaluator_metadata] + id = "e" + version = "1.0" + name = "N" + description = "D" + maturity = "ga" + + [evaluation_settings] + marker = "not-int" + """ + ).strip() + ) + with pytest.raises(ConfigurationError, match="Invalid \\[evaluation_settings\\]"): + load_evaluator_settings(path, _MiniSettings) + + +def test_load_evaluator_settings_prompt_settings_and_prompts(tmp_path: Path) -> None: + path = tmp_path / "prompts.toml" + path.write_text( + textwrap.dedent( + r''' + [evaluator_metadata] + id = "e" + version = "1.0" + name = "N" + description = "D" + maturity = "ga" + + [evaluation_settings] + marker = 1 + + [evaluation_settings.prompt_settings_main] + provider_type = "GOOGLE" + model = "gemini-2.0-flash" + temperature = 0.25 + + [prompts] + system_prompt = """hello + \nworld""" + ''' + ).strip() + ) + result = load_evaluator_settings(path, _WithPromptSettings) + assert result.evaluation_settings.prompt_settings_main.provider_type == LlmProvider.GOOGLE + assert result.evaluation_settings.prompt_settings_main.model == "gemini-2.0-flash" + assert result.evaluation_settings.prompt_settings_main.temperature == 0.25 + assert result.prompts["system_prompt"] == "hello\n\nworld" + + +def test_load_evaluator_settings_invalid_prompt_settings_block(tmp_path: Path) -> None: + path = tmp_path / "bad_prompt.toml" + path.write_text( + textwrap.dedent( + """ + [evaluator_metadata] + id = "e" + version = "1.0" + name = "N" + description = "D" + maturity = "ga" + + [evaluation_settings] + marker = 0 + + [evaluation_settings.prompt_settings_main] + provider_type = "google" + model = "m" + """ + ).strip() + ) + with pytest.raises( + ConfigurationError, match="Invalid \\[evaluation_settings.prompt_settings_main\\]" + ): + load_evaluator_settings(path, _WithPromptSettings) From 71c04450aafce11407bce3057896434bfdfde351 Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Tue, 12 May 2026 01:10:14 -0700 Subject: [PATCH 12/15] chore: update make --- sdks/python/Makefile | 13 ++++++------- sdks/python/README.md | 4 ++-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/sdks/python/Makefile b/sdks/python/Makefile index bf97dfe..0cf88c1 100644 --- a/sdks/python/Makefile +++ b/sdks/python/Makefile @@ -1,8 +1,7 @@ -VENV := ../../.venv -PYTHON := $(VENV)/bin/python -PYTEST := $(VENV)/bin/pytest -RUFF := $(VENV)/bin/ruff -MYPY := $(VENV)/bin/mypy +PYTHON ?= python3 +PYTEST := $(PYTHON) -m pytest +RUFF := $(PYTHON) -m ruff +MYPY := $(PYTHON) -m mypy .PHONY: help test unit-test lint format format-check typecheck pip-check verify coverage @@ -14,7 +13,7 @@ help: @echo " format-check Fail if Ruff would reformat any file" @echo " typecheck Mypy on src package + tests" @echo " pip-check pip dependency consistency check" - @echo " verify lint + format-check + typecheck + pip-check + unit-test" + @echo " verify lint + format-check + typecheck + pip-check + test (unit + contract)" @echo " coverage Unit tests with coverage report (requires dev install)" @echo "" @echo " test Run the full test suite" @@ -37,7 +36,7 @@ typecheck: pip-check: $(PYTHON) -m pip check -verify: lint format-check typecheck pip-check unit-test +verify: lint format-check typecheck pip-check test coverage: $(PYTEST) tests/ -q --ignore=tests/contract_tests --cov=learning_commons_evaluators --cov-report=term-missing diff --git a/sdks/python/README.md b/sdks/python/README.md index 4c27daa..15dc866 100644 --- a/sdks/python/README.md +++ b/sdks/python/README.md @@ -32,10 +32,10 @@ source .venv/bin/activate # macOS/Linux # Install in editable mode with dev dependencies pip install -e ".[dev]" -# Static checks + unit tests (same gate as CI for the Python SDK) +# Static checks + full test suite (unit + contract; same gate as CI for the Python SDK) make verify -# Run the full test suite (unit + contract) +# Tests only (same pytest invocation as the verify step) make test ``` From 32b974b47fc86a2ccf2a6bf34d72e36791c61440 Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Tue, 12 May 2026 01:30:19 -0700 Subject: [PATCH 13/15] ci: fix CI? --- sdks/python/pyproject.toml | 4 +++ .../settings/load_settings.py | 28 +++++++++++-------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml index ec69933..c739073 100644 --- a/sdks/python/pyproject.toml +++ b/sdks/python/pyproject.toml @@ -40,11 +40,15 @@ dev = [ "pytest-cov>=5.0.0", "ruff>=0.9.0", "mypy>=1.14.0", + # Mypy is configured for Python 3.10 even when dev tooling runs on newer Python, + # so it needs the 3.10 tomllib backport available for type checking. + "tomli>=2.0.0", ] [project.urls] Homepage = "https://github.com/learning-commons-org/evaluators" Repository = "https://github.com/learning-commons-org/evaluators/tree/main/sdks/python" +Documentation = "https://docs.learningcommons.org/evaluators" "Bug Tracker" = "https://github.com/learning-commons-org/evaluators/issues" [tool.setuptools.packages.find] diff --git a/sdks/python/src/learning_commons_evaluators/settings/load_settings.py b/sdks/python/src/learning_commons_evaluators/settings/load_settings.py index 530dd9c..ad0d3cf 100644 --- a/sdks/python/src/learning_commons_evaluators/settings/load_settings.py +++ b/sdks/python/src/learning_commons_evaluators/settings/load_settings.py @@ -2,21 +2,27 @@ from __future__ import annotations -import importlib.resources import os +import sys from dataclasses import dataclass - -try: - from importlib.resources.abc import Traversable -except ImportError: - from importlib.abc import Traversable # Python < 3.11 +from importlib import resources from pathlib import Path -from typing import Generic, TypeVar +from typing import TYPE_CHECKING, Generic, TypeVar + +if TYPE_CHECKING: + from importlib.abc import Traversable +else: + # Runtime location moved to importlib.resources.abc in newer Python versions, + # while mypy's Python 3.10 types expose Traversable from importlib.abc. + try: + from importlib.resources.abc import Traversable + except ImportError: + from importlib.abc import Traversable -try: +if sys.version_info >= (3, 11): import tomllib -except ImportError: - import tomli as tomllib # type: ignore[import-not-found,no-redef] # Python < 3.11 +else: + import tomli as tomllib from pydantic import TypeAdapter from pydantic import ValidationError as PydanticValidationError @@ -56,7 +62,7 @@ def shared_settings_root() -> Path | Traversable: env = os.environ.get("EVALUATORS_SETTINGS_DIR") if env: return Path(env) - return importlib.resources.files("learning_commons_evaluators.settings") + return resources.files("learning_commons_evaluators.settings") def load_settings(path: Path | str | Traversable) -> dict: From f39864218ed7bcada1a172757a770ebced754d1d Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Tue, 12 May 2026 16:49:28 -0700 Subject: [PATCH 14/15] chore: addressing PR comments --- .../evaluators/base.py | 2 +- .../src/learning_commons_evaluators/logger.py | 5 ++-- .../schemas/evaluator.py | 7 +++--- .../settings/load_settings.py | 14 +++++++---- sdks/python/tests/schemas/test_input_specs.py | 8 ++++--- .../tests/settings/test_load_settings.py | 24 +++++++++++++++++++ sdks/python/tests/test_logger.py | 16 +++++++++++++ 7 files changed, 62 insertions(+), 14 deletions(-) diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py index 9b48d99..49a4713 100644 --- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py +++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py @@ -91,7 +91,7 @@ def evaluate( evaluation end log record; no result object is returned because this method re-raises. """ if evaluation_settings is None: - evaluation_settings = self.default_evaluation_settings + evaluation_settings = self.default_evaluation_settings.model_copy(deep=True) start = time.perf_counter() evaluation_metadata = EvaluationMetadata( evaluator_metadata=self.metadata, diff --git a/sdks/python/src/learning_commons_evaluators/logger.py b/sdks/python/src/learning_commons_evaluators/logger.py index 4cf5f18..7a40645 100644 --- a/sdks/python/src/learning_commons_evaluators/logger.py +++ b/sdks/python/src/learning_commons_evaluators/logger.py @@ -87,8 +87,9 @@ def create_logger( def create_silent_logger() -> logging.Logger: """Logger that discards all records (no propagation). Use for ``logger=`` opt-out.""" logger = logging.getLogger(f"{SDK_LOGGER_NAME}.silent") - if not logger.handlers: - logger.addHandler(logging.NullHandler()) + for h in list(logger.handlers): + logger.removeHandler(h) + logger.addHandler(logging.NullHandler()) logger.propagate = False return logger diff --git a/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py b/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py index 9b3c6d0..4a7d78f 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py @@ -152,16 +152,17 @@ def validate(self) -> None: Raises :class:`~.errors.ValidationError` if any field is invalid. """ - errors: list[ValidationError] = [] + errors: list[tuple[str, ValidationError]] = [] for name in type(self).model_fields: field_val = getattr(self, name) if isinstance(field_val, InputField): try: field_val.validate() except ValidationError as e: - errors.append(e) + errors.append((name, e)) if errors: - raise ValidationError(f"Validation errors: {errors}") + parts = [f"{field}: {err.message}" for field, err in errors] + raise ValidationError("Validation errors: " + "; ".join(parts)) def input_metadata(self) -> dict[str, Any]: """Return a mapping of field name β†’ :meth:`InputField.input_metadata` for each field. diff --git a/sdks/python/src/learning_commons_evaluators/settings/load_settings.py b/sdks/python/src/learning_commons_evaluators/settings/load_settings.py index ad0d3cf..a869f3c 100644 --- a/sdks/python/src/learning_commons_evaluators/settings/load_settings.py +++ b/sdks/python/src/learning_commons_evaluators/settings/load_settings.py @@ -126,14 +126,18 @@ def _normalize_prompt_whitespace(prompt: str) -> str: def _parse_prompts(data: dict) -> dict[str, str]: - """Build prompts dict from TOML [prompts] section. Optional; any string values are included.""" + """Build prompts dict from TOML ``[prompts]`` section. Optional; values must be strings.""" prompts_section = data.get("prompts") if prompts_section is None or not isinstance(prompts_section, dict): return {} - return { - k: _normalize_prompt_whitespace(v if isinstance(v, str) else str(v)) - for k, v in prompts_section.items() - } + out: dict[str, str] = {} + for k, v in prompts_section.items(): + if not isinstance(v, str): + raise ConfigurationError( + f"Invalid [prompts].{k}: expected a string prompt, got {type(v).__name__}." + ) + out[k] = _normalize_prompt_whitespace(v) + return out @dataclass(frozen=True) diff --git a/sdks/python/tests/schemas/test_input_specs.py b/sdks/python/tests/schemas/test_input_specs.py index 7c06d74..f43e999 100644 --- a/sdks/python/tests/schemas/test_input_specs.py +++ b/sdks/python/tests/schemas/test_input_specs.py @@ -31,6 +31,7 @@ def test_new_subclass_is_picked_up_after_repopulation(self): This verifies the registry mechanism works end-to-end. New subclasses must also be added to AnyInputSpec manually (see module docstring checklist). """ + canonical = dict(INPUT_SPEC_REGISTRY) class _TestInputSpec(InputSpec): type: Literal["_TestInputField"] = "_TestInputField" @@ -40,9 +41,10 @@ class _TestInputSpec(InputSpec): assert "_TestInputField" in INPUT_SPEC_REGISTRY assert INPUT_SPEC_REGISTRY["_TestInputField"] is _TestInputSpec finally: - # Restore the registry to the canonical state so other tests aren't affected. - INPUT_SPEC_REGISTRY.pop("_TestInputField", None) - _populate_input_spec_registry() + # Repopulating would re-register _TestInputSpec while the class still exists. + # Restore a snapshot so the rest of the session sees the canonical registry. + INPUT_SPEC_REGISTRY.clear() + INPUT_SPEC_REGISTRY.update(canonical) class TestAnyInputSpec: diff --git a/sdks/python/tests/settings/test_load_settings.py b/sdks/python/tests/settings/test_load_settings.py index a572a5d..dae9dff 100644 --- a/sdks/python/tests/settings/test_load_settings.py +++ b/sdks/python/tests/settings/test_load_settings.py @@ -201,6 +201,30 @@ def test_load_evaluator_settings_prompt_settings_and_prompts(tmp_path: Path) -> assert result.prompts["system_prompt"] == "hello\n\nworld" +def test_load_evaluator_settings_rejects_non_string_prompt_value(tmp_path: Path) -> None: + path = tmp_path / "bad_prompt_type.toml" + path.write_text( + textwrap.dedent( + """ + [evaluator_metadata] + id = "e" + version = "1.0" + name = "N" + description = "D" + maturity = "ga" + + [evaluation_settings] + marker = 0 + + [prompts] + n = 3 + """ + ).strip() + ) + with pytest.raises(ConfigurationError, match="Invalid \\[prompts\\].n"): + load_evaluator_settings(path, _MiniSettings) + + def test_load_evaluator_settings_invalid_prompt_settings_block(tmp_path: Path) -> None: path = tmp_path / "bad_prompt.toml" path.write_text( diff --git a/sdks/python/tests/test_logger.py b/sdks/python/tests/test_logger.py index 5fba650..10d128f 100644 --- a/sdks/python/tests/test_logger.py +++ b/sdks/python/tests/test_logger.py @@ -66,6 +66,22 @@ def test_no_output(self, capsys) -> None: assert captured.out == "" assert captured.err == "" + def test_clears_pre_existing_handlers(self, capsys) -> None: + log = logging.getLogger("learning_commons_evaluators.silent") + for h in list(log.handlers): + log.removeHandler(h) + noisy = logging.StreamHandler() + log.addHandler(noisy) + log.setLevel(logging.DEBUG) + silent = create_silent_logger() + assert silent is log + silent.info("still silent") + captured = capsys.readouterr() + assert captured.out == "" + assert captured.err == "" + for h in log.handlers: + assert isinstance(h, logging.NullHandler) + class TestFormatError: def test_format_error_returns_string(self) -> None: From d8db3cdeb4656b03b677504f7d42e16a56f74897 Mon Sep 17 00:00:00 2001 From: Fred Sisenda <121206410+czi-fsisenda@users.noreply.github.com> Date: Thu, 14 May 2026 02:22:25 -0700 Subject: [PATCH 15/15] feat(python-sdk): conventionality evaluator (#38) --- .github/workflows/test-sdk-python.yml | 4 +- sdks/python/Makefile | 76 ++- sdks/python/README.md | 43 +- sdks/python/pyproject.toml | 2 +- sdks/python/scripts/capture.py | 329 +++++++++ sdks/python/scripts/generate_settings.py | 542 +++++++++++++++ .../learning_commons_evaluators/__init__.py | 24 + .../evaluators/__init__.py | 14 + .../evaluators/base.py | 51 +- .../evaluators/conventionality.py | 117 ++++ .../evaluators/vocabulary.py | 195 ++++++ .../schemas/__init__.py | 6 + .../schemas/conventionality.py | 32 + .../schemas/vocabulary.py | 68 ++ .../_generated_conventionality_settings.py | 101 +++ .../_generated_vocabulary_settings.py | 404 +++++++++++ .../settings/conventionality/__init__.py | 0 .../settings/conventionality/contracts.toml | 130 ++++ .../settings/vocabulary/__init__.py | 0 .../settings/vocabulary/contracts.toml | 641 ++++++++++++++++++ sdks/python/tests/conftest.py | 29 +- sdks/python/tests/contract_tests/__init__.py | 14 + .../tests/contract_tests/conventionality.py | 87 +++ sdks/python/tests/contract_tests/harness.py | 229 +++++++ sdks/python/tests/contract_tests/loader.py | 169 +++++ .../contract_tests/test_conventionality.py | 84 +++ .../tests/contract_tests/test_vocabulary.py | 136 ++++ .../python/tests/contract_tests/vocabulary.py | 146 ++++ sdks/python/tests/evaluators/test_base.py | 98 ++- .../tests/evaluators/test_conventionality.py | 143 ++++ .../tests/evaluators/test_vocabulary.py | 353 ++++++++++ .../tests/schemas/test_evaluator_schemas.py | 253 +++++++ .../tests/settings/test_load_settings.py | 127 +++- sdks/python/tests/test_package_imports.py | 45 +- sdks/settings/conventionality/contracts.toml | 130 ++++ sdks/settings/conventionality/settings.toml | 78 +++ sdks/settings/vocabulary/contracts.toml | 641 ++++++++++++++++++ sdks/settings/vocabulary/settings.toml | 410 +++++++++++ 38 files changed, 5857 insertions(+), 94 deletions(-) create mode 100644 sdks/python/scripts/capture.py create mode 100644 sdks/python/scripts/generate_settings.py create mode 100644 sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py create mode 100644 sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py create mode 100644 sdks/python/src/learning_commons_evaluators/schemas/conventionality.py create mode 100644 sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py create mode 100644 sdks/python/src/learning_commons_evaluators/settings/_generated_conventionality_settings.py create mode 100644 sdks/python/src/learning_commons_evaluators/settings/_generated_vocabulary_settings.py create mode 100644 sdks/python/src/learning_commons_evaluators/settings/conventionality/__init__.py create mode 100644 sdks/python/src/learning_commons_evaluators/settings/conventionality/contracts.toml create mode 100644 sdks/python/src/learning_commons_evaluators/settings/vocabulary/__init__.py create mode 100644 sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml create mode 100644 sdks/python/tests/contract_tests/__init__.py create mode 100644 sdks/python/tests/contract_tests/conventionality.py create mode 100644 sdks/python/tests/contract_tests/harness.py create mode 100644 sdks/python/tests/contract_tests/loader.py create mode 100644 sdks/python/tests/contract_tests/test_conventionality.py create mode 100644 sdks/python/tests/contract_tests/test_vocabulary.py create mode 100644 sdks/python/tests/contract_tests/vocabulary.py create mode 100644 sdks/python/tests/evaluators/test_conventionality.py create mode 100644 sdks/python/tests/evaluators/test_vocabulary.py create mode 100644 sdks/python/tests/schemas/test_evaluator_schemas.py create mode 100644 sdks/settings/conventionality/contracts.toml create mode 100644 sdks/settings/conventionality/settings.toml create mode 100644 sdks/settings/vocabulary/contracts.toml create mode 100644 sdks/settings/vocabulary/settings.toml diff --git a/.github/workflows/test-sdk-python.yml b/.github/workflows/test-sdk-python.yml index af35a78..1d44f1a 100644 --- a/.github/workflows/test-sdk-python.yml +++ b/.github/workflows/test-sdk-python.yml @@ -6,13 +6,11 @@ on: - main paths: - "sdks/python/**" - - "scripts/generate_settings.py" - "sdks/settings/**" - ".github/workflows/test-sdk-python.yml" pull_request: paths: - "sdks/python/**" - - "scripts/generate_settings.py" - "sdks/settings/**" - ".github/workflows/test-sdk-python.yml" @@ -32,7 +30,7 @@ jobs: working-directory: sdks/python strategy: matrix: - python-version: ["3.10", "3.12", "3.13"] + python-version: ["3.10", "3.11", "3.12", "3.13"] steps: - name: Checkout uses: actions/checkout@v6 diff --git a/sdks/python/Makefile b/sdks/python/Makefile index 0cf88c1..72d8b4b 100644 --- a/sdks/python/Makefile +++ b/sdks/python/Makefile @@ -3,32 +3,66 @@ PYTEST := $(PYTHON) -m pytest RUFF := $(PYTHON) -m ruff MYPY := $(PYTHON) -m mypy -.PHONY: help test unit-test lint format format-check typecheck pip-check verify coverage +# Bundled settings directory inside the package. Two things are maintained here: +# _generated_*_settings.py β€” generated from settings.toml by `make generate-settings` +# contracts.toml β€” copied from sdks/settings/ by `make sync-settings` +# +# Tests always read contracts.toml from the bundled copy (via importlib.resources +# fallback in shared_settings_root()). EVALUATORS_SETTINGS_DIR is intentionally NOT +# exported here so that dev and installed-package runs behave identically. +SETTINGS_DST := src/learning_commons_evaluators/settings + +.PHONY: help build check-build test unit-test contract-test \ + generate-settings check-generated sync-settings check-sync \ + lint format format-check typecheck pip-check verify coverage help: @echo "Usage: make " @echo "" - @echo " lint Ruff linter (src, tests)" + @echo " build Prepare the package: generate settings + sync contracts TOML" + @echo " check-build Verify build artifacts are up to date (use in CI)" + @echo "" + @echo " lint Ruff linter (src/, tests/, scripts/)" @echo " format Apply Ruff formatter" @echo " format-check Fail if Ruff would reformat any file" @echo " typecheck Mypy on src package + tests" @echo " pip-check pip dependency consistency check" - @echo " verify lint + format-check + typecheck + pip-check + test (unit + contract)" + @echo " verify check-build + lint + format-check + typecheck + pip-check + test (unit + contract)" @echo " coverage Unit tests with coverage report (requires dev install)" @echo "" - @echo " test Run the full test suite" - @echo " unit-test Run unit tests only" + @echo " test Run the full test suite (unit + contract)" + @echo " unit-test Run unit tests only (skip contract tests)" + @echo " contract-test Run contract tests only" + @echo "" + @echo " generate-settings Re-generate _generated_*_settings.py from sdks/settings/" + @echo " check-generated Verify generated .py files match sdks/settings/ (exits 1 if stale)" + @echo " sync-settings Copy contracts.toml from sdks/settings/ into the package" + @echo " check-sync Verify bundled contracts.toml matches sdks/settings/" + @echo "" + @echo "Typical workflows:" + @echo " After editing evaluator settings TOML: make generate-settings && make test" + @echo " After editing contracts TOML: make sync-settings && make contract-test" + @echo " Fresh checkout / before release: make build && make verify" + @echo " CI: make verify" + +# ── Build ───────────────────────────────────────────────────────────────────── +# Prepares all package build artifacts in one step. Run after editing any +# file under sdks/settings/, or on a fresh checkout before running tests. + +build: generate-settings sync-settings + +check-build: check-generated check-sync # ── Static checks (Ruff + Mypy) ─────────────────────────────────────────────── lint: - $(RUFF) check src tests + $(RUFF) check src tests scripts format: - $(RUFF) format src tests + $(RUFF) format src tests scripts format-check: - $(RUFF) format --check src tests + $(RUFF) format --check src tests scripts typecheck: $(MYPY) src/learning_commons_evaluators tests @@ -36,7 +70,7 @@ typecheck: pip-check: $(PYTHON) -m pip check -verify: lint format-check typecheck pip-check test +verify: check-build lint format-check typecheck pip-check test coverage: $(PYTEST) tests/ -q --ignore=tests/contract_tests --cov=learning_commons_evaluators --cov-report=term-missing @@ -48,3 +82,27 @@ test: unit-test: $(PYTEST) tests/ -v --ignore=tests/contract_tests + +contract-test: + $(PYTEST) tests/contract_tests/ -v + +# ── Code generation ─────────────────────────────────────────────────────────── +# Re-run when an evaluator settings TOML under sdks/settings/ changes. +# The generator sets EVALUATORS_SETTINGS_DIR itself to point at sdks/settings/. + +generate-settings: + $(PYTHON) scripts/generate_settings.py + +check-generated: + $(PYTHON) scripts/generate_settings.py --check + +# ── Contract test bundling ──────────────────────────────────────────────────── +# contracts.toml is bundled so contract tests run consistently whether using +# a development checkout or a pip-installed package. The sync command is +# handled by the Python generator script. + +sync-settings: + $(PYTHON) scripts/generate_settings.py --sync + +check-sync: + $(PYTHON) scripts/generate_settings.py --check-sync diff --git a/sdks/python/README.md b/sdks/python/README.md index 15dc866..9a1e80c 100644 --- a/sdks/python/README.md +++ b/sdks/python/README.md @@ -230,6 +230,42 @@ config = create_config( ) ``` +### Per-instance default evaluation settings + +Every `BaseEvaluator` subclass defines **class-level** `default_evaluation_settings` +(the bundled evaluators load these from generated settings). You can override that +default for a single evaluator instance by passing the same keyword to the +constructor: + +```python +from learning_commons_evaluators import ConventionalityEvaluator, create_config + +config = create_config(...) +# Start from the bundled defaults, then change what your deployment needs (models, +# temperatures, etc. live on nested PromptSettings). +settings = ConventionalityEvaluator.default_evaluation_settings.model_copy(deep=True) +settings.prompt_settings_step_conventionality_evaluation = ( + settings.prompt_settings_step_conventionality_evaluation.model_copy( + update={"temperature": 0.2} + ) +) +evaluator = ConventionalityEvaluator( + config, + default_evaluation_settings=settings, +) + +# Uses the instance default (a deep copy is taken inside evaluate) +result = evaluator.evaluate(input) + +# Per-call override still wins +result = evaluator.evaluate(input, evaluation_settings=other_settings) +``` + +If you omit `default_evaluation_settings` at construction, attribute lookup uses the +subclass class attribute, same as before. Whenever you call `evaluate()` without +`evaluation_settings`, the SDK uses `model_copy(deep=True)` of the resolved default, +so the object you keep on the instance is not mutated by a run. + ### Logging The SDK uses Python's standard `logging` module. By default, `EvaluatorConfig` uses the @@ -294,7 +330,10 @@ On evaluation failure, `metadata.status` and `error_details` are set on the in-m ## Creating custom evaluators -Extend `BaseEvaluator` to create custom evaluators: +Extend `BaseEvaluator` to create custom evaluators. Set **class-level** +`default_evaluation_settings` for the usual defaults; callers may still construct +`MyEvaluator(config, default_evaluation_settings=...)` to pin different defaults on a +specific instance (see [Per-instance default evaluation settings](#per-instance-default-evaluation-settings)). ```python from learning_commons_evaluators import BaseEvaluator, EvaluatorConfig @@ -329,6 +368,8 @@ class MyEvaluator(BaseEvaluator[MyInput, EvaluationResult, MySettings]): return EvaluationResult(answer=..., explanation=..., metadata=evaluation_metadata) ``` +If you override `__init__` on the subclass, accept the same keyword-only argument and forward it: `super().__init__(config, default_evaluation_settings=default_evaluation_settings)`. + ## License MIT diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml index c739073..0e1d03e 100644 --- a/sdks/python/pyproject.toml +++ b/sdks/python/pyproject.toml @@ -97,7 +97,7 @@ select = [ ignore = ["E501"] # line length β€” formatter wraps where it matters [tool.ruff.lint.per-file-ignores] -"../../scripts/generate_settings.py" = ["E402"] +"scripts/generate_settings.py" = ["E402"] # ── Mypy ────────────────────────────────────────────────────────────────────── diff --git a/sdks/python/scripts/capture.py b/sdks/python/scripts/capture.py new file mode 100644 index 0000000..b2c158c --- /dev/null +++ b/sdks/python/scripts/capture.py @@ -0,0 +1,329 @@ +"""Contract test capture utilities for evaluator notebooks. + +Three-step workflow for notebook authors +----------------------------------------- +1. Wrap every model in your chain with ``capture_llm()``: + + chain = prompt | capture_llm("step_name", my_model) | JsonOutputParser() + + The prefix (``"step_name"``) becomes the step key in ``prompt_steps`` in + the TOML output. Use a short, stable name per step (e.g. ``"main"``, + ``"bk"``, ``"vocab"``). + +2. Immediately after each test-case evaluation, call ``capture_case()`` to save a + point-in-time copy of what was captured. Pass the evaluator's input dict + and output dict directly β€” no manual field extraction needed:: + + case_input = {"text": my_text, "grade": 4} + case_output = run_evaluator(**case_input) + + _cap = capture_case( + name="my_case", + input=case_input, + llm_call_captures=["step_name"], # prefixes, in call order + expected_result=case_output, + description="…", # optional human-readable label + ) + + String values in ``input`` (e.g. ``text``) are ``.strip()``'d before writing + TOML so contract ``[input]`` matches common evaluator behavior. Run your + chains with the same stripped strings so captured ``user_prompt`` matches. + +3. Print the TOML block and paste it into ``contracts.toml`` (for example, + ``sdks/settings//contracts.toml``): + + print(build_contract_toml(_cap_one, _cap_two)) + +Resetting between runs +----------------------- +Call ``reset_captures()`` at the start of each evaluation to avoid stale data from a +previous run leaking into the next capture_case:: + + reset_captures() + output = run_evaluator(text, grade) + _cap = capture_case( + name="my_case", + input={"text": text, "grade": grade}, + llm_call_captures=["main"], + expected_result=output, + ) + +Async chains +------------- +``capture_llm()`` works in both sync (``invoke``) and async (``ainvoke``) +chains with no extra configuration. +""" + +from __future__ import annotations + +import json as _json +from typing import Any + +from langchain_core.runnables import RunnableLambda + +# --------------------------------------------------------------------------- +# Internal state +# --------------------------------------------------------------------------- + +# Flat dict populated by capture_llm() on every chain invocation. +# Keys follow the pattern "{prefix}_{field}" (e.g. "bk_user_prompt"). +_captures: dict[str, Any] = {} + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def reset_captures() -> None: + """Clear all captured data. Call at the start of each evaluation run.""" + _captures.clear() + + +def capture_llm(prefix: str, model: Any) -> RunnableLambda: + """Return a wrapped model that records the formatted prompt and raw response. + + Drop it into a LangChain chain wherever you have a bare model:: + + # single-step evaluator + chain = prompt | capture_llm("main", llm) | JsonOutputParser() + + # multi-step evaluator + bk_chain = bk_prompt | capture_llm("bk", bk_model) + vocab_chain = vocab_prompt | capture_llm("vocab", vocab_model) | JsonOutputParser() + + After the chain runs, ``_captures`` contains: + + * ``"{prefix}_system_prompt"`` β€” system message content (``""`` if none) + * ``"{prefix}_user_prompt"`` β€” human message content + * ``"{prefix}_raw_response"`` β€” text content of the LLM response. + Most providers return a plain string. + Some (e.g. Google Gemini via + langchain_google_genai) return a list of + content blocks; the first ``"text"`` block + is extracted so the stored value is always + a plain string suitable for JsonOutputParser. + * ``"{prefix}_model"`` β€” model identifier + * ``"{prefix}_temperature"`` β€” temperature used + + Supports both ``invoke`` (sync) and ``ainvoke`` (async) chains. + """ + model_name = getattr(model, "model", None) or getattr(model, "model_name", None) or "" + temperature = float(getattr(model, "temperature", 0)) + + def _record(prompt_value: Any, ai_message: Any) -> None: + if isinstance(prompt_value, str): + # Plain string passed directly to the model β€” treat it as a lone + # human message with no system prompt. This happens when the + # caller formats a prompt template themselves and passes the result + # as a string rather than going through a ChatPromptTemplate chain. + system = "" + human = prompt_value + else: + messages = ( + prompt_value.to_messages() + if hasattr(prompt_value, "to_messages") + else list(prompt_value) + ) + system = next( + (str(m.content) for m in messages if getattr(m, "type", None) == "system"), "" + ) + human = next( + (str(m.content) for m in messages if getattr(m, "type", None) == "human"), "" + ) + _captures[f"{prefix}_system_prompt"] = system + _captures[f"{prefix}_user_prompt"] = human + _captures[f"{prefix}_model"] = model_name + _captures[f"{prefix}_temperature"] = temperature + _captures[f"{prefix}_raw_response"] = _extract_text_content(ai_message.content) + + def _invoke(prompt_value: Any) -> Any: + ai_message = model.invoke(prompt_value) + _record(prompt_value, ai_message) + return ai_message + + async def _ainvoke(prompt_value: Any) -> Any: + ai_message = await model.ainvoke(prompt_value) + _record(prompt_value, ai_message) + return ai_message + + return RunnableLambda(_invoke, afunc=_ainvoke) + + +def capture_case( + *, + name: str, + input: dict[str, Any], + llm_call_captures: list[str], + expected_result: dict[str, Any] | None = None, + description: str | None = None, +) -> dict[str, Any]: + """Return a point-in-time copy of ``_captures`` merged with structured context. + + Args: + name: Case identifier used as the TOML key (e.g. ``"marco_polo_grade3"``). + input: The evaluator's input dict (e.g. ``{"text": ..., "grade": 4}``). + String values are ``.strip()``'d before writing TOML. + Keys are otherwise unchanged. + llm_call_captures: Ordered list of capture prefixes to include as + ``prompt_steps`` in the TOML. Must match the prefixes + passed to ``capture_llm()`` during this run, in call order. + expected_result: The evaluator's output. Pass the whole output β€” no + need to extract individual fields. Plain dicts, Pydantic + v1/v2 models, and anything dict-like are all accepted; + ``capture_case()`` normalises to a plain dict. Written to + the ``[expected_result]`` TOML section. + description: Optional human-readable label for this test case. + + Example:: + + case_input = {"text": text, "grade": 3} + case_output = run_evaluator(**case_input) + + _cap = capture_case( + name="marco_polo_grade3", + input=case_input, + llm_call_captures=["bk", "vocab"], + expected_result=case_output, + description="Marco Polo passage, grade 3", + ) + """ + data: dict[str, Any] = dict(_captures) + data["name"] = name + data["input"] = _strip_string_values(dict(input)) + data["llm_call_captures"] = llm_call_captures + if expected_result is not None: + # Normalise to a plain dict so capture_case() is always fully serializable. + # Handles Pydantic v2 models (.model_dump()), v1 models (.dict()), and + # anything else that is already a dict or dict-like. + if hasattr(expected_result, "model_dump"): + expected_result = expected_result.model_dump() + elif hasattr(expected_result, "dict"): + expected_result = expected_result.dict() + data["expected_result"] = dict(expected_result) + if description is not None: + data["description"] = description + return data + + +def build_contract_toml(*cases: dict[str, Any]) -> str: + """Build the contract TOML block for one or more test cases. + + Args: + *cases: One or more dicts as returned by :func:`capture_case`. + + Returns: + TOML string ready to paste into ``contracts.toml`` (for example, + ``sdks/settings//contracts.toml``). + + Example:: + + print(build_contract_toml(_cap_grade3, _cap_grade7)) + """ + return "\n".join(_build_case(c) for c in cases) + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _strip_string_values(inp: dict[str, Any]) -> dict[str, Any]: + """Return a shallow copy of *inp* with leading/trailing whitespace removed from str values.""" + return {k: v.strip() if isinstance(v, str) else v for k, v in inp.items()} + + +def _extract_text_content(content: Any) -> str: + """Extract a plain-text string from an AIMessage content value. + + Most LLM providers via LangChain return a plain ``str``. Some + (e.g. Google Gemini via ``langchain_google_genai``) return a list of + content blocks such as:: + + [{"type": "text", "text": "...the model's reply...", "extras": {...}}] + + This helper normalises both shapes to a plain string so that + ``_raw_response`` is always something ``JsonOutputParser`` can parse + directly β€” not a Python-repr of a list. + """ + if isinstance(content, str): + return content + if isinstance(content, list): + for block in content: + if isinstance(block, dict) and block.get("type") == "text": + return str(block["text"]) + return str(content) + + +def _toml_value(v: Any) -> str: + """Serialize *v* to a TOML literal.""" + if isinstance(v, str): + if v == "": + return '""' + # Prefer multiline LITERAL strings ('''...''') β€” they are verbatim so + # backslashes, double-quotes, and \' sequences are all safe. The only + # restriction is the content cannot contain '''. + # + # IMPORTANT: no "\n" before the closing ''' β€” that would add a spurious + # trailing newline to every parsed value (TOML only trims the *first* + # newline after the opening delimiter, not the last one before closing). + if "'''" not in v: + return "'''\n" + v + "'''" + # Fallback: content contains ''' so literal multiline cannot be used. + # json.dumps produces a TOML-safe basic string (all backslashes/quotes + # escaped); tomllib accepts it as a single-line value. + return _json.dumps(v) + if isinstance(v, bool): + return "true" if v else "false" + if isinstance(v, int): + return str(v) + if isinstance(v, float): + return str(v) + if isinstance(v, list): + return "[" + ", ".join(_json.dumps(item) for item in v) + "]" + # Fallback: JSON-encode whatever it is. + return _json.dumps(str(v)) + + +def _build_case(c: dict[str, Any]) -> str: + name = c["name"] + lines: list[str] = [] + + # ── optional top-level description ─────────────────────────────────────── + description = c.get("description") + if description: + lines += [ + f"[cases.{name}]", + f"description = {_json.dumps(description)}", + "", + ] + + # ── input section ──────────────────────────────────────────────────────── + lines.append(f"[cases.{name}.input]") + for field, val in c.get("input", {}).items(): + # ``capture_case`` applies ``.strip()`` to string fields in ``input``. + lines.append(f"{field} = {_toml_value(val)}") + lines.append("") + + # ── prompt_steps sections ───────────────────────────────────────────────── + prefixes = c["llm_call_captures"] + for prefix in prefixes: + lines += [ + f"[cases.{name}.prompt_steps.{prefix}]", + f"model = {_json.dumps(c.get(prefix + '_model', ''))}", + f"temperature = {c.get(prefix + '_temperature', 0)}", + f"system_prompt = {_toml_value(c.get(prefix + '_system_prompt', ''))}", + f"user_prompt = {_toml_value(c.get(prefix + '_user_prompt', ''))}", + f"llm_response = {_toml_value(c.get(prefix + '_raw_response', ''))}", + "", + ] + + # ── expected_result section ─────────────────────────────────────────────── + expected = c.get("expected_result") + if expected: + lines.append(f"[cases.{name}.expected_result]") + for field, value in expected.items(): + lines.append(f"{field} = {_toml_value(value)}") + lines.append("") + + return "\n".join(lines) diff --git a/sdks/python/scripts/generate_settings.py b/sdks/python/scripts/generate_settings.py new file mode 100644 index 0000000..7d290c0 --- /dev/null +++ b/sdks/python/scripts/generate_settings.py @@ -0,0 +1,542 @@ +#!/usr/bin/env python3 +"""Generate pre-built settings modules from evaluator TOML files. + +Each evaluator's settings TOML is parsed once at *build time* and serialised +as a plain Python module. The evaluator modules then import that generated +module at import time β€” no file I/O, no TOML parsing on every process start. + +Each evaluator directory under sdks/settings/ that contains ``settings.toml`` is picked up +automatically. The settings model is resolved as:: + + learning_commons_evaluators.schemas..EvaluationSettings + +where ```` is the snake_case evaluator folder name converted to PascalCase +(e.g. ``conventionality`` β†’ ``ConventionalityEvaluationSettings`` in +``schemas.conventionality``). + +Each such directory typically also has:: + + sdks/settings//contracts.toml β€” captured LLM interactions for contract tests + +Usage (from ``sdks/python/``, e.g. after ``cd sdks/python`` or ``make generate-settings``):: + + python scripts/generate_settings.py + +From the repository root:: + + python sdks/python/scripts/generate_settings.py + +Other flags:: + + # Check whether generated files are stale (exits 1 if any differ): + python scripts/generate_settings.py --check + + # Copy contracts.toml from sdks/settings/ β†’ bundled package: + python scripts/generate_settings.py --sync + + # Verify bundled contracts.toml matches canonical sdks/settings/: + python scripts/generate_settings.py --check-sync + +Typical CI configuration (``working-directory: sdks/python``):: + + - name: Check settings are up to date + run: python scripts/generate_settings.py --check + - name: Check contracts are in sync + run: python scripts/generate_settings.py --check-sync + +When to regenerate:: + + Any time you edit settings.toml under sdks/settings//, + run this script and commit the updated generated file alongside the TOML. +""" + +from __future__ import annotations + +import argparse +import difflib +import importlib +import os +import sys +import types +from dataclasses import MISSING, dataclass, fields, is_dataclass +from enum import Enum +from pathlib import Path +from string import Template +from typing import Any + +# --------------------------------------------------------------------------- +# Path setup β€” resolve repo root and add SDK src to sys.path so we can import +# the SDK without a full install. This file lives under sdks/python/scripts/. +# --------------------------------------------------------------------------- + +_SCRIPT_DIR = Path(__file__).resolve().parent +_PYTHON_SDK_ROOT = _SCRIPT_DIR.parent +_REPO_ROOT = _PYTHON_SDK_ROOT.parent.parent +_SDK_SRC = _PYTHON_SDK_ROOT / "src" +_SETTINGS_DIR = _REPO_ROOT / "sdks" / "settings" +_GENERATED_DIR = _SDK_SRC / "learning_commons_evaluators" / "settings" + +_LINE_WRAP = 88 + +sys.path.insert(0, str(_SDK_SRC)) + +# Point load_settings at the canonical settings directory so the generator +# always reads the source-of-truth TOML, not the bundled copy. +os.environ.setdefault("EVALUATORS_SETTINGS_DIR", str(_SETTINGS_DIR)) + +# Pre-register the package without running learning_commons_evaluators/__init__.py. +# That __init__ imports evaluators, which import generated settings β€” invalid while +# this script is regenerating those files. Submodules (schemas, settings, …) load +# normally via __path__. +if "learning_commons_evaluators" not in sys.modules: + _lce_pkg = types.ModuleType("learning_commons_evaluators") + _lce_pkg.__path__ = [str(_SDK_SRC / "learning_commons_evaluators")] + sys.modules["learning_commons_evaluators"] = _lce_pkg + +# --------------------------------------------------------------------------- +# SDK imports (after path setup) +# --------------------------------------------------------------------------- + +from pydantic import BaseModel # noqa: E402 +from pydantic_core import PydanticUndefined # noqa: E402 + +from learning_commons_evaluators.schemas.config import ( # noqa: E402 + EvaluationSettings, +) +from learning_commons_evaluators.settings.load_settings import ( # noqa: E402 + EvaluatorSettingsResult, + load_evaluator_settings, +) + +_LCE_PACKAGE = "learning_commons_evaluators" + +# --------------------------------------------------------------------------- +# Emit Python source literals (no third-party codegen: must handle Enum, +# Pydantic, stdlib dataclass, and readable wrapping). +# --------------------------------------------------------------------------- + + +def _emit_string(s: str) -> str: + """Emit a string literal, using triple-quotes for multiline / long strings.""" + if "\n" in s or len(s) > _LINE_WRAP: + content = s.replace('"""', '""\\"') + return f'"""{content}"""' + return repr(s) + + +def _emit_value(obj: Any, indent: int = 0) -> str: + """Recursively emit a Python value as a source-code string.""" + pad = " " * indent + inner = " " * (indent + 1) + + if obj is None: + return "None" + if isinstance(obj, bool): + return "True" if obj else "False" + if isinstance(obj, Enum): + return f"{type(obj).__name__}.{obj.name}" + if isinstance(obj, int): + return repr(obj) + if isinstance(obj, float): + return f"{obj:.1f}" if obj == int(obj) else repr(obj) + if isinstance(obj, str): + return _emit_string(obj) + if isinstance(obj, list): + if not obj: + return "[]" + items = [_emit_value(v, indent + 1) for v in obj] + single = f"[{', '.join(items)}]" + if len(single) <= _LINE_WRAP - len(pad) and "\n" not in single: + return single + body = "\n".join(f"{inner}{item}," for item in items) + return f"[\n{body}\n{pad}]" + if isinstance(obj, dict): + if not obj: + return "{}" + pairs = [(repr(k), _emit_value(v, indent + 1)) for k, v in obj.items()] + single = "{" + ", ".join(f"{k}: {v}" for k, v in pairs) + "}" + if len(single) <= _LINE_WRAP - len(pad) and "\n" not in single: + return single + body = "\n".join(f"{inner}{k}: {v}," for k, v in pairs) + return f"{{\n{body}\n{pad}}}" + if is_dataclass(obj) and not isinstance(obj, type): + return _emit_dataclass(obj, indent) + if isinstance(obj, BaseModel): + return _emit_model(obj, indent) + raise TypeError(f"Cannot emit {type(obj).__name__}: {obj!r}") + + +def _format_constructor(cls_name: str, kw_args: list[tuple[str, str]], indent: int) -> str: + """Format ``ClsName(a=..., b=...)`` with optional line wrapping.""" + pad = " " * indent + inner = " " * (indent + 1) + if not kw_args: + return f"{cls_name}()" + single = f"{cls_name}({', '.join(f'{n}={v}' for n, v in kw_args)})" + if len(single) <= _LINE_WRAP - len(pad) and "\n" not in single: + return single + body = "\n".join(f"{inner}{n}={v}," for n, v in kw_args) + return f"{cls_name}(\n{body}\n{pad})" + + +def _pydantic_kw_args(obj: BaseModel, indent: int) -> list[tuple[str, str]]: + cls = type(obj) + out: list[tuple[str, str]] = [] + for field_name, field_info in cls.model_fields.items(): + val = getattr(obj, field_name) + if field_name == "type" and not field_info.is_required(): + continue + default = field_info.default + if default is not PydanticUndefined and val == default: + continue + out.append((field_name, _emit_value(val, indent + 1))) + return out + + +def _dataclass_kw_args(obj: Any, indent: int) -> list[tuple[str, str]]: + out: list[tuple[str, str]] = [] + for f in fields(obj): + val = getattr(obj, f.name) + if f.default is not MISSING and val == f.default: + continue + out.append((f.name, _emit_value(val, indent + 1))) + return out + + +def _emit_model(obj: BaseModel, indent: int = 0) -> str: + return _format_constructor(type(obj).__name__, _pydantic_kw_args(obj, indent), indent) + + +def _emit_dataclass(obj: Any, indent: int = 0) -> str: + return _format_constructor(type(obj).__name__, _dataclass_kw_args(obj, indent), indent) + + +# --------------------------------------------------------------------------- +# Import block: walk values and import learning_commons_evaluators types used. +# --------------------------------------------------------------------------- + + +def _collect_lce_types(obj: Any, found: set[type]) -> None: + if isinstance(obj, Enum): + found.add(type(obj)) + elif is_dataclass(obj) and not isinstance(obj, type): + found.add(type(obj)) + for f in fields(obj): + _collect_lce_types(getattr(obj, f.name), found) + elif isinstance(obj, BaseModel): + found.add(type(obj)) + for field_name in type(obj).model_fields: + _collect_lce_types(getattr(obj, field_name), found) + elif isinstance(obj, dict): + for v in obj.values(): + _collect_lce_types(v, found) + elif isinstance(obj, list): + for v in obj: + _collect_lce_types(v, found) + + +def _build_import_block( + config: EvaluatorSettingsResult, + settings_cls: type[EvaluationSettings], +) -> str: + found: set[type] = set() + _collect_lce_types(config.evaluator_metadata, found) + _collect_lce_types(config.evaluation_settings, found) + _collect_lce_types(config.prompts, found) + found.add(settings_cls) + found.add(EvaluatorSettingsResult) + + prefix = f"{_LCE_PACKAGE}." + by_module: dict[str, list[str]] = {} + for cls in found: + mod = cls.__module__ + if not mod.startswith(prefix): + continue + by_module.setdefault(mod, []).append(cls.__name__) + + lines: list[str] = [] + for mod in sorted(by_module): + syms = sorted(set(by_module[mod])) + if len(syms) == 1: + lines.append(f"from {mod} import {syms[0]}") + else: + lines.append(f"from {mod} import {', '.join(syms)}") + return "\n".join(lines) + + +_MODULE_TEMPLATE = Template( + """# !! AUTO-GENERATED β€” do not edit directly. +# Source: $rel_toml +# Regenerate : python scripts/generate_settings.py +# Staleness check: python scripts/generate_settings.py --check + +from __future__ import annotations + +$imports + +# ── Evaluator metadata ──────────────────────────────────────────────────────── + +_EVALUATOR_METADATA = $metadata_code + +# ── Prompt templates ────────────────────────────────────────────────────────── + +_PROMPTS: dict[str, str] = $prompts_code + +# ── Evaluation settings ─────────────────────────────────────────────────────── + +_EVALUATION_SETTINGS = $settings_code + +# ── Public config object (imported by evaluator modules) ────────────────────── + +CONFIG: EvaluatorSettingsResult[$settings_cls_name] = EvaluatorSettingsResult( + evaluator_metadata=_EVALUATOR_METADATA, + evaluation_settings=_EVALUATION_SETTINGS, + prompts=_PROMPTS, +) +""" +) + + +def generate_module( + _evaluator_name: str, + toml_path: Path, + settings_cls: type[EvaluationSettings], +) -> str: + """Parse *toml_path* and return the content of the generated Python module.""" + config = load_evaluator_settings(toml_path, settings_cls) + settings_cls_name = settings_cls.__name__ + + return _MODULE_TEMPLATE.substitute( + rel_toml=str(toml_path.relative_to(_REPO_ROOT)), + imports=_build_import_block(config, settings_cls), + metadata_code=_emit_model(config.evaluator_metadata), + prompts_code=_emit_value(config.prompts), + settings_code=_emit_model(config.evaluation_settings), + settings_cls_name=settings_cls_name, + ) + + +# --------------------------------------------------------------------------- +# Evaluator discovery +# --------------------------------------------------------------------------- + + +def _snake_to_pascal(name: str) -> str: + return "".join(part.capitalize() for part in name.split("_")) + + +def _resolve_settings_class(evaluator_name: str) -> type[EvaluationSettings]: + """Import ``EvaluationSettings`` from ``learning_commons_evaluators.schemas.``.""" + if not evaluator_name.isidentifier(): + raise SystemExit( + f"Evaluator folder name {evaluator_name!r} is not a valid Python identifier; " + "rename the directory under sdks/settings/." + ) + class_name = f"{_snake_to_pascal(evaluator_name)}EvaluationSettings" + module_name = f"{_LCE_PACKAGE}.schemas.{evaluator_name}" + try: + mod = importlib.import_module(module_name) + except ModuleNotFoundError as e: + raise SystemExit( + f"No Python module {module_name!r} for evaluator {evaluator_name!r} " + f"(expected class {class_name}). Add schemas/{evaluator_name}.py or align the folder name." + ) from e + try: + cls = getattr(mod, class_name) + except AttributeError as e: + raise SystemExit( + f"Module {module_name!r} has no attribute {class_name!r}. " + f"Define {class_name} there (subclass of EvaluationSettings), or align names." + ) from e + if not isinstance(cls, type) or not issubclass(cls, EvaluationSettings): + raise SystemExit(f"{module_name}.{class_name} must be a subclass of EvaluationSettings.") + return cls + + +@dataclass(frozen=True) +class _EvaluatorTarget: + """One evaluator with a canonical TOML and its generated module path.""" + + name: str + settings_cls: type[EvaluationSettings] + toml_path: Path + output_path: Path + + +def _discover_evaluators() -> list[_EvaluatorTarget]: + if not _SETTINGS_DIR.is_dir(): + return [] + out: list[_EvaluatorTarget] = [] + for child in sorted(_SETTINGS_DIR.iterdir()): + if not child.is_dir(): + continue + toml_path = child / "settings.toml" + if not toml_path.is_file(): + continue + name = child.name + settings_cls = _resolve_settings_class(name) + output = _GENERATED_DIR / f"_generated_{name}_settings.py" + out.append( + _EvaluatorTarget( + name=name, + settings_cls=settings_cls, + toml_path=toml_path, + output_path=output, + ) + ) + return out + + +def _contracts_toml(evaluator_name: str) -> Path: + return _SETTINGS_DIR / evaluator_name / "contracts.toml" + + +# --------------------------------------------------------------------------- +# Commands +# --------------------------------------------------------------------------- + + +def cmd_generate() -> None: + targets = _discover_evaluators() + if not targets: + print(f"No evaluators found under {_SETTINGS_DIR} (add */settings.toml).") + return + for t in targets: + content = generate_module(t.name, t.toml_path, t.settings_cls) + t.output_path.write_text(content, encoding="utf-8") + print(f" generated {t.output_path.relative_to(_REPO_ROOT)}") + print("Done.") + + +def cmd_check() -> int: + targets = _discover_evaluators() + if not targets: + print(f"No evaluators found under {_SETTINGS_DIR} (nothing to check).") + return 0 + stale: list[str] = [] + for t in targets: + expected = generate_module(t.name, t.toml_path, t.settings_cls) + actual = t.output_path.read_text(encoding="utf-8") if t.output_path.exists() else "" + if expected != actual: + diff = "".join( + difflib.unified_diff( + actual.splitlines(keepends=True), + expected.splitlines(keepends=True), + fromfile=str(t.output_path.relative_to(_REPO_ROOT)), + tofile="(regenerated)", + n=3, + ) + ) + print(f"STALE: {t.output_path.relative_to(_REPO_ROOT)}\n{diff}") + stale.append(t.name) + + if stale: + print(f"\nStale evaluators: {stale}") + print("Run: python scripts/generate_settings.py") + return 1 + + print("All generated settings are up to date.") + return 0 + + +def cmd_sync() -> None: + targets = _discover_evaluators() + if not targets: + print(f"No evaluators found under {_SETTINGS_DIR} (nothing to sync).") + return + for t in targets: + src = _contracts_toml(t.name) + if not src.exists(): + print(f" WARNING: canonical {src.relative_to(_REPO_ROOT)} not found β€” skipping") + continue + dst_dir = _GENERATED_DIR / t.name + dst_dir.mkdir(parents=True, exist_ok=True) + dst = dst_dir / "contracts.toml" + dst.write_bytes(src.read_bytes()) + print(f" copied {src.relative_to(_REPO_ROOT)} β†’ {dst.relative_to(_REPO_ROOT)}") + print("Done.") + + +def cmd_check_sync() -> int: + targets = _discover_evaluators() + if not targets: + print(f"No evaluators found under {_SETTINGS_DIR} (nothing to verify).") + return 0 + errors: list[str] = [] + for t in targets: + canonical = _contracts_toml(t.name) + if not canonical.exists(): + print(f" WARNING: canonical {canonical.relative_to(_REPO_ROOT)} not found β€” skipping") + continue + bundled = _GENERATED_DIR / t.name / "contracts.toml" + if not bundled.exists(): + errors.append( + f"MISSING bundled: {bundled.relative_to(_REPO_ROOT)} " + f"(run: python scripts/generate_settings.py --sync)" + ) + elif canonical.read_bytes() != bundled.read_bytes(): + errors.append( + f"OUT OF SYNC: {t.name}/contracts.toml " + f"(run: python scripts/generate_settings.py --sync)" + ) + + if errors: + for msg in errors: + print(msg) + return 1 + + print("All contracts TOMLs are in sync.") + return 0 + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + + +def main() -> None: + parser = argparse.ArgumentParser( + description=( + "Generate (or check) pre-built settings modules from evaluator TOML files, " + "and sync bundled contracts TOMLs." + ) + ) + parser.add_argument( + "--check", + action="store_true", + help=( + "Check generated _generated_*_settings.py files for staleness; " + "exit 1 if any differ from what would be generated." + ), + ) + parser.add_argument( + "--sync", + action="store_true", + help=( + "Copy contracts.toml from sdks/settings/ into the bundled package " + "directory so contract tests work after pip install." + ), + ) + parser.add_argument( + "--check-sync", + action="store_true", + dest="check_sync", + help=( + "Verify bundled contracts.toml files match the canonical sdks/settings/ " + "copies; exit 1 if any are missing or differ." + ), + ) + args = parser.parse_args() + + if args.check: + sys.exit(cmd_check()) + elif args.sync: + cmd_sync() + elif args.check_sync: + sys.exit(cmd_check_sync()) + else: + cmd_generate() + + +if __name__ == "__main__": + main() diff --git a/sdks/python/src/learning_commons_evaluators/__init__.py b/sdks/python/src/learning_commons_evaluators/__init__.py index 1d2228d..b3208dc 100644 --- a/sdks/python/src/learning_commons_evaluators/__init__.py +++ b/sdks/python/src/learning_commons_evaluators/__init__.py @@ -33,8 +33,14 @@ # Evaluators from learning_commons_evaluators.evaluators import ( BaseEvaluator, + ConventionalityEvaluator, InputT, OutputT, + VocabularyEvaluationInput, + VocabularyEvaluator, +) +from learning_commons_evaluators.evaluators.conventionality import ( + ConventionalityEvaluationInput, ) # Logger (uses Python standard logging) @@ -50,6 +56,10 @@ TextInputField, ) from learning_commons_evaluators.schemas.config import EvaluationSettings, LlmProvider +from learning_commons_evaluators.schemas.conventionality import ( + ConventionalityEvaluationSettings, + ConventionalityOutput, +) # Schemas (core) from learning_commons_evaluators.schemas.evaluator import ( @@ -75,6 +85,11 @@ from learning_commons_evaluators.schemas.text_complexity import ( TextComplexityEvaluationInput, ) +from learning_commons_evaluators.schemas.vocabulary import ( + VocabularyComplexityOutput, + VocabularyEvaluationSettings, + normalize_complexity_output, +) __all__ = [ "__description__", @@ -84,6 +99,10 @@ "AuthenticationError", "BaseEvaluator", "ConfigurationError", + "ConventionalityEvaluationInput", + "ConventionalityEvaluationSettings", + "ConventionalityEvaluator", + "ConventionalityOutput", "EvaluationAnswer", "EvaluationExplanation", "EvaluationInput", @@ -119,6 +138,11 @@ "TextInputField", "TokenUsage", "ValidationError", + "VocabularyEvaluationInput", + "VocabularyEvaluationSettings", + "VocabularyEvaluator", + "VocabularyComplexityOutput", + "normalize_complexity_output", "create_config", "create_config_no_telemetry", "create_config_telemetry_with_full_input", diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py b/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py index a22ec2b..5bb9fc1 100644 --- a/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py +++ b/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py @@ -7,9 +7,23 @@ """ from learning_commons_evaluators.evaluators.base import BaseEvaluator, InputT, OutputT +from learning_commons_evaluators.evaluators.conventionality import ( + ConventionalityEvaluationInput, + ConventionalityEvaluator, +) +from learning_commons_evaluators.evaluators.vocabulary import ( + VocabularyEvaluationInput, + VocabularyEvaluator, +) +from learning_commons_evaluators.schemas.vocabulary import normalize_complexity_output __all__ = [ "BaseEvaluator", + "ConventionalityEvaluationInput", + "ConventionalityEvaluator", "InputT", "OutputT", + "VocabularyEvaluationInput", + "VocabularyEvaluator", + "normalize_complexity_output", ] diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py index 49a4713..509b2e1 100644 --- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py +++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py @@ -49,14 +49,25 @@ class BaseEvaluator(ABC, Generic[InputT, OutputT, SettingsT]): """ Abstract base class for all evaluators. Subclasses must set metadata, default_evaluation_settings, and implement evaluate_impl(). + + Pass ``default_evaluation_settings`` at construction to override the class-level + defaults for that instance (used when :meth:`evaluate` is called without + ``evaluation_settings``). """ config: EvaluatorConfig metadata: EvaluatorMetadata default_evaluation_settings: SettingsT - def __init__(self, config: EvaluatorConfig) -> None: + def __init__( + self, + config: EvaluatorConfig, + *, + default_evaluation_settings: SettingsT | None = None, + ) -> None: self.config = config + if default_evaluation_settings is not None: + self.default_evaluation_settings = default_evaluation_settings # TODO: validate config def evaluate( @@ -68,13 +79,14 @@ def evaluate( Validates the input, delegates to :meth:`evaluate_impl`, records timing and status on the returned metadata, and logs start/end events via the - configured logger. If ``evaluation_settings`` is ``None``, the - evaluator's :attr:`default_evaluation_settings` is used. + configured logger. If ``evaluation_settings`` is ``None``, a deep copy of + the instance's :attr:`default_evaluation_settings` is used (from the + constructor keyword when given, otherwise the subclass class attribute). Args: input: Typed input for this evaluator. evaluation_settings: Optional override for evaluation settings. - Defaults to :attr:`default_evaluation_settings`. + Defaults to :attr:`default_evaluation_settings` (constructor or class). Returns: A typed result whose ``metadata.status`` is @@ -184,6 +196,7 @@ def execute_prompt_chain_step( template: Any, chain_inputs: dict[str, Any], parser_output_type: type[ParsedT], + json_dict_normalizer: Callable[[dict], dict] | None = None, ) -> ParsedT: ... def execute_prompt_chain_step( @@ -194,14 +207,15 @@ def execute_prompt_chain_step( template: Any, chain_inputs: dict[str, Any], parser_output_type: type[BaseModel] | None = None, + json_dict_normalizer: Callable[[dict], dict] | None = None, ) -> BaseModel | str: """Run a prompt chain (template | LLM), record metadata, and return the result. When ``parser_output_type`` is a Pydantic model class, the LLM response is - parsed as JSON and returned as an instance of that class. When - ``parser_output_type`` is ``None``, the raw response content is returned as - a plain ``str`` β€” use this for steps that produce unstructured prose (e.g. a - background-knowledge assumption). + parsed as JSON and returned as an instance of that class. When it is + ``None`` (the default), the raw response content is returned as a plain + ``str`` (no JSON parser) β€” use that for steps that produce unstructured prose + (e.g. a background-knowledge assumption). Provider config (e.g. API key) is resolved from ``self.config`` by ``prompt_settings.provider_type``. @@ -215,15 +229,24 @@ def execute_prompt_chain_step( chain_inputs: Variables to format the template and invoke the chain. parser_output_type: Pydantic model class for JSON parsing, or ``None`` to return the raw text response. + json_dict_normalizer: When set with ``parser_output_type``, parse the + model response as JSON into a plain dict (no Pydantic parse), + apply this function (e.g. notebook-style ``normalize_complexity_output``), + then validate with ``parser_output_type``. Format instructions for the + prompt should still be built from the same ``parser_output_type`` via + :class:`~langchain_core.output_parsers.JsonOutputParser`. Returns: - Parsed instance of ``parser_output_type`` when a type is given; plain - ``str`` when ``None``. + Parsed instance of ``parser_output_type`` when it is a model class; plain + ``str`` when ``parser_output_type`` is omitted or ``None``. Raises: ConfigurationError: No provider config for prompt_settings.provider_type. EvaluatorError: SDK errors, including :func:`~learning_commons_evaluators.schemas.errors.wrap_provider_error` output for LangChain or HTTP failures (typically :class:`~learning_commons_evaluators.schemas.errors.APIError` subclasses). Pydantic :exc:`pydantic.ValidationError` from output parsing is re-raised unchanged. + ValueError: If ``json_dict_normalizer`` is set but ``parser_output_type`` is omitted. """ + if json_dict_normalizer is not None and parser_output_type is None: + raise ValueError("json_dict_normalizer requires parser_output_type to be set") # Populated after a successful LLM invoke so we can attach usage even if parsing fails. token_usage: TokenUsage | None = None @@ -238,6 +261,14 @@ def _run_chain() -> BaseModel | str: return str(ai_message.content) from langchain_core.output_parsers.json import JsonOutputParser + if json_dict_normalizer is not None: + loose = JsonOutputParser() + parsed_dict = loose.invoke(ai_message) + if not isinstance(parsed_dict, dict): + parsed_dict = dict(parsed_dict) + normalized = json_dict_normalizer(parsed_dict) + return parser_output_type.model_validate(normalized) + parser = JsonOutputParser(pydantic_object=parser_output_type) raw = parser.invoke(ai_message) if isinstance(raw, parser_output_type): diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py b/sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py new file mode 100644 index 0000000..1ae7032 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py @@ -0,0 +1,117 @@ +"""Conventionality evaluator: evaluates text for conventionality of language.""" + +from __future__ import annotations + +from typing import ClassVar + +import textstat # type: ignore[import-untyped] +from langchain_core.output_parsers import JsonOutputParser +from langchain_core.prompts import ChatPromptTemplate +from pydantic import Field + +from learning_commons_evaluators.evaluators.base import BaseEvaluator +from learning_commons_evaluators.schemas.common_inputs import ( + GradeInputField, + TextInputField, +) +from learning_commons_evaluators.schemas.conventionality import ( + ConventionalityEvaluationSettings, + ConventionalityOutput, +) +from learning_commons_evaluators.schemas.evaluator import ( + EvaluationExplanation, + EvaluationInput, +) +from learning_commons_evaluators.schemas.metadata import ( + EvaluationMetadata, + EvaluatorMetadata, +) +from learning_commons_evaluators.schemas.text_complexity import ( + TextComplexityAnswer, + TextComplexityResult, +) +from learning_commons_evaluators.settings._generated_conventionality_settings import ( + CONFIG as _CONVENTIONALITY_CONFIG, +) + +_INPUT_SETTINGS = _CONVENTIONALITY_CONFIG.evaluator_metadata.inputs + + +class ConventionalityEvaluationInput(EvaluationInput): + """Input for a conventionality evaluation. + + Constraints (min/max text length, allowed grades) are sourced from + ``[[evaluator_metadata.inputs]]`` in evaluator settings and applied + automatically β€” callers supply raw values, not field objects. + + Example:: + + inp = ConventionalityEvaluationInput(text="The quick brown fox...", grade=5) + """ + + _input_settings: ClassVar[dict] = _INPUT_SETTINGS + + text: TextInputField = Field(description="The text to evaluate.") + grade: GradeInputField = Field(description="The grade level of the text.") + + def __init__(self, *, text: str, grade: int, **kwargs): + super().__init__(text=text, grade=grade, **kwargs) + + +class ConventionalityEvaluator( + BaseEvaluator[ + ConventionalityEvaluationInput, + TextComplexityResult, + ConventionalityEvaluationSettings, + ] +): + """Evaluates text for conventionality (idioms, metaphors, implied meaning) relative to grade.""" + + metadata: EvaluatorMetadata = _CONVENTIONALITY_CONFIG.evaluator_metadata + default_evaluation_settings: ConventionalityEvaluationSettings = ( + _CONVENTIONALITY_CONFIG.evaluation_settings + ) + + def evaluate_impl( + self, + input: ConventionalityEvaluationInput, + evaluation_settings: ConventionalityEvaluationSettings, + evaluation_metadata: EvaluationMetadata, + ) -> TextComplexityResult: + """Run conventionality evaluation. Returns TextComplexityResult with answer, explanation, metadata.""" + step_prompt_settings = evaluation_settings.prompt_settings_step_conventionality_evaluation + + fk_score = round(textstat.flesch_kincaid_grade(input.text.value), 2) + prompt_inputs = input.input_values() + prompt_inputs["fk_score"] = fk_score + + parser = JsonOutputParser(pydantic_object=ConventionalityOutput) + prompts = _CONVENTIONALITY_CONFIG.prompts + template = ChatPromptTemplate.from_messages( + [ + ("system", prompts["system_prompt"]), + ("human", prompts["human_prompt"]), + ] + ).partial(format_instructions=parser.get_format_instructions()) + conventionality_output = self.execute_prompt_chain_step( + step_name="conventionality_evaluation", + prompt_settings=step_prompt_settings, + evaluation_metadata=evaluation_metadata, + template=template, + chain_inputs=prompt_inputs, + parser_output_type=ConventionalityOutput, + ) + + answer = TextComplexityAnswer.from_score(conventionality_output.complexity_score) + return TextComplexityResult( + answer=answer, + explanation=EvaluationExplanation( + summary=conventionality_output.reasoning, + details={ + "conventionality_features": conventionality_output.conventionality_features, + "grade_context": conventionality_output.grade_context, + "instructional_insights": conventionality_output.instructional_insights, + }, + ), + metadata=evaluation_metadata, + ) diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py new file mode 100644 index 0000000..cde17df --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py @@ -0,0 +1,195 @@ +"""Vocabulary evaluator: evaluates text for vocabulary complexity relative to grade level.""" + +from __future__ import annotations + +from typing import Any, ClassVar + +import textstat # type: ignore[import-untyped] +from langchain_core.output_parsers import JsonOutputParser +from langchain_core.prompts import ChatPromptTemplate +from pydantic import Field + +from learning_commons_evaluators.evaluators.base import BaseEvaluator +from learning_commons_evaluators.schemas.common_inputs import ( + GradeInputField, + TextInputField, +) +from learning_commons_evaluators.schemas.config import PromptSettings +from learning_commons_evaluators.schemas.evaluator import ( + EvaluationExplanation, + EvaluationInput, +) +from learning_commons_evaluators.schemas.metadata import ( + EvaluationMetadata, + EvaluatorMetadata, +) +from learning_commons_evaluators.schemas.text_complexity import ( + TextComplexityAnswer, + TextComplexityResult, +) +from learning_commons_evaluators.schemas.vocabulary import ( + VocabularyComplexityOutput, + VocabularyEvaluationSettings, + normalize_complexity_output, +) +from learning_commons_evaluators.settings._generated_vocabulary_settings import ( + CONFIG as _VOCABULARY_CONFIG, +) + +_INPUT_SETTINGS = _VOCABULARY_CONFIG.evaluator_metadata.inputs + +# Grades 3–4 use the Gemini-based rubric prompt; grades 5–12 use the GPT-based annotation guide prompt. +_GRADES_3_4 = frozenset({3, 4}) + + +class VocabularyEvaluationInput(EvaluationInput): + """Input for a vocabulary evaluation. + + Constraints (allowed grade levels) are sourced from + ``[[evaluator_metadata.inputs]]`` in evaluator settings and applied + automatically β€” callers supply raw values, not field objects. + + Example:: + + inp = VocabularyEvaluationInput(text="The quick brown fox...", grade=5) + """ + + _input_settings: ClassVar[dict] = _INPUT_SETTINGS + + text: TextInputField = Field(description="The text to evaluate.") + grade: GradeInputField = Field(description="The grade level of the text (3–12).") + + def __init__(self, *, text: str, grade: int, **kwargs): + super().__init__(text=text, grade=grade, **kwargs) + + +class VocabularyEvaluator( + BaseEvaluator[VocabularyEvaluationInput, TextComplexityResult, VocabularyEvaluationSettings] +): + """Evaluates text for vocabulary complexity relative to the target grade level. + + The evaluation runs in two steps: + 1. **Background knowledge** – the model generates a short assumption about what + students at the target grade already know about the text's topic. This + provides context that keeps the complexity rating from penalising familiar + domain words. + 2. **Vocabulary complexity** – a grade-specific prompt + model produces JSON + matching the notebook ``Output`` shape. Raw JSON is passed through + :func:`~learning_commons_evaluators.schemas.vocabulary.normalize_complexity_output` + (same behaviour as the notebook), then validated. Grades 3–4 use Gemini; + grades 5–12 use GPT. Scores are normalised to :class:`TextComplexityAnswer`. + + Supported grades: 3–12. + """ + + metadata: EvaluatorMetadata = _VOCABULARY_CONFIG.evaluator_metadata + default_evaluation_settings: VocabularyEvaluationSettings = ( + _VOCABULARY_CONFIG.evaluation_settings + ) + + def evaluate_impl( + self, + input: VocabularyEvaluationInput, + evaluation_settings: VocabularyEvaluationSettings, + evaluation_metadata: EvaluationMetadata, + ) -> TextComplexityResult: + """Run the two-step vocabulary evaluation and return a TextComplexityResult. + + Grade validation is handled by the framework before this method is called: + ``VocabularyEvaluationInput`` automatically constrains ``grade`` to the + evaluator's ``allowed_grades`` from settings (3–12), so + ``BaseEvaluator.evaluate`` raises before reaching here for unsupported grades. + """ + ps_bk = evaluation_settings.prompt_settings_step_background_knowledge + ps_34 = evaluation_settings.prompt_settings_step_vocab_grades_3_4 + ps_og = evaluation_settings.prompt_settings_step_vocab_other_grades + + grade = input.grade.value + text = input.text.value + fk_score = round(textstat.flesch_kincaid_grade(text), 2) + prompts = _VOCABULARY_CONFIG.prompts + + # ── Step 1: background knowledge ────────────────────────────────────── + bk_template = ChatPromptTemplate.from_messages( + [("human", prompts["background_knowledge_prompt"])] + ) + background_knowledge: str = self.execute_prompt_chain_step( + step_name="background_knowledge", + prompt_settings=ps_bk, + evaluation_metadata=evaluation_metadata, + template=bk_template, + chain_inputs={"text": text, "grade": grade}, + parser_output_type=None, + ) + + # ── Step 2: vocabulary complexity (grade-specific prompts, shared Output shape) + chain_inputs: dict[str, Any] = { + "text": input.text.value, + "student_grade_level": grade, + "student_background_knowledge": background_knowledge, + } + if grade in _GRADES_3_4: + chain_inputs["fk_level"] = fk_score + answer, explanation = self._run_vocab_complexity_chain( + chain_inputs=chain_inputs, + evaluation_metadata=evaluation_metadata, + prompt_settings_vocab=ps_34, + system_prompt=prompts["vocab_grades_3_4_system_prompt"], + user_prompt_template=prompts["vocab_grades_3_4_user_prompt"], + ) + else: + answer, explanation = self._run_vocab_complexity_chain( + chain_inputs=chain_inputs, + evaluation_metadata=evaluation_metadata, + prompt_settings_vocab=ps_og, + system_prompt=prompts["vocab_other_grades_system_prompt"], + user_prompt_template=prompts["vocab_other_grades_user_prompt"], + ) + + return TextComplexityResult( + answer=answer, + explanation=explanation, + metadata=evaluation_metadata, + ) + + def _run_vocab_complexity_chain( + self, + *, + chain_inputs: dict[str, Any], + evaluation_metadata: EvaluationMetadata, + prompt_settings_vocab: PromptSettings, + system_prompt: str, + user_prompt_template: str, + ) -> tuple[TextComplexityAnswer, EvaluationExplanation]: + parser = JsonOutputParser(pydantic_object=VocabularyComplexityOutput) + template = ChatPromptTemplate.from_messages( + [ + ("system", system_prompt), + ("human", user_prompt_template), + ] + ).partial(format_instructions=parser.get_format_instructions()) + + output = self.execute_prompt_chain_step( + step_name="complexity_evaluation", + prompt_settings=prompt_settings_vocab, + evaluation_metadata=evaluation_metadata, + template=template, + chain_inputs=chain_inputs, + parser_output_type=VocabularyComplexityOutput, + json_dict_normalizer=normalize_complexity_output, + ) + + # Normalise the score string: the prompt may return spaces ("very complex") + # but TextComplexityAnswer expects underscores ("very_complex"). + score = output.complexity_score.lower().replace(" ", "_") + answer = TextComplexityAnswer.from_score(score) + explanation = EvaluationExplanation( + summary=output.reasoning, + details={ + "tier_2_words": output.tier_2_words, + "tier_3_words": output.tier_3_words, + "archaic_words": output.archaic_words, + "other_complex_words": output.other_complex_words, + }, + ) + return answer, explanation diff --git a/sdks/python/src/learning_commons_evaluators/schemas/__init__.py b/sdks/python/src/learning_commons_evaluators/schemas/__init__.py index 5b003b4..5070c69 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/__init__.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/__init__.py @@ -9,6 +9,10 @@ LlmProvider, PromptSettings, ) +from learning_commons_evaluators.schemas.conventionality import ( + ConventionalityEvaluationSettings, + ConventionalityOutput, +) from learning_commons_evaluators.schemas.errors import ValidationError from learning_commons_evaluators.schemas.evaluator import ( EvaluationAnswer, @@ -41,6 +45,8 @@ __all__ = [ "AnyInputSpec", + "ConventionalityEvaluationSettings", + "ConventionalityOutput", "GradeInputSpec", "InputSpec", "TextInputSpec", diff --git a/sdks/python/src/learning_commons_evaluators/schemas/conventionality.py b/sdks/python/src/learning_commons_evaluators/schemas/conventionality.py new file mode 100644 index 0000000..17ae47a --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/schemas/conventionality.py @@ -0,0 +1,32 @@ +"""Conventionality schemas.""" + +from typing import Literal + +from pydantic import BaseModel, Field + +from learning_commons_evaluators.schemas.config import ( + EvaluationSettings, + PromptSettings, +) + + +class ConventionalityEvaluationSettings(EvaluationSettings): + """Settings for a conventionality evaluation.""" + + prompt_settings_step_conventionality_evaluation: PromptSettings + + +class ConventionalityOutput(BaseModel): + conventionality_features: list[str] = Field( + description="List of the specific language features driving the complexity (e.g., idioms, metaphors, implied meaning) with direct quotes from the text." + ) + grade_context: str = Field( + description="How the conventionality demands compare to general expectations for the provided target grade." + ) + instructional_insights: str = Field( + description="Actionable pedagogical suggestions for scaffolding the unconventional language features in the classroom." + ) + complexity_score: Literal[ + "slightly_complex", "moderately_complex", "very_complex", "exceedingly_complex" + ] = Field(description="The conventionality complexity level of the text") + reasoning: str = Field(description="A synthesis of why the text fits the chosen rubric level.") diff --git a/sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py b/sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py new file mode 100644 index 0000000..64cde9f --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/schemas/vocabulary.py @@ -0,0 +1,68 @@ +"""Vocabulary evaluator schemas.""" + +from pydantic import BaseModel, Field + +from learning_commons_evaluators.schemas.config import ( + EvaluationSettings, + PromptSettings, +) + + +class VocabularyEvaluationSettings(EvaluationSettings): + """Settings for a vocabulary complexity evaluation. + + Requires three prompt step configurations: + - ``prompt_settings_step_background_knowledge``: generates a student background knowledge + assumption (used as context for the vocabulary complexity step). + - ``prompt_settings_step_vocab_grades_3_4``: vocabulary complexity for grades 3–4. + - ``prompt_settings_step_vocab_other_grades``: vocabulary complexity for grades 5–12. + """ + + prompt_settings_step_background_knowledge: PromptSettings + prompt_settings_step_vocab_grades_3_4: PromptSettings + prompt_settings_step_vocab_other_grades: PromptSettings + + +# Note: avoid class docstrings here β€” Pydantic adds them as a root ``description`` +# key in ``model_json_schema()``, which changes ``JsonOutputParser`` format +# instructions and breaks vocabulary contract tests that snapshot prompts without +# that key (see ``contracts.toml``). Document behavior on fields / comments. + + +class VocabularyComplexityOutput(BaseModel): + tier_2_words: str = Field(description="List of Tier 2 words") + tier_3_words: str = Field(description="List of Tier 3 words") + archaic_words: str = Field(description="List of Archaic words") + other_complex_words: str = Field(description="List of Other Complex words") + complexity_score: str = Field( + description="the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex" + ) + reasoning: str = Field(description="your reasoning for your answer") + + +def normalize_complexity_output(output: dict) -> dict: + """Mirror ``evals/vocabulary_evaluator.ipynb`` ``normalize_complexity_output``. + + Maps integer ``answer`` (1–4, including string digits) from the grades 5–12 + path to ``complexity_score`` using the same labels as the notebook. When + ``answer`` is absent, ``complexity_score`` is left unchanged. + + Missing ``tier_*`` / ``archaic_words`` / ``other_complex_words`` keys are + filled with ``\"\"`` so minimal JSON still validates as ``VocabularyComplexityOutput``. + """ + result = dict(output) + for key in ("tier_2_words", "tier_3_words", "archaic_words", "other_complex_words"): + if key not in result or result[key] is None: + result[key] = "" + mapping = { + 1: "Slightly Complex", + 2: "Moderately Complex", + 3: "Very Complex", + 4: "Exceedingly Complex", + } + if "answer" in result: + value = result["answer"] + if isinstance(value, str) and value.isdigit(): + value = int(value) + result["complexity_score"] = mapping.get(value, str(value)) + return result diff --git a/sdks/python/src/learning_commons_evaluators/settings/_generated_conventionality_settings.py b/sdks/python/src/learning_commons_evaluators/settings/_generated_conventionality_settings.py new file mode 100644 index 0000000..a3c62b7 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/settings/_generated_conventionality_settings.py @@ -0,0 +1,101 @@ +# !! AUTO-GENERATED β€” do not edit directly. +# Source: sdks/settings/conventionality/settings.toml +# Regenerate : python scripts/generate_settings.py +# Staleness check: python scripts/generate_settings.py --check + +from __future__ import annotations + +from learning_commons_evaluators.schemas.config import LlmProvider, PromptSettings +from learning_commons_evaluators.schemas.conventionality import ConventionalityEvaluationSettings +from learning_commons_evaluators.schemas.input_specs import GradeInputSpec, TextInputSpec +from learning_commons_evaluators.schemas.metadata import EvaluatorMaturity, EvaluatorMetadata +from learning_commons_evaluators.settings.load_settings import EvaluatorSettingsResult + +# ── Evaluator metadata ──────────────────────────────────────────────────────── + +_EVALUATOR_METADATA = EvaluatorMetadata( + id='conventionality', + version='0.1', + name='Conventionality', + description='Conventionality Evaluator', + maturity=EvaluatorMaturity.beta, + inputs={ + 'text': TextInputSpec(name='text', min_text_length=10, max_text_length=10000), + 'grade': GradeInputSpec(name='grade', allowed_grades=[3, 4, 5, 6, 7, 8, 9, 10, 11, 12]), + }, +) + +# ── Prompt templates ────────────────────────────────────────────────────────── + +_PROMPTS: dict[str, str] = { + 'system_prompt': """Role +You are an expert reading teacher and text complexity evaluator. Your task is to evaluate the "Conventionality" of a text and assign it a complexity level based on a 4-point scale, carefully factoring in the target grade level. + +Objective +Measure how explicit, literal, and straightforward the text's meaning is, versus how abstract, ironic, figurative, or archaic it is. Focus on the hiddenness of the meaning, the use of conceptual framing, the reliance on abstract reasoning, and the familiarity of the expression for the target grade. + +Complexity Levels +- Slightly Complex: Explicit, literal, straightforward, easy to understand. Meaning is entirely on the surface. The language is concrete, and the meaning is clear and procedural, mostly referring to observable materials and actions. Contains no symbolic or ironic language, and conceptual interpretation is not required. Contains limited figurative language that is common and easy to comprehend at the target grade level. +- Moderately Complex: Largely explicit and easy to understand with some occasions for more complex meaning. May contain a noticeable amount of archaic/dated phrasing, formal historical prose, vocabulary demands, background knowledge requirements, or expressions that are less familiar to the target grade level, which might make the text feel vague or slightly challenging. +- Very Complex: Fairly complex; contains sustained abstract language, conceptual framing, rhetorical idealization, ironic comparisons, or central metaphors that drive the meaning of the text. Addresses concepts, beliefs, and abstract qualities rather than just concrete objects. The tone or underlying message requires interpretation, even if the surface message is clear. +- Exceedingly Complex: Dense and complex; contains considerable abstract, ironic, and/or figurative language. Meaning is heavily hidden, deeply conceptual, or relies heavily on complex rhetorical devices. + +Essential Evaluation Rules +1. Concrete & Procedural Texts: Texts that are highly concrete, clear, and procedural (e.g., describing observable materials, mechanical processes, or physical actions) should typically be rated "Slightly Complex." + +2. Grade-Level Anchoring and Vague Narratives: Always consider the target grade. A literal historical narrative that might be straightforward for older students can be "Moderately Complex" for younger students (e.g., 4th graders) if it involves less familiar expressions, older contexts (e.g., wagon loads, traveling by horseback), vocabulary demands, and background knowledge requirements that make the text feel vague or slightly demanding for that age group. + +3. Rhetorical Idealization and Abstract Qualities: If an entire argument or narrative is built around abstract qualities (e.g., national character, bravery, liberty) and uses repeated figurative language or personification to portray a subject in a certain idealized way, rate the text as "Very Complex." Even if the figurative language is easy to interpret, the need to interpret the rhetorical tone and sustained abstract focus elevates the complexity beyond level two. + +4. Common Idioms and Grade-Level Appropriateness: Do NOT elevate a text to "Moderately Complex" simply because it contains a few common idiomatic expressions. If these expressions are widely known and easy for the target grade to understand without making the text feel vague, the text remains "Slightly Complex." + +5. Conversational and Hypothetical Framing: Using a second-person conversational hook (e.g., "Imagine you are...") to explain a concept is a standard, literal device for engaging readers. It does not constitute complex conceptual framing. + +6. Sustained vs. Occasional Impact: If abstract language, figurative phrasing, irony, or conceptual framing is sustained throughout the text and central to the argument/meaning, the text is Very Complex. Reserve Moderately Complex for texts where the explicit meaning dominates but the expression, vocabulary, or archaic language provides a moderate conventionality challenge. + +7. Central Metaphors and Conceptual Framing: When an author uses a central metaphor to explain a concept or uses figurative phrasing to explain how things "work," this abstract reasoning drives the meaning, elevating the text to Very Complex. + +8. Irony and Abstract Comparisons: Texts that rely on sustained irony, especially through comparative arguments, are inherently Very Complex for younger students. + +9. Isolate Conventionality from Vocabulary: Do not inflate the Conventionality score just because the text uses archaic, dated, or highly academic vocabulary. + +Input Format +You will receive: +- text: The passage to evaluate. +- grade: The target student grade level. +- fk_score: The Flesch-Kincaid readability score. + +Output Format +Provide a JSON object containing ONLY the following keys: +- complexity_score: (String) One of the 4 scale levels exactly as formatted: 'slightly_complex', 'moderately_complex', 'very_complex', or 'exceedingly_complex'. +- reasoning: (String) A detailed explanation of the rating, citing specific features in the text and referencing the expert guardrails (e.g., noting if the text relies on abstract qualities/rhetorical idealization, if vocabulary/background knowledge demands make a literal text vague for the grade level, or if it is strictly concrete/procedural). +- conventionality_features: (List of Strings) The specific language features driving the complexity (e.g., literal narrative, concrete actions, less familiar expressions, sustained irony, abstract qualities, rhetorical idealization, archaic phrasing) with direct quotes from the text. +- grade_context: (String) How the conventionality demands compare to general expectations for the provided target grade. +- instructional_insights: (String) Actionable pedagogical suggestions for scaffolding the conventionality features in the classroom. + +{format_instructions} +""", + 'human_prompt': """Analyze: +Text: {text} +Grade: {grade} +FK Score: {fk_score} +""", +} + +# ── Evaluation settings ─────────────────────────────────────────────────────── + +_EVALUATION_SETTINGS = ConventionalityEvaluationSettings( + prompt_settings_step_conventionality_evaluation=PromptSettings( + provider_type=LlmProvider.GOOGLE, + model='gemini-3-flash-preview', + temperature=0.0, + ), +) + +# ── Public config object (imported by evaluator modules) ────────────────────── + +CONFIG: EvaluatorSettingsResult[ConventionalityEvaluationSettings] = EvaluatorSettingsResult( + evaluator_metadata=_EVALUATOR_METADATA, + evaluation_settings=_EVALUATION_SETTINGS, + prompts=_PROMPTS, +) diff --git a/sdks/python/src/learning_commons_evaluators/settings/_generated_vocabulary_settings.py b/sdks/python/src/learning_commons_evaluators/settings/_generated_vocabulary_settings.py new file mode 100644 index 0000000..82562f8 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/settings/_generated_vocabulary_settings.py @@ -0,0 +1,404 @@ +# !! AUTO-GENERATED β€” do not edit directly. +# Source: sdks/settings/vocabulary/settings.toml +# Regenerate : python scripts/generate_settings.py +# Staleness check: python scripts/generate_settings.py --check + +from __future__ import annotations + +from learning_commons_evaluators.schemas.config import LlmProvider, PromptSettings +from learning_commons_evaluators.schemas.input_specs import GradeInputSpec, TextInputSpec +from learning_commons_evaluators.schemas.metadata import EvaluatorMaturity, EvaluatorMetadata +from learning_commons_evaluators.schemas.vocabulary import VocabularyEvaluationSettings +from learning_commons_evaluators.settings.load_settings import EvaluatorSettingsResult + +# ── Evaluator metadata ──────────────────────────────────────────────────────── + +_EVALUATOR_METADATA = EvaluatorMetadata( + id='vocabulary', + version='0.1', + name='Vocabulary', + description='Vocabulary Complexity Evaluator', + maturity=EvaluatorMaturity.alpha, + inputs={ + 'text': TextInputSpec( + name='text', + description='The text to evaluate for vocabulary complexity.', + ), + 'grade': GradeInputSpec( + name='grade', + description='The grade level of the text (3–12).', + allowed_grades=[3, 4, 5, 6, 7, 8, 9, 10, 11, 12], + ), + }, +) + +# ── Prompt templates ────────────────────────────────────────────────────────── + +_PROMPTS: dict[str, str] = { + 'background_knowledge_prompt': """Review the following text, which is an educational text written for students in the following grade band: {grade}. + +Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with. + +Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.). + +Here's an example: +[START EXAMPLE] +Grade Band: 11th +Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not +learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was +not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to +know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me, +are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it +is the chief end of man here to β€œglorify God and enjoy him forever.” + +Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy. +[END EXAMPLE] + +You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else. + +You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have: +[BEGIN TOPICS] +[ + K: [ + "Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees", + "The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?", + "America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?", + "Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together", + "How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?", + "How Has Our World Changed?", "Why Do People Have Jobs?" + ], + 1: [ + "Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds", + "A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals’ unique features?", + "Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?", + "Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World", + "What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?", + "How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?" + ], + 2: [ + "Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators", + "A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?", + "Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?", + "Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are", + "Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?", + "How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?" + ], + "3": [ + "Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics", + "Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist", + "Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate", + "Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?", + "What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?", + "How Do People in a Community Meet Their Wants and Needs?" + ], + 4: [ + "Poetry", "Animal Defense Mechanisms", "The American Revolution", + "Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)", + "A Great Heart: What does it mean to have a great heart, literally and figuratively?", + "Extreme Settings: How does a challenging setting or physical environment change a person?", + "American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light", + "Earth's Features", "Waves, Energy, and Information", "Regions of the United States", + "How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?", + "How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?", + "How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?" + ], + 5: [ + "Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change", + "Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?", + "Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?", + "Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky", + "Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation", + "How Were the Lives of Native Peoples Influenced by Where They Lived?", + "What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?", + "Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?", + "How Does the Constitution Help Us Understand What It Means to Be an American?", + "What Do the Early Years of the United States Reveal About the Character of the Nation?", + "What Was the Effect of the Civil War on U.S. Society?" + ], + 6: [ + "Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools", + "Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?", + "A Hero’s Journey: What is the significance and power of the hero’s journey?", + "Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?", + "Courage in Crisis: How can the challenges of a hostile environment inspire heroism?", + "Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy", + "Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate", + "Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)", + "Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution", + "The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)", + "Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)", + "New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II", + "The Cold War", "Civil Rights and American Society", "America Since the 1970s" + ], + 7: [ + "The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution", + "Identity in the Middle Ages: How does society both support and limit the development of identity?", + "Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?", + "Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering", + "Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources", + "Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent", + "Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan", + "Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires", + "China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking", + "Age of Exploration and Trade", "Revolutions and Empires", "The Modern World" + ], + 8: [ + "Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment", + "The Poetics and Power of Storytelling: What is the power of storytelling?", + "The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?", + "Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy", + "Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun", + "Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms", + "Places and Regions", "Physical Geography", "Population Geography", "Economic Geography", + "Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses", + "Government and the Economy", "The Global Economy" + ] +] +[END TOPICS] + +Here is the text: +[BEGIN TEXT] +{text} +[END TEXT] +""", + 'vocab_grades_3_4_system_prompt': """You are an expert curriculum designer. Your job is to rate the complexity of a text's vocabulary relative to the grade level. + +You will be given a rubric (with levels from least to most complex: slightly complex, moderately complex, very complex, exceedingly complex) as well as guidelines for interpreting the rubric. +IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meaning, etc.) + +**Resource 1: Qualitative Text Complexity rubric (SAP)** +1. **Level 1: Slightly complex** + * Original Definition: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. A very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0. + * Summary definition: Overall, vocabulary is easy to understand and does not impede comprehension of the bulk of the text (including main idea and supporting claims). 1-2 quick pauses for processing by the student are ok here! +2. **Level 2: Moderately complex** + * Original Definition: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK + * Summary definition: Overall, vocabulary generally allows students to comprehend the bulk of the text with little difficulty, though there may be occasional pauses for clarification. Several quick pauses or occasional prolonged pauses may occur. +3. **Level 3: Very complex** + * Original Definition: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic + * Summary definition: Overall, vocabulary often presents challenges that may slow down comprehension but does not completely block the comprehension of the bulk of the text. +4. **Level 4: Exceedingly complex** + * Original Definition: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading. + * Summary definition: Overall, vocabulary is so complex that it makes comprehension of the bulk of the text very challenging and requires careful effort to interpret. + +**Resource 2: Flesch-Kincaid Grade Level** +Use the Flesch-Kincaid (FK) Grade Level as light guidance of the approximate grade level based on readability. The metric alone does not provide final information of vocabulary complexity, but a ballpark of the difficulty of the entire text. +* grade 2-3: 1.98-5.34 +* grade 4-5: 4.51-7.73 +* grade 6-8: 6.51-10.34 +* grade 9-10: 8.32-12.12 +* grade 11-College: 10.34-14.2 + +**Guidelines for Interpretation and Reasoning** + +Your reasoning is the most critical part of your analysis. It's not enough to simply count complex words. You must analyze their impact on a student at the specified grade level. Use the following principles to guide your judgment: + +1. **Density and Cumulative Effect:** Do not just count complex words; evaluate their concentration. A short text with a high density of challenging Tier 2 words (e.g., `peculiar`, `mischievous`, `courageous` for a 4th grader) can be more overwhelming than a longer text with a few scattered Tier 3 words. A constant barrage of unfamiliar words can elevate complexity from `very` to `exceedingly`. +2. **Contextual Scaffolding:** Assess how the text supports new vocabulary. + * Are new, complex terms explicitly defined or explained with simple examples (e.g., "volume... to see if it is big enough to hold a liter of food")? + * Is the surrounding language simple and conversational, making the meaning of new words easier to infer? + * Strong scaffolding can lower the complexity rating. A text with many Tier 3 words that are well-explained might only be `moderately complex`. +3. **Abstract vs. Concrete Vocabulary:** Differentiate between words for abstract concepts and words for concrete things. A text built on abstract Tier 2 words (e.g., `relationships`, `performance`, `non-physical`) can be more challenging than a text that introduces Tier 3 labels for concrete things or people (e.g., `Sumerians`, `polonium`). +4. **Conceptual Load:** Consider the cognitive load of the vocabulary. A list of many new, multi-syllabic, conceptually-heavy terms (e.g., `Paleolithic`, `Mesolithic`, `Neolithic` for a 3rd grader) can be `very complex` even if the terms are briefly defined, because the student must process multiple new concepts at once. +5. **Calibrating the Top Levels:** Be precise in your use of `very complex` vs. `exceedingly complex`. + * **Very complex:** The vocabulary creates significant hurdles and slows the reader down, but the main ideas of the text are still accessible with effort. + * **Exceedingly complex:** The vocabulary is so dense, technical, or abstract that it acts as a barrier, making it nearly impossible for the target student to grasp the bulk of the text's meaning without extensive outside help. Reserve this for texts saturated with advanced terminology. +6. **Consider Background Knowledge:** Pay close attention to the provided `student_background_knowledge`. Do not classify a word as complex if the student is likely to be familiar with it (e.g., 'oxygen' for a 3rd grader who has learned about the human body). + +**Final Analysis Format** + +Provide these information as your final analysis: +1. **Complex vocabulary:** + * Tier 2 words: Words that are commonly used in academic settings and more complex than colloquial, or everyday language and often have multiple meanings. + * Tier 3 words: Overly academic or domain-specific words. + * Archaic words: Words, or uses of words that are not commonly used in modern conversational language. E.g., β€œThe jury retired to deliberate on their verdict." The use of β€œretire” to mean withdrawing to a private place is an archaic use. + * Other complex words: All other words that can increase complexity of the text (e.g., idioms, unfamiliar proper nouns that function as vocabulary). +2. **Vocabulary complexity:** one of: slightly complex, moderately complex, very complex, exceedingly complex +3. **Your reasoning of the complexity:** A detailed explanation of your rating, referencing the principles above. +""", + 'vocab_grades_3_4_user_prompt': """Below is the text you need to evaluate. Let's think step by step in order to predict the output of the vocabulary complexity task. + +- It is intended for grade {student_grade_level}. + +- You can assume the student has the following background knowledge about the text β€” this background knowledge influences which words from the text are familiar versus unfamiliar for the student: {student_background_knowledge} + +- Text Flesch-Kincaid grade level: {fk_level} + +- Text to evaluate: [BEGIN TEXT] +{text} +[END TEXT] + +{format_instructions} +""", + 'vocab_other_grades_system_prompt': """You are an expert curriculum designer. Your job involves reading text snippets intended for students in K-12 and evaluating the complexity of the vocabulary in each text. + +You will be given a rubric (with options 1, 2, 3, 4) as well as guidelines for interpreting the rubric. + +IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meainng, etc.) +IMPORTANT: Rely on the supplied rubric and annotation guidelines along. Do not introduce any new crtieria for evaluating the complexity of a text's vocabulary. + +Please first reason out loud about the vocabulary complexity of the text and then provide an answer between 1 and 4 (whole numbers only). Provide the answer as an integer (not a float). +""", + 'vocab_other_grades_user_prompt': """Your job is to rate the complexity of a text's vocabulary (relative to the intended level of the text) according to a rubric and annotation guide. Stick to the rubric and annotation guide exactly β€” do not introduce any additional criteria or lenses for judging the complexity of the text. + +[BEGIN ANNOTATION GUIDE AND RUBRIC] +Instructions +For the following task, please assume that: + - The student is on grade level and proficient in all core content areas, including reading fluency, comprehension, science, & social studies. (example). + - The student is moving through a common progression of topics (detailed here). + - The student is fluent in speaking English. + - The student has an "average" amount of background knowledge on topics not commonly covered in curriculum. + - The student will use this material for independent reading/work, without direct instruction. + - The text is reasonable for the given grade level. + +Please do not consider the presence of figurative language when scoring Vocabulary. For example: with a phrase like "kicked the bucket," consider only the qualities of the words themselves ("kicked", "the" and "bucket"). + +Please do be sure to consider: +- all of the different types of vocabulary (listed below) +- the overall proportion of complex words in the text - including repeated complex words. +- the resulting holistic complexity of the vocabulary (described in the Summary section below). + +Level 1: +Rubric: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. That said, a very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0. + +Level 2: +Rubric: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK, but if it's very low, the text is probably level 1. + +Level 3: +Rubric: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic + +Level 4: +Rubric: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading + +And here are some relevant definitions: + - Conversational: Everyday language. + - Familiar: Words that the student is likely to have seen/heard, from everyday life or their curriculum. Reminder: assume an "average" level of background knowledge. + - Unfamiliar: Words the student has probably not heard, or are being used in an unfamiliar way. + - For ex: 4th graders are familiar with the word "table" but may not be familiar with the use of the word with respect to data ("a table of data"). + - Note: + - Words with in-line definitions (via appositives, or because they can be easily inferred from other parts of the text) should be evaluated as less unfamiliar. + - For ex: "The pharaoh, a powerful ruler of ancient Egypt, was buried in a grand tomb." + - The word "pharaoh" might be unfamiliar or subject-specific, but since is defined within the text, you can consider it a more familiar word. + - Unfamiliar proper nouns: + - A person's name, even if unfamiliar, generally does not add to complexity. + - Other unfamiliar proper nouns (eg locations, organizations) do add to complexity. + +- Subject-specific: Words that are specific to a subject or field of study that are essential for understanding concepts and engaging with the content. +- Overly-academic: Words that are excessively formal, complex, or specialized. + - For ex: "The agrarian societal structure of the Neolithic Revolution precipitated a paradigm shift in agriculture" +- Archaic: A word that was common in the past but is now rarely/almost never used. Could also be a word used in an archaic way. + - For ex: "After a long day of court proceedings, the jury 'retired' to deliberate on their verdict." + - The word "retire" meaning to stop working may be familiar to a student, but "retire" meaning "withdrawing to a private place" is an archaic use. + + +Examples +The student is on-grade-level: +- Consider a 6th grade passage about earth systems. Per NGSS standards, students are introduced to earth systems starting in 2nd grade. They encounter words like: wind, water, river, lake, solids, and liquids. For our rating purposes, we would assume most students following 2nd have encountered these words. In 5th grade, they dive more fully into earth systems concepts, learning vocabulary words like geosphere, sediment, biosphere, atmosphere, ecosystems, organisms and climate. While rating, we would consider the words listed in the NGSS standards as more familiar following that grade level. If the same passage were intended for 3rd graders, though, then the subject-specific vocabulary is likely to be unfamiliar. + +Figurative Language +- Kicked the bucket. +- The pen is mightier than the sword. +- The classroom was a zoo. +- He ran faster than the speed of light. +[END ANNOTATION GUIDE AND RUBRIC] + +Here are a couple examples of texts that have already been scored along with justification for their scores, which you can use as exemplars: +[BEGIN EXAMPLES] + +*** EXAMPLE 1 *** +The following text was intended for grade level 11 and received a complexity level of 1. + +Here is the background knowledge assumption for that text: N/A + +Here is the text: +// START TEXT // +"In a recent lecture, "Is Nothing Sacred?", Salman Rushdie, one of the most censored authors of our time, talked about the importance of books. He grew up in a household in India where books were as sacred as bread. If anyone in the household dropped a piece of bread or a book, the person not only picked it up, but also kissed the object by way of apologizing for clumsy disrespect. + +He goes on to say that he had kissed many books before he had kissed a girl. Bread and books were for his household, and for many like his, food for the body and the soul. This image of the kissing of the book one had accidentally dropped made an impression on me. It speaks to the love and respect many people have for them. + +I grew up in a small town in New Mexico, and we had very few books in our household. The first one I remember reading was my catechism book. Before I went to school to learn English, my mother taught me catechism in Spanish. + +I remember the questions and answers I had to learn, and I remember the well-thumbed, frayed volume which was sacred to me. + +Growing up with few books in the house created in me a desire and a need for them. When I started school, I remember visiting the one room library of our town and standing in front of the dusty shelves. In reality there were only a few shelves and not over a thousand books, but I wanted to read them all. There was food for my soul in the books, that much I realized." +// END TEXT // + +Here is the reasoning for that complexity level: +// START REASONING // +This text is a 1 for vocabulary, because the vocabulary that is used is familiar and accessible for a proficient 11th grader. Most of the words used in the text are very common everyday vocabulary for describing growing up, family life, and the importance of reading. A few examples of these very common words are: small town, book, school, learn, food, kissed, image, respect, love, speaks. There are many more in the text. In this text there are only a few "juicier" or more complex words, you can think of those as words that are less familiar, have a more abstract or nuanced meaning, or carry a very large concept. Less commonly spoken words that were used in the text were: frayed, volume, censored, clumsy, sacred. These are still well within reach of a proficient 11th grader, and would still be considered familiar, because they will have encountered them in past reading or academic studies. In the text there are a couple of words that are outliers, but they are not essential to the understanding of the larger text. One of these words or hyphenated compound phrase is well-frayed. A compound phrase is a phrase consisting of multiple words that work together to create a specific meaning or idea, often acting as a single unit in a sentence. If the meaning of individual words is familiar, it is typically quite easy for proficient readers to generalize the larger meaning that the author is implying with their word choice. In this case, proficient students will be accustomed to the phrase well, with the secondary meaning of very, rather than a description of positivity or health; and they will be accustomed to the use frayed, as in worn, aged, or damaged from use. Making the leap to identify the meaning of "well-frayed" as a book that is very used, will take only moments for a proficient 11th grader. Another word that stands out in the text is the word catechism, which might be new for many students based on their personal background or location, but a full understanding of what a catechism book contains is not essential for understanding the paragraph or whole text. The reader can make it through using minimum context clues to know that the catechism must be something important to his family. The type of book he learned to read before going to school is not critical for comprehension, it's enough to understand that reading was so important in his family, his mother started instruction before he even started school. Additionally, it's important to know that having one unknown word for an 11th grade reading, does not merit a rating higher than one. + +It is worth noting that another reason this text is a 1, is that the content or topic of the passage is so familiar and covered extensively in K-12 education, i.e. reading is important, loving books, growing up; that coupled with the simple vocabulary choices, getting to the meaning of the overall text, and even the paragraphs, would be incredibly easy for a proficient 11th grader. +// END REASONING // +*** EXAMPLE 2 *** +The following text was intended for grade level 5 and received a complexity level of 2. + +Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with the concept of natural disasters, including hurricanes, and basic atmospheric concepts like high and low pressure from their studies on weather and climate. They may not be familiar with the specific formation processes of hurricanes or the global terminology differences (hurricane, typhoon, cyclone). + +Here is the text: +// START TEXT // +Great whirling storms roar out of the oceans in many parts of the world. They are called by several namesβ€”hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms. + +Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes. +// END TEXT // + +Here is the reasoning for that complexity level: +// START REASONING // +I scored this a 2 because of the density of subject-specific vocabulary related to weather and climate, which is often covered in lower grade levels. This adds to the complexity above a 1, but it is not a level 3 because of the familiarity with the topic, which implies some familiarity with the vocabulary as well. The specific formation process and the vocabulary used to explain the processes are also subject-specfiic but not famliar, which would make the second paragraph a level 3 in the rubric language, but when considering the language used in the overall SUMMARY below the rubric, this new content and vocabulary would cause quick pauses and/or occasional prolonged pauses but would not cause the reader to slow down to due to challenging overall comprehension of the key ideas and supporting claims. This is especially the case because the second paragraph builds upon prior knowledge and familiar vocabulary use, so it is not entirely new information and vocabulary. While there is subject-specific vocabulary used, overly academic vocabulary is NOT used and is more conversational in nature, such as "great whiring storms" and "born" / "giving birth" to storm (although this is the way storms are described!) rather than more technical terms which made comprehension easier due to the accessibility of the vocabulary (even if used in other contexts before reading this text). Words such as "a lot" and "bigger" are more conversational, and while technical, unfamiliar words are provided, such as "hurricane," "typhoon," and "cyclone," knowing and understanding their differences is not necessary to grasp the main idea. The processes by which they are formed are what need to be retained while reading the entire text, and familiarity with the bulk of the vocabulary used would allow for that to happen without too much struggle to make meaning of it. Additionally, the text does not contain any archaic vocabulary or ambiguous words, which prevents it from reaching a rating of 4, although it is not necessary that they text have such vocabulary to meet a level 4, the frequent inclusion of such vocabulary makes it more likely to land at least a 3 or 4. +// END REASONING // + +*** EXAMPLE 3 *** +The following text was intended for grade level 6 and received a complexity level of 3. + +Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with basic Earth science concepts such as rocks, minerals, and fossils, as well as natural processes like volcanic eruptions and earthquakes. They may not be familiar with more advanced topics like plate tectonics or the specific branches of geology such as mineralogy, petrology, and seismology. + +Here is the text: +// START TEXT // +Geology is the scientific study of Earth. Geologists study the planetβ€”its formation, its internal structure, its materials, its chemical and physical processes, and its history. Mountains, valleys, plains, sea floors, minerals, rocks, fossils, and the processes that create and destroy each of these are all the domain of the geologist. Geology is divided into two broad categories of study: physical geology and historical geology. + +Physical geology is concerned with the processes occurring on or below the surface of Earth and the materials on which they operate. These processes include volcanic eruptions, landslides, earthquakes, and floods. Materials include rocks, air, seawater, soils, and sediment. Physical geology further divides into more specific branches, each of which deals with its own part of Earth's materials, landforms, and processes. Mineralogy and petrology investigate the composition and origin of minerals and rocks. Volcanologists study lava, rocks, and gases on live, dormant, and extinct volcanoes. Seismologists use instruments to monitor and predict earthquakes and volcanic eruptions. + +Historical geology is concerned with the chronology of events, both physical and biological, that have taken place in Earth's history. Paleontologists study fossils (remains of ancient life) for evidence of the evolution of life on Earth. Fossils not only relate evolution, but also speak of the environment in which the organism lived. Corals in rocks at the top of the Grand Canyon in Arizona, for example, show a shallow sea flooded the area around 290 million years ago. In addition, by determining the ages and types of rocks around the world, geologists piece together continental and oceanic history over the past few billion years. Plate tectonics (the study of the movement of the sections of Earth's crust) adds to Earth's story with details of the changing configuration of the continents and oceans. +// END TEXT // + +Here is the reasoning for that complexity level: +// START REASONING // +To determine the complexity rating of this text based on the vocabulary present, I used the annotation guide, scoring rubric, and examples to set the expectations for rating. During the first read of the text, I "bolded" and categorized the more challenging vocabulary words according to the following complexity groupings: archaic, unfamiliar, archaic, subject-specific, and/or overly academic. On the second read, I considered the main idea or "gist" that students need to acquire understanding of. I then referenced the previously mentioned tools–annotation guide, scoring rubric, and examples to remind myself of the expectations for rating. I agreed that readers would have familiarity with basic concepts of geology; however, I also considered the definitions provided for words such as Geology, Geologists, Physical Geology, Historical Geology, Mineralogy, and Petrology. I considered how students might pause for clarification and for how long. After reviewing the Annotation Guide while considering, I narrowed the rating down because the definitions provided throughout the text of more complex words should make the meaning of the text more accessible for readers, which is why although the words are subject-specific, I rated this text as a 3 instead of a 2-less complex or a 4–more complex. I read the text one final time to ensure clarity around my rating, scored and wrote the justification. +// END REASONING // +[END EXAMPLES] + +Below is the text you need to evaluate. It is intended for grade {student_grade_level}. + +As you read the text, you can assume the student has the following background knowledge about the text β€” this background knowledge influences which words from the text are familiar versus unfamiliar for the student: {student_background_knowledge} + +[BEGIN TEXT] +{text} +[END TEXT] + +In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say "level 2"). + +{format_instructions} +""", +} + +# ── Evaluation settings ─────────────────────────────────────────────────────── + +_EVALUATION_SETTINGS = VocabularyEvaluationSettings( + prompt_settings_step_background_knowledge=PromptSettings( + provider_type=LlmProvider.OPENAI, + model='gpt-4o-2024-11-20', + temperature=0.0, + ), + prompt_settings_step_vocab_grades_3_4=PromptSettings( + provider_type=LlmProvider.GOOGLE, + model='gemini-2.5-pro', + temperature=0.0, + ), + prompt_settings_step_vocab_other_grades=PromptSettings(provider_type=LlmProvider.OPENAI, model='gpt-4.1', temperature=0.0), +) + +# ── Public config object (imported by evaluator modules) ────────────────────── + +CONFIG: EvaluatorSettingsResult[VocabularyEvaluationSettings] = EvaluatorSettingsResult( + evaluator_metadata=_EVALUATOR_METADATA, + evaluation_settings=_EVALUATION_SETTINGS, + prompts=_PROMPTS, +) diff --git a/sdks/python/src/learning_commons_evaluators/settings/conventionality/__init__.py b/sdks/python/src/learning_commons_evaluators/settings/conventionality/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sdks/python/src/learning_commons_evaluators/settings/conventionality/contracts.toml b/sdks/python/src/learning_commons_evaluators/settings/conventionality/contracts.toml new file mode 100644 index 0000000..47bac4c --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/settings/conventionality/contracts.toml @@ -0,0 +1,130 @@ +# Contract tests for the Conventionality evaluator. +# +# This file captures the exact LLM interaction for known inputs so that +# tests can verify the SDK sends the same request as the notebook and +# produces the same result from the same response. +# +# HOW TO REGENERATE +# ----------------- +# Run the notebook "evals/Final ship - Conventionality Experimental Evaluator.ipynb" +# from the repo root with GOOGLE_API_KEY set. It loads prompts from the evaluator +# settings TOML (same as the SDK) and prints a TOML block β€” paste into this file +# and run `make sync-settings` from sdks/python/ to update the bundled copy. +# +# WHAT BELONGS HERE +# ----------------- +# Only data that originates from external sources: +# - input: what the caller provides to the evaluator +# - prompt_steps: what the SDK sends to the LLM (fully formatted messages, +# model, temperature) and the raw response the LLM returned +# - expected_result: the structured output the LLM returned (notebook format) +# +# Deterministic values computed by library code (e.g. FK score from textstat, +# format_instructions from LangChain's JsonOutputParser) are NOT stored as +# standalone fields β€” they appear inside the fully formatted prompts, where they +# are implicitly verified by the prompt-request assertion in the contract test. + +[cases.turnip] +description = "Grade 4 classroom narrative (Henry and the turnip)" + +[cases.turnip.input] +text = ''' + +"Well, then," said the teacher, "you may take your slate and go out behind the schoolhouse for half an hour. Think of something to write about, and write the word on your slate. Then try to tell what it is, what it is like, what it is good for, and what is done with it. That is the way to write a composition." Henry took his slate and went out. Just behind the schoolhouse was Mr. Finney's barn. Quite close to the barn was a garden. And in the garden, Henry saw a turnip. "Well, I know what that is," he said to himself; and he wrote the word turnip on his slate. Then he tried to tell what it was like, what it was good for, and what was done with it. Before the half hour was ended he had written a very neat composition on his slate. He then went into the house, and waited while the teacher read it. The teacher was surprised and pleased. He said, "Henry Longfellow, you have done very well. Today you may stand up before the school and read what you have written about the turnip." +''' +grade = 4 + +[cases.turnip.prompt_steps.main] +model = "gemini-3-flash-preview" +temperature = 0.0 +system_prompt = '''Role +You are an expert reading teacher and text complexity evaluator. Your task is to evaluate the "Conventionality" of a text and assign it a complexity level based on a 4-point scale, carefully factoring in the target grade level. + +Objective +Measure how explicit, literal, and straightforward the text's meaning is, versus how abstract, ironic, figurative, or archaic it is. Focus on the hiddenness of the meaning, the use of conceptual framing, the reliance on abstract reasoning, and the familiarity of the expression for the target grade. + +Complexity Levels +- Slightly Complex: Explicit, literal, straightforward, easy to understand. Meaning is entirely on the surface. The language is concrete, and the meaning is clear and procedural, mostly referring to observable materials and actions. Contains no symbolic or ironic language, and conceptual interpretation is not required. Contains limited figurative language that is common and easy to comprehend at the target grade level. +- Moderately Complex: Largely explicit and easy to understand with some occasions for more complex meaning. May contain a noticeable amount of archaic/dated phrasing, formal historical prose, vocabulary demands, background knowledge requirements, or expressions that are less familiar to the target grade level, which might make the text feel vague or slightly challenging. +- Very Complex: Fairly complex; contains sustained abstract language, conceptual framing, rhetorical idealization, ironic comparisons, or central metaphors that drive the meaning of the text. Addresses concepts, beliefs, and abstract qualities rather than just concrete objects. The tone or underlying message requires interpretation, even if the surface message is clear. +- Exceedingly Complex: Dense and complex; contains considerable abstract, ironic, and/or figurative language. Meaning is heavily hidden, deeply conceptual, or relies heavily on complex rhetorical devices. + +Essential Evaluation Rules +1. Concrete & Procedural Texts: Texts that are highly concrete, clear, and procedural (e.g., describing observable materials, mechanical processes, or physical actions) should typically be rated "Slightly Complex." + +2. Grade-Level Anchoring and Vague Narratives: Always consider the target grade. A literal historical narrative that might be straightforward for older students can be "Moderately Complex" for younger students (e.g., 4th graders) if it involves less familiar expressions, older contexts (e.g., wagon loads, traveling by horseback), vocabulary demands, and background knowledge requirements that make the text feel vague or slightly demanding for that age group. + +3. Rhetorical Idealization and Abstract Qualities: If an entire argument or narrative is built around abstract qualities (e.g., national character, bravery, liberty) and uses repeated figurative language or personification to portray a subject in a certain idealized way, rate the text as "Very Complex." Even if the figurative language is easy to interpret, the need to interpret the rhetorical tone and sustained abstract focus elevates the complexity beyond level two. + +4. Common Idioms and Grade-Level Appropriateness: Do NOT elevate a text to "Moderately Complex" simply because it contains a few common idiomatic expressions. If these expressions are widely known and easy for the target grade to understand without making the text feel vague, the text remains "Slightly Complex." + +5. Conversational and Hypothetical Framing: Using a second-person conversational hook (e.g., "Imagine you are...") to explain a concept is a standard, literal device for engaging readers. It does not constitute complex conceptual framing. + +6. Sustained vs. Occasional Impact: If abstract language, figurative phrasing, irony, or conceptual framing is sustained throughout the text and central to the argument/meaning, the text is Very Complex. Reserve Moderately Complex for texts where the explicit meaning dominates but the expression, vocabulary, or archaic language provides a moderate conventionality challenge. + +7. Central Metaphors and Conceptual Framing: When an author uses a central metaphor to explain a concept or uses figurative phrasing to explain how things "work," this abstract reasoning drives the meaning, elevating the text to Very Complex. + +8. Irony and Abstract Comparisons: Texts that rely on sustained irony, especially through comparative arguments, are inherently Very Complex for younger students. + +9. Isolate Conventionality from Vocabulary: Do not inflate the Conventionality score just because the text uses archaic, dated, or highly academic vocabulary. + +Input Format +You will receive: +- text: The passage to evaluate. +- grade: The target student grade level. +- fk_score: The Flesch-Kincaid readability score. + +Output Format +Provide a JSON object containing ONLY the following keys: +- complexity_score: (String) One of the 4 scale levels exactly as formatted: 'slightly_complex', 'moderately_complex', 'very_complex', or 'exceedingly_complex'. +- reasoning: (String) A detailed explanation of the rating, citing specific features in the text and referencing the expert guardrails (e.g., noting if the text relies on abstract qualities/rhetorical idealization, if vocabulary/background knowledge demands make a literal text vague for the grade level, or if it is strictly concrete/procedural). +- conventionality_features: (List of Strings) The specific language features driving the complexity (e.g., literal narrative, concrete actions, less familiar expressions, sustained irony, abstract qualities, rhetorical idealization, archaic phrasing) with direct quotes from the text. +- grade_context: (String) How the conventionality demands compare to general expectations for the provided target grade. +- instructional_insights: (String) Actionable pedagogical suggestions for scaffolding the conventionality features in the classroom. + +STRICT OUTPUT FORMAT: +- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators. +- Do not wrap the JSON in Markdown or code fences (no ``` or ```json). +- Do not prepend or append any text (e.g., do not write "Here is the JSON:"). +- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments. + +The output should be formatted as a JSON instance that conforms to the JSON schema below. + +As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]} the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted. + +Here is the output schema (shown in a code block for readability only β€” do not include any backticks or Markdown in your output): +``` +{"properties": {"conventionality_features": {"description": "List of the specific language features driving the complexity (e.g., idioms, metaphors, implied meaning) with direct quotes from the text.", "items": {"type": "string"}, "title": "Conventionality Features", "type": "array"}, "grade_context": {"description": "How the conventionality demands compare to general expectations for the provided target grade.", "title": "Grade Context", "type": "string"}, "instructional_insights": {"description": "Actionable pedagogical suggestions for scaffolding the unconventional language features in the classroom.", "title": "Instructional Insights", "type": "string"}, "complexity_score": {"description": "The conventionality complexity level of the text", "enum": ["slightly_complex", "moderately_complex", "very_complex", "exceedingly_complex"], "title": "Complexity Score", "type": "string"}, "reasoning": {"description": "A synthesis of why the text fits the chosen rubric level.", "title": "Reasoning", "type": "string"}}, "required": ["conventionality_features", "grade_context", "instructional_insights", "complexity_score", "reasoning"]} +``` +''' +user_prompt = '''Analyze: +Text: "Well, then," said the teacher, "you may take your slate and go out behind the schoolhouse for half an hour. Think of something to write about, and write the word on your slate. Then try to tell what it is, what it is like, what it is good for, and what is done with it. That is the way to write a composition." Henry took his slate and went out. Just behind the schoolhouse was Mr. Finney's barn. Quite close to the barn was a garden. And in the garden, Henry saw a turnip. "Well, I know what that is," he said to himself; and he wrote the word turnip on his slate. Then he tried to tell what it was like, what it was good for, and what was done with it. Before the half hour was ended he had written a very neat composition on his slate. He then went into the house, and waited while the teacher read it. The teacher was surprised and pleased. He said, "Henry Longfellow, you have done very well. Today you may stand up before the school and read what you have written about the turnip." +Grade: 4 +FK Score: 3.75 +''' +llm_response = ''' +```json +{ + "complexity_score": "slightly_complex", + "reasoning": "The text is a straightforward, literal narrative that is entirely explicit and easy to understand. The meaning remains on the surface, focusing on concrete actions and observable materials (a slate, a schoolhouse, a turnip). The teacher's instructions are procedural and clear, providing a step-by-step guide for writing a composition. There is no symbolic, ironic, or figurative language that requires interpretation. While the setting is historical (using a 'slate' and 'schoolhouse'), these terms do not create a barrier to understanding the plot or the underlying message, which is a simple account of a student completing a task.", + "conventionality_features": [ + "literal narrative: 'Henry took his slate and went out.'", + "concrete actions: 'he wrote the word turnip on his slate.'", + "procedural instructions: 'tell what it is, what it is like, what it is good for, and what is done with it.'", + "straightforward meaning: 'The teacher was surprised and pleased.'" + ], + "grade_context": "For a 4th-grade student, this text is highly accessible and falls well within the expected range for conventionality. The Flesch-Kincaid score of 3.75 aligns with the text's simple sentence structures and concrete subject matter, making it an easy read for students at this level.", + "instructional_insights": "Since the text is highly conventional, teachers can use it to model the 'procedural' writing steps mentioned in the story. Students can practice the teacher's four-step method (what it is, what it is like, what it is good for, what is done with it) on modern objects. A brief historical note explaining that a 'slate' was a precursor to a notebook or tablet would provide sufficient context for the dated terminology." +} +```''' + +[cases.turnip.expected_result] +complexity_score = ''' +slightly_complex''' +reasoning = ''' +The text is a straightforward, literal narrative that is entirely explicit and easy to understand. The meaning remains on the surface, focusing on concrete actions and observable materials (a slate, a schoolhouse, a turnip). The teacher's instructions are procedural and clear, providing a step-by-step guide for writing a composition. There is no symbolic, ironic, or figurative language that requires interpretation. While the setting is historical (using a 'slate' and 'schoolhouse'), these terms do not create a barrier to understanding the plot or the underlying message, which is a simple account of a student completing a task.''' +conventionality_features = ["literal narrative: 'Henry took his slate and went out.'", "concrete actions: 'he wrote the word turnip on his slate.'", "procedural instructions: 'tell what it is, what it is like, what it is good for, and what is done with it.'", "straightforward meaning: 'The teacher was surprised and pleased.'"] +grade_context = ''' +For a 4th-grade student, this text is highly accessible and falls well within the expected range for conventionality. The Flesch-Kincaid score of 3.75 aligns with the text's simple sentence structures and concrete subject matter, making it an easy read for students at this level.''' +instructional_insights = ''' +Since the text is highly conventional, teachers can use it to model the 'procedural' writing steps mentioned in the story. Students can practice the teacher's four-step method (what it is, what it is like, what it is good for, what is done with it) on modern objects. A brief historical note explaining that a 'slate' was a precursor to a notebook or tablet would provide sufficient context for the dated terminology.''' diff --git a/sdks/python/src/learning_commons_evaluators/settings/vocabulary/__init__.py b/sdks/python/src/learning_commons_evaluators/settings/vocabulary/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml b/sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml new file mode 100644 index 0000000..8420e01 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/settings/vocabulary/contracts.toml @@ -0,0 +1,641 @@ +# Contract tests for the Vocabulary evaluator. +# +# This file captures the exact LLM interactions for known inputs so that tests +# can verify the SDK sends the same requests as the notebook and produces the +# same results from the same responses. +# +# HOW TO REGENERATE +# ----------------- +# Run the notebook "evals/vocabulary_evaluator.ipynb" with valid API keys. +# The final cells output a TOML block that you paste in here to replace the +# placeholder values below. +# +# WHAT BELONGS HERE +# ----------------- +# Only data that originates from external sources: +# - input: what the caller provides to the evaluator +# - prompt_steps: what the SDK sends to the LLM (fully formatted messages, +# model, temperature) and the raw response the LLM returned. +# Steps must be listed in call order: +# 1. background_knowledge (plain text response) +# 2. vocab_complexity (JSON response) +# - expected_result: the structured output parsed from the vocab_complexity +# llm_response (notebook format), before SDK result mapping. +# +# Deterministic values computed by library code (FK score, format_instructions) +# are NOT stored as standalone fields β€” they appear inside the fully formatted +# prompts, where they are implicitly verified by the prompt-request assertion. +# +# TWO CASES ARE INCLUDED +# ----------------------- +# marco_polo_grade3 β€” grade 3, uses the grades 3–4 path (Gemini + rubric labels +# + word-level breakdown in expected_result). +# hurricanes_grade7 β€” grade 7, uses the grades 5–12 path (GPT; expected_result +# holds ``complexity_score`` as a string plus word-list fields, +# matching the notebook ``Output`` shape. The SDK also accepts +# legacy integer ``answer`` in raw JSON like ``normalize_complexity_output``.) + +# =========================================================================== +# Case 1: marco_polo_grade3 (grades 3–4 path) +# =========================================================================== + +[cases.marco_polo_grade3] +description = "Marco Polo passage, grade 3 (grades 3-4 Gemini path)" + +[cases.marco_polo_grade3.input] +text = ''' +Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all. +Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk. +His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.''' +grade = 3 + +[cases.marco_polo_grade3.prompt_steps.background_knowledge] +model = "gpt-4o-2024-11-20" +temperature = 0.0 +system_prompt = "" +user_prompt = ''' + +Review the following text, which is an educational text written for students in the following grade band: 3. + +Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with. + +Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.). + +Here's an example: +[START EXAMPLE] +Grade Band: 11th +Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not +learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was +not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to +know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me, +are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it +is the chief end of man here to β€œglorify God and enjoy him forever.” + +Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy. +[END EXAMPLE] + +You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else. + +You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have: +[BEGIN TOPICS] +[ + K: [ + "Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees", + "The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?", + "America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?", + "Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together", + "How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?", + "How Has Our World Changed?", "Why Do People Have Jobs?" + ], + 1: [ + "Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds", + "A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals’ unique features?", + "Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?", + "Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World", + "What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?", + "How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?" + ], + 2: [ + "Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators", + "A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?", + "Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?", + "Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are", + "Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?", + "How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?" + ], + "3": [ + "Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics", + "Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist", + "Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate", + "Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?", + "What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?", + "How Do People in a Community Meet Their Wants and Needs?" + ], + 4: [ + "Poetry", "Animal Defense Mechanisms", "The American Revolution", + "Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)", + "A Great Heart: What does it mean to have a great heart, literally and figuratively?", + "Extreme Settings: How does a challenging setting or physical environment change a person?", + "American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light", + "Earth's Features", "Waves, Energy, and Information", "Regions of the United States", + "How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?", + "How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?", + "How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?" + ], + 5: [ + "Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change", + "Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?", + "Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?", + "Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky", + "Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation", + "How Were the Lives of Native Peoples Influenced by Where They Lived?", + "What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?", + "Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?", + "How Does the Constitution Help Us Understand What It Means to Be an American?", + "What Do the Early Years of the United States Reveal About the Character of the Nation?", + "What Was the Effect of the Civil War on U.S. Society?" + ], + 6: [ + "Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools", + "Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?", + "A Hero’s Journey: What is the significance and power of the hero’s journey?", + "Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?", + "Courage in Crisis: How can the challenges of a hostile environment inspire heroism?", + "Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy", + "Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate", + "Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)", + "Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution", + "The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)", + "Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)", + "New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II", + "The Cold War", "Civil Rights and American Society", "America Since the 1970s" + ], + 7: [ + "The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution", + "Identity in the Middle Ages: How does society both support and limit the development of identity?", + "Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?", + "Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering", + "Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources", + "Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent", + "Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan", + "Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires", + "China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking", + "Age of Exploration and Trade", "Revolutions and Empires", "The Modern World" + ], + 8: [ + "Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment", + "The Poetics and Power of Storytelling: What is the power of storytelling?", + "The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?", + "Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy", + "Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun", + "Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms", + "Places and Regions", "Physical Geography", "Population Geography", "Economic Geography", + "Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses", + "Government and the Economy", "The Global Economy" + ] +] +[END TOPICS] + +Here is the text: +[BEGIN TEXT] +Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all. +Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk. +His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East. +[END TEXT] +''' +llm_response = ''' +Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of historical exploration. They are unlikely to be familiar with the Mongol Dynasty, Kublai Khan, or Marco Polo's specific travels and contributions.''' + +[cases.marco_polo_grade3.prompt_steps.vocab_complexity] +model = "gemini-2.5-pro" +temperature = 0.0 +system_prompt = ''' + +You are an expert curriculum designer. Your job is to rate the complexity of a text's vocabulary relative to the grade level. + +You will be given a rubric (with levels from least to most complex: slightly complex, moderately complex, very complex, exceedingly complex) as well as guidelines for interpreting the rubric. +IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meaning, etc.) + +**Resource 1: Qualitative Text Complexity rubric (SAP)** +1. **Level 1: Slightly complex** + * Original Definition: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. A very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0. + * Summary definition: Overall, vocabulary is easy to understand and does not impede comprehension of the bulk of the text (including main idea and supporting claims). 1-2 quick pauses for processing by the student are ok here! +2. **Level 2: Moderately complex** + * Original Definition: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK + * Summary definition: Overall, vocabulary generally allows students to comprehend the bulk of the text with little difficulty, though there may be occasional pauses for clarification. Several quick pauses or occasional prolonged pauses may occur. +3. **Level 3: Very complex** + * Original Definition: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic + * Summary definition: Overall, vocabulary often presents challenges that may slow down comprehension but does not completely block the comprehension of the bulk of the text. +4. **Level 4: Exceedingly complex** + * Original Definition: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading. + * Summary definition: Overall, vocabulary is so complex that it makes comprehension of the bulk of the text very challenging and requires careful effort to interpret. + +**Resource 2: Flesch-Kincaid Grade Level** +Use the Flesch-Kincaid (FK) Grade Level as light guidance of the approximate grade level based on readability. The metric alone does not provide final information of vocabulary complexity, but a ballpark of the difficulty of the entire text. +* grade 2-3: 1.98-5.34 +* grade 4-5: 4.51-7.73 +* grade 6-8: 6.51-10.34 +* grade 9-10: 8.32-12.12 +* grade 11-College: 10.34-14.2 + +**Guidelines for Interpretation and Reasoning** + +Your reasoning is the most critical part of your analysis. It's not enough to simply count complex words. You must analyze their impact on a student at the specified grade level. Use the following principles to guide your judgment: + +1. **Density and Cumulative Effect:** Do not just count complex words; evaluate their concentration. A short text with a high density of challenging Tier 2 words (e.g., `peculiar`, `mischievous`, `courageous` for a 4th grader) can be more overwhelming than a longer text with a few scattered Tier 3 words. A constant barrage of unfamiliar words can elevate complexity from `very` to `exceedingly`. +2. **Contextual Scaffolding:** Assess how the text supports new vocabulary. + * Are new, complex terms explicitly defined or explained with simple examples (e.g., "volume... to see if it is big enough to hold a liter of food")? + * Is the surrounding language simple and conversational, making the meaning of new words easier to infer? + * Strong scaffolding can lower the complexity rating. A text with many Tier 3 words that are well-explained might only be `moderately complex`. +3. **Abstract vs. Concrete Vocabulary:** Differentiate between words for abstract concepts and words for concrete things. A text built on abstract Tier 2 words (e.g., `relationships`, `performance`, `non-physical`) can be more challenging than a text that introduces Tier 3 labels for concrete things or people (e.g., `Sumerians`, `polonium`). +4. **Conceptual Load:** Consider the cognitive load of the vocabulary. A list of many new, multi-syllabic, conceptually-heavy terms (e.g., `Paleolithic`, `Mesolithic`, `Neolithic` for a 3rd grader) can be `very complex` even if the terms are briefly defined, because the student must process multiple new concepts at once. +5. **Calibrating the Top Levels:** Be precise in your use of `very complex` vs. `exceedingly complex`. + * **Very complex:** The vocabulary creates significant hurdles and slows the reader down, but the main ideas of the text are still accessible with effort. + * **Exceedingly complex:** The vocabulary is so dense, technical, or abstract that it acts as a barrier, making it nearly impossible for the target student to grasp the bulk of the text's meaning without extensive outside help. Reserve this for texts saturated with advanced terminology. +6. **Consider Background Knowledge:** Pay close attention to the provided `student_background_knowledge`. Do not classify a word as complex if the student is likely to be familiar with it (e.g., 'oxygen' for a 3rd grader who has learned about the human body). + +**Final Analysis Format** + +Provide these information as your final analysis: +1. **Complex vocabulary:** + * Tier 2 words: Words that are commonly used in academic settings and more complex than colloquial, or everyday language and often have multiple meanings. + * Tier 3 words: Overly academic or domain-specific words. + * Archaic words: Words, or uses of words that are not commonly used in modern conversational language. E.g., β€œThe jury retired to deliberate on their verdict." The use of β€œretire” to mean withdrawing to a private place is an archaic use. + * Other complex words: All other words that can increase complexity of the text (e.g., idioms, unfamiliar proper nouns that function as vocabulary). +2. **Vocabulary complexity:** one of: slightly complex, moderately complex, very complex, exceedingly complex +3. **Your reasoning of the complexity:** A detailed explanation of your rating, referencing the principles above. +''' +user_prompt = ''' + +Below is the text you need to evaluate. Let's think step by step in order to predict the output of the vocabulary complexity task. + +- It is intended for grade 3. + +- You can assume the student has the following background knowledge about the text β€” this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of historical exploration. They are unlikely to be familiar with the Mongol Dynasty, Kublai Khan, or Marco Polo's specific travels and contributions. + +- Text Flesch-Kincaid grade level: 6.59 + +- Text to evaluate: [BEGIN TEXT] +Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all. +Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk. +His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East. +[END TEXT] + +STRICT OUTPUT FORMAT: +- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators. +- Do not wrap the JSON in Markdown or code fences (no ``` or ```json). +- Do not prepend or append any text (e.g., do not write "Here is the JSON:"). +- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments. + +The output should be formatted as a JSON instance that conforms to the JSON schema below. + +As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]} the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted. + +Here is the output schema (shown in a code block for readability only β€” do not include any backticks or Markdown in your output): +``` +{"properties": {"tier_2_words": {"description": "List of Tier 2 words", "title": "Tier 2 Words", "type": "string"}, "tier_3_words": {"description": "List of Tier 3 words", "title": "Tier 3 Words", "type": "string"}, "archaic_words": {"description": "List of Archaic words", "title": "Archaic Words", "type": "string"}, "other_complex_words": {"description": "List of Other Complex words", "title": "Other Complex Words", "type": "string"}, "complexity_score": {"description": "the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex", "title": "Complexity Score", "type": "string"}, "reasoning": {"description": "your reasoning for your answer", "title": "Reasoning", "type": "string"}}, "required": ["tier_2_words", "tier_3_words", "archaic_words", "other_complex_words", "complexity_score", "reasoning"]} +``` +''' +llm_response = ''' +```json +{ + "tier_2_words": "served, nevertheless, scholars, described, treasures, wealthy, trading, survived, passengers", + "tier_3_words": "Mongol Dynasty, Kublai Khan, ivory, jade, porcelain", + "archaic_words": "None", + "other_complex_words": "Venice, Ayas, Tabriz, Kerman, Mediterranean Sea, Far East, near East", + "complexity_score": "very complex", + "reasoning": "The vocabulary is rated as very complex due to the high density and conceptual load of unfamiliar terms for a 3rd grader, combined with a near-total lack of contextual scaffolding. The text introduces a large number of proper nouns that function as new vocabulary concepts (Mongol Dynasty, Kublai Khan, Venice, Ayas, Tabriz, Kerman) in a very short space. In addition, it includes several Tier 3 words for unfamiliar goods (ivory, jade, porcelain) and challenging Tier 2 words (nevertheless, scholars). These terms are presented without definition or explanation, requiring the student to process multiple new concepts simultaneously. This density of unfamiliar vocabulary will often present challenges and significantly slow down comprehension, even if the student can grasp the general story of travel and trade. The cumulative effect of these words pushes the text beyond moderately complex into the very complex category for this grade level." +} +```''' + +[cases.marco_polo_grade3.expected_result] +tier_2_words = ''' +served, nevertheless, scholars, described, treasures, wealthy, trading, survived, passengers''' +tier_3_words = ''' +Mongol Dynasty, Kublai Khan, ivory, jade, porcelain''' +archaic_words = ''' +None''' +other_complex_words = ''' +Venice, Ayas, Tabriz, Kerman, Mediterranean Sea, Far East, near East''' +complexity_score = ''' +very complex''' +reasoning = ''' +The vocabulary is rated as very complex due to the high density and conceptual load of unfamiliar terms for a 3rd grader, combined with a near-total lack of contextual scaffolding. The text introduces a large number of proper nouns that function as new vocabulary concepts (Mongol Dynasty, Kublai Khan, Venice, Ayas, Tabriz, Kerman) in a very short space. In addition, it includes several Tier 3 words for unfamiliar goods (ivory, jade, porcelain) and challenging Tier 2 words (nevertheless, scholars). These terms are presented without definition or explanation, requiring the student to process multiple new concepts simultaneously. This density of unfamiliar vocabulary will often present challenges and significantly slow down comprehension, even if the student can grasp the general story of travel and trade. The cumulative effect of these words pushes the text beyond moderately complex into the very complex category for this grade level.''' + + +# =========================================================================== +# Case 2: hurricanes_grade7 (grades 5–12 path) +# =========================================================================== + +[cases.hurricanes_grade7] +description = "Hurricane formation passage, grade 7 (grades 5-12 GPT path)" + +[cases.hurricanes_grade7.input] +text = ''' +Great whirling storms roar out of the oceans in many parts of the world. They are called by several names β€” hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms. + +Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.''' +grade = 7 + +[cases.hurricanes_grade7.prompt_steps.background_knowledge] +model = "gpt-4o-2024-11-20" +temperature = 0.0 +system_prompt = "" +user_prompt = ''' + +Review the following text, which is an educational text written for students in the following grade band: 7. + +Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with. + +Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.). + +Here's an example: +[START EXAMPLE] +Grade Band: 11th +Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not +learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was +not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to +know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me, +are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it +is the chief end of man here to β€œglorify God and enjoy him forever.” + +Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy. +[END EXAMPLE] + +You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else. + +You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have: +[BEGIN TOPICS] +[ + K: [ + "Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees", + "The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?", + "America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?", + "Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together", + "How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?", + "How Has Our World Changed?", "Why Do People Have Jobs?" + ], + 1: [ + "Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds", + "A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals’ unique features?", + "Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?", + "Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World", + "What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?", + "How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?" + ], + 2: [ + "Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators", + "A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?", + "Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?", + "Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are", + "Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?", + "How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?" + ], + "3": [ + "Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics", + "Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist", + "Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate", + "Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?", + "What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?", + "How Do People in a Community Meet Their Wants and Needs?" + ], + 4: [ + "Poetry", "Animal Defense Mechanisms", "The American Revolution", + "Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)", + "A Great Heart: What does it mean to have a great heart, literally and figuratively?", + "Extreme Settings: How does a challenging setting or physical environment change a person?", + "American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light", + "Earth's Features", "Waves, Energy, and Information", "Regions of the United States", + "How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?", + "How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?", + "How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?" + ], + 5: [ + "Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change", + "Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?", + "Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?", + "Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky", + "Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation", + "How Were the Lives of Native Peoples Influenced by Where They Lived?", + "What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?", + "Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?", + "How Does the Constitution Help Us Understand What It Means to Be an American?", + "What Do the Early Years of the United States Reveal About the Character of the Nation?", + "What Was the Effect of the Civil War on U.S. Society?" + ], + 6: [ + "Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools", + "Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?", + "A Hero’s Journey: What is the significance and power of the hero’s journey?", + "Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?", + "Courage in Crisis: How can the challenges of a hostile environment inspire heroism?", + "Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy", + "Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate", + "Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)", + "Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution", + "The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)", + "Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)", + "New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II", + "The Cold War", "Civil Rights and American Society", "America Since the 1970s" + ], + 7: [ + "The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution", + "Identity in the Middle Ages: How does society both support and limit the development of identity?", + "Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?", + "Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering", + "Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources", + "Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent", + "Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan", + "Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires", + "China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking", + "Age of Exploration and Trade", "Revolutions and Empires", "The Modern World" + ], + 8: [ + "Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment", + "The Poetics and Power of Storytelling: What is the power of storytelling?", + "The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?", + "Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy", + "Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun", + "Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms", + "Places and Regions", "Physical Geography", "Population Geography", "Economic Geography", + "Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses", + "Government and the Economy", "The Global Economy" + ] +] +[END TOPICS] + +Here is the text: +[BEGIN TEXT] +Great whirling storms roar out of the oceans in many parts of the world. They are called by several names β€” hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms. + +Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes. +[END TEXT] +''' +llm_response = ''' +Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, air pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes, their formation, or their global naming conventions.''' + +[cases.hurricanes_grade7.prompt_steps.vocab_complexity] +model = "gpt-4.1" +temperature = 0.0 +system_prompt = ''' + +You are an expert curriculum designer. Your job involves reading text snippets intended for students in K-12 and evaluating the complexity of the vocabulary in each text. + +You will be given a rubric (with options 1, 2, 3, 4) as well as guidelines for interpreting the rubric. + +IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meainng, etc.) +IMPORTANT: Rely on the supplied rubric and annotation guidelines along. Do not introduce any new crtieria for evaluating the complexity of a text's vocabulary. + +Please first reason out loud about the vocabulary complexity of the text and then provide an answer between 1 and 4 (whole numbers only). Provide the answer as an integer (not a float). +''' +user_prompt = ''' + +Your job is to rate the complexity of a text's vocabulary (relative to the intended level of the text) according to a rubric and annotation guide. Stick to the rubric and annotation guide exactly β€” do not introduce any additional criteria or lenses for judging the complexity of the text. + +[BEGIN ANNOTATION GUIDE AND RUBRIC] +Instructions +For the following task, please assume that: + - The student is on grade level and proficient in all core content areas, including reading fluency, comprehension, science, & social studies. (example). + - The student is moving through a common progression of topics (detailed here). + - The student is fluent in speaking English. + - The student has an "average" amount of background knowledge on topics not commonly covered in curriculum. + - The student will use this material for independent reading/work, without direct instruction. + - The text is reasonable for the given grade level. + +Please do not consider the presence of figurative language when scoring Vocabulary. For example: with a phrase like "kicked the bucket," consider only the qualities of the words themselves ("kicked", "the" and "bucket"). + +Please do be sure to consider: +- all of the different types of vocabulary (listed below) +- the overall proportion of complex words in the text - including repeated complex words. +- the resulting holistic complexity of the vocabulary (described in the Summary section below). + +Level 1: +Rubric: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. That said, a very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0. + +Level 2: +Rubric: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK, but if it's very low, the text is probably level 1. + +Level 3: +Rubric: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic + +Level 4: +Rubric: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading + +And here are some relevant definitions: + - Conversational: Everyday language. + - Familiar: Words that the student is likely to have seen/heard, from everyday life or their curriculum. Reminder: assume an "average" level of background knowledge. + - Unfamiliar: Words the student has probably not heard, or are being used in an unfamiliar way. + - For ex: 4th graders are familiar with the word "table" but may not be familiar with the use of the word with respect to data ("a table of data"). + - Note: + - Words with in-line definitions (via appositives, or because they can be easily inferred from other parts of the text) should be evaluated as less unfamiliar. + - For ex: "The pharaoh, a powerful ruler of ancient Egypt, was buried in a grand tomb." + - The word "pharaoh" might be unfamiliar or subject-specific, but since is defined within the text, you can consider it a more familiar word. + - Unfamiliar proper nouns: + - A person's name, even if unfamiliar, generally does not add to complexity. + - Other unfamiliar proper nouns (eg locations, organizations) do add to complexity. + +- Subject-specific: Words that are specific to a subject or field of study that are essential for understanding concepts and engaging with the content. +- Overly-academic: Words that are excessively formal, complex, or specialized. + - For ex: "The agrarian societal structure of the Neolithic Revolution precipitated a paradigm shift in agriculture" +- Archaic: A word that was common in the past but is now rarely/almost never used. Could also be a word used in an archaic way. + - For ex: "After a long day of court proceedings, the jury 'retired' to deliberate on their verdict." + - The word "retire" meaning to stop working may be familiar to a student, but "retire" meaning "withdrawing to a private place" is an archaic use. + + +Examples +The student is on-grade-level: +- Consider a 6th grade passage about earth systems. Per NGSS standards, students are introduced to earth systems starting in 2nd grade. They encounter words like: wind, water, river, lake, solids, and liquids. For our rating purposes, we would assume most students following 2nd have encountered these words. In 5th grade, they dive more fully into earth systems concepts, learning vocabulary words like geosphere, sediment, biosphere, atmosphere, ecosystems, organisms and climate. While rating, we would consider the words listed in the NGSS standards as more familiar following that grade level. If the same passage were intended for 3rd graders, though, then the subject-specific vocabulary is likely to be unfamiliar. + +Figurative Language +- Kicked the bucket. +- The pen is mightier than the sword. +- The classroom was a zoo. +- He ran faster than the speed of light. +[END ANNOTATION GUIDE AND RUBRIC] + +Here are a couple examples of texts that have already been scored along with justification for their scores, which you can use as exemplars: +[BEGIN EXAMPLES] + +*** EXAMPLE 1 *** +The following text was intended for grade level 11 and received a complexity level of 1. + +Here is the background knowledge assumption for that text: N/A + +Here is the text: +// START TEXT // +"In a recent lecture, "Is Nothing Sacred?", Salman Rushdie, one of the most censored authors of our time, talked about the importance of books. He grew up in a household in India where books were as sacred as bread. If anyone in the household dropped a piece of bread or a book, the person not only picked it up, but also kissed the object by way of apologizing for clumsy disrespect. + +He goes on to say that he had kissed many books before he had kissed a girl. Bread and books were for his household, and for many like his, food for the body and the soul. This image of the kissing of the book one had accidentally dropped made an impression on me. It speaks to the love and respect many people have for them. + +I grew up in a small town in New Mexico, and we had very few books in our household. The first one I remember reading was my catechism book. Before I went to school to learn English, my mother taught me catechism in Spanish. + +I remember the questions and answers I had to learn, and I remember the well-thumbed, frayed volume which was sacred to me. + +Growing up with few books in the house created in me a desire and a need for them. When I started school, I remember visiting the one room library of our town and standing in front of the dusty shelves. In reality there were only a few shelves and not over a thousand books, but I wanted to read them all. There was food for my soul in the books, that much I realized." +// END TEXT // + +Here is the reasoning for that complexity level: +// START REASONING // +This text is a 1 for vocabulary, because the vocabulary that is used is familiar and accessible for a proficient 11th grader. Most of the words used in the text are very common everyday vocabulary for describing growing up, family life, and the importance of reading. A few examples of these very common words are: small town, book, school, learn, food, kissed, image, respect, love, speaks. There are many more in the text. In this text there are only a few "juicier" or more complex words, you can think of those as words that are less familiar, have a more abstract or nuanced meaning, or carry a very large concept. Less commonly spoken words that were used in the text were: frayed, volume, censored, clumsy, sacred. These are still well within reach of a proficient 11th grader, and would still be considered familiar, because they will have encountered them in past reading or academic studies. In the text there are a couple of words that are outliers, but they are not essential to the understanding of the larger text. One of these words or hyphenated compound phrase is well-frayed. A compound phrase is a phrase consisting of multiple words that work together to create a specific meaning or idea, often acting as a single unit in a sentence. If the meaning of individual words is familiar, it is typically quite easy for proficient readers to generalize the larger meaning that the author is implying with their word choice. In this case, proficient students will be accustomed to the phrase well, with the secondary meaning of very, rather than a description of positivity or health; and they will be accustomed to the use frayed, as in worn, aged, or damaged from use. Making the leap to identify the meaning of "well-frayed" as a book that is very used, will take only moments for a proficient 11th grader. Another word that stands out in the text is the word catechism, which might be new for many students based on their personal background or location, but a full understanding of what a catechism book contains is not essential for understanding the paragraph or whole text. The reader can make it through using minimum context clues to know that the catechism must be something important to his family. The type of book he learned to read before going to school is not critical for comprehension, it's enough to understand that reading was so important in his family, his mother started instruction before he even started school. Additionally, it's important to know that having one unknown word for an 11th grade reading, does not merit a rating higher than one. + +It is worth noting that another reason this text is a 1, is that the content or topic of the passage is so familiar and covered extensively in K-12 education, i.e. reading is important, loving books, growing up; that coupled with the simple vocabulary choices, getting to the meaning of the overall text, and even the paragraphs, would be incredibly easy for a proficient 11th grader. +// END REASONING // +*** EXAMPLE 2 *** +The following text was intended for grade level 5 and received a complexity level of 2. + +Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with the concept of natural disasters, including hurricanes, and basic atmospheric concepts like high and low pressure from their studies on weather and climate. They may not be familiar with the specific formation processes of hurricanes or the global terminology differences (hurricane, typhoon, cyclone). + +Here is the text: +// START TEXT // +Great whirling storms roar out of the oceans in many parts of the world. They are called by several namesβ€”hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms. + +Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes. +// END TEXT // + +Here is the reasoning for that complexity level: +// START REASONING // +I scored this a 2 because of the density of subject-specific vocabulary related to weather and climate, which is often covered in lower grade levels. This adds to the complexity above a 1, but it is not a level 3 because of the familiarity with the topic, which implies some familiarity with the vocabulary as well. The specific formation process and the vocabulary used to explain the processes are also subject-specfiic but not famliar, which would make the second paragraph a level 3 in the rubric language, but when considering the language used in the overall SUMMARY below the rubric, this new content and vocabulary would cause quick pauses and/or occasional prolonged pauses but would not cause the reader to slow down to due to challenging overall comprehension of the key ideas and supporting claims. This is especially the case because the second paragraph builds upon prior knowledge and familiar vocabulary use, so it is not entirely new information and vocabulary. While there is subject-specific vocabulary used, overly academic vocabulary is NOT used and is more conversational in nature, such as "great whiring storms" and "born" / "giving birth" to storm (although this is the way storms are described!) rather than more technical terms which made comprehension easier due to the accessibility of the vocabulary (even if used in other contexts before reading this text). Words such as "a lot" and "bigger" are more conversational, and while technical, unfamiliar words are provided, such as "hurricane," "typhoon," and "cyclone," knowing and understanding their differences is not necessary to grasp the main idea. The processes by which they are formed are what need to be retained while reading the entire text, and familiarity with the bulk of the vocabulary used would allow for that to happen without too much struggle to make meaning of it. Additionally, the text does not contain any archaic vocabulary or ambiguous words, which prevents it from reaching a rating of 4, although it is not necessary that they text have such vocabulary to meet a level 4, the frequent inclusion of such vocabulary makes it more likely to land at least a 3 or 4. +// END REASONING // + +*** EXAMPLE 3 *** +The following text was intended for grade level 6 and received a complexity level of 3. + +Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with basic Earth science concepts such as rocks, minerals, and fossils, as well as natural processes like volcanic eruptions and earthquakes. They may not be familiar with more advanced topics like plate tectonics or the specific branches of geology such as mineralogy, petrology, and seismology. + +Here is the text: +// START TEXT // +Geology is the scientific study of Earth. Geologists study the planetβ€”its formation, its internal structure, its materials, its chemical and physical processes, and its history. Mountains, valleys, plains, sea floors, minerals, rocks, fossils, and the processes that create and destroy each of these are all the domain of the geologist. Geology is divided into two broad categories of study: physical geology and historical geology. + +Physical geology is concerned with the processes occurring on or below the surface of Earth and the materials on which they operate. These processes include volcanic eruptions, landslides, earthquakes, and floods. Materials include rocks, air, seawater, soils, and sediment. Physical geology further divides into more specific branches, each of which deals with its own part of Earth's materials, landforms, and processes. Mineralogy and petrology investigate the composition and origin of minerals and rocks. Volcanologists study lava, rocks, and gases on live, dormant, and extinct volcanoes. Seismologists use instruments to monitor and predict earthquakes and volcanic eruptions. + +Historical geology is concerned with the chronology of events, both physical and biological, that have taken place in Earth's history. Paleontologists study fossils (remains of ancient life) for evidence of the evolution of life on Earth. Fossils not only relate evolution, but also speak of the environment in which the organism lived. Corals in rocks at the top of the Grand Canyon in Arizona, for example, show a shallow sea flooded the area around 290 million years ago. In addition, by determining the ages and types of rocks around the world, geologists piece together continental and oceanic history over the past few billion years. Plate tectonics (the study of the movement of the sections of Earth's crust) adds to Earth's story with details of the changing configuration of the continents and oceans. +// END TEXT // + +Here is the reasoning for that complexity level: +// START REASONING // +To determine the complexity rating of this text based on the vocabulary present, I used the annotation guide, scoring rubric, and examples to set the expectations for rating. During the first read of the text, I "bolded" and categorized the more challenging vocabulary words according to the following complexity groupings: archaic, unfamiliar, archaic, subject-specific, and/or overly academic. On the second read, I considered the main idea or "gist" that students need to acquire understanding of. I then referenced the previously mentioned tools–annotation guide, scoring rubric, and examples to remind myself of the expectations for rating. I agreed that readers would have familiarity with basic concepts of geology; however, I also considered the definitions provided for words such as Geology, Geologists, Physical Geology, Historical Geology, Mineralogy, and Petrology. I considered how students might pause for clarification and for how long. After reviewing the Annotation Guide while considering, I narrowed the rating down because the definitions provided throughout the text of more complex words should make the meaning of the text more accessible for readers, which is why although the words are subject-specific, I rated this text as a 3 instead of a 2-less complex or a 4–more complex. I read the text one final time to ensure clarity around my rating, scored and wrote the justification. +// END REASONING // +[END EXAMPLES] + +Below is the text you need to evaluate. It is intended for grade 7. + +As you read the text, you can assume the student has the following background knowledge about the text β€” this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, air pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes, their formation, or their global naming conventions. + +[BEGIN TEXT] +Great whirling storms roar out of the oceans in many parts of the world. They are called by several names β€” hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms. + +Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes. +[END TEXT] + +In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say "level 2"). + +STRICT OUTPUT FORMAT: +- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators. +- Do not wrap the JSON in Markdown or code fences (no ``` or ```json). +- Do not prepend or append any text (e.g., do not write "Here is the JSON:"). +- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments. + +The output should be formatted as a JSON instance that conforms to the JSON schema below. + +As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]} the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted. + +Here is the output schema (shown in a code block for readability only β€” do not include any backticks or Markdown in your output): +``` +{"properties": {"tier_2_words": {"description": "List of Tier 2 words", "title": "Tier 2 Words", "type": "string"}, "tier_3_words": {"description": "List of Tier 3 words", "title": "Tier 3 Words", "type": "string"}, "archaic_words": {"description": "List of Archaic words", "title": "Archaic Words", "type": "string"}, "other_complex_words": {"description": "List of Other Complex words", "title": "Other Complex Words", "type": "string"}, "complexity_score": {"description": "the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex", "title": "Complexity Score", "type": "string"}, "reasoning": {"description": "your reasoning for your answer", "title": "Reasoning", "type": "string"}}, "required": ["tier_2_words", "tier_3_words", "archaic_words", "other_complex_words", "complexity_score", "reasoning"]} +``` +''' +llm_response = ''' +{"tier_2_words": "roar, fury, envelope, damage, mightiest, develop, match, familiar, surface, rising, sinking, pressing", "tier_3_words": "hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist", "archaic_words": "", "other_complex_words": "ashore", "complexity_score": "slightly complex", "reasoning": "The vocabulary in this text is mostly contemporary, conversational, and familiar for a 7th grader with the assumed background knowledge. Words like 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' and 'low pressure' are subject-specific (Tier 3), but these are introduced with context and are not overly academic or dense. There are a few Tier 2 words such as 'roar,' 'fury,' 'envelope,' and 'damage,' but these are likely familiar from prior curriculum or everyday language. The word 'ashore' may be less familiar, but it is not enough to raise the overall complexity. There are no archaic or overly academic words. The proportion of complex words is low, and the text is accessible for independent reading at this grade level. Therefore, the vocabulary complexity is best described as 'slightly complex,' corresponding to a rubric level 2."}''' + +[cases.hurricanes_grade7.expected_result] +tier_2_words = ''' +roar, fury, envelope, damage, mightiest, develop, match, familiar, surface, rising, sinking, pressing''' +tier_3_words = ''' +hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist''' +archaic_words = "" +other_complex_words = ''' +ashore''' +complexity_score = ''' +slightly complex''' +reasoning = ''' +The vocabulary in this text is mostly contemporary, conversational, and familiar for a 7th grader with the assumed background knowledge. Words like 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' and 'low pressure' are subject-specific (Tier 3), but these are introduced with context and are not overly academic or dense. There are a few Tier 2 words such as 'roar,' 'fury,' 'envelope,' and 'damage,' but these are likely familiar from prior curriculum or everyday language. The word 'ashore' may be less familiar, but it is not enough to raise the overall complexity. There are no archaic or overly academic words. The proportion of complex words is low, and the text is accessible for independent reading at this grade level. Therefore, the vocabulary complexity is best described as 'slightly complex,' corresponding to a rubric level 2.''' diff --git a/sdks/python/tests/conftest.py b/sdks/python/tests/conftest.py index 15f6ff8..34951ec 100644 --- a/sdks/python/tests/conftest.py +++ b/sdks/python/tests/conftest.py @@ -3,11 +3,7 @@ import pytest from learning_commons_evaluators import create_config_no_telemetry -from learning_commons_evaluators.schemas.config import ( - EvaluationSettings, - LlmProvider, - PromptSettings, -) +from learning_commons_evaluators.schemas.config import EvaluationSettings from learning_commons_evaluators.schemas.metadata import ( EvaluationMetadata, EvaluatorMaturity, @@ -15,10 +11,6 @@ ) -class _StubEvaluationSettings(EvaluationSettings): - """Minimal concrete EvaluationSettings for fixtures; replaced once a real evaluator lands.""" - - @pytest.fixture def evaluator_metadata(): """Minimal EvaluatorMetadata suitable for use in tests.""" @@ -33,24 +25,19 @@ def evaluator_metadata(): @pytest.fixture def evaluation_metadata(evaluator_metadata): - """EvaluationMetadata with sensible defaults for unit tests.""" + """EvaluationMetadata with sensible defaults for unit tests. + + Uses base :class:`EvaluationSettings` so this stays evaluator-agnostic; tests for a + concrete evaluator should build that evaluator's settings (or a dedicated fixture) + when they need specific fields. + """ return EvaluationMetadata( evaluator_metadata=evaluator_metadata, - evaluation_settings=_StubEvaluationSettings(), + evaluation_settings=EvaluationSettings(), input_metadata={}, ) -@pytest.fixture -def prompt_settings_google(): - """PromptSettings configured for Google, usable in multiple test modules.""" - return PromptSettings( - provider_type=LlmProvider.GOOGLE, - model="gemini-2.0-flash", - temperature=0.0, - ) - - @pytest.fixture def config(): """EvaluatorConfig with no telemetry, suitable for unit tests.""" diff --git a/sdks/python/tests/contract_tests/__init__.py b/sdks/python/tests/contract_tests/__init__.py new file mode 100644 index 0000000..569ad77 --- /dev/null +++ b/sdks/python/tests/contract_tests/__init__.py @@ -0,0 +1,14 @@ +"""Contract tests: verify SDK fidelity to the evaluation notebooks. + +Each evaluator has a ``contracts.toml`` in its settings folder that records: + - ``input``: the evaluator inputs used for the test case + - ``prompt_steps``: the exact LLM request (formatted messages, model, temperature) + and raw response captured from a real notebook run + - ``expected_result``: the structured LLM output (notebook format) + +Tests load these artifacts, run the evaluator with the LLM mocked to return the +captured response, and assert that: + 1. The SDK sends the same prompt as the notebook (same formatted messages, + model, and temperature). + 2. The SDK produces the same result from that response as the notebook. +""" diff --git a/sdks/python/tests/contract_tests/conventionality.py b/sdks/python/tests/contract_tests/conventionality.py new file mode 100644 index 0000000..86283d6 --- /dev/null +++ b/sdks/python/tests/contract_tests/conventionality.py @@ -0,0 +1,87 @@ +"""Conventionality-specific helpers for contract tests. + +Provides: + - Named case loaders (one function per test case in the TOML). + - ``conventionality_notebook_to_sdk_result``: converts the notebook-format + expected result (raw ``JsonOutputParser`` dict) to the expected + ``TextComplexityResult`` that the SDK should produce. +""" + +from __future__ import annotations + +from learning_commons_evaluators.schemas.evaluator import EvaluationExplanation +from learning_commons_evaluators.schemas.metadata import ( + EvaluationMetadata, + EvaluatorMaturity, + EvaluatorMetadata, + Status, +) +from learning_commons_evaluators.schemas.text_complexity import ( + TextComplexityAnswer, + TextComplexityResult, +) + +from .loader import ContractCase, load_contract_case + +# --------------------------------------------------------------------------- +# Case loaders +# --------------------------------------------------------------------------- + + +def load_conventionality_turnip_case() -> ContractCase: + """Load the 'turnip' contract test case for the conventionality evaluator.""" + return load_contract_case("conventionality", "turnip") + + +# --------------------------------------------------------------------------- +# Result mapper +# --------------------------------------------------------------------------- + + +def conventionality_notebook_to_sdk_result(case: ContractCase) -> TextComplexityResult: + """Convert ``case.expected_result`` (notebook format) to a ``TextComplexityResult``. + + The notebook outputs a plain dict from ``JsonOutputParser``; the SDK wraps + that into ``TextComplexityResult``. This function performs the same + structural mapping the SDK does so tests can assert equality. + + Only ``answer`` and ``explanation`` are compared β€” ``metadata`` is excluded + because it contains non-deterministic fields (timing, evaluation ID, etc.). + + Args: + case: A loaded :class:`~loader.ContractCase` with a populated + ``expected_result``. + + Returns: + A ``TextComplexityResult`` built from the contract's expected output. + The ``metadata`` field is a minimal placeholder so the object is valid. + """ + r = case.expected_result + answer = TextComplexityAnswer.from_score(r["complexity_score"]) + explanation = EvaluationExplanation( + summary=r["reasoning"], + details={ + "conventionality_features": r["conventionality_features"], + "grade_context": r["grade_context"], + "instructional_insights": r["instructional_insights"], + }, + ) + # A minimal metadata object β€” only used to satisfy the result model; not + # compared in assertions (use assert_answer / assert_explanation helpers). + placeholder_metadata = EvaluationMetadata( + evaluator_metadata=EvaluatorMetadata( + id="conventionality", + version="0.1", + name="Conventionality", + description="Contract-test placeholder metadata (not compared).", + maturity=EvaluatorMaturity.beta, + ), + evaluation_settings=None, + input_metadata={}, + status=Status.succeeded, + ) + return TextComplexityResult( + answer=answer, + explanation=explanation, + metadata=placeholder_metadata, + ) diff --git a/sdks/python/tests/contract_tests/harness.py b/sdks/python/tests/contract_tests/harness.py new file mode 100644 index 0000000..93ff7e1 --- /dev/null +++ b/sdks/python/tests/contract_tests/harness.py @@ -0,0 +1,229 @@ +"""Contract test harness: LLM mocking and prompt-request assertions. + +Usage +----- +:: + + case = load_conventionality_turnip_case() + config = create_config_no_telemetry() + evaluator = ConventionalityEvaluator(config) + + inp = ConventionalityEvaluationInput( + text=case.input["text"], + grade=case.input["grade"], + ) + + with ContractTestHarness(case) as harness: + result = evaluator.evaluate(inp) + + harness.assert_prompt_step("main") + +For evaluators with multiple LLM steps the harness automatically queues +responses in ``prompt_steps`` order and captures each call's request, so the +assertions still use the step name:: + + harness.assert_prompt_step("step_1") + harness.assert_prompt_step("step_2") + +The captured data is accessible after the context exits. + +Prompt strings (system and user) are compared after normalizing line endings +(``\\r\\n`` β†’ ``\\n``) and stripping leading/trailing whitespace, so minor TOML +multiline formatting differences do not fail tests. +""" + +from __future__ import annotations + +from typing import Any +from unittest.mock import patch + +from .loader import ContractCase, PromptStepContract + +# The module-level name that ``base.py`` imports ``create_provider`` under. +_CREATE_PROVIDER_TARGET = "learning_commons_evaluators.evaluators.base.create_provider" + + +class ContractTestHarness: + """Sets up a mocked LLM for a contract test and captures what was sent to it. + + The harness patches ``create_provider`` so that no real API calls are made. + Each call to ``create_provider`` pops the next response from the case's + ``prompt_steps`` (in order) and returns a lightweight callable that: + + - Records the formatted messages it receives. + - Returns an ``AIMessage`` with the stored ``llm_response`` content. + + After the ``with`` block, use :meth:`assert_prompt_step` to verify each + captured request matches the stored contract. + """ + + def __init__(self, case: ContractCase) -> None: + self.case = case + # Populated during the evaluation run; indexed by call order. + self._captured: list[_CapturedCall] = [] + self._patch: Any = None + + def __enter__(self) -> ContractTestHarness: + if not self.case.is_populated(): + raise ValueError( + "Contract case still contains PLACEHOLDER_* values in prompt_steps; " + "capture a real notebook run and refresh contracts.toml before running " + "contract tests." + ) + # Build a FIFO list of (step_name, response) pairs in definition order. + self._response_queue: list[tuple[str, str]] = [ + (name, step.llm_response) for name, step in self.case.prompt_steps.items() + ] + self._captured = [] + + self._patch = patch(_CREATE_PROVIDER_TARGET, side_effect=self._make_fake_provider) + self._patch.start() + return self + + def __exit__(self, *args: Any) -> None: + if self._patch is not None: + self._patch.stop() + + # ------------------------------------------------------------------ + # Assertions + # ------------------------------------------------------------------ + + def assert_prompt_step(self, step_name: str) -> None: + """Assert that the request captured for *step_name* matches the contract. + + Raises: + AssertionError: If the captured messages, model, or temperature + differ from the stored contract. + IndexError: If no call was captured for the given step. + """ + step_names = list(self.case.prompt_steps.keys()) + if step_name not in step_names: + raise ValueError(f"Step '{step_name}' not in contract. Available steps: {step_names}") + step_index = step_names.index(step_name) + + if step_index >= len(self._captured): + raise AssertionError( + f"No LLM call captured for step '{step_name}' " + f"(only {len(self._captured)} call(s) were made)." + ) + + captured = self._captured[step_index] + contract = self.case.prompt_steps[step_name] + _assert_prompt_matches(captured, contract, step_name) + + # ------------------------------------------------------------------ + # Internal + # ------------------------------------------------------------------ + + def _make_fake_provider(self, prompt_settings: Any, evaluator_config: Any) -> Any: + """Return a callable that records input messages and returns the next mock response.""" + from langchain_core.messages import AIMessage + + if not self._response_queue: + raise RuntimeError( + "ContractTestHarness: more LLM calls were made than there are " + "prompt steps in the contract case." + ) + _step_name, response_content = self._response_queue.pop(0) + captured_list = self._captured + + def _fake_llm(prompt_value: Any) -> AIMessage: + # ``prompt_value`` is a ChatPromptValue produced by the template. + messages = ( + prompt_value.to_messages() + if hasattr(prompt_value, "to_messages") + else list(prompt_value) + ) + system_content = _message_content(messages, "system") + human_content = _message_content(messages, "human") + captured_list.append( + _CapturedCall( + system_prompt=system_content, + user_prompt=human_content, + model=prompt_settings.model, + temperature=prompt_settings.temperature, + ) + ) + return AIMessage(content=response_content) + + return _fake_llm + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +class _CapturedCall: + """One captured LLM invocation.""" + + def __init__( + self, + *, + system_prompt: str, + user_prompt: str, + model: str, + temperature: float, + ) -> None: + self.system_prompt = system_prompt + self.user_prompt = user_prompt + self.model = model + self.temperature = temperature + + +def _message_content(messages: list[Any], message_type: str) -> str: + """Extract content from the first message of the given type.""" + for msg in messages: + if getattr(msg, "type", None) == message_type: + return str(msg.content) + return "" + + +def _normalize_prompt_snapshot(s: str) -> str: + """Normalize prompt text for contract comparison. + + Hand-edited or pasted TOML multiline strings may differ from runtime only by + CRLF vs LF line endings, or a spurious leading newline after an opening + ``'''`` delimiter (TOML trims only the first newline). Normalizing avoids + those false failures while preserving internal content. + """ + return s.replace("\r\n", "\n").strip() + + +def _assert_prompt_matches( + captured: _CapturedCall, + contract: PromptStepContract, + step_name: str, +) -> None: + assert captured.model == contract.model, ( + f"Step '{step_name}': model mismatch.\n" + f" SDK sent: {captured.model!r}\n" + f" Contract: {contract.model!r}" + ) + assert captured.temperature == contract.temperature, ( + f"Step '{step_name}': temperature mismatch.\n" + f" SDK sent: {captured.temperature}\n" + f" Contract: {contract.temperature}" + ) + sys_sdk = _normalize_prompt_snapshot(captured.system_prompt) + sys_contract = _normalize_prompt_snapshot(contract.system_prompt) + assert sys_sdk == sys_contract, ( + f"Step '{step_name}': system_prompt mismatch.\n" + f" First diff at char {_first_diff_index(sys_sdk, sys_contract)}.\n" + f" SDK sent (first 200): {sys_sdk[:200]!r}\n" + f" Contract (first 200): {sys_contract[:200]!r}" + ) + user_sdk = _normalize_prompt_snapshot(captured.user_prompt) + user_contract = _normalize_prompt_snapshot(contract.user_prompt) + assert user_sdk == user_contract, ( + f"Step '{step_name}': user_prompt mismatch.\n" + f" SDK sent: {user_sdk!r}\n" + f" Contract: {user_contract!r}" + ) + + +def _first_diff_index(a: str, b: str) -> int: + for i, (ca, cb) in enumerate(zip(a, b, strict=False)): + if ca != cb: + return i + return min(len(a), len(b)) diff --git a/sdks/python/tests/contract_tests/loader.py b/sdks/python/tests/contract_tests/loader.py new file mode 100644 index 0000000..85cabcd --- /dev/null +++ b/sdks/python/tests/contract_tests/loader.py @@ -0,0 +1,169 @@ +"""Load contract test cases from evaluator settings folders. + +Each evaluator's ``contracts.toml`` lives next to its ``settings.toml`` in +``sdks/settings//`` (e.g. +``sdks/settings/conventionality/contracts.toml``). This module provides the data +models and a loader that reads those files into structured objects usable from +tests. +""" + +from __future__ import annotations + +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from importlib.abc import Traversable +else: + try: + from importlib.resources.abc import Traversable + except ImportError: + from importlib.abc import Traversable + +if sys.version_info >= (3, 11): + import tomllib # type: ignore[import-untyped] +else: + try: + import tomllib # type: ignore[import-untyped, import-not-found] + except ImportError: + import tomli as tomllib # type: ignore[import-not-found,no-redef] + + +# Sentinel used to detect placeholder values left by the TOML template. +_PLACEHOLDER_PREFIX = "PLACEHOLDER" + + +@dataclass +class PromptStepContract: + """Contract for one LLM prompt step: the exact request sent and response received. + + ``system_prompt`` and ``user_prompt`` are the *fully formatted* messages β€” + all template placeholders have been substituted (including library-computed + values like ``{format_instructions}`` and ``{fk_score}``). + + ``llm_response`` is the raw string content returned by the LLM, as captured + from the notebook run. It is used to mock the LLM in the contract test. + """ + + system_prompt: str + user_prompt: str + model: str + temperature: float + llm_response: str + + def is_populated(self) -> bool: + """Return False if any field still holds a placeholder value.""" + return not any( + str(v).startswith(_PLACEHOLDER_PREFIX) + for v in (self.system_prompt, self.user_prompt, self.llm_response) + ) + + +@dataclass +class ContractCase: + """One test case for a contract test. + + Attributes: + name: Identifier matching the ``[cases.]`` TOML key. + description: Human-readable label (optional). + input: Raw evaluator input values (e.g. ``{"text": ..., "grade": 4}``). + prompt_steps: Ordered mapping of step name β†’ :class:`PromptStepContract`. + The order matches the order of LLM calls made during evaluation. + expected_result: Parsed LLM output in notebook format (i.e. the dict produced by + ``JsonOutputParser``, before SDK result mapping). Used to verify + the SDK produces the same structured result. + """ + + name: str + description: str + input: dict[str, Any] + prompt_steps: dict[str, PromptStepContract] + expected_result: dict[str, Any] + + def is_populated(self) -> bool: + """Return False if any prompt step still holds a placeholder value.""" + return all(step.is_populated() for step in self.prompt_steps.values()) + + def llm_responses_in_order(self) -> list[str]: + """Return LLM responses for all steps, in call order.""" + return [step.llm_response for step in self.prompt_steps.values()] + + +def load_contract_case(evaluator_name: str, case_name: str) -> ContractCase: + """Load a named test case from the evaluator's ``contracts.toml``. + + Resolved via :func:`~learning_commons_evaluators.settings.load_settings.shared_settings_root` + (bundled ``settings//contracts.toml`` in the package; kept in sync + with the canonical ``sdks/settings/`` copy by ``make sync-settings``). + + Args: + evaluator_name: Name of the evaluator (e.g. ``"conventionality"``). + case_name: Name of the case within the TOML (e.g. ``"turnip"``). + + Returns: + A :class:`ContractCase` loaded from the TOML. + + Raises: + FileNotFoundError: If ``contracts.toml`` does not exist. + KeyError: If ``case_name`` is not found in the TOML. + """ + toml_path = _settings_path(evaluator_name) / "contracts.toml" + if not toml_path.is_file(): + raise FileNotFoundError( + f"contracts.toml not found for evaluator '{evaluator_name}' (expected at {toml_path})" + ) + + if isinstance(toml_path, Path): + with open(toml_path, "rb") as fh: + data = tomllib.load(fh) + else: + with toml_path.open("rb") as fh: + data = tomllib.load(fh) + + cases = data.get("cases", {}) + if case_name not in cases: + available = ", ".join(cases.keys()) or "(none)" + raise KeyError(f"Case '{case_name}' not found in {toml_path}. Available cases: {available}") + + return _parse_case(case_name, cases[case_name]) + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _settings_path(evaluator_name: str) -> Path | Traversable: + """Resolve the settings directory for an evaluator from the shared settings root.""" + from learning_commons_evaluators.settings.load_settings import ( + shared_settings_root, # noqa: PLC0415 + ) + + return shared_settings_root() / evaluator_name + + +def _parse_case(name: str, raw: dict[str, Any]) -> ContractCase: + prompt_steps_raw = raw.get("prompt_steps", {}) + prompt_steps: dict[str, PromptStepContract] = { + step_name: _parse_prompt_step(step_data) + for step_name, step_data in prompt_steps_raw.items() + } + return ContractCase( + name=name, + description=raw.get("description", ""), + input=raw.get("input", {}), + prompt_steps=prompt_steps, + expected_result=raw.get("expected_result", {}), + ) + + +def _parse_prompt_step(raw: dict[str, Any]) -> PromptStepContract: + return PromptStepContract( + system_prompt=str(raw["system_prompt"]), + user_prompt=str(raw["user_prompt"]), + model=str(raw["model"]), + temperature=float(raw["temperature"]), + llm_response=str(raw["llm_response"]), + ) diff --git a/sdks/python/tests/contract_tests/test_conventionality.py b/sdks/python/tests/contract_tests/test_conventionality.py new file mode 100644 index 0000000..44db637 --- /dev/null +++ b/sdks/python/tests/contract_tests/test_conventionality.py @@ -0,0 +1,84 @@ +"""Contract test: ConventionalityEvaluator matches the notebook. + +This test verifies two things for each contract case: + 1. The SDK sends the same LLM request as the notebook (same fully-formatted + system prompt, user prompt, model, and temperature). + 2. Given the same LLM response, the SDK produces the same result as the + notebook. + +HOW TO ADD A NEW CASE +--------------------- +1. Add a ``[cases.]`` entry to + ``sdks/settings/conventionality/contracts.toml``. +2. Add a loader function to ``contract_tests/conventionality.py``. +3. Add a test function here following the pattern below. + +HOW TO REFRESH CONTRACT DATA +----------------------------- +Run the notebook ``evals/Final ship - Conventionality Experimental Evaluator.ipynb`` +from the **repository root** with a valid ``GOOGLE_API_KEY``. The notebook loads +prompts from the evaluator settings TOML (same as the SDK) and prints a TOML block. +Paste it into ``sdks/settings/conventionality/contracts.toml`` (the canonical +copy), then run ``make sync-settings`` from ``sdks/python/`` to update the bundled +copy. +""" + +from learning_commons_evaluators import ( + ConventionalityEvaluationInput, + ConventionalityEvaluator, + create_config_no_telemetry, +) +from learning_commons_evaluators.schemas.metadata import Status + +from .conventionality import ( + conventionality_notebook_to_sdk_result, + load_conventionality_turnip_case, +) +from .harness import ContractTestHarness + + +class TestConventionalityContract: + def test_turnip_grade4(self) -> None: + """Turnip classroom narrative, grade 4. + + Verifies: + - The fully-formatted system and user prompts match the notebook. + - The model and temperature match the notebook. + - Given the notebook's LLM response, the SDK returns the same + answer and explanation as the notebook. + """ + case = load_conventionality_turnip_case() + + config = create_config_no_telemetry() + evaluator = ConventionalityEvaluator(config) + inp = ConventionalityEvaluationInput( + text=case.input["text"], + grade=case.input["grade"], + ) + + with ContractTestHarness(case) as harness: + result = evaluator.evaluate(inp) + + # --- Prompt fidelity --- + # Asserts that the SDK sent the same fully-formatted request as the + # notebook (model, temperature, and both message contents). + harness.assert_prompt_step("main") + + # --- Result fidelity --- + # Asserts the SDK maps the LLM response to the same answer and + # explanation as the notebook. Metadata (timing, evaluation_id, etc.) + # is intentionally excluded from this comparison. + expected = conventionality_notebook_to_sdk_result(case) + assert result.metadata.status == Status.succeeded + assert result.answer.score == expected.answer.score, ( + f"answer.score: SDK={result.answer.score!r}, notebook={expected.answer.score!r}" + ) + assert result.answer.label == expected.answer.label, ( + f"answer.label: SDK={result.answer.label!r}, notebook={expected.answer.label!r}" + ) + assert result.explanation.summary == expected.explanation.summary, ( + "explanation.summary (reasoning) differs between SDK and notebook" + ) + assert result.explanation.details == expected.explanation.details, ( + "explanation.details differs between SDK and notebook" + ) diff --git a/sdks/python/tests/contract_tests/test_vocabulary.py b/sdks/python/tests/contract_tests/test_vocabulary.py new file mode 100644 index 0000000..57364c7 --- /dev/null +++ b/sdks/python/tests/contract_tests/test_vocabulary.py @@ -0,0 +1,136 @@ +"""Contract tests: VocabularyEvaluator matches the notebook. + +Each test verifies two things for a known input: + 1. The SDK sends the same LLM requests as the notebook (fully-formatted + system prompt, user prompt, model, and temperature β€” for both steps). + 2. Given the same LLM responses, the SDK produces the same structured result + as the notebook (score, reasoning, and word-list fields where applicable). + +The vocabulary evaluator makes two LLM calls per evaluation: + - Step "background_knowledge": plain-text response (no system prompt). + - Step "vocab_complexity": JSON response (system + human prompt). + +HOW TO ADD A NEW CASE +--------------------- +1. Add a ``[cases.]`` entry to + ``settings/vocabulary/contracts.toml`` with both prompt steps. +2. Add a loader function to ``contract_tests/vocabulary.py``. +3. Add a test function here following the pattern below. + +HOW TO REFRESH CONTRACT DATA +----------------------------- +Run the notebook ``evals/vocabulary_evaluator.ipynb`` with valid API keys. +The final cells output a TOML block; paste it into ``contracts.toml`` +replacing the placeholder values (canonical copy under ``sdks/settings/vocabulary/``). + +Then run ``make sync-settings`` from ``sdks/python`` so the bundled +``src/learning_commons_evaluators/settings/vocabulary/contracts.toml`` matches; +contract tests load contracts from that package copy, not from ``sdks/settings/`` alone. + +IMPORTANT: ``system_prompt = ""`` for the background_knowledge step is the +correct value (the prompt has no system message), not a placeholder. Only +the ``user_prompt`` and ``llm_response`` fields need to be populated. +""" + +from learning_commons_evaluators import ( + VocabularyEvaluationInput, + VocabularyEvaluator, + create_config_no_telemetry, +) +from learning_commons_evaluators.schemas.metadata import Status + +from .harness import ContractTestHarness +from .vocabulary import ( + load_vocabulary_grade34_case, + load_vocabulary_other_grades_case, + vocabulary_grade34_notebook_to_sdk_result, + vocabulary_other_grades_notebook_to_sdk_result, +) + + +class TestVocabularyContractGrades34: + def test_marco_polo_grade3(self) -> None: + """Marco Polo passage, grade 3 β€” grades 3–4 Gemini path. + + Verifies: + - Both LLM requests (background_knowledge and vocab_complexity) match + the fully-formatted prompts captured from the notebook. + - Given the notebook's LLM responses, the SDK returns the same answer, + reasoning, and word-breakdown details as the notebook. + """ + case = load_vocabulary_grade34_case() + + config = create_config_no_telemetry() + evaluator = VocabularyEvaluator(config) + inp = VocabularyEvaluationInput( + text=case.input["text"], + grade=case.input["grade"], + ) + + with ContractTestHarness(case) as harness: + result = evaluator.evaluate(inp) + + # --- Prompt fidelity --- + # Both steps are asserted: model, temperature, and formatted messages + # must match what the notebook sent to the LLM. + harness.assert_prompt_step("background_knowledge") + harness.assert_prompt_step("vocab_complexity") + + # --- Result fidelity --- + expected = vocabulary_grade34_notebook_to_sdk_result(case) + assert result.metadata.status == Status.succeeded + assert result.answer.score == expected.answer.score, ( + f"answer.score: SDK={result.answer.score!r}, notebook={expected.answer.score!r}" + ) + assert result.answer.label == expected.answer.label, ( + f"answer.label: SDK={result.answer.label!r}, notebook={expected.answer.label!r}" + ) + assert result.explanation.summary == expected.explanation.summary, ( + "explanation.summary (reasoning) differs between SDK and notebook" + ) + assert result.explanation.details == expected.explanation.details, ( + "explanation.details (word breakdown) differs between SDK and notebook" + ) + + +class TestVocabularyContractOtherGrades: + def test_hurricanes_grade7(self) -> None: + """Hurricane formation passage, grade 7 β€” grades 5–12 GPT path. + + Verifies: + - Both LLM requests match the notebook. + - Given the notebook's LLM responses, the SDK maps ``complexity_score`` + to the correct ``TextComplexityAnswer`` and returns the same reasoning + and word-breakdown details as the notebook. + """ + case = load_vocabulary_other_grades_case() + + config = create_config_no_telemetry() + evaluator = VocabularyEvaluator(config) + inp = VocabularyEvaluationInput( + text=case.input["text"], + grade=case.input["grade"], + ) + + with ContractTestHarness(case) as harness: + result = evaluator.evaluate(inp) + + # --- Prompt fidelity --- + harness.assert_prompt_step("background_knowledge") + harness.assert_prompt_step("vocab_complexity") + + # --- Result fidelity --- + expected = vocabulary_other_grades_notebook_to_sdk_result(case) + assert result.metadata.status == Status.succeeded + assert result.answer.score == expected.answer.score, ( + f"answer.score: SDK={result.answer.score!r}, notebook={expected.answer.score!r}" + ) + assert result.answer.label == expected.answer.label, ( + f"answer.label: SDK={result.answer.label!r}, notebook={expected.answer.label!r}" + ) + assert result.explanation.summary == expected.explanation.summary, ( + "explanation.summary (reasoning) differs between SDK and notebook" + ) + assert result.explanation.details == expected.explanation.details, ( + "explanation.details (word breakdown) differs between SDK and notebook" + ) diff --git a/sdks/python/tests/contract_tests/vocabulary.py b/sdks/python/tests/contract_tests/vocabulary.py new file mode 100644 index 0000000..26e56b1 --- /dev/null +++ b/sdks/python/tests/contract_tests/vocabulary.py @@ -0,0 +1,146 @@ +"""Vocabulary-specific helpers for contract tests. + +Provides: + - Named case loaders (one function per test case in the TOML). + - Result mappers: convert the notebook-format expected result (raw + ``JsonOutputParser`` dict) to the ``TextComplexityResult`` that the SDK + should produce. + +Both grade paths use the same ``Output``-style schema (word-level breakdown + +string ``complexity_score``), so both mappers apply the same normalisation: + ``complexity_score.lower().replace(" ", "_")`` β†’ ``TextComplexityAnswer``. +""" + +from __future__ import annotations + +from learning_commons_evaluators.schemas.evaluator import EvaluationExplanation +from learning_commons_evaluators.schemas.metadata import ( + EvaluationMetadata, + EvaluatorMaturity, + EvaluatorMetadata, + Status, +) +from learning_commons_evaluators.schemas.text_complexity import ( + TextComplexityAnswer, + TextComplexityResult, +) + +from .loader import ContractCase, load_contract_case + +# --------------------------------------------------------------------------- +# Case loaders +# --------------------------------------------------------------------------- + + +def load_vocabulary_grade34_case() -> ContractCase: + """Load the 'marco_polo_grade3' contract test case (grades 3–4 path).""" + return load_contract_case("vocabulary", "marco_polo_grade3") + + +def load_vocabulary_other_grades_case() -> ContractCase: + """Load the 'hurricanes_grade7' contract test case (grades 5–12 path).""" + return load_contract_case("vocabulary", "hurricanes_grade7") + + +# --------------------------------------------------------------------------- +# Result mappers +# --------------------------------------------------------------------------- + + +def _placeholder_metadata() -> EvaluationMetadata: + """Minimal metadata placeholder for result comparison objects. + + Only ``answer`` and ``explanation`` are compared in contract assertions; + metadata contains non-deterministic fields (timing, evaluation ID, etc.) + and is intentionally excluded. + """ + return EvaluationMetadata( + evaluator_metadata=EvaluatorMetadata( + id="vocabulary", + version="0.1", + name="Vocabulary", + description="Contract test placeholder.", + maturity=EvaluatorMaturity.alpha, + ), + evaluation_settings=None, # type: ignore[arg-type] + input_metadata={}, + status=Status.succeeded, + ) + + +def vocabulary_grade34_notebook_to_sdk_result( + case: ContractCase, +) -> TextComplexityResult: + """Convert a grades 3–4 ``expected_result`` dict to a ``TextComplexityResult``. + + The notebook outputs a plain dict from ``JsonOutputParser``; the SDK wraps + that into ``TextComplexityResult``. This function performs the same mapping + so tests can assert equality. + + Only ``answer`` and ``explanation`` are compared β€” ``metadata`` is a + placeholder because it contains non-deterministic fields. + + Args: + case: A loaded :class:`~loader.ContractCase` with a populated + ``expected_result`` (grades 3–4 format). + + Returns: + A ``TextComplexityResult`` built from the contract's expected output. + """ + r = case.expected_result + # Normalise the score string: the notebook may return "very complex" (spaces). + raw_score = r["complexity_score"].lower().replace(" ", "_") + answer = TextComplexityAnswer.from_score(raw_score) + explanation = EvaluationExplanation( + summary=r["reasoning"], + details={ + "tier_2_words": r["tier_2_words"], + "tier_3_words": r["tier_3_words"], + "archaic_words": r["archaic_words"], + "other_complex_words": r["other_complex_words"], + }, + ) + return TextComplexityResult( + answer=answer, + explanation=explanation, + metadata=_placeholder_metadata(), + ) + + +def vocabulary_other_grades_notebook_to_sdk_result( + case: ContractCase, +) -> TextComplexityResult: + """Convert a grades 5–12 ``expected_result`` dict to a ``TextComplexityResult``. + + The OTHER_GRADES path uses the same Output-style schema as grades 3–4, so + ``expected_result`` contains a string ``complexity_score`` (e.g. "slightly + complex") which is normalised to underscore form before mapping, plus the + same word-list fields as the notebook dict. + + Only ``answer`` and ``explanation`` are compared β€” ``metadata`` is a + placeholder because it contains non-deterministic fields. + + Args: + case: A loaded :class:`~loader.ContractCase` with a populated + ``expected_result`` (grades 5–12 format). + + Returns: + A ``TextComplexityResult`` built from the contract's expected output. + """ + r = case.expected_result + score = r["complexity_score"].lower().replace(" ", "_") + answer = TextComplexityAnswer.from_score(score) + explanation = EvaluationExplanation( + summary=r["reasoning"], + details={ + "tier_2_words": r["tier_2_words"], + "tier_3_words": r["tier_3_words"], + "archaic_words": r["archaic_words"], + "other_complex_words": r["other_complex_words"], + }, + ) + return TextComplexityResult( + answer=answer, + explanation=explanation, + metadata=_placeholder_metadata(), + ) diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py index 32cc3b5..333e5c1 100644 --- a/sdks/python/tests/evaluators/test_base.py +++ b/sdks/python/tests/evaluators/test_base.py @@ -1,7 +1,10 @@ """Tests for :class:`~learning_commons_evaluators.evaluators.base.BaseEvaluator`. -Covers: ``__init__``, ``evaluate`` (metadata, settings override, success/failure), -``update_total_token_usage``, ``execute_step``, ``execute_prompt_chain_step``. +Covers ``__init__``, ``evaluate`` (metadata, settings override, success/failure, telemetry), +``update_total_token_usage``, ``execute_step``, and ``execute_prompt_chain_step``. +``EvaluationMetadata`` always uses ``input.input_metadata()`` (including when +``send_full_input_with_telemetry`` is enabled). Helpers use both a minimal stub evaluator +and conventionality-oriented fixtures where useful. """ from __future__ import annotations @@ -112,6 +115,15 @@ class TestBaseEvaluatorInit: def test_config_is_stored(self, config): assert _StubEvaluator(config).config is config + def test_constructor_default_evaluation_settings_overrides_class_default(self, config): + instance_default = _StubSettings(marker=99) + ev = _StubEvaluator(config, default_evaluation_settings=instance_default) + assert ev.default_evaluation_settings is instance_default + + def test_omitted_constructor_default_falls_back_to_class_attribute(self, config): + ev = _StubEvaluator(config) + assert ev.default_evaluation_settings is _StubEvaluator.default_evaluation_settings + # --------------------------------------------------------------------------- # evaluate() @@ -130,6 +142,20 @@ def test_passes_explicit_evaluation_settings(self, stub_evaluator): assert result.metadata.evaluation_settings.marker == 42 assert result.explanation.details.get("marker") == 42 + def test_constructor_default_used_when_evaluate_settings_omitted(self, config): + ev = _StubEvaluator(config, default_evaluation_settings=_StubSettings(marker=77)) + result = ev.evaluate(_stub_input()) + assert result.metadata.evaluation_settings.marker == 77 + assert result.explanation.details.get("marker") == 77 + + def test_evaluate_explicit_settings_override_constructor_default(self, config): + ev = _StubEvaluator( + config, + default_evaluation_settings=_StubSettings(marker=1), + ) + result = ev.evaluate(_stub_input(), evaluation_settings=_StubSettings(marker=2)) + assert result.explanation.details.get("marker") == 2 + class TestEvaluateInputMetadata: """``input_metadata`` on :class:`EvaluationMetadata` always comes from ``input.input_metadata()``.""" @@ -142,6 +168,7 @@ def test_evaluate_sets_metadata_from_input_metadata(self, stub_evaluator): assert result.metadata.input_metadata["grade_level"] == {"grade": 3} def test_full_telemetry_config_still_uses_input_metadata_not_raw_values(self, stub_evaluator): + """``send_full_input_with_telemetry`` does not replace ``input_metadata`` with raw values.""" cfg = create_config(telemetry_partner_id="test", send_full_input_with_telemetry=True) ev = _StubEvaluator(cfg) inp = _stub_input() @@ -151,7 +178,7 @@ def test_full_telemetry_config_still_uses_input_metadata_not_raw_values(self, st assert result.metadata.input_metadata["grade_level"] == {"grade": 3} -class TestEvaluateErrorHandling: +class TestStubEvaluateErrorHandling: def test_raises_validation_error_for_invalid_input(self, stub_evaluator): inp = TextComplexityEvaluationInput( text=TextInputField( @@ -281,9 +308,15 @@ def test_extras_appear_in_step_metadata(self, stub_evaluator, evaluation_metadat class TestExecutePromptChainStep: - """Mock ``create_provider`` so ``template | provider`` runs in-process (matches sdk_python tests).""" + """Mock ``create_provider`` so ``template | provider`` runs in-process. + + Fake LLMs return real ``AIMessage`` values so ``JsonOutputParser`` and + ``token_usage_from_aimessage`` exercise the real code paths where applicable. + """ - def test_returns_raw_string_when_parser_output_type_is_none(self, evaluation_metadata): + def test_returns_raw_string_when_parser_output_type_is_none( + self, stub_evaluator, evaluation_metadata + ): def _fake_llm(_pv): return AIMessage(content="plain prose") @@ -304,6 +337,25 @@ def _fake_llm(_pv): ) assert out == "plain prose" + def test_json_dict_normalizer_without_parser_type_raises( + self, stub_evaluator, evaluation_metadata + ): + template = ChatPromptTemplate.from_messages([("human", "{input}")]) + with pytest.raises(ValueError, match="json_dict_normalizer requires"): + stub_evaluator.execute_prompt_chain_step( + step_name="raw", + prompt_settings=PromptSettings( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + temperature=0.0, + ), + evaluation_metadata=evaluation_metadata, + template=template, + chain_inputs={"input": "Hello"}, + parser_output_type=None, + json_dict_normalizer=lambda d: d, + ) + def test_returns_parsed_pydantic_output(self, stub_evaluator, evaluation_metadata): def _fake_llm(_pv): return AIMessage(content=_CHAIN_JSON) @@ -326,6 +378,42 @@ def _fake_llm(_pv): assert result.label == "ok" assert result.score == 7 + def test_json_dict_normalizer_parses_dict_then_normalizes_then_validates( + self, stub_evaluator, evaluation_metadata + ): + """Optional ``json_dict_normalizer``: loose JSON β†’ dict β†’ user fn β†’ ``model_validate``.""" + + def _fake_llm(_pv): + return AIMessage(content='{"n": 1}') + + class _Out(BaseModel): + n: int = Field(description="n") + doubled: int = Field(description="doubled") + + def _double(d: dict) -> dict: + d = dict(d) + d["doubled"] = int(d["n"]) * 2 + return d + + template = ChatPromptTemplate.from_messages([("human", "{input}")]) + with patch(_CHAIN_PATCH, return_value=_fake_llm): + result = stub_evaluator.execute_prompt_chain_step( + step_name="main", + prompt_settings=PromptSettings( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + temperature=0.0, + ), + evaluation_metadata=evaluation_metadata, + template=template, + chain_inputs={"input": "Hello"}, + parser_output_type=_Out, + json_dict_normalizer=_double, + ) + assert isinstance(result, _Out) + assert result.n == 1 + assert result.doubled == 2 + def test_parser_returning_model_instance_short_circuits_model_validate( self, stub_evaluator, evaluation_metadata ): diff --git a/sdks/python/tests/evaluators/test_conventionality.py b/sdks/python/tests/evaluators/test_conventionality.py new file mode 100644 index 0000000..e339da0 --- /dev/null +++ b/sdks/python/tests/evaluators/test_conventionality.py @@ -0,0 +1,143 @@ +"""Tests for ConventionalityEvaluator and conventionality helpers.""" + +from unittest.mock import patch + +import pytest + +from learning_commons_evaluators import ( + ConventionalityEvaluationInput, + ConventionalityEvaluator, + create_config_no_telemetry, +) +from learning_commons_evaluators.schemas.conventionality import ConventionalityOutput +from learning_commons_evaluators.schemas.errors import ConfigurationError +from learning_commons_evaluators.schemas.metadata import Status + +# Long sample text (well above ``min_text_length`` from conventionality settings TOML). +_SAMPLE_TEXT = ( + "Marco Polo was a Venetian merchant and explorer who traveled through Asia " + "in the late 13th century. He spent nearly two decades at the court of " + "Kublai Khan, the Mongol ruler of China, and described his experiences in " + "a book that introduced Europeans to the Far East." +) + + +def _make_mock_output(): + return ConventionalityOutput( + conventionality_features=["idioms"], + grade_context="Grade-appropriate.", + instructional_insights="Consider scaffolding.", + complexity_score="moderately_complex", + reasoning="The text uses some conventional language.", + ) + + +class TestConventionalityEvaluator: + def test_evaluate_returns_evaluation_result(self): + config = create_config_no_telemetry() + evaluator = ConventionalityEvaluator(config) + inp = ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=5) + with patch.object(evaluator, "execute_prompt_chain_step", return_value=_make_mock_output()): + result = evaluator.evaluate(inp) + assert result.answer.score == "moderately_complex" + assert result.answer.label == "Moderately complex" + assert result.explanation.summary is not None + assert result.metadata.status == Status.succeeded + assert result.metadata.evaluator_metadata.id == "conventionality" + + def test_evaluate_with_explicit_settings(self): + from learning_commons_evaluators.schemas.config import ( + LlmProvider, + PromptSettings, + ) + from learning_commons_evaluators.schemas.conventionality import ( + ConventionalityEvaluationSettings, + ) + + config = create_config_no_telemetry() + evaluator = ConventionalityEvaluator(config) + settings = ConventionalityEvaluationSettings( + prompt_settings_step_conventionality_evaluation=PromptSettings( + provider_type=LlmProvider.GOOGLE, + model="gemini-2.0-flash", + temperature=0.0, + ) + ) + inp = ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=3) + with patch.object(evaluator, "execute_prompt_chain_step", return_value=_make_mock_output()): + result = evaluator.evaluate(inp, evaluation_settings=settings) + assert result.metadata.status == Status.succeeded + + def test_metadata_and_default_settings(self): + evaluator = ConventionalityEvaluator(create_config_no_telemetry()) + assert evaluator.metadata.id == "conventionality" + assert evaluator.metadata.version == "0.1" + assert evaluator.default_evaluation_settings is not None + + +class TestConventionalityEvaluationInputConfiguration: + """Tests that ConventionalityEvaluationInput fails loudly on bad configuration. + + These tests patch ``ConventionalityEvaluationInput._input_settings`` directly + because the ClassVar is bound at class-definition time. Patching the + module-level ``_INPUT_SETTINGS`` name would rebind the module variable but + leave the class variable pointing at the original dict. + """ + + def test_missing_text_spec_raises_configuration_error(self, monkeypatch): + """If 'text' is absent from _input_settings, ConfigurationError is raised immediately.""" + monkeypatch.setattr(ConventionalityEvaluationInput, "_input_settings", {}) + with pytest.raises(ConfigurationError, match="'text'"): + ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=5) + + def test_missing_grade_spec_raises_configuration_error(self, monkeypatch): + """If 'grade' is absent from _input_settings, ConfigurationError is raised immediately.""" + from learning_commons_evaluators.schemas.input_specs import TextInputSpec + + monkeypatch.setattr( + ConventionalityEvaluationInput, + "_input_settings", + {"text": TextInputSpec(name="text")}, + ) + with pytest.raises(ConfigurationError, match="'grade'"): + ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=5) + + def test_wrong_text_spec_type_raises_configuration_error(self, monkeypatch): + """If the 'text' spec has the wrong type, ConfigurationError names the type mismatch.""" + from learning_commons_evaluators.schemas.input_specs import GradeInputSpec + + monkeypatch.setattr( + ConventionalityEvaluationInput, + "_input_settings", + {"text": GradeInputSpec(name="text")}, + ) + with pytest.raises(ConfigurationError, match="TextInputSpec"): + ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=5) + + def test_wrong_grade_spec_type_raises_configuration_error(self, monkeypatch): + """If the 'grade' spec has the wrong type, ConfigurationError names the type mismatch.""" + from learning_commons_evaluators.schemas.input_specs import TextInputSpec + + monkeypatch.setattr( + ConventionalityEvaluationInput, + "_input_settings", + { + "text": TextInputSpec(name="text"), + "grade": TextInputSpec(name="grade"), # wrong type + }, + ) + with pytest.raises(ConfigurationError, match="GradeInputSpec"): + ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=5) + + +class TestConventionalityOutput: + def test_conventionality_output_literal_score(self): + out = ConventionalityOutput( + conventionality_features=[], + grade_context="", + instructional_insights="", + complexity_score="slightly_complex", + reasoning="Test.", + ) + assert out.complexity_score == "slightly_complex" + assert out.reasoning == "Test." diff --git a/sdks/python/tests/evaluators/test_vocabulary.py b/sdks/python/tests/evaluators/test_vocabulary.py new file mode 100644 index 0000000..a6eacf0 --- /dev/null +++ b/sdks/python/tests/evaluators/test_vocabulary.py @@ -0,0 +1,353 @@ +"""Tests for VocabularyEvaluator and vocabulary helpers.""" + +from unittest.mock import patch + +import pytest + +from learning_commons_evaluators import ( + VocabularyEvaluationInput, + VocabularyEvaluator, + create_config_no_telemetry, +) +from learning_commons_evaluators.schemas.errors import ConfigurationError, ValidationError +from learning_commons_evaluators.schemas.metadata import Status +from learning_commons_evaluators.schemas.vocabulary import ( + VocabularyComplexityOutput, + normalize_complexity_output, +) + +_SAMPLE_TEXT = "The cat sat on the mat." + +# ── Mock helpers ────────────────────────────────────────────────────────────── + +_MOCK_BACKGROUND_KNOWLEDGE = "Students are familiar with household pets and basic domestic scenes." + + +def _make_grades34_output( + score: str = "moderately_complex", +) -> VocabularyComplexityOutput: + return VocabularyComplexityOutput( + tier_2_words="sat", + tier_3_words="none", + archaic_words="none", + other_complex_words="none", + complexity_score=score, + reasoning="Most words are simple and familiar.", + ) + + +_OTHER_GRADES_SCORE_MAP: dict[int, str] = { + 1: "slightly complex", + 2: "moderately complex", + 3: "very complex", + 4: "exceedingly complex", +} + + +def _make_other_grades_output(answer: int = 2) -> VocabularyComplexityOutput: + """Build mock complexity output from a convenience integer rubric level (1–4).""" + + return VocabularyComplexityOutput( + tier_2_words="sat", + tier_3_words="none", + archaic_words="none", + other_complex_words="none", + complexity_score=_OTHER_GRADES_SCORE_MAP[answer], + reasoning="Most words are simple and familiar.", + ) + + +def _patch_steps(evaluator, bk_return, vocab_return): + """Patch execute_prompt_chain_step with side_effect=[bk_return, vocab_return]. + + Both steps (background_knowledge and vocab_complexity) go through the same + method; side_effect returns them in call order. + """ + return patch.object( + evaluator, + "execute_prompt_chain_step", + side_effect=[bk_return, vocab_return], + ) + + +# ── Grade 3–4 path ──────────────────────────────────────────────────────────── + + +class TestVocabularyEvaluatorGrades34: + def test_evaluate_grade_3_returns_result(self): + config = create_config_no_telemetry() + evaluator = VocabularyEvaluator(config) + inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=3) + with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_grades34_output()): + result = evaluator.evaluate(inp) + + assert result.answer.score == "moderately_complex" + assert result.answer.label == "Moderately complex" + assert result.metadata.status == Status.succeeded + assert "tier_2_words" in result.explanation.details + + def test_evaluate_grade_4_returns_result(self): + config = create_config_no_telemetry() + evaluator = VocabularyEvaluator(config) + inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=4) + with _patch_steps( + evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_grades34_output("very_complex") + ): + result = evaluator.evaluate(inp) + + assert result.answer.score == "very_complex" + + def test_grades34_score_with_spaces_is_normalised(self): + """The grades 3–4 prompt may return "slightly complex" (spaces); normalise to underscores.""" + config = create_config_no_telemetry() + evaluator = VocabularyEvaluator(config) + inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=3) + # The evaluator calls .lower().replace(" ", "_") before from_score(), + # so we feed a space-separated label and assert it survives the path. + output = _make_grades34_output("slightly complex") + + with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, output): + result = evaluator.evaluate(inp) + + assert result.answer.score == "slightly_complex" + + def test_evaluate_grades34_explanation_has_word_breakdown(self): + config = create_config_no_telemetry() + evaluator = VocabularyEvaluator(config) + inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=3) + with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_grades34_output()): + result = evaluator.evaluate(inp) + + details = result.explanation.details + assert "tier_2_words" in details + assert "tier_3_words" in details + assert "archaic_words" in details + assert "other_complex_words" in details + + +# ── Grades 5–12 path ────────────────────────────────────────────────────────── + + +class TestVocabularyEvaluatorOtherGrades: + @pytest.mark.parametrize( + "score_label, expected_score", + [ + (1, "slightly_complex"), + (2, "moderately_complex"), + (3, "very_complex"), + (4, "exceedingly_complex"), + ], + ) + def test_all_complexity_scores_map_correctly(self, score_label, expected_score): + """Each complexity label (passed as convenience int 1–4) maps to the right score.""" + config = create_config_no_telemetry() + evaluator = VocabularyEvaluator(config) + inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=7) + with _patch_steps( + evaluator, + _MOCK_BACKGROUND_KNOWLEDGE, + _make_other_grades_output(score_label), + ): + result = evaluator.evaluate(inp) + + assert result.answer.score == expected_score + + def test_evaluate_grade_12_returns_result(self): + config = create_config_no_telemetry() + evaluator = VocabularyEvaluator(config) + inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=12) + with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(1)): + result = evaluator.evaluate(inp) + + assert result.metadata.status == Status.succeeded + assert result.answer.score == "slightly_complex" + + def test_other_grades_explanation_includes_word_breakdown(self): + """Grades 5–12 mirror the notebook: word lists live in ``explanation.details``.""" + config = create_config_no_telemetry() + evaluator = VocabularyEvaluator(config) + inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=8) + with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(2)): + result = evaluator.evaluate(inp) + + details = result.explanation.details + assert details["tier_2_words"] == "sat" + assert details["tier_3_words"] == "none" + assert details["archaic_words"] == "none" + assert details["other_complex_words"] == "none" + + def test_other_grades_legacy_integer_answer_normalizes_like_notebook(self): + """``normalize_complexity_output`` then validate (same order as the notebook).""" + parsed = VocabularyComplexityOutput.model_validate( + normalize_complexity_output( + {"answer": 3, "reasoning": "Dense technical terms throughout."} + ) + ) + assert parsed.complexity_score == "Very Complex" + assert parsed.tier_2_words == "" + assert parsed.tier_3_words == "" + + def test_other_grades_legacy_string_digit_answer(self): + parsed = VocabularyComplexityOutput.model_validate( + normalize_complexity_output({"answer": "2", "reasoning": "Accessible vocabulary."}) + ) + assert parsed.complexity_score == "Moderately Complex" + + def test_other_grades_unexpected_digit_answer_raises(self): + """Out-of-range rubric digit normalizes to a bare string; ``from_score`` rejects it.""" + config = create_config_no_telemetry() + evaluator = VocabularyEvaluator(config) + inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=7) + # Same ``complexity_score`` as ``normalize_complexity_output({"answer": 9, ...})``. + unexpected = VocabularyComplexityOutput( + tier_2_words="sat", + tier_3_words="none", + archaic_words="none", + other_complex_words="none", + complexity_score="9", + reasoning="Model returned an invalid rubric code.", + ) + with ( + _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, unexpected), + pytest.raises(ValueError, match=r"Unknown text complexity score: '9'"), + ): + evaluator.evaluate(inp) + + +class TestNormalizeComplexityOutput: + def test_preserves_complexity_score_when_answer_absent(self): + row = normalize_complexity_output( + { + "tier_2_words": "a", + "tier_3_words": "b", + "archaic_words": "c", + "other_complex_words": "d", + "complexity_score": "slightly complex", + "reasoning": "r", + } + ) + assert row["complexity_score"] == "slightly complex" + + def test_answer_overwrites_or_sets_complexity_score(self): + row = normalize_complexity_output({"answer": 1, "reasoning": "x"}) + assert row["complexity_score"] == "Slightly Complex" + + def test_unexpected_digit_answer_falls_back_to_string_value(self): + """Rubric is 1–4; other digits (int or string) become ``complexity_score == str(value)``.""" + assert ( + normalize_complexity_output({"answer": 5, "reasoning": "x"})["complexity_score"] == "5" + ) + assert ( + normalize_complexity_output({"answer": "9", "reasoning": "x"})["complexity_score"] + == "9" + ) + + +# ── Grade validation via framework ──────────────────────────────────────────── + + +class TestVocabularyEvaluationInputValidation: + def test_allowed_grades_set_from_toml(self): + """VocabularyEvaluationInput picks up allowed_grades from the TOML spec.""" + inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5) + assert set(inp.grade.spec.allowed_grades) == frozenset(range(3, 13)) + + @pytest.mark.parametrize("unsupported_grade", [0, 1, 2]) + def test_unsupported_grade_raises_via_framework(self, unsupported_grade): + """BaseEvaluator.evaluate() calls input.validate(), which catches the bad grade.""" + config = create_config_no_telemetry() + evaluator = VocabularyEvaluator(config) + inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=unsupported_grade) + # The base evaluator catches the ValidationError, sets status=failed, then re-raises. + with pytest.raises(ValidationError): + evaluator.evaluate(inp) + + def test_unsupported_grade_sets_status_failed(self): + """Metadata status is set to failed when grade validation fails.""" + config = create_config_no_telemetry() + evaluator = VocabularyEvaluator(config) + inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=2) + with pytest.raises(ValidationError): + evaluator.evaluate(inp) + + +# ── Metadata and settings ───────────────────────────────────────────────────── + + +class TestVocabularyEvaluatorMetadata: + def test_evaluator_metadata(self): + evaluator = VocabularyEvaluator(create_config_no_telemetry()) + assert evaluator.metadata.id == "vocabulary" + assert evaluator.metadata.version == "0.1" + + def test_default_settings_has_all_prompt_steps(self): + evaluator = VocabularyEvaluator(create_config_no_telemetry()) + settings = evaluator.default_evaluation_settings + assert settings.prompt_settings_step_background_knowledge is not None + assert settings.prompt_settings_step_vocab_grades_3_4 is not None + assert settings.prompt_settings_step_vocab_other_grades is not None + + def test_evaluate_succeeds_and_records_metadata(self): + config = create_config_no_telemetry() + evaluator = VocabularyEvaluator(config) + inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5) + with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(2)): + result = evaluator.evaluate(inp) + + assert result.metadata.status == Status.succeeded + assert result.metadata.evaluator_metadata.id == "vocabulary" + + +class TestVocabularyEvaluationInputConfiguration: + """Tests that VocabularyEvaluationInput fails loudly on bad configuration. + + These tests patch ``VocabularyEvaluationInput._input_settings`` directly + because the ClassVar is bound at class-definition time. Patching the + module-level ``_INPUT_SETTINGS`` name would rebind the module variable but + leave the class variable pointing at the original dict. + """ + + def test_missing_text_spec_raises_configuration_error(self, monkeypatch): + """If 'text' is absent from _input_settings, ConfigurationError is raised immediately.""" + monkeypatch.setattr(VocabularyEvaluationInput, "_input_settings", {}) + with pytest.raises(ConfigurationError, match="'text'"): + VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5) + + def test_missing_grade_spec_raises_configuration_error(self, monkeypatch): + """If 'grade' is absent from _input_settings, ConfigurationError is raised.""" + from learning_commons_evaluators.schemas.input_specs import TextInputSpec + + monkeypatch.setattr( + VocabularyEvaluationInput, + "_input_settings", + {"text": TextInputSpec(name="text")}, + ) + with pytest.raises(ConfigurationError, match="'grade'"): + VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5) + + def test_wrong_text_spec_type_raises_configuration_error(self, monkeypatch): + """If the 'text' spec has the wrong type, ConfigurationError names the type mismatch.""" + from learning_commons_evaluators.schemas.input_specs import GradeInputSpec + + monkeypatch.setattr( + VocabularyEvaluationInput, + "_input_settings", + {"text": GradeInputSpec(name="text")}, + ) + with pytest.raises(ConfigurationError, match="TextInputSpec"): + VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5) + + def test_wrong_grade_spec_type_raises_configuration_error(self, monkeypatch): + """If the 'grade' spec has the wrong type, ConfigurationError names the mismatch.""" + from learning_commons_evaluators.schemas.input_specs import TextInputSpec + + monkeypatch.setattr( + VocabularyEvaluationInput, + "_input_settings", + { + "text": TextInputSpec(name="text"), + "grade": TextInputSpec(name="grade"), # wrong type + }, + ) + with pytest.raises(ConfigurationError, match="GradeInputSpec"): + VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5) diff --git a/sdks/python/tests/schemas/test_evaluator_schemas.py b/sdks/python/tests/schemas/test_evaluator_schemas.py new file mode 100644 index 0000000..433fa66 --- /dev/null +++ b/sdks/python/tests/schemas/test_evaluator_schemas.py @@ -0,0 +1,253 @@ +"""Tests for EvaluationInput, EvaluationAnswer, EvaluationExplanation, and EvaluationResult. + +Uses a minimal :class:`_ExampleEvaluationInput` (text + grade) wired to explicit +:class:`~learning_commons_evaluators.schemas.input_specs.TextInputSpec` / +:class:`~learning_commons_evaluators.schemas.input_specs.GradeInputSpec` +instances so nothing depends on a real evaluator's TOML or class names. +""" + +from __future__ import annotations + +from typing import ClassVar + +import pytest + +from learning_commons_evaluators.schemas.common_inputs import ( + GradeInputField, + TextInputField, +) +from learning_commons_evaluators.schemas.errors import ( + ConfigurationError, + ValidationError, +) +from learning_commons_evaluators.schemas.evaluator import ( + EvaluationAnswer, + EvaluationExplanation, + EvaluationInput, + EvaluationResult, + InputField, +) +from learning_commons_evaluators.schemas.input_specs import ( + GradeInputSpec, + TextInputSpec, +) +from learning_commons_evaluators.schemas.metadata import ( + Status, +) + +# Long sample text (well above ``min_text_length`` on :attr:`_EXAMPLE_TEXT_SPEC`). +_LONG_TEXT = ( + "Marco Polo was a Venetian merchant and explorer who traveled through Asia " + "in the late 13th century. He spent nearly two decades at the court of " + "Kublai Khan, the Mongol ruler of China, and described his experiences in " + "a book that introduced Europeans to the Far East." +) + +_EXAMPLE_TEXT_SPEC = TextInputSpec(name="text", min_text_length=10) +_EXAMPLE_GRADE_SPEC = GradeInputSpec(name="grade") +# Unconstrained text spec for tests that only need an :class:`InputField` instance. +_BARE_TEXT_SPEC = TextInputSpec(name="text") + + +class _ExampleEvaluationInput(EvaluationInput): + """Minimal concrete :class:`EvaluationInput` for schema unit tests.""" + + _input_settings: ClassVar[dict] = { + "text": _EXAMPLE_TEXT_SPEC, + "grade": _EXAMPLE_GRADE_SPEC, + } + text: TextInputField + grade: GradeInputField + + def __init__( + self, + *, + text: str | TextInputField, + grade: int | GradeInputField, + **kwargs, + ): + super().__init__(text=text, grade=grade, **kwargs) + + +# --------------------------------------------------------------------------- +# Mixes a proper InputField with a plain float β€” exercises the +# isinstance(..., InputField) false branches on :class:`EvaluationInput`. +# --------------------------------------------------------------------------- + + +class _MixedInput(EvaluationInput): + """EvaluationInput with one proper InputField and one plain Python value.""" + + text: TextInputField + weight: float # no validate(), no input_metadata(), no .value attribute + + +class TestEvaluationInput: + # --- happy-path construction, validation, and metadata --- + + def test_validate_and_input_metadata(self): + inp = _ExampleEvaluationInput(text=_LONG_TEXT, grade=5) + inp.validate() + meta = inp.input_metadata() + assert meta["text"] == {"textLength": len(_LONG_TEXT)} + assert meta["grade"] == {"grade": 5} + + def test_input_values_returns_primitive_values(self): + """input_values() should unwrap .value from each InputField.""" + inp = _ExampleEvaluationInput(text=_LONG_TEXT, grade=7) + values = inp.input_values() + assert values["text"] == _LONG_TEXT + assert values["grade"] == 7 + + # --- validation error paths --- + + def test_validate_raises_on_invalid_grade(self): + inp = _ExampleEvaluationInput(text=_LONG_TEXT, grade=99) + with pytest.raises(ValidationError): + inp.validate() + + def test_validate_raises_on_invalid_text_length(self): + inp = _ExampleEvaluationInput(text="x", grade=5) + with pytest.raises(ValidationError): + inp.validate() + + def test_validate_collects_all_errors_before_raising(self): + """All field errors are collected; a single ValidationError is raised at the end.""" + inp = _ExampleEvaluationInput(text="x", grade=99) + with pytest.raises(ValidationError) as exc_info: + inp.validate() + msg = str(exc_info.value) + assert "below minimum" in msg + assert "0-12" in msg + + # --- isinstance False branches: field without protocol methods --- + + def test_validate_skips_non_inputfield_fields(self): + """Fields that don't implement the InputField protocol are silently skipped.""" + inp = _MixedInput(text=TextInputField(spec=_BARE_TEXT_SPEC, value="hello"), weight=7.5) + inp.validate() # should not raise even though weight is not an InputField + + def test_input_metadata_returns_none_for_non_inputfield_fields(self): + """Fields that are not InputFields produce a None entry in the output dict.""" + inp = _MixedInput(text=TextInputField(spec=_BARE_TEXT_SPEC, value="hello"), weight=7.5) + meta = inp.input_metadata() + assert meta["text"] == {"textLength": 5} + assert meta["weight"] is None # fallback for non-protocol fields + + def test_input_values_returns_field_itself_for_non_inputfield(self): + """Fields that are not InputFields are returned as-is from input_values().""" + inp = _MixedInput(text=TextInputField(spec=_BARE_TEXT_SPEC, value="hello"), weight=7.5) + values = inp.input_values() + assert values["text"] == "hello" + assert values["weight"] == 7.5 # not an InputField; the float is returned directly + + # --- InputField subclass isinstance checks --- + + def test_text_input_field_is_inputfield_subclass(self): + """TextInputField must be an InputField subclass (checked via isinstance).""" + field = TextInputField(spec=_BARE_TEXT_SPEC, value="hello") + assert isinstance(field, InputField) + + def test_grade_input_field_is_inputfield_subclass(self): + """GradeInputField must be an InputField subclass (checked via isinstance).""" + spec = GradeInputSpec(name="grade") + field = GradeInputField(spec=spec, value=5) + assert isinstance(field, InputField) + + def test_plain_value_is_not_an_inputfield(self): + """Plain Python values must not be treated as InputField instances.""" + assert not isinstance(7.5, InputField) + assert not isinstance("raw string", InputField) + + def test_inputfield_cannot_be_instantiated_directly(self): + """InputField is abstract and must not be instantiable without implementing validate() and input_metadata().""" + with pytest.raises(TypeError, match="abstract"): + InputField(spec=TextInputSpec(name="text"), value="hello") # type: ignore[abstract] + + +class TestCoerceRawToInputFields: + """Direct unit tests for :meth:`EvaluationInput._coerce_raw_to_input_fields`.""" + + _COERCE_TEXT_SPEC = TextInputSpec(name="text") + + def test_raw_values_are_wrapped_into_input_fields(self): + """Raw str/int values should be wrapped into the declared InputField types.""" + inp = _ExampleEvaluationInput(text="hello world", grade=5) + assert isinstance(inp.text, TextInputField) + assert inp.text.value == "hello world" + assert isinstance(inp.grade, GradeInputField) + assert inp.grade.value == 5 + + def test_already_constructed_input_field_is_not_rewrapped(self): + """Passing a fully-constructed InputField instance bypasses construction.""" + pre_built = TextInputField(spec=self._COERCE_TEXT_SPEC, value="pre-built") + inp = _ExampleEvaluationInput(text=pre_built, grade=3) + assert inp.text is pre_built # same object, not a copy + + def test_non_inputfield_field_is_left_unchanged(self): + """Plain (non-InputField) fields are passed through without modification.""" + + class _WithPlainField(EvaluationInput): + _input_settings: ClassVar[dict] = {"text": TextInputSpec(name="text")} + text: TextInputField + weight: float + + def __init__(self, *, text: str, weight: float, **kwargs): + super().__init__(text=text, weight=weight, **kwargs) + + inp = _WithPlainField(text="hello", weight=1.5) + assert inp.weight == 1.5 # plain float, untouched + + def test_missing_spec_raises_configuration_error(self, monkeypatch): + """ConfigurationError is raised when a required spec is absent from _input_settings.""" + monkeypatch.setattr(_ExampleEvaluationInput, "_input_settings", {}) + with pytest.raises(ConfigurationError, match="'text'"): + _ExampleEvaluationInput(text="hello", grade=5) + + def test_wrong_spec_type_raises_configuration_error(self, monkeypatch): + """ConfigurationError is raised when the spec type doesn't match the field's expectation.""" + monkeypatch.setattr( + _ExampleEvaluationInput, + "_input_settings", + {"text": GradeInputSpec(name="text"), "grade": _EXAMPLE_GRADE_SPEC}, + ) + with pytest.raises(ConfigurationError, match="TextInputSpec"): + _ExampleEvaluationInput(text="hello", grade=5) + + def test_error_message_includes_class_and_field_name(self, monkeypatch): + """ConfigurationError messages name both the class and the missing field.""" + monkeypatch.setattr(_ExampleEvaluationInput, "_input_settings", {}) + with pytest.raises(ConfigurationError) as exc_info: + _ExampleEvaluationInput(text="hello", grade=5) + msg = str(exc_info.value) + assert "_ExampleEvaluationInput" in msg + assert "'text'" in msg + + +class TestEvaluationAnswer: + def test_score_and_label(self): + answer = EvaluationAnswer(score="moderately_complex", label="Moderately complex") + assert answer.score == "moderately_complex" + assert answer.label == "Moderately complex" + + +class TestEvaluationExplanation: + def test_summary_and_details(self): + explanation = EvaluationExplanation(summary="Reasoning.", details={"key": "value"}) + assert explanation.summary == "Reasoning." + assert explanation.details["key"] == "value" + + def test_details_defaults_to_empty_dict(self): + explanation = EvaluationExplanation(summary="Short.") + assert explanation.details == {} + + +class TestEvaluationResult: + def test_construction_and_status(self, evaluation_metadata): + result = EvaluationResult( + answer=EvaluationAnswer(score="slightly_complex", label="Slightly complex"), + explanation=EvaluationExplanation(summary="Summary"), + metadata=evaluation_metadata, + ) + assert result.answer.score == "slightly_complex" + assert result.metadata.status == Status.processing diff --git a/sdks/python/tests/settings/test_load_settings.py b/sdks/python/tests/settings/test_load_settings.py index dae9dff..25dc88b 100644 --- a/sdks/python/tests/settings/test_load_settings.py +++ b/sdks/python/tests/settings/test_load_settings.py @@ -3,8 +3,10 @@ from __future__ import annotations import importlib.resources +import os import textwrap from pathlib import Path +from unittest.mock import patch import pytest @@ -13,6 +15,9 @@ LlmProvider, PromptSettings, ) +from learning_commons_evaluators.schemas.conventionality import ( + ConventionalityEvaluationSettings, +) from learning_commons_evaluators.schemas.errors import ConfigurationError from learning_commons_evaluators.schemas.input_specs import TextInputSpec from learning_commons_evaluators.schemas.metadata import EvaluatorMaturity @@ -226,27 +231,121 @@ def test_load_evaluator_settings_rejects_non_string_prompt_value(tmp_path: Path) def test_load_evaluator_settings_invalid_prompt_settings_block(tmp_path: Path) -> None: + """Unknown ``[[evaluator_metadata.inputs]].type`` must raise a clear configuration error.""" path = tmp_path / "bad_prompt.toml" path.write_text( textwrap.dedent( """ [evaluator_metadata] - id = "e" - version = "1.0" - name = "N" - description = "D" - maturity = "ga" + id = "x" + version = "0.1" + name = "X" + description = "X" + maturity = "beta" - [evaluation_settings] - marker = 0 + [[evaluator_metadata.inputs]] + name = "mystery" + type = "UnknownInputField" - [evaluation_settings.prompt_settings_main] - provider_type = "google" - model = "m" + [evaluation_settings.prompt_settings_step_conventionality_evaluation] + provider_type = "GOOGLE" + model = "gemini-2.0-flash" + temperature = 0 """ ).strip() ) - with pytest.raises( - ConfigurationError, match="Invalid \\[evaluation_settings.prompt_settings_main\\]" - ): - load_evaluator_settings(path, _WithPromptSettings) + with pytest.raises(ConfigurationError, match="UnknownInputField"): + load_evaluator_settings(path, ConventionalityEvaluationSettings) + + +# --------------------------------------------------------------------------- +# shared_settings_root +# --------------------------------------------------------------------------- + + +class TestSharedSettingsRoot: + def test_env_var_overrides_bundled_path(self, tmp_path: Path) -> None: + with patch.dict(os.environ, {"EVALUATORS_SETTINGS_DIR": str(tmp_path)}): + assert shared_settings_root() == tmp_path + + def test_importlib_fallback_resolves_to_existing_directory(self) -> None: + env = {k: v for k, v in os.environ.items() if k != "EVALUATORS_SETTINGS_DIR"} + with patch.dict(os.environ, env, clear=True): + root = shared_settings_root() + assert root.is_dir(), f"shared_settings_root() resolved to non-directory: {root}" + + def test_bundled_contract_tests_are_present(self) -> None: + """The bundled package must contain contracts.toml for each evaluator. + + The evaluator settings TOML is intentionally NOT bundled (evaluators use _generated_*_settings.py). + contracts.toml IS bundled so contract tests work against an installed package. + """ + env = {k: v for k, v in os.environ.items() if k != "EVALUATORS_SETTINGS_DIR"} + with patch.dict(os.environ, env, clear=True): + root = shared_settings_root() + for evaluator in ("conventionality",): + assert (root / evaluator / "contracts.toml").is_file(), ( + f"Bundled {evaluator}/contracts.toml not found β€” " + f"run 'python scripts/generate_settings.py --sync'" + ) + + +# --------------------------------------------------------------------------- +# Settings bundle sync guard +# --------------------------------------------------------------------------- +# +# What is and isn't bundled in the package: +# +# BUNDLED: contracts.toml β€” needed so contract tests run against an +# installed package without access to sdks/settings/. +# +# NOT BUNDLED: settings.toml β€” not needed at runtime; evaluators import +# from _generated_*_settings.py (pre-built at generation time). +# The canonical copy lives in sdks/settings/ and is the input to +# `make generate-settings`. +# +# If a sync test fails, run from the repo root: +# python scripts/generate_settings.py --sync β€” copies contracts.toml canonical β†’ bundled +# python scripts/generate_settings.py --check β€” verifies generated .py files are up to date + + +def _bundled_settings_root() -> Path: + """Return the importlib.resources path for bundled settings, bypassing EVALUATORS_SETTINGS_DIR.""" + pkg = importlib.resources.files("learning_commons_evaluators.settings") + return Path(str(pkg)) + + +def _canonical_settings_root() -> Path | None: + """Return sdks/settings/ relative to this file, or None if not in the monorepo.""" + # This file lives at: sdks/python/tests/settings/test_load_settings.py + # parents[3] = sdks/ + candidate = Path(__file__).parents[3] / "settings" + return candidate if candidate.is_dir() else None + + +@pytest.mark.parametrize("evaluator", ["conventionality", "vocabulary"]) +def test_bundled_contract_tests_match_canonical(evaluator: str) -> None: + """Bundled contracts.toml must be byte-for-byte identical to sdks/settings/. + + Skipped when running outside the monorepo (e.g., from an installed package). + If this fails, run ``python scripts/generate_settings.py --sync`` from the repo root. + """ + canonical_root = _canonical_settings_root() + if canonical_root is None: + pytest.skip("sdks/settings/ not found β€” running outside the monorepo") + + canonical = canonical_root / evaluator / "contracts.toml" + if not canonical.exists(): + pytest.skip(f"Canonical file not found: {canonical}") + + bundled = _bundled_settings_root() / evaluator / "contracts.toml" + assert bundled.exists(), ( + f"Bundled {evaluator}/contracts.toml not found.\n" + f"Run: python scripts/generate_settings.py --sync" + ) + assert canonical.read_bytes() == bundled.read_bytes(), ( + f"{evaluator}/contracts.toml is out of sync.\n" + f" canonical: {canonical}\n" + f" bundled: {bundled}\n" + f"Fix: python scripts/generate_settings.py --sync" + ) diff --git a/sdks/python/tests/test_package_imports.py b/sdks/python/tests/test_package_imports.py index bee9dad..cece087 100644 --- a/sdks/python/tests/test_package_imports.py +++ b/sdks/python/tests/test_package_imports.py @@ -1,36 +1,21 @@ """Smoke tests for package public API imports.""" +import importlib -def test_main_package_imports(): - """All advertised public symbols can be imported from the root package.""" - from learning_commons_evaluators import ( - BaseEvaluator, - TelemetryConfig, - __version__, - create_config_no_telemetry, - ) - assert __version__ is not None - assert create_config_no_telemetry is not None - assert BaseEvaluator is not None - assert TelemetryConfig is not None +def test_root_package_public_api(): + """Every name in ``__all__`` is defined on the root package (single source of truth).""" + pkg = importlib.import_module("learning_commons_evaluators") + missing = [name for name in pkg.__all__ if not hasattr(pkg, name)] + assert not missing, f"__all__ lists undefined names: {missing}" + for name in pkg.__all__: + assert getattr(pkg, name) is not None, name -def test_errors_import(): - from learning_commons_evaluators import ( - APIError, - wrap_provider_error, - ) - - assert APIError is not None - assert wrap_provider_error is not None - - -def test_providers_import(): - from learning_commons_evaluators.providers import ( - create_provider, - token_usage_from_aimessage, - ) - - assert create_provider is not None - assert token_usage_from_aimessage is not None +def test_providers_public_api(): + """Every name in ``providers.__all__`` is defined on the submodule.""" + providers = importlib.import_module("learning_commons_evaluators.providers") + missing = [name for name in providers.__all__ if not hasattr(providers, name)] + assert not missing, f"providers.__all__ lists undefined names: {missing}" + for name in providers.__all__: + assert getattr(providers, name) is not None, name diff --git a/sdks/settings/conventionality/contracts.toml b/sdks/settings/conventionality/contracts.toml new file mode 100644 index 0000000..47bac4c --- /dev/null +++ b/sdks/settings/conventionality/contracts.toml @@ -0,0 +1,130 @@ +# Contract tests for the Conventionality evaluator. +# +# This file captures the exact LLM interaction for known inputs so that +# tests can verify the SDK sends the same request as the notebook and +# produces the same result from the same response. +# +# HOW TO REGENERATE +# ----------------- +# Run the notebook "evals/Final ship - Conventionality Experimental Evaluator.ipynb" +# from the repo root with GOOGLE_API_KEY set. It loads prompts from the evaluator +# settings TOML (same as the SDK) and prints a TOML block β€” paste into this file +# and run `make sync-settings` from sdks/python/ to update the bundled copy. +# +# WHAT BELONGS HERE +# ----------------- +# Only data that originates from external sources: +# - input: what the caller provides to the evaluator +# - prompt_steps: what the SDK sends to the LLM (fully formatted messages, +# model, temperature) and the raw response the LLM returned +# - expected_result: the structured output the LLM returned (notebook format) +# +# Deterministic values computed by library code (e.g. FK score from textstat, +# format_instructions from LangChain's JsonOutputParser) are NOT stored as +# standalone fields β€” they appear inside the fully formatted prompts, where they +# are implicitly verified by the prompt-request assertion in the contract test. + +[cases.turnip] +description = "Grade 4 classroom narrative (Henry and the turnip)" + +[cases.turnip.input] +text = ''' + +"Well, then," said the teacher, "you may take your slate and go out behind the schoolhouse for half an hour. Think of something to write about, and write the word on your slate. Then try to tell what it is, what it is like, what it is good for, and what is done with it. That is the way to write a composition." Henry took his slate and went out. Just behind the schoolhouse was Mr. Finney's barn. Quite close to the barn was a garden. And in the garden, Henry saw a turnip. "Well, I know what that is," he said to himself; and he wrote the word turnip on his slate. Then he tried to tell what it was like, what it was good for, and what was done with it. Before the half hour was ended he had written a very neat composition on his slate. He then went into the house, and waited while the teacher read it. The teacher was surprised and pleased. He said, "Henry Longfellow, you have done very well. Today you may stand up before the school and read what you have written about the turnip." +''' +grade = 4 + +[cases.turnip.prompt_steps.main] +model = "gemini-3-flash-preview" +temperature = 0.0 +system_prompt = '''Role +You are an expert reading teacher and text complexity evaluator. Your task is to evaluate the "Conventionality" of a text and assign it a complexity level based on a 4-point scale, carefully factoring in the target grade level. + +Objective +Measure how explicit, literal, and straightforward the text's meaning is, versus how abstract, ironic, figurative, or archaic it is. Focus on the hiddenness of the meaning, the use of conceptual framing, the reliance on abstract reasoning, and the familiarity of the expression for the target grade. + +Complexity Levels +- Slightly Complex: Explicit, literal, straightforward, easy to understand. Meaning is entirely on the surface. The language is concrete, and the meaning is clear and procedural, mostly referring to observable materials and actions. Contains no symbolic or ironic language, and conceptual interpretation is not required. Contains limited figurative language that is common and easy to comprehend at the target grade level. +- Moderately Complex: Largely explicit and easy to understand with some occasions for more complex meaning. May contain a noticeable amount of archaic/dated phrasing, formal historical prose, vocabulary demands, background knowledge requirements, or expressions that are less familiar to the target grade level, which might make the text feel vague or slightly challenging. +- Very Complex: Fairly complex; contains sustained abstract language, conceptual framing, rhetorical idealization, ironic comparisons, or central metaphors that drive the meaning of the text. Addresses concepts, beliefs, and abstract qualities rather than just concrete objects. The tone or underlying message requires interpretation, even if the surface message is clear. +- Exceedingly Complex: Dense and complex; contains considerable abstract, ironic, and/or figurative language. Meaning is heavily hidden, deeply conceptual, or relies heavily on complex rhetorical devices. + +Essential Evaluation Rules +1. Concrete & Procedural Texts: Texts that are highly concrete, clear, and procedural (e.g., describing observable materials, mechanical processes, or physical actions) should typically be rated "Slightly Complex." + +2. Grade-Level Anchoring and Vague Narratives: Always consider the target grade. A literal historical narrative that might be straightforward for older students can be "Moderately Complex" for younger students (e.g., 4th graders) if it involves less familiar expressions, older contexts (e.g., wagon loads, traveling by horseback), vocabulary demands, and background knowledge requirements that make the text feel vague or slightly demanding for that age group. + +3. Rhetorical Idealization and Abstract Qualities: If an entire argument or narrative is built around abstract qualities (e.g., national character, bravery, liberty) and uses repeated figurative language or personification to portray a subject in a certain idealized way, rate the text as "Very Complex." Even if the figurative language is easy to interpret, the need to interpret the rhetorical tone and sustained abstract focus elevates the complexity beyond level two. + +4. Common Idioms and Grade-Level Appropriateness: Do NOT elevate a text to "Moderately Complex" simply because it contains a few common idiomatic expressions. If these expressions are widely known and easy for the target grade to understand without making the text feel vague, the text remains "Slightly Complex." + +5. Conversational and Hypothetical Framing: Using a second-person conversational hook (e.g., "Imagine you are...") to explain a concept is a standard, literal device for engaging readers. It does not constitute complex conceptual framing. + +6. Sustained vs. Occasional Impact: If abstract language, figurative phrasing, irony, or conceptual framing is sustained throughout the text and central to the argument/meaning, the text is Very Complex. Reserve Moderately Complex for texts where the explicit meaning dominates but the expression, vocabulary, or archaic language provides a moderate conventionality challenge. + +7. Central Metaphors and Conceptual Framing: When an author uses a central metaphor to explain a concept or uses figurative phrasing to explain how things "work," this abstract reasoning drives the meaning, elevating the text to Very Complex. + +8. Irony and Abstract Comparisons: Texts that rely on sustained irony, especially through comparative arguments, are inherently Very Complex for younger students. + +9. Isolate Conventionality from Vocabulary: Do not inflate the Conventionality score just because the text uses archaic, dated, or highly academic vocabulary. + +Input Format +You will receive: +- text: The passage to evaluate. +- grade: The target student grade level. +- fk_score: The Flesch-Kincaid readability score. + +Output Format +Provide a JSON object containing ONLY the following keys: +- complexity_score: (String) One of the 4 scale levels exactly as formatted: 'slightly_complex', 'moderately_complex', 'very_complex', or 'exceedingly_complex'. +- reasoning: (String) A detailed explanation of the rating, citing specific features in the text and referencing the expert guardrails (e.g., noting if the text relies on abstract qualities/rhetorical idealization, if vocabulary/background knowledge demands make a literal text vague for the grade level, or if it is strictly concrete/procedural). +- conventionality_features: (List of Strings) The specific language features driving the complexity (e.g., literal narrative, concrete actions, less familiar expressions, sustained irony, abstract qualities, rhetorical idealization, archaic phrasing) with direct quotes from the text. +- grade_context: (String) How the conventionality demands compare to general expectations for the provided target grade. +- instructional_insights: (String) Actionable pedagogical suggestions for scaffolding the conventionality features in the classroom. + +STRICT OUTPUT FORMAT: +- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators. +- Do not wrap the JSON in Markdown or code fences (no ``` or ```json). +- Do not prepend or append any text (e.g., do not write "Here is the JSON:"). +- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments. + +The output should be formatted as a JSON instance that conforms to the JSON schema below. + +As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]} the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted. + +Here is the output schema (shown in a code block for readability only β€” do not include any backticks or Markdown in your output): +``` +{"properties": {"conventionality_features": {"description": "List of the specific language features driving the complexity (e.g., idioms, metaphors, implied meaning) with direct quotes from the text.", "items": {"type": "string"}, "title": "Conventionality Features", "type": "array"}, "grade_context": {"description": "How the conventionality demands compare to general expectations for the provided target grade.", "title": "Grade Context", "type": "string"}, "instructional_insights": {"description": "Actionable pedagogical suggestions for scaffolding the unconventional language features in the classroom.", "title": "Instructional Insights", "type": "string"}, "complexity_score": {"description": "The conventionality complexity level of the text", "enum": ["slightly_complex", "moderately_complex", "very_complex", "exceedingly_complex"], "title": "Complexity Score", "type": "string"}, "reasoning": {"description": "A synthesis of why the text fits the chosen rubric level.", "title": "Reasoning", "type": "string"}}, "required": ["conventionality_features", "grade_context", "instructional_insights", "complexity_score", "reasoning"]} +``` +''' +user_prompt = '''Analyze: +Text: "Well, then," said the teacher, "you may take your slate and go out behind the schoolhouse for half an hour. Think of something to write about, and write the word on your slate. Then try to tell what it is, what it is like, what it is good for, and what is done with it. That is the way to write a composition." Henry took his slate and went out. Just behind the schoolhouse was Mr. Finney's barn. Quite close to the barn was a garden. And in the garden, Henry saw a turnip. "Well, I know what that is," he said to himself; and he wrote the word turnip on his slate. Then he tried to tell what it was like, what it was good for, and what was done with it. Before the half hour was ended he had written a very neat composition on his slate. He then went into the house, and waited while the teacher read it. The teacher was surprised and pleased. He said, "Henry Longfellow, you have done very well. Today you may stand up before the school and read what you have written about the turnip." +Grade: 4 +FK Score: 3.75 +''' +llm_response = ''' +```json +{ + "complexity_score": "slightly_complex", + "reasoning": "The text is a straightforward, literal narrative that is entirely explicit and easy to understand. The meaning remains on the surface, focusing on concrete actions and observable materials (a slate, a schoolhouse, a turnip). The teacher's instructions are procedural and clear, providing a step-by-step guide for writing a composition. There is no symbolic, ironic, or figurative language that requires interpretation. While the setting is historical (using a 'slate' and 'schoolhouse'), these terms do not create a barrier to understanding the plot or the underlying message, which is a simple account of a student completing a task.", + "conventionality_features": [ + "literal narrative: 'Henry took his slate and went out.'", + "concrete actions: 'he wrote the word turnip on his slate.'", + "procedural instructions: 'tell what it is, what it is like, what it is good for, and what is done with it.'", + "straightforward meaning: 'The teacher was surprised and pleased.'" + ], + "grade_context": "For a 4th-grade student, this text is highly accessible and falls well within the expected range for conventionality. The Flesch-Kincaid score of 3.75 aligns with the text's simple sentence structures and concrete subject matter, making it an easy read for students at this level.", + "instructional_insights": "Since the text is highly conventional, teachers can use it to model the 'procedural' writing steps mentioned in the story. Students can practice the teacher's four-step method (what it is, what it is like, what it is good for, what is done with it) on modern objects. A brief historical note explaining that a 'slate' was a precursor to a notebook or tablet would provide sufficient context for the dated terminology." +} +```''' + +[cases.turnip.expected_result] +complexity_score = ''' +slightly_complex''' +reasoning = ''' +The text is a straightforward, literal narrative that is entirely explicit and easy to understand. The meaning remains on the surface, focusing on concrete actions and observable materials (a slate, a schoolhouse, a turnip). The teacher's instructions are procedural and clear, providing a step-by-step guide for writing a composition. There is no symbolic, ironic, or figurative language that requires interpretation. While the setting is historical (using a 'slate' and 'schoolhouse'), these terms do not create a barrier to understanding the plot or the underlying message, which is a simple account of a student completing a task.''' +conventionality_features = ["literal narrative: 'Henry took his slate and went out.'", "concrete actions: 'he wrote the word turnip on his slate.'", "procedural instructions: 'tell what it is, what it is like, what it is good for, and what is done with it.'", "straightforward meaning: 'The teacher was surprised and pleased.'"] +grade_context = ''' +For a 4th-grade student, this text is highly accessible and falls well within the expected range for conventionality. The Flesch-Kincaid score of 3.75 aligns with the text's simple sentence structures and concrete subject matter, making it an easy read for students at this level.''' +instructional_insights = ''' +Since the text is highly conventional, teachers can use it to model the 'procedural' writing steps mentioned in the story. Students can practice the teacher's four-step method (what it is, what it is like, what it is good for, what is done with it) on modern objects. A brief historical note explaining that a 'slate' was a precursor to a notebook or tablet would provide sufficient context for the dated terminology.''' diff --git a/sdks/settings/conventionality/settings.toml b/sdks/settings/conventionality/settings.toml new file mode 100644 index 0000000..5d29e96 --- /dev/null +++ b/sdks/settings/conventionality/settings.toml @@ -0,0 +1,78 @@ +[evaluator_metadata] +id = "conventionality" +version = "0.1" +name = "Conventionality" +description = "Conventionality Evaluator" +maturity = "beta" + +[[evaluator_metadata.inputs]] +name = "text" +type = "TextInputField" +min_text_length = 10 +max_text_length = 10000 + +[[evaluator_metadata.inputs]] +name = "grade" +type = "GradeInputField" +allowed_grades = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12] + +[prompts] +system_prompt = """ +Role +You are an expert reading teacher and text complexity evaluator. Your task is to evaluate the "Conventionality" of a text and assign it a complexity level based on a 4-point scale, carefully factoring in the target grade level. + +Objective +Measure how explicit, literal, and straightforward the text's meaning is, versus how abstract, ironic, figurative, or archaic it is. Focus on the hiddenness of the meaning, the use of conceptual framing, the reliance on abstract reasoning, and the familiarity of the expression for the target grade. + +Complexity Levels +- Slightly Complex: Explicit, literal, straightforward, easy to understand. Meaning is entirely on the surface. The language is concrete, and the meaning is clear and procedural, mostly referring to observable materials and actions. Contains no symbolic or ironic language, and conceptual interpretation is not required. Contains limited figurative language that is common and easy to comprehend at the target grade level. +- Moderately Complex: Largely explicit and easy to understand with some occasions for more complex meaning. May contain a noticeable amount of archaic/dated phrasing, formal historical prose, vocabulary demands, background knowledge requirements, or expressions that are less familiar to the target grade level, which might make the text feel vague or slightly challenging. +- Very Complex: Fairly complex; contains sustained abstract language, conceptual framing, rhetorical idealization, ironic comparisons, or central metaphors that drive the meaning of the text. Addresses concepts, beliefs, and abstract qualities rather than just concrete objects. The tone or underlying message requires interpretation, even if the surface message is clear. +- Exceedingly Complex: Dense and complex; contains considerable abstract, ironic, and/or figurative language. Meaning is heavily hidden, deeply conceptual, or relies heavily on complex rhetorical devices. + +Essential Evaluation Rules +1. Concrete & Procedural Texts: Texts that are highly concrete, clear, and procedural (e.g., describing observable materials, mechanical processes, or physical actions) should typically be rated "Slightly Complex." + +2. Grade-Level Anchoring and Vague Narratives: Always consider the target grade. A literal historical narrative that might be straightforward for older students can be "Moderately Complex" for younger students (e.g., 4th graders) if it involves less familiar expressions, older contexts (e.g., wagon loads, traveling by horseback), vocabulary demands, and background knowledge requirements that make the text feel vague or slightly demanding for that age group. + +3. Rhetorical Idealization and Abstract Qualities: If an entire argument or narrative is built around abstract qualities (e.g., national character, bravery, liberty) and uses repeated figurative language or personification to portray a subject in a certain idealized way, rate the text as "Very Complex." Even if the figurative language is easy to interpret, the need to interpret the rhetorical tone and sustained abstract focus elevates the complexity beyond level two. + +4. Common Idioms and Grade-Level Appropriateness: Do NOT elevate a text to "Moderately Complex" simply because it contains a few common idiomatic expressions. If these expressions are widely known and easy for the target grade to understand without making the text feel vague, the text remains "Slightly Complex." + +5. Conversational and Hypothetical Framing: Using a second-person conversational hook (e.g., "Imagine you are...") to explain a concept is a standard, literal device for engaging readers. It does not constitute complex conceptual framing. + +6. Sustained vs. Occasional Impact: If abstract language, figurative phrasing, irony, or conceptual framing is sustained throughout the text and central to the argument/meaning, the text is Very Complex. Reserve Moderately Complex for texts where the explicit meaning dominates but the expression, vocabulary, or archaic language provides a moderate conventionality challenge. + +7. Central Metaphors and Conceptual Framing: When an author uses a central metaphor to explain a concept or uses figurative phrasing to explain how things "work," this abstract reasoning drives the meaning, elevating the text to Very Complex. + +8. Irony and Abstract Comparisons: Texts that rely on sustained irony, especially through comparative arguments, are inherently Very Complex for younger students. + +9. Isolate Conventionality from Vocabulary: Do not inflate the Conventionality score just because the text uses archaic, dated, or highly academic vocabulary. + +Input Format +You will receive: +- text: The passage to evaluate. +- grade: The target student grade level. +- fk_score: The Flesch-Kincaid readability score. + +Output Format +Provide a JSON object containing ONLY the following keys: +- complexity_score: (String) One of the 4 scale levels exactly as formatted: 'slightly_complex', 'moderately_complex', 'very_complex', or 'exceedingly_complex'. +- reasoning: (String) A detailed explanation of the rating, citing specific features in the text and referencing the expert guardrails (e.g., noting if the text relies on abstract qualities/rhetorical idealization, if vocabulary/background knowledge demands make a literal text vague for the grade level, or if it is strictly concrete/procedural). +- conventionality_features: (List of Strings) The specific language features driving the complexity (e.g., literal narrative, concrete actions, less familiar expressions, sustained irony, abstract qualities, rhetorical idealization, archaic phrasing) with direct quotes from the text. +- grade_context: (String) How the conventionality demands compare to general expectations for the provided target grade. +- instructional_insights: (String) Actionable pedagogical suggestions for scaffolding the conventionality features in the classroom. + +{format_instructions} +""" +human_prompt = """ +Analyze: +Text: {text} +Grade: {grade} +FK Score: {fk_score} +""" + +[evaluation_settings.prompt_settings_step_conventionality_evaluation] +provider_type = "GOOGLE" +model = "gemini-3-flash-preview" +temperature = 0 diff --git a/sdks/settings/vocabulary/contracts.toml b/sdks/settings/vocabulary/contracts.toml new file mode 100644 index 0000000..8420e01 --- /dev/null +++ b/sdks/settings/vocabulary/contracts.toml @@ -0,0 +1,641 @@ +# Contract tests for the Vocabulary evaluator. +# +# This file captures the exact LLM interactions for known inputs so that tests +# can verify the SDK sends the same requests as the notebook and produces the +# same results from the same responses. +# +# HOW TO REGENERATE +# ----------------- +# Run the notebook "evals/vocabulary_evaluator.ipynb" with valid API keys. +# The final cells output a TOML block that you paste in here to replace the +# placeholder values below. +# +# WHAT BELONGS HERE +# ----------------- +# Only data that originates from external sources: +# - input: what the caller provides to the evaluator +# - prompt_steps: what the SDK sends to the LLM (fully formatted messages, +# model, temperature) and the raw response the LLM returned. +# Steps must be listed in call order: +# 1. background_knowledge (plain text response) +# 2. vocab_complexity (JSON response) +# - expected_result: the structured output parsed from the vocab_complexity +# llm_response (notebook format), before SDK result mapping. +# +# Deterministic values computed by library code (FK score, format_instructions) +# are NOT stored as standalone fields β€” they appear inside the fully formatted +# prompts, where they are implicitly verified by the prompt-request assertion. +# +# TWO CASES ARE INCLUDED +# ----------------------- +# marco_polo_grade3 β€” grade 3, uses the grades 3–4 path (Gemini + rubric labels +# + word-level breakdown in expected_result). +# hurricanes_grade7 β€” grade 7, uses the grades 5–12 path (GPT; expected_result +# holds ``complexity_score`` as a string plus word-list fields, +# matching the notebook ``Output`` shape. The SDK also accepts +# legacy integer ``answer`` in raw JSON like ``normalize_complexity_output``.) + +# =========================================================================== +# Case 1: marco_polo_grade3 (grades 3–4 path) +# =========================================================================== + +[cases.marco_polo_grade3] +description = "Marco Polo passage, grade 3 (grades 3-4 Gemini path)" + +[cases.marco_polo_grade3.input] +text = ''' +Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all. +Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk. +His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East.''' +grade = 3 + +[cases.marco_polo_grade3.prompt_steps.background_knowledge] +model = "gpt-4o-2024-11-20" +temperature = 0.0 +system_prompt = "" +user_prompt = ''' + +Review the following text, which is an educational text written for students in the following grade band: 3. + +Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with. + +Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.). + +Here's an example: +[START EXAMPLE] +Grade Band: 11th +Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not +learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was +not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to +know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me, +are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it +is the chief end of man here to β€œglorify God and enjoy him forever.” + +Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy. +[END EXAMPLE] + +You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else. + +You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have: +[BEGIN TOPICS] +[ + K: [ + "Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees", + "The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?", + "America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?", + "Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together", + "How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?", + "How Has Our World Changed?", "Why Do People Have Jobs?" + ], + 1: [ + "Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds", + "A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals’ unique features?", + "Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?", + "Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World", + "What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?", + "How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?" + ], + 2: [ + "Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators", + "A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?", + "Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?", + "Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are", + "Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?", + "How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?" + ], + "3": [ + "Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics", + "Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist", + "Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate", + "Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?", + "What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?", + "How Do People in a Community Meet Their Wants and Needs?" + ], + 4: [ + "Poetry", "Animal Defense Mechanisms", "The American Revolution", + "Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)", + "A Great Heart: What does it mean to have a great heart, literally and figuratively?", + "Extreme Settings: How does a challenging setting or physical environment change a person?", + "American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light", + "Earth's Features", "Waves, Energy, and Information", "Regions of the United States", + "How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?", + "How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?", + "How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?" + ], + 5: [ + "Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change", + "Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?", + "Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?", + "Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky", + "Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation", + "How Were the Lives of Native Peoples Influenced by Where They Lived?", + "What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?", + "Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?", + "How Does the Constitution Help Us Understand What It Means to Be an American?", + "What Do the Early Years of the United States Reveal About the Character of the Nation?", + "What Was the Effect of the Civil War on U.S. Society?" + ], + 6: [ + "Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools", + "Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?", + "A Hero’s Journey: What is the significance and power of the hero’s journey?", + "Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?", + "Courage in Crisis: How can the challenges of a hostile environment inspire heroism?", + "Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy", + "Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate", + "Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)", + "Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution", + "The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)", + "Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)", + "New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II", + "The Cold War", "Civil Rights and American Society", "America Since the 1970s" + ], + 7: [ + "The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution", + "Identity in the Middle Ages: How does society both support and limit the development of identity?", + "Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?", + "Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering", + "Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources", + "Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent", + "Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan", + "Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires", + "China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking", + "Age of Exploration and Trade", "Revolutions and Empires", "The Modern World" + ], + 8: [ + "Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment", + "The Poetics and Power of Storytelling: What is the power of storytelling?", + "The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?", + "Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy", + "Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun", + "Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms", + "Places and Regions", "Physical Geography", "Population Geography", "Economic Geography", + "Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses", + "Government and the Economy", "The Global Economy" + ] +] +[END TOPICS] + +Here is the text: +[BEGIN TEXT] +Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all. +Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk. +His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East. +[END TEXT] +''' +llm_response = ''' +Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of historical exploration. They are unlikely to be familiar with the Mongol Dynasty, Kublai Khan, or Marco Polo's specific travels and contributions.''' + +[cases.marco_polo_grade3.prompt_steps.vocab_complexity] +model = "gemini-2.5-pro" +temperature = 0.0 +system_prompt = ''' + +You are an expert curriculum designer. Your job is to rate the complexity of a text's vocabulary relative to the grade level. + +You will be given a rubric (with levels from least to most complex: slightly complex, moderately complex, very complex, exceedingly complex) as well as guidelines for interpreting the rubric. +IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meaning, etc.) + +**Resource 1: Qualitative Text Complexity rubric (SAP)** +1. **Level 1: Slightly complex** + * Original Definition: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. A very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0. + * Summary definition: Overall, vocabulary is easy to understand and does not impede comprehension of the bulk of the text (including main idea and supporting claims). 1-2 quick pauses for processing by the student are ok here! +2. **Level 2: Moderately complex** + * Original Definition: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK + * Summary definition: Overall, vocabulary generally allows students to comprehend the bulk of the text with little difficulty, though there may be occasional pauses for clarification. Several quick pauses or occasional prolonged pauses may occur. +3. **Level 3: Very complex** + * Original Definition: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic + * Summary definition: Overall, vocabulary often presents challenges that may slow down comprehension but does not completely block the comprehension of the bulk of the text. +4. **Level 4: Exceedingly complex** + * Original Definition: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading. + * Summary definition: Overall, vocabulary is so complex that it makes comprehension of the bulk of the text very challenging and requires careful effort to interpret. + +**Resource 2: Flesch-Kincaid Grade Level** +Use the Flesch-Kincaid (FK) Grade Level as light guidance of the approximate grade level based on readability. The metric alone does not provide final information of vocabulary complexity, but a ballpark of the difficulty of the entire text. +* grade 2-3: 1.98-5.34 +* grade 4-5: 4.51-7.73 +* grade 6-8: 6.51-10.34 +* grade 9-10: 8.32-12.12 +* grade 11-College: 10.34-14.2 + +**Guidelines for Interpretation and Reasoning** + +Your reasoning is the most critical part of your analysis. It's not enough to simply count complex words. You must analyze their impact on a student at the specified grade level. Use the following principles to guide your judgment: + +1. **Density and Cumulative Effect:** Do not just count complex words; evaluate their concentration. A short text with a high density of challenging Tier 2 words (e.g., `peculiar`, `mischievous`, `courageous` for a 4th grader) can be more overwhelming than a longer text with a few scattered Tier 3 words. A constant barrage of unfamiliar words can elevate complexity from `very` to `exceedingly`. +2. **Contextual Scaffolding:** Assess how the text supports new vocabulary. + * Are new, complex terms explicitly defined or explained with simple examples (e.g., "volume... to see if it is big enough to hold a liter of food")? + * Is the surrounding language simple and conversational, making the meaning of new words easier to infer? + * Strong scaffolding can lower the complexity rating. A text with many Tier 3 words that are well-explained might only be `moderately complex`. +3. **Abstract vs. Concrete Vocabulary:** Differentiate between words for abstract concepts and words for concrete things. A text built on abstract Tier 2 words (e.g., `relationships`, `performance`, `non-physical`) can be more challenging than a text that introduces Tier 3 labels for concrete things or people (e.g., `Sumerians`, `polonium`). +4. **Conceptual Load:** Consider the cognitive load of the vocabulary. A list of many new, multi-syllabic, conceptually-heavy terms (e.g., `Paleolithic`, `Mesolithic`, `Neolithic` for a 3rd grader) can be `very complex` even if the terms are briefly defined, because the student must process multiple new concepts at once. +5. **Calibrating the Top Levels:** Be precise in your use of `very complex` vs. `exceedingly complex`. + * **Very complex:** The vocabulary creates significant hurdles and slows the reader down, but the main ideas of the text are still accessible with effort. + * **Exceedingly complex:** The vocabulary is so dense, technical, or abstract that it acts as a barrier, making it nearly impossible for the target student to grasp the bulk of the text's meaning without extensive outside help. Reserve this for texts saturated with advanced terminology. +6. **Consider Background Knowledge:** Pay close attention to the provided `student_background_knowledge`. Do not classify a word as complex if the student is likely to be familiar with it (e.g., 'oxygen' for a 3rd grader who has learned about the human body). + +**Final Analysis Format** + +Provide these information as your final analysis: +1. **Complex vocabulary:** + * Tier 2 words: Words that are commonly used in academic settings and more complex than colloquial, or everyday language and often have multiple meanings. + * Tier 3 words: Overly academic or domain-specific words. + * Archaic words: Words, or uses of words that are not commonly used in modern conversational language. E.g., β€œThe jury retired to deliberate on their verdict." The use of β€œretire” to mean withdrawing to a private place is an archaic use. + * Other complex words: All other words that can increase complexity of the text (e.g., idioms, unfamiliar proper nouns that function as vocabulary). +2. **Vocabulary complexity:** one of: slightly complex, moderately complex, very complex, exceedingly complex +3. **Your reasoning of the complexity:** A detailed explanation of your rating, referencing the principles above. +''' +user_prompt = ''' + +Below is the text you need to evaluate. Let's think step by step in order to predict the output of the vocabulary complexity task. + +- It is intended for grade 3. + +- You can assume the student has the following background knowledge about the text β€” this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic geography (e.g., continents, oceans, and major regions like Asia and Europe) and the concept of historical exploration. They are unlikely to be familiar with the Mongol Dynasty, Kublai Khan, or Marco Polo's specific travels and contributions. + +- Text Flesch-Kincaid grade level: 6.59 + +- Text to evaluate: [BEGIN TEXT] +Polo went on a 24-year trip to China with his father and uncle during the Mongol Dynasty. He left Venice at the age of 17 on a boat that went through the Mediterranean Sea, Ayas, Tabriz and Kerman. Then he travelled across Asia getting as far as Beijing. On the way there he had to go over mountains and through terrible deserts, across hot burning lands and places where the cold was horrible. He served in Kublai Khan's court for 17 years. He left the Far East and returned to Venice by sea. There was sickness on board and 600 passengers and crew died and some say pirates attacked. Nevertheless, Marco Polo survived it all. +Some scholars believe that while Marco Polo did go to China, he did not go to all of the other places described in his book. He brought noodles back from China and the Italians came up with different sizes and shapes and called it pasta. Polo returned to Venice with treasures like ivory, jade, jewels, porcelain and silk. +His father had borrowed money and bought a ship. He became wealthy because of his trading in the near East. +[END TEXT] + +STRICT OUTPUT FORMAT: +- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators. +- Do not wrap the JSON in Markdown or code fences (no ``` or ```json). +- Do not prepend or append any text (e.g., do not write "Here is the JSON:"). +- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments. + +The output should be formatted as a JSON instance that conforms to the JSON schema below. + +As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]} the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted. + +Here is the output schema (shown in a code block for readability only β€” do not include any backticks or Markdown in your output): +``` +{"properties": {"tier_2_words": {"description": "List of Tier 2 words", "title": "Tier 2 Words", "type": "string"}, "tier_3_words": {"description": "List of Tier 3 words", "title": "Tier 3 Words", "type": "string"}, "archaic_words": {"description": "List of Archaic words", "title": "Archaic Words", "type": "string"}, "other_complex_words": {"description": "List of Other Complex words", "title": "Other Complex Words", "type": "string"}, "complexity_score": {"description": "the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex", "title": "Complexity Score", "type": "string"}, "reasoning": {"description": "your reasoning for your answer", "title": "Reasoning", "type": "string"}}, "required": ["tier_2_words", "tier_3_words", "archaic_words", "other_complex_words", "complexity_score", "reasoning"]} +``` +''' +llm_response = ''' +```json +{ + "tier_2_words": "served, nevertheless, scholars, described, treasures, wealthy, trading, survived, passengers", + "tier_3_words": "Mongol Dynasty, Kublai Khan, ivory, jade, porcelain", + "archaic_words": "None", + "other_complex_words": "Venice, Ayas, Tabriz, Kerman, Mediterranean Sea, Far East, near East", + "complexity_score": "very complex", + "reasoning": "The vocabulary is rated as very complex due to the high density and conceptual load of unfamiliar terms for a 3rd grader, combined with a near-total lack of contextual scaffolding. The text introduces a large number of proper nouns that function as new vocabulary concepts (Mongol Dynasty, Kublai Khan, Venice, Ayas, Tabriz, Kerman) in a very short space. In addition, it includes several Tier 3 words for unfamiliar goods (ivory, jade, porcelain) and challenging Tier 2 words (nevertheless, scholars). These terms are presented without definition or explanation, requiring the student to process multiple new concepts simultaneously. This density of unfamiliar vocabulary will often present challenges and significantly slow down comprehension, even if the student can grasp the general story of travel and trade. The cumulative effect of these words pushes the text beyond moderately complex into the very complex category for this grade level." +} +```''' + +[cases.marco_polo_grade3.expected_result] +tier_2_words = ''' +served, nevertheless, scholars, described, treasures, wealthy, trading, survived, passengers''' +tier_3_words = ''' +Mongol Dynasty, Kublai Khan, ivory, jade, porcelain''' +archaic_words = ''' +None''' +other_complex_words = ''' +Venice, Ayas, Tabriz, Kerman, Mediterranean Sea, Far East, near East''' +complexity_score = ''' +very complex''' +reasoning = ''' +The vocabulary is rated as very complex due to the high density and conceptual load of unfamiliar terms for a 3rd grader, combined with a near-total lack of contextual scaffolding. The text introduces a large number of proper nouns that function as new vocabulary concepts (Mongol Dynasty, Kublai Khan, Venice, Ayas, Tabriz, Kerman) in a very short space. In addition, it includes several Tier 3 words for unfamiliar goods (ivory, jade, porcelain) and challenging Tier 2 words (nevertheless, scholars). These terms are presented without definition or explanation, requiring the student to process multiple new concepts simultaneously. This density of unfamiliar vocabulary will often present challenges and significantly slow down comprehension, even if the student can grasp the general story of travel and trade. The cumulative effect of these words pushes the text beyond moderately complex into the very complex category for this grade level.''' + + +# =========================================================================== +# Case 2: hurricanes_grade7 (grades 5–12 path) +# =========================================================================== + +[cases.hurricanes_grade7] +description = "Hurricane formation passage, grade 7 (grades 5-12 GPT path)" + +[cases.hurricanes_grade7.input] +text = ''' +Great whirling storms roar out of the oceans in many parts of the world. They are called by several names β€” hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms. + +Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes.''' +grade = 7 + +[cases.hurricanes_grade7.prompt_steps.background_knowledge] +model = "gpt-4o-2024-11-20" +temperature = 0.0 +system_prompt = "" +user_prompt = ''' + +Review the following text, which is an educational text written for students in the following grade band: 7. + +Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with. + +Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.). + +Here's an example: +[START EXAMPLE] +Grade Band: 11th +Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not +learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was +not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to +know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me, +are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it +is the chief end of man here to β€œglorify God and enjoy him forever.” + +Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy. +[END EXAMPLE] + +You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else. + +You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have: +[BEGIN TOPICS] +[ + K: [ + "Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees", + "The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?", + "America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?", + "Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together", + "How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?", + "How Has Our World Changed?", "Why Do People Have Jobs?" + ], + 1: [ + "Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds", + "A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals’ unique features?", + "Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?", + "Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World", + "What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?", + "How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?" + ], + 2: [ + "Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators", + "A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?", + "Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?", + "Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are", + "Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?", + "How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?" + ], + "3": [ + "Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics", + "Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist", + "Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate", + "Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?", + "What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?", + "How Do People in a Community Meet Their Wants and Needs?" + ], + 4: [ + "Poetry", "Animal Defense Mechanisms", "The American Revolution", + "Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)", + "A Great Heart: What does it mean to have a great heart, literally and figuratively?", + "Extreme Settings: How does a challenging setting or physical environment change a person?", + "American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light", + "Earth's Features", "Waves, Energy, and Information", "Regions of the United States", + "How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?", + "How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?", + "How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?" + ], + 5: [ + "Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change", + "Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?", + "Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?", + "Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky", + "Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation", + "How Were the Lives of Native Peoples Influenced by Where They Lived?", + "What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?", + "Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?", + "How Does the Constitution Help Us Understand What It Means to Be an American?", + "What Do the Early Years of the United States Reveal About the Character of the Nation?", + "What Was the Effect of the Civil War on U.S. Society?" + ], + 6: [ + "Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools", + "Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?", + "A Hero’s Journey: What is the significance and power of the hero’s journey?", + "Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?", + "Courage in Crisis: How can the challenges of a hostile environment inspire heroism?", + "Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy", + "Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate", + "Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)", + "Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution", + "The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)", + "Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)", + "New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II", + "The Cold War", "Civil Rights and American Society", "America Since the 1970s" + ], + 7: [ + "The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution", + "Identity in the Middle Ages: How does society both support and limit the development of identity?", + "Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?", + "Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering", + "Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources", + "Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent", + "Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan", + "Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires", + "China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking", + "Age of Exploration and Trade", "Revolutions and Empires", "The Modern World" + ], + 8: [ + "Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment", + "The Poetics and Power of Storytelling: What is the power of storytelling?", + "The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?", + "Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy", + "Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun", + "Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms", + "Places and Regions", "Physical Geography", "Population Geography", "Economic Geography", + "Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses", + "Government and the Economy", "The Global Economy" + ] +] +[END TOPICS] + +Here is the text: +[BEGIN TEXT] +Great whirling storms roar out of the oceans in many parts of the world. They are called by several names β€” hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms. + +Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes. +[END TEXT] +''' +llm_response = ''' +Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, air pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes, their formation, or their global naming conventions.''' + +[cases.hurricanes_grade7.prompt_steps.vocab_complexity] +model = "gpt-4.1" +temperature = 0.0 +system_prompt = ''' + +You are an expert curriculum designer. Your job involves reading text snippets intended for students in K-12 and evaluating the complexity of the vocabulary in each text. + +You will be given a rubric (with options 1, 2, 3, 4) as well as guidelines for interpreting the rubric. + +IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meainng, etc.) +IMPORTANT: Rely on the supplied rubric and annotation guidelines along. Do not introduce any new crtieria for evaluating the complexity of a text's vocabulary. + +Please first reason out loud about the vocabulary complexity of the text and then provide an answer between 1 and 4 (whole numbers only). Provide the answer as an integer (not a float). +''' +user_prompt = ''' + +Your job is to rate the complexity of a text's vocabulary (relative to the intended level of the text) according to a rubric and annotation guide. Stick to the rubric and annotation guide exactly β€” do not introduce any additional criteria or lenses for judging the complexity of the text. + +[BEGIN ANNOTATION GUIDE AND RUBRIC] +Instructions +For the following task, please assume that: + - The student is on grade level and proficient in all core content areas, including reading fluency, comprehension, science, & social studies. (example). + - The student is moving through a common progression of topics (detailed here). + - The student is fluent in speaking English. + - The student has an "average" amount of background knowledge on topics not commonly covered in curriculum. + - The student will use this material for independent reading/work, without direct instruction. + - The text is reasonable for the given grade level. + +Please do not consider the presence of figurative language when scoring Vocabulary. For example: with a phrase like "kicked the bucket," consider only the qualities of the words themselves ("kicked", "the" and "bucket"). + +Please do be sure to consider: +- all of the different types of vocabulary (listed below) +- the overall proportion of complex words in the text - including repeated complex words. +- the resulting holistic complexity of the vocabulary (described in the Summary section below). + +Level 1: +Rubric: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. That said, a very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0. + +Level 2: +Rubric: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK, but if it's very low, the text is probably level 1. + +Level 3: +Rubric: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic + +Level 4: +Rubric: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading + +And here are some relevant definitions: + - Conversational: Everyday language. + - Familiar: Words that the student is likely to have seen/heard, from everyday life or their curriculum. Reminder: assume an "average" level of background knowledge. + - Unfamiliar: Words the student has probably not heard, or are being used in an unfamiliar way. + - For ex: 4th graders are familiar with the word "table" but may not be familiar with the use of the word with respect to data ("a table of data"). + - Note: + - Words with in-line definitions (via appositives, or because they can be easily inferred from other parts of the text) should be evaluated as less unfamiliar. + - For ex: "The pharaoh, a powerful ruler of ancient Egypt, was buried in a grand tomb." + - The word "pharaoh" might be unfamiliar or subject-specific, but since is defined within the text, you can consider it a more familiar word. + - Unfamiliar proper nouns: + - A person's name, even if unfamiliar, generally does not add to complexity. + - Other unfamiliar proper nouns (eg locations, organizations) do add to complexity. + +- Subject-specific: Words that are specific to a subject or field of study that are essential for understanding concepts and engaging with the content. +- Overly-academic: Words that are excessively formal, complex, or specialized. + - For ex: "The agrarian societal structure of the Neolithic Revolution precipitated a paradigm shift in agriculture" +- Archaic: A word that was common in the past but is now rarely/almost never used. Could also be a word used in an archaic way. + - For ex: "After a long day of court proceedings, the jury 'retired' to deliberate on their verdict." + - The word "retire" meaning to stop working may be familiar to a student, but "retire" meaning "withdrawing to a private place" is an archaic use. + + +Examples +The student is on-grade-level: +- Consider a 6th grade passage about earth systems. Per NGSS standards, students are introduced to earth systems starting in 2nd grade. They encounter words like: wind, water, river, lake, solids, and liquids. For our rating purposes, we would assume most students following 2nd have encountered these words. In 5th grade, they dive more fully into earth systems concepts, learning vocabulary words like geosphere, sediment, biosphere, atmosphere, ecosystems, organisms and climate. While rating, we would consider the words listed in the NGSS standards as more familiar following that grade level. If the same passage were intended for 3rd graders, though, then the subject-specific vocabulary is likely to be unfamiliar. + +Figurative Language +- Kicked the bucket. +- The pen is mightier than the sword. +- The classroom was a zoo. +- He ran faster than the speed of light. +[END ANNOTATION GUIDE AND RUBRIC] + +Here are a couple examples of texts that have already been scored along with justification for their scores, which you can use as exemplars: +[BEGIN EXAMPLES] + +*** EXAMPLE 1 *** +The following text was intended for grade level 11 and received a complexity level of 1. + +Here is the background knowledge assumption for that text: N/A + +Here is the text: +// START TEXT // +"In a recent lecture, "Is Nothing Sacred?", Salman Rushdie, one of the most censored authors of our time, talked about the importance of books. He grew up in a household in India where books were as sacred as bread. If anyone in the household dropped a piece of bread or a book, the person not only picked it up, but also kissed the object by way of apologizing for clumsy disrespect. + +He goes on to say that he had kissed many books before he had kissed a girl. Bread and books were for his household, and for many like his, food for the body and the soul. This image of the kissing of the book one had accidentally dropped made an impression on me. It speaks to the love and respect many people have for them. + +I grew up in a small town in New Mexico, and we had very few books in our household. The first one I remember reading was my catechism book. Before I went to school to learn English, my mother taught me catechism in Spanish. + +I remember the questions and answers I had to learn, and I remember the well-thumbed, frayed volume which was sacred to me. + +Growing up with few books in the house created in me a desire and a need for them. When I started school, I remember visiting the one room library of our town and standing in front of the dusty shelves. In reality there were only a few shelves and not over a thousand books, but I wanted to read them all. There was food for my soul in the books, that much I realized." +// END TEXT // + +Here is the reasoning for that complexity level: +// START REASONING // +This text is a 1 for vocabulary, because the vocabulary that is used is familiar and accessible for a proficient 11th grader. Most of the words used in the text are very common everyday vocabulary for describing growing up, family life, and the importance of reading. A few examples of these very common words are: small town, book, school, learn, food, kissed, image, respect, love, speaks. There are many more in the text. In this text there are only a few "juicier" or more complex words, you can think of those as words that are less familiar, have a more abstract or nuanced meaning, or carry a very large concept. Less commonly spoken words that were used in the text were: frayed, volume, censored, clumsy, sacred. These are still well within reach of a proficient 11th grader, and would still be considered familiar, because they will have encountered them in past reading or academic studies. In the text there are a couple of words that are outliers, but they are not essential to the understanding of the larger text. One of these words or hyphenated compound phrase is well-frayed. A compound phrase is a phrase consisting of multiple words that work together to create a specific meaning or idea, often acting as a single unit in a sentence. If the meaning of individual words is familiar, it is typically quite easy for proficient readers to generalize the larger meaning that the author is implying with their word choice. In this case, proficient students will be accustomed to the phrase well, with the secondary meaning of very, rather than a description of positivity or health; and they will be accustomed to the use frayed, as in worn, aged, or damaged from use. Making the leap to identify the meaning of "well-frayed" as a book that is very used, will take only moments for a proficient 11th grader. Another word that stands out in the text is the word catechism, which might be new for many students based on their personal background or location, but a full understanding of what a catechism book contains is not essential for understanding the paragraph or whole text. The reader can make it through using minimum context clues to know that the catechism must be something important to his family. The type of book he learned to read before going to school is not critical for comprehension, it's enough to understand that reading was so important in his family, his mother started instruction before he even started school. Additionally, it's important to know that having one unknown word for an 11th grade reading, does not merit a rating higher than one. + +It is worth noting that another reason this text is a 1, is that the content or topic of the passage is so familiar and covered extensively in K-12 education, i.e. reading is important, loving books, growing up; that coupled with the simple vocabulary choices, getting to the meaning of the overall text, and even the paragraphs, would be incredibly easy for a proficient 11th grader. +// END REASONING // +*** EXAMPLE 2 *** +The following text was intended for grade level 5 and received a complexity level of 2. + +Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with the concept of natural disasters, including hurricanes, and basic atmospheric concepts like high and low pressure from their studies on weather and climate. They may not be familiar with the specific formation processes of hurricanes or the global terminology differences (hurricane, typhoon, cyclone). + +Here is the text: +// START TEXT // +Great whirling storms roar out of the oceans in many parts of the world. They are called by several namesβ€”hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms. + +Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes. +// END TEXT // + +Here is the reasoning for that complexity level: +// START REASONING // +I scored this a 2 because of the density of subject-specific vocabulary related to weather and climate, which is often covered in lower grade levels. This adds to the complexity above a 1, but it is not a level 3 because of the familiarity with the topic, which implies some familiarity with the vocabulary as well. The specific formation process and the vocabulary used to explain the processes are also subject-specfiic but not famliar, which would make the second paragraph a level 3 in the rubric language, but when considering the language used in the overall SUMMARY below the rubric, this new content and vocabulary would cause quick pauses and/or occasional prolonged pauses but would not cause the reader to slow down to due to challenging overall comprehension of the key ideas and supporting claims. This is especially the case because the second paragraph builds upon prior knowledge and familiar vocabulary use, so it is not entirely new information and vocabulary. While there is subject-specific vocabulary used, overly academic vocabulary is NOT used and is more conversational in nature, such as "great whiring storms" and "born" / "giving birth" to storm (although this is the way storms are described!) rather than more technical terms which made comprehension easier due to the accessibility of the vocabulary (even if used in other contexts before reading this text). Words such as "a lot" and "bigger" are more conversational, and while technical, unfamiliar words are provided, such as "hurricane," "typhoon," and "cyclone," knowing and understanding their differences is not necessary to grasp the main idea. The processes by which they are formed are what need to be retained while reading the entire text, and familiarity with the bulk of the vocabulary used would allow for that to happen without too much struggle to make meaning of it. Additionally, the text does not contain any archaic vocabulary or ambiguous words, which prevents it from reaching a rating of 4, although it is not necessary that they text have such vocabulary to meet a level 4, the frequent inclusion of such vocabulary makes it more likely to land at least a 3 or 4. +// END REASONING // + +*** EXAMPLE 3 *** +The following text was intended for grade level 6 and received a complexity level of 3. + +Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with basic Earth science concepts such as rocks, minerals, and fossils, as well as natural processes like volcanic eruptions and earthquakes. They may not be familiar with more advanced topics like plate tectonics or the specific branches of geology such as mineralogy, petrology, and seismology. + +Here is the text: +// START TEXT // +Geology is the scientific study of Earth. Geologists study the planetβ€”its formation, its internal structure, its materials, its chemical and physical processes, and its history. Mountains, valleys, plains, sea floors, minerals, rocks, fossils, and the processes that create and destroy each of these are all the domain of the geologist. Geology is divided into two broad categories of study: physical geology and historical geology. + +Physical geology is concerned with the processes occurring on or below the surface of Earth and the materials on which they operate. These processes include volcanic eruptions, landslides, earthquakes, and floods. Materials include rocks, air, seawater, soils, and sediment. Physical geology further divides into more specific branches, each of which deals with its own part of Earth's materials, landforms, and processes. Mineralogy and petrology investigate the composition and origin of minerals and rocks. Volcanologists study lava, rocks, and gases on live, dormant, and extinct volcanoes. Seismologists use instruments to monitor and predict earthquakes and volcanic eruptions. + +Historical geology is concerned with the chronology of events, both physical and biological, that have taken place in Earth's history. Paleontologists study fossils (remains of ancient life) for evidence of the evolution of life on Earth. Fossils not only relate evolution, but also speak of the environment in which the organism lived. Corals in rocks at the top of the Grand Canyon in Arizona, for example, show a shallow sea flooded the area around 290 million years ago. In addition, by determining the ages and types of rocks around the world, geologists piece together continental and oceanic history over the past few billion years. Plate tectonics (the study of the movement of the sections of Earth's crust) adds to Earth's story with details of the changing configuration of the continents and oceans. +// END TEXT // + +Here is the reasoning for that complexity level: +// START REASONING // +To determine the complexity rating of this text based on the vocabulary present, I used the annotation guide, scoring rubric, and examples to set the expectations for rating. During the first read of the text, I "bolded" and categorized the more challenging vocabulary words according to the following complexity groupings: archaic, unfamiliar, archaic, subject-specific, and/or overly academic. On the second read, I considered the main idea or "gist" that students need to acquire understanding of. I then referenced the previously mentioned tools–annotation guide, scoring rubric, and examples to remind myself of the expectations for rating. I agreed that readers would have familiarity with basic concepts of geology; however, I also considered the definitions provided for words such as Geology, Geologists, Physical Geology, Historical Geology, Mineralogy, and Petrology. I considered how students might pause for clarification and for how long. After reviewing the Annotation Guide while considering, I narrowed the rating down because the definitions provided throughout the text of more complex words should make the meaning of the text more accessible for readers, which is why although the words are subject-specific, I rated this text as a 3 instead of a 2-less complex or a 4–more complex. I read the text one final time to ensure clarity around my rating, scored and wrote the justification. +// END REASONING // +[END EXAMPLES] + +Below is the text you need to evaluate. It is intended for grade 7. + +As you read the text, you can assume the student has the following background knowledge about the text β€” this background knowledge influences which words from the text are familiar versus unfamiliar for the student: Background Knowledge Assumption: Students are likely familiar with basic weather concepts, including storms, air pressure, and the water cycle, but may not yet have detailed knowledge of hurricanes, their formation, or their global naming conventions. + +[BEGIN TEXT] +Great whirling storms roar out of the oceans in many parts of the world. They are called by several names β€” hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms. + +Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes. +[END TEXT] + +In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say "level 2"). + +STRICT OUTPUT FORMAT: +- Return only the JSON value that conforms to the schema. Do not include any additional text, explanations, headings, or separators. +- Do not wrap the JSON in Markdown or code fences (no ``` or ```json). +- Do not prepend or append any text (e.g., do not write "Here is the JSON:"). +- The response must be a single top-level JSON value exactly as required by the schema (object/array/etc.), with no trailing commas or comments. + +The output should be formatted as a JSON instance that conforms to the JSON schema below. + +As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]} the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted. + +Here is the output schema (shown in a code block for readability only β€” do not include any backticks or Markdown in your output): +``` +{"properties": {"tier_2_words": {"description": "List of Tier 2 words", "title": "Tier 2 Words", "type": "string"}, "tier_3_words": {"description": "List of Tier 3 words", "title": "Tier 3 Words", "type": "string"}, "archaic_words": {"description": "List of Archaic words", "title": "Archaic Words", "type": "string"}, "other_complex_words": {"description": "List of Other Complex words", "title": "Other Complex Words", "type": "string"}, "complexity_score": {"description": "the complexity of the text, one of: slightly complex, moderately complex, very complex, or exceedingly complex", "title": "Complexity Score", "type": "string"}, "reasoning": {"description": "your reasoning for your answer", "title": "Reasoning", "type": "string"}}, "required": ["tier_2_words", "tier_3_words", "archaic_words", "other_complex_words", "complexity_score", "reasoning"]} +``` +''' +llm_response = ''' +{"tier_2_words": "roar, fury, envelope, damage, mightiest, develop, match, familiar, surface, rising, sinking, pressing", "tier_3_words": "hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist", "archaic_words": "", "other_complex_words": "ashore", "complexity_score": "slightly complex", "reasoning": "The vocabulary in this text is mostly contemporary, conversational, and familiar for a 7th grader with the assumed background knowledge. Words like 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' and 'low pressure' are subject-specific (Tier 3), but these are introduced with context and are not overly academic or dense. There are a few Tier 2 words such as 'roar,' 'fury,' 'envelope,' and 'damage,' but these are likely familiar from prior curriculum or everyday language. The word 'ashore' may be less familiar, but it is not enough to raise the overall complexity. There are no archaic or overly academic words. The proportion of complex words is low, and the text is accessible for independent reading at this grade level. Therefore, the vocabulary complexity is best described as 'slightly complex,' corresponding to a rubric level 2."}''' + +[cases.hurricanes_grade7.expected_result] +tier_2_words = ''' +roar, fury, envelope, damage, mightiest, develop, match, familiar, surface, rising, sinking, pressing''' +tier_3_words = ''' +hurricane, typhoon, cyclone, atmosphere, high pressure, low pressure, tropical, moist''' +archaic_words = "" +other_complex_words = ''' +ashore''' +complexity_score = ''' +slightly complex''' +reasoning = ''' +The vocabulary in this text is mostly contemporary, conversational, and familiar for a 7th grader with the assumed background knowledge. Words like 'hurricane,' 'typhoon,' 'cyclone,' 'atmosphere,' 'high pressure,' and 'low pressure' are subject-specific (Tier 3), but these are introduced with context and are not overly academic or dense. There are a few Tier 2 words such as 'roar,' 'fury,' 'envelope,' and 'damage,' but these are likely familiar from prior curriculum or everyday language. The word 'ashore' may be less familiar, but it is not enough to raise the overall complexity. There are no archaic or overly academic words. The proportion of complex words is low, and the text is accessible for independent reading at this grade level. Therefore, the vocabulary complexity is best described as 'slightly complex,' corresponding to a rubric level 2.''' diff --git a/sdks/settings/vocabulary/settings.toml b/sdks/settings/vocabulary/settings.toml new file mode 100644 index 0000000..295444f --- /dev/null +++ b/sdks/settings/vocabulary/settings.toml @@ -0,0 +1,410 @@ +[evaluator_metadata] +id = "vocabulary" +version = "0.1" +name = "Vocabulary" +description = "Vocabulary Complexity Evaluator" +maturity = "alpha" + +[[evaluator_metadata.inputs]] +name = "text" +type = "TextInputField" +description = "The text to evaluate for vocabulary complexity." + +[[evaluator_metadata.inputs]] +name = "grade" +type = "GradeInputField" +description = "The grade level of the text (3–12)." +allowed_grades = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12] + +# ── Prompt templates ─────────────────────────────────────────────────────────── +# +# Three prompt keys are used by the evaluator: +# background_knowledge_prompt – step 1 (all grades): produces a plain-text +# student background knowledge assumption. +# vocab_grades_3_4_system_prompt / vocab_grades_3_4_user_prompt – step 2 for +# grades 3–4: rubric-based, returns a string label +# and word-level breakdown. +# vocab_other_grades_system_prompt / vocab_other_grades_user_prompt – step 2 +# for grades 5–12: annotation-guide-based, returns +# a string complexity_score and reasoning. +# +# These prompts are the source of truth for what the evaluator sends to the LLM. +# They match the corresponding prompts in evals/prompts/vocab_prompts.py. +# If you update a prompt here, update the notebook file too (or vice versa). + +[prompts] +background_knowledge_prompt = """ +Review the following text, which is an educational text written for students in the following grade band: {grade}. + +Your job is to give me a background knowledge assumption; that is: what topics, if any, from the text students are likely to be familiar with based on a standard progression of topics in US public school education, as well as topics, if any the student is not likely to be familiar with. + +Make sure your response is concise (between 1 - 3 lines max) and is about the topics themselves, not about any other aspect of the text (e.g. flowery language, complicated sentence structure, etc.). + +Here's an example: +[START EXAMPLE] +Grade Band: 11th +Text: I went to the woods because I wished to live deliberately, to front only the essential facts of life, and see if I could not +learn what it had to teach, and not, when I came to die, discover that I had not lived. I did not wish to live what was +not life, living is so dear; nor did I wish to practise resignation, unless it was quite necessary. I wanted to live deep and suck out all the marrow of life, to live so sturdily and Spartan-like as to put to rout all that was not life, to cut a broad swath and shave close, to drive life into a corner, and reduce it to its lowest terms, and, if it proved to be mean, why then to get the whole and genuine meanness of it, and publish its meanness to the world; or if it were sublime, to +know it by experience, and be able to give a true account of it in my next excursion. For most men, it appears to me, +are in a strange uncertainty about it, whether it is of the devil or of God, and have somewhat hastily concluded that it +is the chief end of man here to β€œglorify God and enjoy him forever.” + +Background Knowledge Assumption: Assume they’ve studied American Transcendentalists like Thoreau and Emerson, including the mid-19th-century context of nature-focused philosophy. +[END EXAMPLE] + +You should assume that the student is an average US public school who is learning from common core curriculum. When you respond, just respond with the background knowledge assumption and nothing else. + +You can use the following list of topics that we know are covered for each grade level, although use your best judgement if you know there are other topics out there that students are likely to have covered. And this doesn't cover higher grade levels, so you'll have to again use your judgement for, say, what background knowledge a 9th grader is likely to have: +[BEGIN TOPICS] +[ + K: [ + "Toys and Play", "Weather Wonders", "Trees are Alive", "Enjoying and Appreciating Trees", + "The Five Senses: How do our senses help us learn?", "Once Upon a Farm: What makes a good story?", + "America, Then and Now: How has life in America changed over time?", "The Continents: What makes the world fascinating?", + "Needs of Plants and Animals", "Pushes and Pulls", "Sunlight and Weather", "Learning and Working Together", + "How Do People Learn and Work Together?", "Where Do We Live?", "What Does it Mean to Be an American?", + "How Has Our World Changed?", "Why Do People Have Jobs?" + ], + 1: [ + "Tools and Work", "A Study of the Sun, Moon, and Stars", "Birds' Amazing Bodies", "Caring for Birds", + "A World of Books: How do books change lives around the world?", "Creature Features: What can we discover about animals’ unique features?", + "Powerful Forces: How do people respond to the powerful force of the wind?", "Cinderella Stories: Why do people around the world admire Cinderella?", + "Animal and Plant Defenses", "Light and Sounds", "Spinning Earth", "Our Place in the World", + "What Are the Rights and Responsibilities of Citizens?", "How Can We Describe Where We Live?", + "How Do We Celebrate Our Country?", "How Does the Past Shape Our Lives?", "Why Do People Work?" + ], + 2: [ + "Schools and Community", "Fossils Tell of Earth's Changes", "The Secret World of Pollination", "Providing for Pollinators", + "A Season of Change: How does change impact people and nature?", "The American West: What was life like in the West for early Americans?", + "Civil Rights Heroes: How can people respond to injustice?", "Good Eating: How does food nourish us?", + "Plant and Animal Relationships", "Properties of Matter", "Changing Landforms", "Exploring Who We Are", + "Why Is It Important to Learn About the Past?", "How Does Geography Help Us Understand Our World?", + "How Do We Get What We Want and Need?", "Why Do We Need Government?", "How Can People Make a Difference in Our World?" + ], + "3": [ + "Overcoming Learning Challenges Near and Far", "Adaptations and the Wide World of Frogs", "Exploring Literary Classics", + "Water Around the World", "Ocean/Sea Exploration", "Outer Space", "Immigration", "Art/Being an Artist", + "Balancing Forces", "Inheritance and Traits", "Environments and Survival", "Weather and Climate", + "Communities", "Why Does It Matter Where We Live?", "What Is Our Relationship With Our Environment?", + "What Makes a Community Unique?", "How Does the Past Impact the Present?", "Why Do Governments and Citizens Need Each Other?", + "How Do People in a Community Meet Their Wants and Needs?" + ], + 4: [ + "Poetry", "Animal Defense Mechanisms", "The American Revolution", + "Responding to Inequality: Ratifying the 19th Amendment (covers gender and racial inequality)", + "A Great Heart: What does it mean to have a great heart, literally and figuratively?", + "Extreme Settings: How does a challenging setting or physical environment change a person?", + "American Revolution/Multiple Perspectives", "Myths/Myth Making", "Energy Conversions", "Vision and Light", + "Earth's Features", "Waves, Energy, and Information", "Regions of the United States", + "How Does America Use Its Strengths and Face Its Challenges?", "Why Have People Moved to and From the Northeast?", + "How Has the Southeast Changed Over Time?", "How Does the Midwest Reflect the Spirit of America?", + "How Does the Southwest Reflect Its Diverse Past and Unique Environment?", "What Draws People to the West?" + ], + 5: [ + "Human Rights", "Biodiversity in the Rainforest", "Athlete Leaders of Social Change", + "Impact of Natural Disasters", "Cultures in Conflict: How do cultural beliefs and values guide people?", + "Word Play: How and why do writers play with words?", "A War Between Us: How did the Civil War impact people?", + "Breaking Barriers: How can sports influence individuals and societies?", "Patterns of Earth and Sky", + "Modeling Matter", "The Earth System", "Ecosystem Restoration", "U.S. History: Making a New Nation", + "How Were the Lives of Native Peoples Influenced by Where They Lived?", + "What Happened When Diverse Cultures Crossed Paths?", "What Is the Impact of People Settling in a New Place?", + "Why Would a Nation Want to Become Independent?", "What Does the Revolutionary Era Tell Us About Our Nation Today?", + "How Does the Constitution Help Us Understand What It Means to Be an American?", + "What Do the Early Years of the United States Reveal About the Character of the Nation?", + "What Was the Effect of the Civil War on U.S. Society?" + ], + 6: [ + "Greek Mythology", "Critical Problems and Design Solutions", "American Indian Boarding Schools", + "Remarkable Accomplishments in Space Science", "Resilience in the Great Depression: How can enduring tremendous hardship contribute to personal transformation?", + "A Hero’s Journey: What is the significance and power of the hero’s journey?", + "Narrating the Unknown: How did the social and environmental factors in the unknown world of Jamestown shape its development and decline?", + "Courage in Crisis: How can the challenges of a hostile environment inspire heroism?", + "Microbiome", "Metabolism", "Metabolism Engineering", "Traits and Reproduction", "Thermal Energy", + "Ocean, Atmosphere, and Climate", "Weather Patterns", "Earth's Changing Climate", + "Earth's Changing Climate: Engineering Internship", "The First Americans (up to 1492)", + "Exploration and Colonization", "English Colonies", "American Revolution", "First Governments and the Constitution", + "The Early American Republic", "Political and Geographic Changes (1828-1850)", "Life in the North and South (1820-1860)", + "Division and Civil War (1821-1865)", "Reconstruction (1865-1896)", "The West (1858-1896)", + "New Industry and a Changing Society", "Expansion and War", "The 1920s and 1930s", "World War II", + "The Cold War", "Civil Rights and American Society", "America Since the 1970s" + ], + 7: [ + "The Lost Children of Sudan (Genocide, Genocide in Sudan)", "Epidemics", "Harlem Renaissance", "Plastic Pollution", + "Identity in the Middle Ages: How does society both support and limit the development of identity?", + "Americans All: How did World War II affect individuals?", "Language and Power: What is the power of language?", + "Fever: How can times of crisis affect citizens and society?", "Geology on Mars", "Plane Motion", "Plane Motion Engineering", + "Rock Formations", "Phase Change", "Phase Change Engineering", "Chemical Reactions", "Populations and Resources", + "Matter and Energy in Ecosystems", "Early Humans and Agricultural Revolution", "Fertile Crescent", + "Ancient Egypt and Kush", "The Israelites", "Ancient Greece", "Ancient South Asia", "Early China, Korea, and Japan", + "Ancient Rome", "Rise of Christian Kingdoms", "The Americas", "Medieval Europe", "The Rise of Islamic Empires", + "China in the Middle Ages", "Korea and Japan in the Middle Ages", "African Civilizations", "New Ways of Thinking", + "Age of Exploration and Trade", "Revolutions and Empires", "The Modern World" + ], + 8: [ + "Folklore of Latin America", "Food Choices", "The Holocaust", "Japanese American Internment", + "The Poetics and Power of Storytelling: What is the power of storytelling?", + "The Great War: How do literature and art illuminate the effects of World War I?", "What Is Love?", + "Teens as Change Agents: How do people effect social change?", "Harnessing Human Energy", + "Force and Motion", "Force and Motion Engineering", "Magnetic Fields", "Light Waves", "Earth, Moon, and Sun", + "Natural Selection", "Natural Selection Engineering", "Evolutionary History", "The World in Spatial Terms", + "Places and Regions", "Physical Geography", "Population Geography", "Economic Geography", + "Political Geography", "Human-Environment Geography", "What is Economics?", "Markets, Money, and Businesses", + "Government and the Economy", "The Global Economy" + ] +] +[END TOPICS] + +Here is the text: +[BEGIN TEXT] +{text} +[END TEXT] +""" + +vocab_grades_3_4_system_prompt = """ +You are an expert curriculum designer. Your job is to rate the complexity of a text's vocabulary relative to the grade level. + +You will be given a rubric (with levels from least to most complex: slightly complex, moderately complex, very complex, exceedingly complex) as well as guidelines for interpreting the rubric. +IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meaning, etc.) + +**Resource 1: Qualitative Text Complexity rubric (SAP)** +1. **Level 1: Slightly complex** + * Original Definition: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. A very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0. + * Summary definition: Overall, vocabulary is easy to understand and does not impede comprehension of the bulk of the text (including main idea and supporting claims). 1-2 quick pauses for processing by the student are ok here! +2. **Level 2: Moderately complex** + * Original Definition: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK + * Summary definition: Overall, vocabulary generally allows students to comprehend the bulk of the text with little difficulty, though there may be occasional pauses for clarification. Several quick pauses or occasional prolonged pauses may occur. +3. **Level 3: Very complex** + * Original Definition: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic + * Summary definition: Overall, vocabulary often presents challenges that may slow down comprehension but does not completely block the comprehension of the bulk of the text. +4. **Level 4: Exceedingly complex** + * Original Definition: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading. + * Summary definition: Overall, vocabulary is so complex that it makes comprehension of the bulk of the text very challenging and requires careful effort to interpret. + +**Resource 2: Flesch-Kincaid Grade Level** +Use the Flesch-Kincaid (FK) Grade Level as light guidance of the approximate grade level based on readability. The metric alone does not provide final information of vocabulary complexity, but a ballpark of the difficulty of the entire text. +* grade 2-3: 1.98-5.34 +* grade 4-5: 4.51-7.73 +* grade 6-8: 6.51-10.34 +* grade 9-10: 8.32-12.12 +* grade 11-College: 10.34-14.2 + +**Guidelines for Interpretation and Reasoning** + +Your reasoning is the most critical part of your analysis. It's not enough to simply count complex words. You must analyze their impact on a student at the specified grade level. Use the following principles to guide your judgment: + +1. **Density and Cumulative Effect:** Do not just count complex words; evaluate their concentration. A short text with a high density of challenging Tier 2 words (e.g., `peculiar`, `mischievous`, `courageous` for a 4th grader) can be more overwhelming than a longer text with a few scattered Tier 3 words. A constant barrage of unfamiliar words can elevate complexity from `very` to `exceedingly`. +2. **Contextual Scaffolding:** Assess how the text supports new vocabulary. + * Are new, complex terms explicitly defined or explained with simple examples (e.g., "volume... to see if it is big enough to hold a liter of food")? + * Is the surrounding language simple and conversational, making the meaning of new words easier to infer? + * Strong scaffolding can lower the complexity rating. A text with many Tier 3 words that are well-explained might only be `moderately complex`. +3. **Abstract vs. Concrete Vocabulary:** Differentiate between words for abstract concepts and words for concrete things. A text built on abstract Tier 2 words (e.g., `relationships`, `performance`, `non-physical`) can be more challenging than a text that introduces Tier 3 labels for concrete things or people (e.g., `Sumerians`, `polonium`). +4. **Conceptual Load:** Consider the cognitive load of the vocabulary. A list of many new, multi-syllabic, conceptually-heavy terms (e.g., `Paleolithic`, `Mesolithic`, `Neolithic` for a 3rd grader) can be `very complex` even if the terms are briefly defined, because the student must process multiple new concepts at once. +5. **Calibrating the Top Levels:** Be precise in your use of `very complex` vs. `exceedingly complex`. + * **Very complex:** The vocabulary creates significant hurdles and slows the reader down, but the main ideas of the text are still accessible with effort. + * **Exceedingly complex:** The vocabulary is so dense, technical, or abstract that it acts as a barrier, making it nearly impossible for the target student to grasp the bulk of the text's meaning without extensive outside help. Reserve this for texts saturated with advanced terminology. +6. **Consider Background Knowledge:** Pay close attention to the provided `student_background_knowledge`. Do not classify a word as complex if the student is likely to be familiar with it (e.g., 'oxygen' for a 3rd grader who has learned about the human body). + +**Final Analysis Format** + +Provide these information as your final analysis: +1. **Complex vocabulary:** + * Tier 2 words: Words that are commonly used in academic settings and more complex than colloquial, or everyday language and often have multiple meanings. + * Tier 3 words: Overly academic or domain-specific words. + * Archaic words: Words, or uses of words that are not commonly used in modern conversational language. E.g., β€œThe jury retired to deliberate on their verdict." The use of β€œretire” to mean withdrawing to a private place is an archaic use. + * Other complex words: All other words that can increase complexity of the text (e.g., idioms, unfamiliar proper nouns that function as vocabulary). +2. **Vocabulary complexity:** one of: slightly complex, moderately complex, very complex, exceedingly complex +3. **Your reasoning of the complexity:** A detailed explanation of your rating, referencing the principles above. +""" + +vocab_grades_3_4_user_prompt = """ +Below is the text you need to evaluate. Let's think step by step in order to predict the output of the vocabulary complexity task. + +- It is intended for grade {student_grade_level}. + +- You can assume the student has the following background knowledge about the text β€” this background knowledge influences which words from the text are familiar versus unfamiliar for the student: {student_background_knowledge} + +- Text Flesch-Kincaid grade level: {fk_level} + +- Text to evaluate: [BEGIN TEXT] +{text} +[END TEXT] + +{format_instructions} +""" + +vocab_other_grades_system_prompt = """ +You are an expert curriculum designer. Your job involves reading text snippets intended for students in K-12 and evaluating the complexity of the vocabulary in each text. + +You will be given a rubric (with options 1, 2, 3, 4) as well as guidelines for interpreting the rubric. + +IMPORTANT: You should only pay attention to the vocabulary. Do not evaluate any other element of the text's complexity (e.g. sentence structure, meainng, etc.) +IMPORTANT: Rely on the supplied rubric and annotation guidelines along. Do not introduce any new crtieria for evaluating the complexity of a text's vocabulary. + +Please first reason out loud about the vocabulary complexity of the text and then provide an answer between 1 and 4 (whole numbers only). Provide the answer as an integer (not a float). +""" + +vocab_other_grades_user_prompt = """ +Your job is to rate the complexity of a text's vocabulary (relative to the intended level of the text) according to a rubric and annotation guide. Stick to the rubric and annotation guide exactly β€” do not introduce any additional criteria or lenses for judging the complexity of the text. + +[BEGIN ANNOTATION GUIDE AND RUBRIC] +Instructions +For the following task, please assume that: + - The student is on grade level and proficient in all core content areas, including reading fluency, comprehension, science, & social studies. (example). + - The student is moving through a common progression of topics (detailed here). + - The student is fluent in speaking English. + - The student has an "average" amount of background knowledge on topics not commonly covered in curriculum. + - The student will use this material for independent reading/work, without direct instruction. + - The text is reasonable for the given grade level. + +Please do not consider the presence of figurative language when scoring Vocabulary. For example: with a phrase like "kicked the bucket," consider only the qualities of the words themselves ("kicked", "the" and "bucket"). + +Please do be sure to consider: +- all of the different types of vocabulary (listed below) +- the overall proportion of complex words in the text - including repeated complex words. +- the resulting holistic complexity of the vocabulary (described in the Summary section below). + +Level 1: +Rubric: Vocabulary that is almost entirely not complex: contemporary, conversational, and/or familiar. That said, a very low proportion of complex words (archaic, subject-specific, academic) is OK -- i.e. doesn't need to be 0. + +Level 2: +Rubric: Vocabulary that is mostly not complex: contemporary, conversational, and/or familiar. A low proportion of complex words (archaic, subject-specific, academic) is OK, but if it's very low, the text is probably level 1. + +Level 3: +Rubric: Vocabulary that is often complex: unfamiliar, archaic, subject-specific, and/or overly academic + +Level 4: +Rubric: Vocabulary that is mostly complex: unfamiliar, archaic, subject-specific, and/or overly academic. May be ambiguous or purposefully misleading + +And here are some relevant definitions: + - Conversational: Everyday language. + - Familiar: Words that the student is likely to have seen/heard, from everyday life or their curriculum. Reminder: assume an "average" level of background knowledge. + - Unfamiliar: Words the student has probably not heard, or are being used in an unfamiliar way. + - For ex: 4th graders are familiar with the word "table" but may not be familiar with the use of the word with respect to data ("a table of data"). + - Note: + - Words with in-line definitions (via appositives, or because they can be easily inferred from other parts of the text) should be evaluated as less unfamiliar. + - For ex: "The pharaoh, a powerful ruler of ancient Egypt, was buried in a grand tomb." + - The word "pharaoh" might be unfamiliar or subject-specific, but since is defined within the text, you can consider it a more familiar word. + - Unfamiliar proper nouns: + - A person's name, even if unfamiliar, generally does not add to complexity. + - Other unfamiliar proper nouns (eg locations, organizations) do add to complexity. + +- Subject-specific: Words that are specific to a subject or field of study that are essential for understanding concepts and engaging with the content. +- Overly-academic: Words that are excessively formal, complex, or specialized. + - For ex: "The agrarian societal structure of the Neolithic Revolution precipitated a paradigm shift in agriculture" +- Archaic: A word that was common in the past but is now rarely/almost never used. Could also be a word used in an archaic way. + - For ex: "After a long day of court proceedings, the jury 'retired' to deliberate on their verdict." + - The word "retire" meaning to stop working may be familiar to a student, but "retire" meaning "withdrawing to a private place" is an archaic use. + + +Examples +The student is on-grade-level: +- Consider a 6th grade passage about earth systems. Per NGSS standards, students are introduced to earth systems starting in 2nd grade. They encounter words like: wind, water, river, lake, solids, and liquids. For our rating purposes, we would assume most students following 2nd have encountered these words. In 5th grade, they dive more fully into earth systems concepts, learning vocabulary words like geosphere, sediment, biosphere, atmosphere, ecosystems, organisms and climate. While rating, we would consider the words listed in the NGSS standards as more familiar following that grade level. If the same passage were intended for 3rd graders, though, then the subject-specific vocabulary is likely to be unfamiliar. + +Figurative Language +- Kicked the bucket. +- The pen is mightier than the sword. +- The classroom was a zoo. +- He ran faster than the speed of light. +[END ANNOTATION GUIDE AND RUBRIC] + +Here are a couple examples of texts that have already been scored along with justification for their scores, which you can use as exemplars: +[BEGIN EXAMPLES] + +*** EXAMPLE 1 *** +The following text was intended for grade level 11 and received a complexity level of 1. + +Here is the background knowledge assumption for that text: N/A + +Here is the text: +// START TEXT // +"In a recent lecture, "Is Nothing Sacred?", Salman Rushdie, one of the most censored authors of our time, talked about the importance of books. He grew up in a household in India where books were as sacred as bread. If anyone in the household dropped a piece of bread or a book, the person not only picked it up, but also kissed the object by way of apologizing for clumsy disrespect. + +He goes on to say that he had kissed many books before he had kissed a girl. Bread and books were for his household, and for many like his, food for the body and the soul. This image of the kissing of the book one had accidentally dropped made an impression on me. It speaks to the love and respect many people have for them. + +I grew up in a small town in New Mexico, and we had very few books in our household. The first one I remember reading was my catechism book. Before I went to school to learn English, my mother taught me catechism in Spanish. + +I remember the questions and answers I had to learn, and I remember the well-thumbed, frayed volume which was sacred to me. + +Growing up with few books in the house created in me a desire and a need for them. When I started school, I remember visiting the one room library of our town and standing in front of the dusty shelves. In reality there were only a few shelves and not over a thousand books, but I wanted to read them all. There was food for my soul in the books, that much I realized." +// END TEXT // + +Here is the reasoning for that complexity level: +// START REASONING // +This text is a 1 for vocabulary, because the vocabulary that is used is familiar and accessible for a proficient 11th grader. Most of the words used in the text are very common everyday vocabulary for describing growing up, family life, and the importance of reading. A few examples of these very common words are: small town, book, school, learn, food, kissed, image, respect, love, speaks. There are many more in the text. In this text there are only a few "juicier" or more complex words, you can think of those as words that are less familiar, have a more abstract or nuanced meaning, or carry a very large concept. Less commonly spoken words that were used in the text were: frayed, volume, censored, clumsy, sacred. These are still well within reach of a proficient 11th grader, and would still be considered familiar, because they will have encountered them in past reading or academic studies. In the text there are a couple of words that are outliers, but they are not essential to the understanding of the larger text. One of these words or hyphenated compound phrase is well-frayed. A compound phrase is a phrase consisting of multiple words that work together to create a specific meaning or idea, often acting as a single unit in a sentence. If the meaning of individual words is familiar, it is typically quite easy for proficient readers to generalize the larger meaning that the author is implying with their word choice. In this case, proficient students will be accustomed to the phrase well, with the secondary meaning of very, rather than a description of positivity or health; and they will be accustomed to the use frayed, as in worn, aged, or damaged from use. Making the leap to identify the meaning of "well-frayed" as a book that is very used, will take only moments for a proficient 11th grader. Another word that stands out in the text is the word catechism, which might be new for many students based on their personal background or location, but a full understanding of what a catechism book contains is not essential for understanding the paragraph or whole text. The reader can make it through using minimum context clues to know that the catechism must be something important to his family. The type of book he learned to read before going to school is not critical for comprehension, it's enough to understand that reading was so important in his family, his mother started instruction before he even started school. Additionally, it's important to know that having one unknown word for an 11th grade reading, does not merit a rating higher than one. + +It is worth noting that another reason this text is a 1, is that the content or topic of the passage is so familiar and covered extensively in K-12 education, i.e. reading is important, loving books, growing up; that coupled with the simple vocabulary choices, getting to the meaning of the overall text, and even the paragraphs, would be incredibly easy for a proficient 11th grader. +// END REASONING // +*** EXAMPLE 2 *** +The following text was intended for grade level 5 and received a complexity level of 2. + +Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with the concept of natural disasters, including hurricanes, and basic atmospheric concepts like high and low pressure from their studies on weather and climate. They may not be familiar with the specific formation processes of hurricanes or the global terminology differences (hurricane, typhoon, cyclone). + +Here is the text: +// START TEXT // +Great whirling storms roar out of the oceans in many parts of the world. They are called by several namesβ€”hurricane, typhoon, and cyclone are the three most familiar ones. But no matter what they are called, they are all the same sort of storm. They are born in the same way, in tropical waters. They develop the same way, feeding on warm, moist air. And they do the same kind of damage, both ashore and at sea. Other storms may cover a bigger area or have higher winds, but none can match both the size and the fury of hurricanes. They are earth's mightiest storms. + +Like all storms, they take place in the atmosphere, the envelope of air that surrounds the earth and presses on its surface. The pressure at any one place is always changing. There are days when air is sinking and the atmosphere presses harder on the surface. These are the times of high pressure. There are days when a lot of air is rising and the atmosphere does not press down as hard. These are times of low pressure. Low-pressure areas over warm oceans give birth to hurricanes. +// END TEXT // + +Here is the reasoning for that complexity level: +// START REASONING // +I scored this a 2 because of the density of subject-specific vocabulary related to weather and climate, which is often covered in lower grade levels. This adds to the complexity above a 1, but it is not a level 3 because of the familiarity with the topic, which implies some familiarity with the vocabulary as well. The specific formation process and the vocabulary used to explain the processes are also subject-specfiic but not famliar, which would make the second paragraph a level 3 in the rubric language, but when considering the language used in the overall SUMMARY below the rubric, this new content and vocabulary would cause quick pauses and/or occasional prolonged pauses but would not cause the reader to slow down to due to challenging overall comprehension of the key ideas and supporting claims. This is especially the case because the second paragraph builds upon prior knowledge and familiar vocabulary use, so it is not entirely new information and vocabulary. While there is subject-specific vocabulary used, overly academic vocabulary is NOT used and is more conversational in nature, such as "great whiring storms" and "born" / "giving birth" to storm (although this is the way storms are described!) rather than more technical terms which made comprehension easier due to the accessibility of the vocabulary (even if used in other contexts before reading this text). Words such as "a lot" and "bigger" are more conversational, and while technical, unfamiliar words are provided, such as "hurricane," "typhoon," and "cyclone," knowing and understanding their differences is not necessary to grasp the main idea. The processes by which they are formed are what need to be retained while reading the entire text, and familiarity with the bulk of the vocabulary used would allow for that to happen without too much struggle to make meaning of it. Additionally, the text does not contain any archaic vocabulary or ambiguous words, which prevents it from reaching a rating of 4, although it is not necessary that they text have such vocabulary to meet a level 4, the frequent inclusion of such vocabulary makes it more likely to land at least a 3 or 4. +// END REASONING // + +*** EXAMPLE 3 *** +The following text was intended for grade level 6 and received a complexity level of 3. + +Here is the background knowledge assumption for that text: Background Knowledge Assumption: Students are likely familiar with basic Earth science concepts such as rocks, minerals, and fossils, as well as natural processes like volcanic eruptions and earthquakes. They may not be familiar with more advanced topics like plate tectonics or the specific branches of geology such as mineralogy, petrology, and seismology. + +Here is the text: +// START TEXT // +Geology is the scientific study of Earth. Geologists study the planetβ€”its formation, its internal structure, its materials, its chemical and physical processes, and its history. Mountains, valleys, plains, sea floors, minerals, rocks, fossils, and the processes that create and destroy each of these are all the domain of the geologist. Geology is divided into two broad categories of study: physical geology and historical geology. + +Physical geology is concerned with the processes occurring on or below the surface of Earth and the materials on which they operate. These processes include volcanic eruptions, landslides, earthquakes, and floods. Materials include rocks, air, seawater, soils, and sediment. Physical geology further divides into more specific branches, each of which deals with its own part of Earth's materials, landforms, and processes. Mineralogy and petrology investigate the composition and origin of minerals and rocks. Volcanologists study lava, rocks, and gases on live, dormant, and extinct volcanoes. Seismologists use instruments to monitor and predict earthquakes and volcanic eruptions. + +Historical geology is concerned with the chronology of events, both physical and biological, that have taken place in Earth's history. Paleontologists study fossils (remains of ancient life) for evidence of the evolution of life on Earth. Fossils not only relate evolution, but also speak of the environment in which the organism lived. Corals in rocks at the top of the Grand Canyon in Arizona, for example, show a shallow sea flooded the area around 290 million years ago. In addition, by determining the ages and types of rocks around the world, geologists piece together continental and oceanic history over the past few billion years. Plate tectonics (the study of the movement of the sections of Earth's crust) adds to Earth's story with details of the changing configuration of the continents and oceans. +// END TEXT // + +Here is the reasoning for that complexity level: +// START REASONING // +To determine the complexity rating of this text based on the vocabulary present, I used the annotation guide, scoring rubric, and examples to set the expectations for rating. During the first read of the text, I "bolded" and categorized the more challenging vocabulary words according to the following complexity groupings: archaic, unfamiliar, archaic, subject-specific, and/or overly academic. On the second read, I considered the main idea or "gist" that students need to acquire understanding of. I then referenced the previously mentioned tools–annotation guide, scoring rubric, and examples to remind myself of the expectations for rating. I agreed that readers would have familiarity with basic concepts of geology; however, I also considered the definitions provided for words such as Geology, Geologists, Physical Geology, Historical Geology, Mineralogy, and Petrology. I considered how students might pause for clarification and for how long. After reviewing the Annotation Guide while considering, I narrowed the rating down because the definitions provided throughout the text of more complex words should make the meaning of the text more accessible for readers, which is why although the words are subject-specific, I rated this text as a 3 instead of a 2-less complex or a 4–more complex. I read the text one final time to ensure clarity around my rating, scored and wrote the justification. +// END REASONING // +[END EXAMPLES] + +Below is the text you need to evaluate. It is intended for grade {student_grade_level}. + +As you read the text, you can assume the student has the following background knowledge about the text β€” this background knowledge influences which words from the text are familiar versus unfamiliar for the student: {student_background_knowledge} + +[BEGIN TEXT] +{text} +[END TEXT] + +In your response, when specifying the level of complexity, be sure to use only a single integer (e.g. 2) and don't include any other text (e.g. don't say "level 2"). + +{format_instructions} +""" + +# ── Evaluation settings ──────────────────────────────────────────────────────── +# +# Three LLM steps, each with its own provider and model: +# background_knowledge – GPT-4o for the background knowledge generation step +# vocab_grades_3_4 – Gemini for the grades 3–4 complexity step +# vocab_other_grades – GPT-4.1 for the grades 5–12 complexity step + +[evaluation_settings] + +[evaluation_settings.prompt_settings_step_background_knowledge] +provider_type = "OPENAI" +model = "gpt-4o-2024-11-20" +temperature = 0 + +[evaluation_settings.prompt_settings_step_vocab_grades_3_4] +provider_type = "GOOGLE" +model = "gemini-2.5-pro" +temperature = 0 + +[evaluation_settings.prompt_settings_step_vocab_other_grades] +provider_type = "OPENAI" +model = "gpt-4.1" +temperature = 0