Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test-sdk-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ jobs:
working-directory: sdks/python
strategy:
matrix:
python-version: ["3.10", "3.12", "3.13"]
python-version: ["3.10", "3.11", "3.12", "3.13"]
steps:
- name: Checkout
uses: actions/checkout@v6
Expand Down
59 changes: 50 additions & 9 deletions sdks/python/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,48 +3,74 @@ PYTEST := $(PYTHON) -m pytest
RUFF := $(PYTHON) -m ruff
MYPY := $(PYTHON) -m mypy

.PHONY: help test unit-test generate-settings check-generated \
# Bundled settings directory inside the package. Two things are maintained here:
# _generated_*_settings.py — generated from settings.toml by `make generate-settings`
# contracts.toml — copied from sdks/settings/ by `make sync-settings`
#
# Tests always read contracts.toml from the bundled copy (via importlib.resources
# fallback in shared_settings_root()). EVALUATORS_SETTINGS_DIR is intentionally NOT
# exported here so that dev and installed-package runs behave identically.
SETTINGS_DST := src/learning_commons_evaluators/settings

.PHONY: help build check-build test unit-test contract-test \
generate-settings check-generated sync-settings check-sync \
lint format format-check typecheck pip-check verify coverage

help:
@echo "Usage: make <target>"
@echo ""
@echo " lint Ruff linter (src, tests, settings generator script)"
@echo " build Prepare the package: generate settings + sync contracts TOML"
@echo " check-build Verify build artifacts are up to date (use in CI)"
@echo ""
@echo " lint Ruff linter (src/, tests/, scripts/)"
@echo " format Apply Ruff formatter"
@echo " format-check Fail if Ruff would reformat any file"
@echo " typecheck Mypy on src package + tests"
@echo " pip-check pip dependency consistency check"
@echo " verify check-generated + lint + format-check + typecheck + pip-check + test (unit + contract)"
@echo " verify check-build + lint + format-check + typecheck + pip-check + test (unit + contract)"
@echo " coverage Unit tests with coverage report (requires dev install)"
@echo ""
@echo " test Run the full test suite"
@echo " unit-test Run unit tests only"
@echo " test Run the full test suite (unit + contract)"
@echo " unit-test Run unit tests only (skip contract tests)"
@echo " contract-test Run contract tests only"
@echo ""
@echo " generate-settings Re-generate _generated_*_settings.py from sdks/settings/"
@echo " check-generated Verify generated .py files match sdks/settings/ (exits 1 if stale)"
@echo " sync-settings Copy contracts.toml from sdks/settings/ into the package"
@echo " check-sync Verify bundled contracts.toml matches sdks/settings/"
@echo ""
@echo "Typical workflows:"
@echo " After editing evaluator settings TOML: make generate-settings && make test"
@echo " After editing contracts TOML: make sync-settings && make contract-test"
@echo " Fresh checkout / before release: make build && make verify"
@echo " CI: make verify"

# ── Build ─────────────────────────────────────────────────────────────────────
# Prepares all package build artifacts in one step. Run after editing any
# file under sdks/settings/, or on a fresh checkout before running tests.

build: generate-settings sync-settings

check-build: check-generated check-sync

# ── Static checks (Ruff + Mypy) ───────────────────────────────────────────────

lint:
$(RUFF) check src tests scripts/generate_settings.py
$(RUFF) check src tests scripts

format:
$(RUFF) format src tests scripts/generate_settings.py
$(RUFF) format src tests scripts

format-check:
$(RUFF) format --check src tests scripts/generate_settings.py
$(RUFF) format --check src tests scripts

typecheck:
$(MYPY) src/learning_commons_evaluators tests

pip-check:
$(PYTHON) -m pip check

verify: check-generated lint format-check typecheck pip-check test
verify: check-build lint format-check typecheck pip-check test

coverage:
$(PYTEST) tests/ -q --ignore=tests/contract_tests --cov=learning_commons_evaluators --cov-report=term-missing
Expand All @@ -57,11 +83,26 @@ test:
unit-test:
$(PYTEST) tests/ -v --ignore=tests/contract_tests

contract-test:
Comment thread
czi-fsisenda marked this conversation as resolved.
$(PYTEST) tests/contract_tests/ -v

# ── Code generation ───────────────────────────────────────────────────────────
# Re-run when an evaluator settings TOML under sdks/settings/ changes.
# The generator sets EVALUATORS_SETTINGS_DIR itself to point at sdks/settings/.

generate-settings:
$(PYTHON) scripts/generate_settings.py

check-generated:
$(PYTHON) scripts/generate_settings.py --check

# ── Contract test bundling ────────────────────────────────────────────────────
# contracts.toml is bundled so contract tests run consistently whether using
# a development checkout or a pip-installed package. The sync command is
# handled by the Python generator script.

sync-settings:
$(PYTHON) scripts/generate_settings.py --sync

check-sync:
$(PYTHON) scripts/generate_settings.py --check-sync
43 changes: 42 additions & 1 deletion sdks/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,42 @@ config = create_config(
)
```

### Per-instance default evaluation settings

Every `BaseEvaluator` subclass defines **class-level** `default_evaluation_settings`
(the bundled evaluators load these from generated settings). You can override that
default for a single evaluator instance by passing the same keyword to the
constructor:

```python
from learning_commons_evaluators import ConventionalityEvaluator, create_config

config = create_config(...)
# Start from the bundled defaults, then change what your deployment needs (models,
# temperatures, etc. live on nested PromptSettings).
settings = ConventionalityEvaluator.default_evaluation_settings.model_copy(deep=True)
settings.prompt_settings_step_conventionality_evaluation = (
settings.prompt_settings_step_conventionality_evaluation.model_copy(
update={"temperature": 0.2}
)
)
evaluator = ConventionalityEvaluator(
config,
default_evaluation_settings=settings,
)

# Uses the instance default (a deep copy is taken inside evaluate)
result = evaluator.evaluate(input)

# Per-call override still wins
result = evaluator.evaluate(input, evaluation_settings=other_settings)
```

If you omit `default_evaluation_settings` at construction, attribute lookup uses the
subclass class attribute, same as before. Whenever you call `evaluate()` without
`evaluation_settings`, the SDK uses `model_copy(deep=True)` of the resolved default,
so the object you keep on the instance is not mutated by a run.

### Logging

The SDK uses Python's standard `logging` module. By default, `EvaluatorConfig` uses the
Expand Down Expand Up @@ -294,7 +330,10 @@ On evaluation failure, `metadata.status` and `error_details` are set on the in-m

## Creating custom evaluators

Extend `BaseEvaluator` to create custom evaluators:
Extend `BaseEvaluator` to create custom evaluators. Set **class-level**
`default_evaluation_settings` for the usual defaults; callers may still construct
`MyEvaluator(config, default_evaluation_settings=...)` to pin different defaults on a
specific instance (see [Per-instance default evaluation settings](#per-instance-default-evaluation-settings)).

```python
from learning_commons_evaluators import BaseEvaluator, EvaluatorConfig
Expand Down Expand Up @@ -329,6 +368,8 @@ class MyEvaluator(BaseEvaluator[MyInput, EvaluationResult, MySettings]):
return EvaluationResult(answer=..., explanation=..., metadata=evaluation_metadata)
```

If you override `__init__` on the subclass, accept the same keyword-only argument and forward it: `super().__init__(config, default_evaluation_settings=default_evaluation_settings)`.

## License

MIT
Loading