learning-commons-org · czi-fsisenda · May 14, 2026 · Apr 30, 2026 · May 13, 2026 · May 13, 2026
diff --git a/.github/workflows/test-sdk-python.yml b/.github/workflows/test-sdk-python.yml
@@ -30,7 +30,7 @@ jobs:
         working-directory: sdks/python
     strategy:
       matrix:
-        python-version: ["3.10", "3.12", "3.13"]
+        python-version: ["3.10", "3.11", "3.12", "3.13"]
     steps:
       - name: Checkout
         uses: actions/checkout@v6

diff --git a/sdks/python/Makefile b/sdks/python/Makefile
@@ -3,48 +3,74 @@ PYTEST  := $(PYTHON) -m pytest
 RUFF    := $(PYTHON) -m ruff
 MYPY    := $(PYTHON) -m mypy
 
-.PHONY: help test unit-test generate-settings check-generated \
+# Bundled settings directory inside the package.  Two things are maintained here:
+#   _generated_*_settings.py  — generated from settings.toml by `make generate-settings`
+#   contracts.toml             — copied from sdks/settings/ by `make sync-settings`
+#
+# Tests always read contracts.toml from the bundled copy (via importlib.resources
+# fallback in shared_settings_root()).  EVALUATORS_SETTINGS_DIR is intentionally NOT
+# exported here so that dev and installed-package runs behave identically.
+SETTINGS_DST := src/learning_commons_evaluators/settings
+
+.PHONY: help build check-build test unit-test contract-test \
+        generate-settings check-generated sync-settings check-sync \
         lint format format-check typecheck pip-check verify coverage
 
 help:
 	@echo "Usage: make <target>"
 	@echo ""
-	@echo "  lint               Ruff linter (src, tests, settings generator script)"
+	@echo "  build              Prepare the package: generate settings + sync contracts TOML"
+	@echo "  check-build        Verify build artifacts are up to date (use in CI)"
+	@echo ""
+	@echo "  lint               Ruff linter (src/, tests/, scripts/)"
 	@echo "  format             Apply Ruff formatter"
 	@echo "  format-check       Fail if Ruff would reformat any file"
 	@echo "  typecheck          Mypy on src package + tests"
 	@echo "  pip-check          pip dependency consistency check"
-	@echo "  verify             check-generated + lint + format-check + typecheck + pip-check + test (unit + contract)"
+	@echo "  verify             check-build + lint + format-check + typecheck + pip-check + test (unit + contract)"
 	@echo "  coverage           Unit tests with coverage report (requires dev install)"
 	@echo ""
-	@echo "  test               Run the full test suite"
-	@echo "  unit-test          Run unit tests only"
+	@echo "  test               Run the full test suite (unit + contract)"
+	@echo "  unit-test          Run unit tests only (skip contract tests)"
+	@echo "  contract-test      Run contract tests only"
 	@echo ""
 	@echo "  generate-settings  Re-generate _generated_*_settings.py from sdks/settings/"
 	@echo "  check-generated    Verify generated .py files match sdks/settings/ (exits 1 if stale)"
+	@echo "  sync-settings      Copy contracts.toml from sdks/settings/ into the package"
+	@echo "  check-sync         Verify bundled contracts.toml matches sdks/settings/"
 	@echo ""
 	@echo "Typical workflows:"
 	@echo "  After editing evaluator settings TOML:  make generate-settings && make test"
+	@echo "  After editing contracts TOML:           make sync-settings && make contract-test"
+	@echo "  Fresh checkout / before release:        make build && make verify"
 	@echo "  CI:                                     make verify"
 
+# ── Build ─────────────────────────────────────────────────────────────────────
+# Prepares all package build artifacts in one step.  Run after editing any
+# file under sdks/settings/, or on a fresh checkout before running tests.
+
+build: generate-settings sync-settings
+
+check-build: check-generated check-sync
+
 # ── Static checks (Ruff + Mypy) ───────────────────────────────────────────────
 
 lint:
-	$(RUFF) check src tests scripts/generate_settings.py
+	$(RUFF) check src tests scripts
 
 format:
-	$(RUFF) format src tests scripts/generate_settings.py
+	$(RUFF) format src tests scripts
 
 format-check:
-	$(RUFF) format --check src tests scripts/generate_settings.py
+	$(RUFF) format --check src tests scripts
 
 typecheck:
 	$(MYPY) src/learning_commons_evaluators tests
 
 pip-check:
 	$(PYTHON) -m pip check
 
-verify: check-generated lint format-check typecheck pip-check test
+verify: check-build lint format-check typecheck pip-check test
 
 coverage:
 	$(PYTEST) tests/ -q --ignore=tests/contract_tests --cov=learning_commons_evaluators --cov-report=term-missing
@@ -57,11 +83,26 @@ test:
 unit-test:
 	$(PYTEST) tests/ -v --ignore=tests/contract_tests
 
+contract-test:
+	$(PYTEST) tests/contract_tests/ -v
+
 # ── Code generation ───────────────────────────────────────────────────────────
 # Re-run when an evaluator settings TOML under sdks/settings/ changes.
+# The generator sets EVALUATORS_SETTINGS_DIR itself to point at sdks/settings/.
 
 generate-settings:
 	$(PYTHON) scripts/generate_settings.py
 
 check-generated:
 	$(PYTHON) scripts/generate_settings.py --check
+
+# ── Contract test bundling ────────────────────────────────────────────────────
+# contracts.toml is bundled so contract tests run consistently whether using
+# a development checkout or a pip-installed package.  The sync command is
+# handled by the Python generator script.
+
+sync-settings:
+	$(PYTHON) scripts/generate_settings.py --sync
+
+check-sync:
+	$(PYTHON) scripts/generate_settings.py --check-sync
diff --git a/sdks/python/README.md b/sdks/python/README.md
@@ -230,6 +230,42 @@ config = create_config(
 )
 ```
 
+### Per-instance default evaluation settings
+
+Every `BaseEvaluator` subclass defines **class-level** `default_evaluation_settings`
+(the bundled evaluators load these from generated settings). You can override that
+default for a single evaluator instance by passing the same keyword to the
+constructor:
+
+```python
+from learning_commons_evaluators import ConventionalityEvaluator, create_config
+
+config = create_config(...)
+# Start from the bundled defaults, then change what your deployment needs (models,
+# temperatures, etc. live on nested PromptSettings).
+settings = ConventionalityEvaluator.default_evaluation_settings.model_copy(deep=True)
+settings.prompt_settings_step_conventionality_evaluation = (
+    settings.prompt_settings_step_conventionality_evaluation.model_copy(
+        update={"temperature": 0.2}
+    )
+)
+evaluator = ConventionalityEvaluator(
+    config,
+    default_evaluation_settings=settings,
+)
+
+# Uses the instance default (a deep copy is taken inside evaluate)
+result = evaluator.evaluate(input)
+
+# Per-call override still wins
+result = evaluator.evaluate(input, evaluation_settings=other_settings)
+```
+
+If you omit `default_evaluation_settings` at construction, attribute lookup uses the
+subclass class attribute, same as before. Whenever you call `evaluate()` without
+`evaluation_settings`, the SDK uses `model_copy(deep=True)` of the resolved default,
+so the object you keep on the instance is not mutated by a run.
+
 ### Logging
 
 The SDK uses Python's standard `logging` module. By default, `EvaluatorConfig` uses the
@@ -294,7 +330,10 @@ On evaluation failure, `metadata.status` and `error_details` are set on the in-m
 
 ## Creating custom evaluators
 
-Extend `BaseEvaluator` to create custom evaluators:
+Extend `BaseEvaluator` to create custom evaluators. Set **class-level**
+`default_evaluation_settings` for the usual defaults; callers may still construct
+`MyEvaluator(config, default_evaluation_settings=...)` to pin different defaults on a
+specific instance (see [Per-instance default evaluation settings](#per-instance-default-evaluation-settings)).
 
 ```python
 from learning_commons_evaluators import BaseEvaluator, EvaluatorConfig
@@ -329,6 +368,8 @@ class MyEvaluator(BaseEvaluator[MyInput, EvaluationResult, MySettings]):
         return EvaluationResult(answer=..., explanation=..., metadata=evaluation_metadata)
 ```
 
+If you override `__init__` on the subclass, accept the same keyword-only argument and forward it: `super().__init__(config, default_evaluation_settings=default_evaluation_settings)`.
+
 ## License
 
 MIT