learning-commons-org · czi-fsisenda · May 14, 2026 · Apr 30, 2026 · May 13, 2026 · May 13, 2026
diff --git a/sdks/python/README.md b/sdks/python/README.md
@@ -230,6 +230,42 @@ config = create_config(
 )
 ```
 
+### Per-instance default evaluation settings
+
+Every `BaseEvaluator` subclass defines **class-level** `default_evaluation_settings`
+(the bundled evaluators load these from generated settings). You can override that
+default for a single evaluator instance by passing the same keyword to the
+constructor:
+
+```python
+from learning_commons_evaluators import ConventionalityEvaluator, create_config
+
+config = create_config(...)
+# Start from the bundled defaults, then change what your deployment needs (models,
+# temperatures, etc. live on nested PromptSettings).
+settings = ConventionalityEvaluator.default_evaluation_settings.model_copy(deep=True)
+settings.prompt_settings_step_conventionality_evaluation = (
+    settings.prompt_settings_step_conventionality_evaluation.model_copy(
+        update={"temperature": 0.2}
+    )
+)
+evaluator = ConventionalityEvaluator(
+    config,
+    default_evaluation_settings=settings,
+)
+
+# Uses the instance default (a deep copy is taken inside evaluate)
+result = evaluator.evaluate(input)
+
+# Per-call override still wins
+result = evaluator.evaluate(input, evaluation_settings=other_settings)
+```
+
+If you omit `default_evaluation_settings` at construction, attribute lookup uses the
+subclass class attribute, same as before. Whenever you call `evaluate()` without
+`evaluation_settings`, the SDK uses `model_copy(deep=True)` of the resolved default,
+so the object you keep on the instance is not mutated by a run.
+
 ### Logging
 
 The SDK uses Python's standard `logging` module. By default, `EvaluatorConfig` uses the
@@ -294,7 +330,10 @@ On evaluation failure, `metadata.status` and `error_details` are set on the in-m
 
 ## Creating custom evaluators
 
-Extend `BaseEvaluator` to create custom evaluators:
+Extend `BaseEvaluator` to create custom evaluators. Set **class-level**
+`default_evaluation_settings` for the usual defaults; callers may still construct
+`MyEvaluator(config, default_evaluation_settings=...)` to pin different defaults on a
+specific instance (see [Per-instance default evaluation settings](#per-instance-default-evaluation-settings)).
 
 ```python
 from learning_commons_evaluators import BaseEvaluator, EvaluatorConfig
@@ -329,6 +368,8 @@ class MyEvaluator(BaseEvaluator[MyInput, EvaluationResult, MySettings]):
         return EvaluationResult(answer=..., explanation=..., metadata=evaluation_metadata)
 ```
 
+If you override `__init__` on the subclass, accept the same keyword-only argument and forward it: `super().__init__(config, default_evaluation_settings=default_evaluation_settings)`.
+
 ## License
 
 MIT
diff --git a/sdks/python/scripts/capture.py b/sdks/python/scripts/capture.py
@@ -25,6 +25,10 @@
            description="…",                 # optional human-readable label
        )
 
+   String values in ``input`` (e.g. ``text``) are ``.strip()``'d before writing
+   TOML so contract ``[input]`` matches common evaluator behavior. Run your
+   chains with the same stripped strings so captured ``user_prompt`` matches.
+
 3. Print the TOML block and paste it into ``contracts.toml`` (for example,
    ``sdks/settings/<evaluator>/contracts.toml``):
 
@@ -159,7 +163,8 @@ def capture_case(
     Args:
         name:              Case identifier used as the TOML key (e.g. ``"marco_polo_grade3"``).
         input:             The evaluator's input dict (e.g. ``{"text": ..., "grade": 4}``).
-                           Keys are written as-is to the ``[input]`` TOML section.
+                           String values are ``.strip()``'d before writing TOML.
+                           Keys are otherwise unchanged.
         llm_call_captures: Ordered list of capture prefixes to include as
                            ``prompt_steps`` in the TOML.  Must match the prefixes
                            passed to ``capture_llm()`` during this run, in call order.
@@ -185,7 +190,7 @@ def capture_case(
     """
     data: dict[str, Any] = dict(_captures)
     data["name"] = name
-    data["input"] = dict(input)
+    data["input"] = _strip_string_values(dict(input))
     data["llm_call_captures"] = llm_call_captures
     if expected_result is not None:
         # Normalise to a plain dict so capture_case() is always fully serializable.
@@ -223,6 +228,11 @@ def build_contract_toml(*cases: dict[str, Any]) -> str:
 # ---------------------------------------------------------------------------
 
 
+def _strip_string_values(inp: dict[str, Any]) -> dict[str, Any]:
+    """Return a shallow copy of *inp* with leading/trailing whitespace removed from str values."""
+    return {k: v.strip() if isinstance(v, str) else v for k, v in inp.items()}
+
+
 def _extract_text_content(content: Any) -> str:
     """Extract a plain-text string from an AIMessage content value.
 
@@ -291,9 +301,7 @@ def _build_case(c: dict[str, Any]) -> str:
     # ── input section ────────────────────────────────────────────────────────
     lines.append(f"[cases.{name}.input]")
     for field, val in c.get("input", {}).items():
-        # Do NOT strip text — stripping would make input.text differ from the
-        # text that capture_llm used when formatting the user_prompt, causing
-        # the contract test's prompt-fidelity assertion to fail.
+        # ``capture_case`` applies ``.strip()`` to string fields in ``input``.
         lines.append(f"{field} = {_toml_value(val)}")
     lines.append("")
 

diff --git a/sdks/python/src/learning_commons_evaluators/__init__.py b/sdks/python/src/learning_commons_evaluators/__init__.py
@@ -36,6 +36,8 @@
     ConventionalityEvaluator,
     InputT,
     OutputT,
+    VocabularyEvaluationInput,
+    VocabularyEvaluator,
 )
 from learning_commons_evaluators.evaluators.conventionality import (
     ConventionalityEvaluationInput,
@@ -83,6 +85,11 @@
 from learning_commons_evaluators.schemas.text_complexity import (
     TextComplexityEvaluationInput,
 )
+from learning_commons_evaluators.schemas.vocabulary import (
+    VocabularyComplexityOutput,
+    VocabularyEvaluationSettings,
+    normalize_complexity_output,
+)
 
 __all__ = [
     "__description__",
@@ -131,6 +138,11 @@
     "TextInputField",
     "TokenUsage",
     "ValidationError",
+    "VocabularyEvaluationInput",
+    "VocabularyEvaluationSettings",
+    "VocabularyEvaluator",
+    "VocabularyComplexityOutput",
+    "normalize_complexity_output",
     "create_config",
     "create_config_no_telemetry",
     "create_config_telemetry_with_full_input",

diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py b/sdks/python/src/learning_commons_evaluators/evaluators/__init__.py
@@ -11,11 +11,19 @@
     ConventionalityEvaluationInput,
     ConventionalityEvaluator,
 )
+from learning_commons_evaluators.evaluators.vocabulary import (
+    VocabularyEvaluationInput,
+    VocabularyEvaluator,
+)
+from learning_commons_evaluators.schemas.vocabulary import normalize_complexity_output
 
 __all__ = [
     "BaseEvaluator",
     "ConventionalityEvaluationInput",
     "ConventionalityEvaluator",
     "InputT",
     "OutputT",
+    "VocabularyEvaluationInput",
+    "VocabularyEvaluator",
+    "normalize_complexity_output",
 ]
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
@@ -49,14 +49,25 @@ class BaseEvaluator(ABC, Generic[InputT, OutputT, SettingsT]):
     """
     Abstract base class for all evaluators.
     Subclasses must set metadata, default_evaluation_settings, and implement evaluate_impl().
+
+    Pass ``default_evaluation_settings`` at construction to override the class-level
+    defaults for that instance (used when :meth:`evaluate` is called without
+    ``evaluation_settings``).
     """
 
     config: EvaluatorConfig
     metadata: EvaluatorMetadata
     default_evaluation_settings: SettingsT
 
-    def __init__(self, config: EvaluatorConfig) -> None:
+    def __init__(
+        self,
+        config: EvaluatorConfig,
+        *,
+        default_evaluation_settings: SettingsT | None = None,
+    ) -> None:
         self.config = config
+        if default_evaluation_settings is not None:
+            self.default_evaluation_settings = default_evaluation_settings
         # TODO: validate config
 
     def evaluate(
@@ -68,13 +79,14 @@ def evaluate(
 
         Validates the input, delegates to :meth:`evaluate_impl`, records timing
         and status on the returned metadata, and logs start/end events via the
-        configured logger.  If ``evaluation_settings`` is ``None``, the
-        evaluator's :attr:`default_evaluation_settings` is used.
+        configured logger.  If ``evaluation_settings`` is ``None``, a deep copy of
+        the instance's :attr:`default_evaluation_settings` is used (from the
+        constructor keyword when given, otherwise the subclass class attribute).
 
         Args:
             input: Typed input for this evaluator.
             evaluation_settings: Optional override for evaluation settings.
-                Defaults to :attr:`default_evaluation_settings`.
+                Defaults to :attr:`default_evaluation_settings` (constructor or class).
 
         Returns:
             A typed result whose ``metadata.status`` is
@@ -184,6 +196,7 @@ def execute_prompt_chain_step(
         template: Any,
         chain_inputs: dict[str, Any],
         parser_output_type: type[ParsedT],
+        json_dict_normalizer: Callable[[dict], dict] | None = None,
     ) -> ParsedT: ...
 
     def execute_prompt_chain_step(
@@ -194,6 +207,7 @@ def execute_prompt_chain_step(
         template: Any,
         chain_inputs: dict[str, Any],
         parser_output_type: type[BaseModel] | None = None,
+        json_dict_normalizer: Callable[[dict], dict] | None = None,
     ) -> BaseModel | str:
         """Run a prompt chain (template | LLM), record metadata, and return the result.
 
@@ -215,6 +229,12 @@ def execute_prompt_chain_step(
             chain_inputs: Variables to format the template and invoke the chain.
             parser_output_type: Pydantic model class for JSON parsing, or ``None``
                 to return the raw text response.
+            json_dict_normalizer: When set with ``parser_output_type``, parse the
+                model response as JSON into a plain dict (no Pydantic parse),
+                apply this function (e.g. notebook-style ``normalize_complexity_output``),
+                then validate with ``parser_output_type``. Format instructions for the
+                prompt should still be built from the same ``parser_output_type`` via
+                :class:`~langchain_core.output_parsers.JsonOutputParser`.
 
         Returns:
             Parsed instance of ``parser_output_type`` when it is a model class; plain
@@ -223,7 +243,10 @@ def execute_prompt_chain_step(
         Raises:
             ConfigurationError: No provider config for prompt_settings.provider_type.
             EvaluatorError: SDK errors, including :func:`~learning_commons_evaluators.schemas.errors.wrap_provider_error` output for LangChain or HTTP failures (typically :class:`~learning_commons_evaluators.schemas.errors.APIError` subclasses). Pydantic :exc:`pydantic.ValidationError` from output parsing is re-raised unchanged.
+            ValueError: If ``json_dict_normalizer`` is set but ``parser_output_type`` is omitted.
         """
+        if json_dict_normalizer is not None and parser_output_type is None:
+            raise ValueError("json_dict_normalizer requires parser_output_type to be set")
         # Populated after a successful LLM invoke so we can attach usage even if parsing fails.
         token_usage: TokenUsage | None = None
 
@@ -238,6 +261,14 @@ def _run_chain() -> BaseModel | str:
                     return str(ai_message.content)
                 from langchain_core.output_parsers.json import JsonOutputParser
 
+                if json_dict_normalizer is not None:
+                    loose = JsonOutputParser()
+                    parsed_dict = loose.invoke(ai_message)
+                    if not isinstance(parsed_dict, dict):
+                        parsed_dict = dict(parsed_dict)
+                    normalized = json_dict_normalizer(parsed_dict)
+                    return parser_output_type.model_validate(normalized)
+
                 parser = JsonOutputParser(pydantic_object=parser_output_type)
                 raw = parser.invoke(ai_message)
                 if isinstance(raw, parser_output_type):