Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 42 additions & 1 deletion sdks/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,42 @@ config = create_config(
)
```

### Per-instance default evaluation settings

Every `BaseEvaluator` subclass defines **class-level** `default_evaluation_settings`
(the bundled evaluators load these from generated settings). You can override that
default for a single evaluator instance by passing the same keyword to the
constructor:

```python
from learning_commons_evaluators import ConventionalityEvaluator, create_config

config = create_config(...)
# Start from the bundled defaults, then change what your deployment needs (models,
# temperatures, etc. live on nested PromptSettings).
settings = ConventionalityEvaluator.default_evaluation_settings.model_copy(deep=True)
settings.prompt_settings_step_conventionality_evaluation = (
settings.prompt_settings_step_conventionality_evaluation.model_copy(
update={"temperature": 0.2}
)
)
evaluator = ConventionalityEvaluator(
config,
default_evaluation_settings=settings,
)

# Uses the instance default (a deep copy is taken inside evaluate)
result = evaluator.evaluate(input)

# Per-call override still wins
result = evaluator.evaluate(input, evaluation_settings=other_settings)
```

If you omit `default_evaluation_settings` at construction, attribute lookup uses the
subclass class attribute, same as before. Whenever you call `evaluate()` without
`evaluation_settings`, the SDK uses `model_copy(deep=True)` of the resolved default,
so the object you keep on the instance is not mutated by a run.

### Logging

The SDK uses Python's standard `logging` module. By default, `EvaluatorConfig` uses the
Expand Down Expand Up @@ -294,7 +330,10 @@ On evaluation failure, `metadata.status` and `error_details` are set on the in-m

## Creating custom evaluators

Extend `BaseEvaluator` to create custom evaluators:
Extend `BaseEvaluator` to create custom evaluators. Set **class-level**
`default_evaluation_settings` for the usual defaults; callers may still construct
`MyEvaluator(config, default_evaluation_settings=...)` to pin different defaults on a
specific instance (see [Per-instance default evaluation settings](#per-instance-default-evaluation-settings)).

```python
from learning_commons_evaluators import BaseEvaluator, EvaluatorConfig
Expand Down Expand Up @@ -329,6 +368,8 @@ class MyEvaluator(BaseEvaluator[MyInput, EvaluationResult, MySettings]):
return EvaluationResult(answer=..., explanation=..., metadata=evaluation_metadata)
```

If you override `__init__` on the subclass, accept the same keyword-only argument and forward it: `super().__init__(config, default_evaluation_settings=default_evaluation_settings)`.

## License

MIT
18 changes: 13 additions & 5 deletions sdks/python/scripts/capture.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@
description="…", # optional human-readable label
)

String values in ``input`` (e.g. ``text``) are ``.strip()``'d before writing
TOML so contract ``[input]`` matches common evaluator behavior. Run your
chains with the same stripped strings so captured ``user_prompt`` matches.

3. Print the TOML block and paste it into ``contracts.toml`` (for example,
``sdks/settings/<evaluator>/contracts.toml``):

Expand Down Expand Up @@ -159,7 +163,8 @@ def capture_case(
Args:
name: Case identifier used as the TOML key (e.g. ``"marco_polo_grade3"``).
input: The evaluator's input dict (e.g. ``{"text": ..., "grade": 4}``).
Keys are written as-is to the ``[input]`` TOML section.
String values are ``.strip()``'d before writing TOML.
Keys are otherwise unchanged.
llm_call_captures: Ordered list of capture prefixes to include as
``prompt_steps`` in the TOML. Must match the prefixes
passed to ``capture_llm()`` during this run, in call order.
Expand All @@ -185,7 +190,7 @@ def capture_case(
"""
data: dict[str, Any] = dict(_captures)
data["name"] = name
data["input"] = dict(input)
data["input"] = _strip_string_values(dict(input))
data["llm_call_captures"] = llm_call_captures
if expected_result is not None:
# Normalise to a plain dict so capture_case() is always fully serializable.
Expand Down Expand Up @@ -223,6 +228,11 @@ def build_contract_toml(*cases: dict[str, Any]) -> str:
# ---------------------------------------------------------------------------


def _strip_string_values(inp: dict[str, Any]) -> dict[str, Any]:
"""Return a shallow copy of *inp* with leading/trailing whitespace removed from str values."""
return {k: v.strip() if isinstance(v, str) else v for k, v in inp.items()}


def _extract_text_content(content: Any) -> str:
"""Extract a plain-text string from an AIMessage content value.

Expand Down Expand Up @@ -291,9 +301,7 @@ def _build_case(c: dict[str, Any]) -> str:
# ── input section ────────────────────────────────────────────────────────
lines.append(f"[cases.{name}.input]")
for field, val in c.get("input", {}).items():
# Do NOT strip text — stripping would make input.text differ from the
# text that capture_llm used when formatting the user_prompt, causing
# the contract test's prompt-fidelity assertion to fail.
# ``capture_case`` applies ``.strip()`` to string fields in ``input``.
lines.append(f"{field} = {_toml_value(val)}")
lines.append("")

Expand Down
12 changes: 12 additions & 0 deletions sdks/python/src/learning_commons_evaluators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
ConventionalityEvaluator,
InputT,
OutputT,
VocabularyEvaluationInput,
VocabularyEvaluator,
)
from learning_commons_evaluators.evaluators.conventionality import (
ConventionalityEvaluationInput,
Expand Down Expand Up @@ -83,6 +85,11 @@
from learning_commons_evaluators.schemas.text_complexity import (
TextComplexityEvaluationInput,
)
from learning_commons_evaluators.schemas.vocabulary import (
VocabularyComplexityOutput,
VocabularyEvaluationSettings,
normalize_complexity_output,
)

__all__ = [
"__description__",
Expand Down Expand Up @@ -131,6 +138,11 @@
"TextInputField",
"TokenUsage",
"ValidationError",
"VocabularyEvaluationInput",
"VocabularyEvaluationSettings",
"VocabularyEvaluator",
"VocabularyComplexityOutput",
"normalize_complexity_output",
"create_config",
"create_config_no_telemetry",
"create_config_telemetry_with_full_input",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,19 @@
ConventionalityEvaluationInput,
ConventionalityEvaluator,
)
from learning_commons_evaluators.evaluators.vocabulary import (
VocabularyEvaluationInput,
VocabularyEvaluator,
)
from learning_commons_evaluators.schemas.vocabulary import normalize_complexity_output

__all__ = [
"BaseEvaluator",
"ConventionalityEvaluationInput",
"ConventionalityEvaluator",
"InputT",
"OutputT",
"VocabularyEvaluationInput",
"VocabularyEvaluator",
"normalize_complexity_output",
]
39 changes: 35 additions & 4 deletions sdks/python/src/learning_commons_evaluators/evaluators/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,25 @@ class BaseEvaluator(ABC, Generic[InputT, OutputT, SettingsT]):
"""
Abstract base class for all evaluators.
Subclasses must set metadata, default_evaluation_settings, and implement evaluate_impl().

Pass ``default_evaluation_settings`` at construction to override the class-level
defaults for that instance (used when :meth:`evaluate` is called without
``evaluation_settings``).
"""

config: EvaluatorConfig
metadata: EvaluatorMetadata
default_evaluation_settings: SettingsT

def __init__(self, config: EvaluatorConfig) -> None:
def __init__(
self,
config: EvaluatorConfig,
*,
default_evaluation_settings: SettingsT | None = None,
) -> None:
self.config = config
if default_evaluation_settings is not None:
self.default_evaluation_settings = default_evaluation_settings
# TODO: validate config

def evaluate(
Expand All @@ -68,13 +79,14 @@ def evaluate(

Validates the input, delegates to :meth:`evaluate_impl`, records timing
and status on the returned metadata, and logs start/end events via the
configured logger. If ``evaluation_settings`` is ``None``, the
evaluator's :attr:`default_evaluation_settings` is used.
configured logger. If ``evaluation_settings`` is ``None``, a deep copy of
the instance's :attr:`default_evaluation_settings` is used (from the
constructor keyword when given, otherwise the subclass class attribute).

Args:
input: Typed input for this evaluator.
evaluation_settings: Optional override for evaluation settings.
Defaults to :attr:`default_evaluation_settings`.
Defaults to :attr:`default_evaluation_settings` (constructor or class).

Returns:
A typed result whose ``metadata.status`` is
Expand Down Expand Up @@ -184,6 +196,7 @@ def execute_prompt_chain_step(
template: Any,
chain_inputs: dict[str, Any],
parser_output_type: type[ParsedT],
json_dict_normalizer: Callable[[dict], dict] | None = None,
) -> ParsedT: ...

def execute_prompt_chain_step(
Expand All @@ -194,6 +207,7 @@ def execute_prompt_chain_step(
template: Any,
chain_inputs: dict[str, Any],
parser_output_type: type[BaseModel] | None = None,
json_dict_normalizer: Callable[[dict], dict] | None = None,
) -> BaseModel | str:
"""Run a prompt chain (template | LLM), record metadata, and return the result.

Expand All @@ -215,6 +229,12 @@ def execute_prompt_chain_step(
chain_inputs: Variables to format the template and invoke the chain.
parser_output_type: Pydantic model class for JSON parsing, or ``None``
to return the raw text response.
json_dict_normalizer: When set with ``parser_output_type``, parse the
model response as JSON into a plain dict (no Pydantic parse),
apply this function (e.g. notebook-style ``normalize_complexity_output``),
then validate with ``parser_output_type``. Format instructions for the
prompt should still be built from the same ``parser_output_type`` via
:class:`~langchain_core.output_parsers.JsonOutputParser`.

Returns:
Parsed instance of ``parser_output_type`` when it is a model class; plain
Expand All @@ -223,7 +243,10 @@ def execute_prompt_chain_step(
Raises:
ConfigurationError: No provider config for prompt_settings.provider_type.
EvaluatorError: SDK errors, including :func:`~learning_commons_evaluators.schemas.errors.wrap_provider_error` output for LangChain or HTTP failures (typically :class:`~learning_commons_evaluators.schemas.errors.APIError` subclasses). Pydantic :exc:`pydantic.ValidationError` from output parsing is re-raised unchanged.
ValueError: If ``json_dict_normalizer`` is set but ``parser_output_type`` is omitted.
"""
if json_dict_normalizer is not None and parser_output_type is None:
raise ValueError("json_dict_normalizer requires parser_output_type to be set")
# Populated after a successful LLM invoke so we can attach usage even if parsing fails.
token_usage: TokenUsage | None = None

Expand All @@ -238,6 +261,14 @@ def _run_chain() -> BaseModel | str:
return str(ai_message.content)
from langchain_core.output_parsers.json import JsonOutputParser

if json_dict_normalizer is not None:
loose = JsonOutputParser()
parsed_dict = loose.invoke(ai_message)
if not isinstance(parsed_dict, dict):
parsed_dict = dict(parsed_dict)
normalized = json_dict_normalizer(parsed_dict)
return parser_output_type.model_validate(normalized)

parser = JsonOutputParser(pydantic_object=parser_output_type)
raw = parser.invoke(ai_message)
if isinstance(raw, parser_output_type):
Expand Down
Loading