From a386280c246a972ec124444913e0082eb02befcf Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Fri, 15 May 2026 10:10:34 -0700 Subject: [PATCH 1/2] feat(python-sdk): async-first evaluators with evaluate_sync wrapper - Make evaluate, evaluate_impl, execute_step, and execute_prompt_chain_step async; add evaluate_sync via asyncio.run for sync callers. - Use ainvoke for LangChain prompt chains. --- sdks/python/README.md | 19 +- .../evaluators/base.py | 128 +++++++-- .../evaluators/conventionality.py | 4 +- .../evaluators/vocabulary.py | 16 +- sdks/python/tests/contract_tests/harness.py | 2 +- .../contract_tests/test_conventionality.py | 2 +- .../tests/contract_tests/test_vocabulary.py | 4 +- sdks/python/tests/evaluators/test_base.py | 259 ++++++++++-------- .../tests/evaluators/test_conventionality.py | 4 +- .../tests/evaluators/test_vocabulary.py | 24 +- 10 files changed, 284 insertions(+), 178 deletions(-) diff --git a/sdks/python/README.md b/sdks/python/README.md index 9a1e80c..f1847cd 100644 --- a/sdks/python/README.md +++ b/sdks/python/README.md @@ -153,7 +153,7 @@ config = create_config( # Create evaluator and run evaluation evaluator = ConventionalityEvaluator(config) -result = evaluator.evaluate( +result = evaluator.evaluate_sync( ConventionalityEvaluationInput(text="The cat's out of the bag now.", grade=5) ) @@ -185,7 +185,7 @@ config = create_config( ) evaluator = ConventionalityEvaluator(config) -result = evaluator.evaluate( +result = evaluator.evaluate_sync( ConventionalityEvaluationInput(text="Your text here.", grade=5) ) @@ -254,16 +254,17 @@ evaluator = ConventionalityEvaluator( default_evaluation_settings=settings, ) -# Uses the instance default (a deep copy is taken inside evaluate) -result = evaluator.evaluate(input) +# Uses the instance default (a deep copy is taken inside evaluate / evaluate_sync) +result = evaluator.evaluate_sync(input) # Per-call override still wins -result = evaluator.evaluate(input, evaluation_settings=other_settings) +result = evaluator.evaluate_sync(input, evaluation_settings=other_settings) ``` If you omit `default_evaluation_settings` at construction, attribute lookup uses the -subclass class attribute, same as before. Whenever you call `evaluate()` without -`evaluation_settings`, the SDK uses `model_copy(deep=True)` of the resolved default, +subclass class attribute, same as before. Whenever you call `evaluate_sync()` or +`await evaluator.evaluate(...)` without `evaluation_settings`, the SDK uses +`model_copy(deep=True)` of the resolved default, so the object you keep on the instance is not mutated by a run. ### Logging @@ -313,7 +314,7 @@ from learning_commons_evaluators import ( ) try: - result = evaluator.evaluate(input) + result = evaluator.evaluate_sync(input) except ConfigurationError as e: print(f"Config issue: {e}") except ValidationError as e: @@ -326,7 +327,7 @@ except APIError as e: Failures inside LLM prompt steps are passed through `wrap_provider_error()` (see `learning_commons_evaluators.schemas.errors`) so you typically see `APIError` subclasses rather than raw LangChain or HTTP client exceptions. Use `EvaluatorTimeoutError` for timeouts (the package does not export a `TimeoutError` alias, to avoid shadowing the Python builtin). -On evaluation failure, `metadata.status` and `error_details` are set on the in-memory metadata object for the run and appear on the evaluation end log line; `BaseEvaluator.evaluate` still re-raises and does not return a result object. +On evaluation failure, `metadata.status` and `error_details` are set on the in-memory metadata object for the run and appear on the evaluation end log line; `BaseEvaluator.evaluate` / `evaluate_sync` still re-raises and does not return a result object. ## Creating custom evaluators diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py index 509b2e1..2ec69c0 100644 --- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py +++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py @@ -2,9 +2,10 @@ from __future__ import annotations +import asyncio import time from abc import ABC, abstractmethod -from collections.abc import Callable +from collections.abc import Awaitable, Callable from typing import Any, Generic, TypeVar, overload from pydantic import BaseModel @@ -48,7 +49,13 @@ class BaseEvaluator(ABC, Generic[InputT, OutputT, SettingsT]): """ Abstract base class for all evaluators. - Subclasses must set metadata, default_evaluation_settings, and implement evaluate_impl(). + Subclasses must set ``metadata``, ``default_evaluation_settings``, and implement + :meth:`evaluate_impl`. + + Callers run an evaluation with :meth:`evaluate` (async: ``await evaluator.evaluate(...)``) + or :meth:`evaluate_sync` from synchronous code (uses :func:`asyncio.run`). If a loop is + already running on this thread, :meth:`evaluate_sync` raises :exc:`RuntimeError`; use + ``await`` :meth:`evaluate` in that case. Pass ``default_evaluation_settings`` at construction to override the class-level defaults for that instance (used when :meth:`evaluate` is called without @@ -70,7 +77,7 @@ def __init__( self.default_evaluation_settings = default_evaluation_settings # TODO: validate config - def evaluate( + async def evaluate( self, input: InputT, evaluation_settings: SettingsT | None = None, @@ -79,10 +86,14 @@ def evaluate( Validates the input, delegates to :meth:`evaluate_impl`, records timing and status on the returned metadata, and logs start/end events via the - configured logger. If ``evaluation_settings`` is ``None``, a deep copy of + configured logger. If ``evaluation_settings`` is ``None``, a deep copy of the instance's :attr:`default_evaluation_settings` is used (from the constructor keyword when given, otherwise the subclass class attribute). + The ``finally`` block always runs so ``processing_time_ms`` and the end log + line are emitted even when validation or the implementation raises; telemetry + hooks remain TODOs here until wired. + Args: input: Typed input for this evaluator. evaluation_settings: Optional override for evaluation settings. @@ -116,7 +127,7 @@ def evaluate( ) try: input.validate() - result = self.evaluate_impl(input, evaluation_settings, evaluation_metadata) + result = await self.evaluate_impl(input, evaluation_settings, evaluation_metadata) evaluation_metadata.status = Status.succeeded result.metadata = evaluation_metadata return result @@ -133,38 +144,94 @@ def evaluate( # TODO: add full input to telemetry if enabled # TODO: send_telemetry(evaluation_metadata) + def evaluate_sync( + self, + input: InputT, + evaluation_settings: SettingsT | None = None, + ) -> OutputT: + """Run :meth:`evaluate` to completion from synchronous code. + + This is a thin wrapper around :func:`asyncio.run` over :meth:`evaluate`. Use it + from scripts, REPLs, or tests that are not already inside an asyncio event loop. + + Args: + input: Same as :meth:`evaluate`. + evaluation_settings: Same as :meth:`evaluate`. + + Returns: + Same typed result as :meth:`evaluate` on success. + + Raises: + Same exceptions as :meth:`evaluate`. + RuntimeError: If this thread already has a running asyncio event loop; use + ``await evaluator.evaluate(...)`` instead of :meth:`evaluate_sync`. + + Note: + Do not call this method when a running event loop is active (for example + from inside an ``async def`` without nesting another ``asyncio.run``); prefer + awaiting :meth:`evaluate` instead. + """ + try: + asyncio.get_running_loop() + except RuntimeError: + pass + else: + raise RuntimeError( + "evaluate_sync() cannot be used while an asyncio event loop is running in " + "this thread; use await evaluator.evaluate(...) from async code instead." + ) from None + return asyncio.run(self.evaluate(input, evaluation_settings)) + @abstractmethod - def evaluate_impl( + async def evaluate_impl( self, input: InputT, evaluation_settings: SettingsT, evaluation_metadata: EvaluationMetadata, ) -> OutputT: - """Implement the evaluation logic. Return a result; base assigns evaluation_metadata onto it.""" + """Implement the evaluator-specific logic for one run. + + Subclasses perform prompt steps (often via :meth:`execute_prompt_chain_step`), + assemble the typed result, and return it. The base :meth:`evaluate` assigns + ``evaluation_metadata`` onto ``result.metadata`` after a successful return. + + Args: + input: Validated evaluation input. + evaluation_settings: Resolved settings for this run (already deep-copied + when the caller omitted overrides). + evaluation_metadata: Run metadata; populate ``step_details`` and related + fields as steps execute. + + Returns: + A fully constructed result object (``metadata`` may still point at the + same ``evaluation_metadata`` instance; the base layer sets status and timing). + """ ... - def execute_step( + async def execute_step( self, step_name: str, evaluation_metadata: EvaluationMetadata, - implementation_function: Callable[[], StepResultT], + implementation_function: Callable[[], Awaitable[StepResultT]], *, extras: dict[str, Any] | None = None, ) -> StepResultT: - """Run ``implementation_function`` and record step metadata on ``evaluation_metadata``. + """Await ``implementation_function`` and record step metadata on ``evaluation_metadata``. ``step_name`` is always the step id. Optional ``extras`` is copied into :attr:`StepMetadata.extras` (merged with any updates made during the step, e.g. token usage). - The step may return any type (e.g. a Pydantic model, a plain ``str``, or ``None``); the same - type is returned to the caller. + ``implementation_function`` must be a zero-argument callable that returns an + awaitable (typically an ``async def`` with no parameters, or a lambda that + returns one). The awaited value may be any type (e.g. a Pydantic model, a plain + ``str``, or ``None``); the same type is returned to the caller. """ start = time.perf_counter() step_extras = dict(extras) if extras is not None else {} step_metadata = StepMetadata(step_id=step_name, extras=step_extras) self.config.logger.info("step start", extra={"step_metadata": step_metadata}) try: - result = implementation_function() + result = await implementation_function() step_metadata.status = Status.succeeded return result except Exception as e: @@ -177,7 +244,7 @@ def execute_step( evaluation_metadata.step_details[step_name] = step_metadata @overload - def execute_prompt_chain_step( + async def execute_prompt_chain_step( self, step_name: str, prompt_settings: PromptSettings, @@ -188,7 +255,7 @@ def execute_prompt_chain_step( ) -> str: ... @overload - def execute_prompt_chain_step( + async def execute_prompt_chain_step( self, step_name: str, prompt_settings: PromptSettings, @@ -199,7 +266,7 @@ def execute_prompt_chain_step( json_dict_normalizer: Callable[[dict], dict] | None = None, ) -> ParsedT: ... - def execute_prompt_chain_step( + async def execute_prompt_chain_step( self, step_name: str, prompt_settings: PromptSettings, @@ -209,19 +276,20 @@ def execute_prompt_chain_step( parser_output_type: type[BaseModel] | None = None, json_dict_normalizer: Callable[[dict], dict] | None = None, ) -> BaseModel | str: - """Run a prompt chain (template | LLM), record metadata, and return the result. + """Run a prompt chain (template | LLM) with ``ainvoke``, record metadata, and return the result. - When ``parser_output_type`` is a Pydantic model class, the LLM response is - parsed as JSON and returned as an instance of that class. When it is - ``None`` (the default), the raw response content is returned as a plain - ``str`` (no JSON parser) — use that for steps that produce unstructured prose - (e.g. a background-knowledge assumption). + The LangChain runnable ``template | provider`` is invoked asynchronously via + :meth:`~langchain_core.runnables.base.Runnable.ainvoke`. When ``parser_output_type`` + is a Pydantic model class, the LLM response is parsed as JSON and returned as an + instance of that class. When it is ``None`` (the default), the raw response content + is returned as a plain ``str`` (no JSON parser) — use that for steps that produce + unstructured prose (e.g. a background-knowledge assumption). Provider config (e.g. API key) is resolved from ``self.config`` by ``prompt_settings.provider_type``. Args: - step_name: Identifier for this step in evaluation_metadata.step_details. + step_name: Identifier for this step in ``evaluation_metadata.step_details``. prompt_settings: Provider type, model, and temperature for the LLM call. evaluation_metadata: Metadata for the full evaluation; step metadata and token usage are updated in place. @@ -241,7 +309,7 @@ def execute_prompt_chain_step( ``str`` when ``parser_output_type`` is omitted or ``None``. Raises: - ConfigurationError: No provider config for prompt_settings.provider_type. + ConfigurationError: No provider config for ``prompt_settings.provider_type``. EvaluatorError: SDK errors, including :func:`~learning_commons_evaluators.schemas.errors.wrap_provider_error` output for LangChain or HTTP failures (typically :class:`~learning_commons_evaluators.schemas.errors.APIError` subclasses). Pydantic :exc:`pydantic.ValidationError` from output parsing is re-raised unchanged. ValueError: If ``json_dict_normalizer`` is set but ``parser_output_type`` is omitted. """ @@ -250,12 +318,12 @@ def execute_prompt_chain_step( # Populated after a successful LLM invoke so we can attach usage even if parsing fails. token_usage: TokenUsage | None = None - def _run_chain() -> BaseModel | str: + async def _run_chain() -> BaseModel | str: nonlocal token_usage try: provider = create_provider(prompt_settings, self.config) llm_chain: Any = template | provider - ai_message = llm_chain.invoke(chain_inputs) + ai_message = await llm_chain.ainvoke(chain_inputs) token_usage = token_usage_from_aimessage(ai_message, prompt_settings) if parser_output_type is None: return str(ai_message.content) @@ -263,14 +331,14 @@ def _run_chain() -> BaseModel | str: if json_dict_normalizer is not None: loose = JsonOutputParser() - parsed_dict = loose.invoke(ai_message) + parsed_dict = await loose.ainvoke(ai_message) if not isinstance(parsed_dict, dict): parsed_dict = dict(parsed_dict) normalized = json_dict_normalizer(parsed_dict) return parser_output_type.model_validate(normalized) parser = JsonOutputParser(pydantic_object=parser_output_type) - raw = parser.invoke(ai_message) + raw = await parser.ainvoke(ai_message) if isinstance(raw, parser_output_type): return raw return parser_output_type.model_validate(raw) @@ -284,15 +352,15 @@ def _run_chain() -> BaseModel | str: raise wrap_provider_error(e) from e try: - return self.execute_step( + return await self.execute_step( step_name, evaluation_metadata, + _run_chain, extras={ PROMPT_STEP_EXTRA_PROMPT_SETTINGS: prompt_settings_to_extras_value( prompt_settings ), }, - implementation_function=_run_chain, ) finally: if token_usage is not None: diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py b/sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py index 1ae7032..7669d89 100644 --- a/sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py +++ b/sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py @@ -72,7 +72,7 @@ class ConventionalityEvaluator( _CONVENTIONALITY_CONFIG.evaluation_settings ) - def evaluate_impl( + async def evaluate_impl( self, input: ConventionalityEvaluationInput, evaluation_settings: ConventionalityEvaluationSettings, @@ -93,7 +93,7 @@ def evaluate_impl( ("human", prompts["human_prompt"]), ] ).partial(format_instructions=parser.get_format_instructions()) - conventionality_output = self.execute_prompt_chain_step( + conventionality_output = await self.execute_prompt_chain_step( step_name="conventionality_evaluation", prompt_settings=step_prompt_settings, evaluation_metadata=evaluation_metadata, diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py index cde17df..30ee1f7 100644 --- a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py +++ b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py @@ -87,7 +87,7 @@ class VocabularyEvaluator( _VOCABULARY_CONFIG.evaluation_settings ) - def evaluate_impl( + async def evaluate_impl( self, input: VocabularyEvaluationInput, evaluation_settings: VocabularyEvaluationSettings, @@ -98,7 +98,7 @@ def evaluate_impl( Grade validation is handled by the framework before this method is called: ``VocabularyEvaluationInput`` automatically constrains ``grade`` to the evaluator's ``allowed_grades`` from settings (3–12), so - ``BaseEvaluator.evaluate`` raises before reaching here for unsupported grades. + ``BaseEvaluator.evaluate`` / ``evaluate_sync`` raises before reaching here for unsupported grades. """ ps_bk = evaluation_settings.prompt_settings_step_background_knowledge ps_34 = evaluation_settings.prompt_settings_step_vocab_grades_3_4 @@ -113,7 +113,7 @@ def evaluate_impl( bk_template = ChatPromptTemplate.from_messages( [("human", prompts["background_knowledge_prompt"])] ) - background_knowledge: str = self.execute_prompt_chain_step( + background_knowledge: str = await self.execute_prompt_chain_step( step_name="background_knowledge", prompt_settings=ps_bk, evaluation_metadata=evaluation_metadata, @@ -130,7 +130,7 @@ def evaluate_impl( } if grade in _GRADES_3_4: chain_inputs["fk_level"] = fk_score - answer, explanation = self._run_vocab_complexity_chain( + answer, explanation = await self._run_vocab_complexity_chain( chain_inputs=chain_inputs, evaluation_metadata=evaluation_metadata, prompt_settings_vocab=ps_34, @@ -138,7 +138,7 @@ def evaluate_impl( user_prompt_template=prompts["vocab_grades_3_4_user_prompt"], ) else: - answer, explanation = self._run_vocab_complexity_chain( + answer, explanation = await self._run_vocab_complexity_chain( chain_inputs=chain_inputs, evaluation_metadata=evaluation_metadata, prompt_settings_vocab=ps_og, @@ -152,7 +152,7 @@ def evaluate_impl( metadata=evaluation_metadata, ) - def _run_vocab_complexity_chain( + async def _run_vocab_complexity_chain( self, *, chain_inputs: dict[str, Any], @@ -169,8 +169,8 @@ def _run_vocab_complexity_chain( ] ).partial(format_instructions=parser.get_format_instructions()) - output = self.execute_prompt_chain_step( - step_name="complexity_evaluation", + output = await self.execute_prompt_chain_step( + step_name="vocab_complexity", prompt_settings=prompt_settings_vocab, evaluation_metadata=evaluation_metadata, template=template, diff --git a/sdks/python/tests/contract_tests/harness.py b/sdks/python/tests/contract_tests/harness.py index 93ff7e1..0f9c284 100644 --- a/sdks/python/tests/contract_tests/harness.py +++ b/sdks/python/tests/contract_tests/harness.py @@ -14,7 +14,7 @@ ) with ContractTestHarness(case) as harness: - result = evaluator.evaluate(inp) + result = evaluator.evaluate_sync(inp) harness.assert_prompt_step("main") diff --git a/sdks/python/tests/contract_tests/test_conventionality.py b/sdks/python/tests/contract_tests/test_conventionality.py index 44db637..128f772 100644 --- a/sdks/python/tests/contract_tests/test_conventionality.py +++ b/sdks/python/tests/contract_tests/test_conventionality.py @@ -57,7 +57,7 @@ def test_turnip_grade4(self) -> None: ) with ContractTestHarness(case) as harness: - result = evaluator.evaluate(inp) + result = evaluator.evaluate_sync(inp) # --- Prompt fidelity --- # Asserts that the SDK sent the same fully-formatted request as the diff --git a/sdks/python/tests/contract_tests/test_vocabulary.py b/sdks/python/tests/contract_tests/test_vocabulary.py index 57364c7..bfcd88d 100644 --- a/sdks/python/tests/contract_tests/test_vocabulary.py +++ b/sdks/python/tests/contract_tests/test_vocabulary.py @@ -68,7 +68,7 @@ def test_marco_polo_grade3(self) -> None: ) with ContractTestHarness(case) as harness: - result = evaluator.evaluate(inp) + result = evaluator.evaluate_sync(inp) # --- Prompt fidelity --- # Both steps are asserted: model, temperature, and formatted messages @@ -113,7 +113,7 @@ def test_hurricanes_grade7(self) -> None: ) with ContractTestHarness(case) as harness: - result = evaluator.evaluate(inp) + result = evaluator.evaluate_sync(inp) # --- Prompt fidelity --- harness.assert_prompt_step("background_knowledge") diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py index 333e5c1..7c61c9e 100644 --- a/sdks/python/tests/evaluators/test_base.py +++ b/sdks/python/tests/evaluators/test_base.py @@ -1,7 +1,8 @@ """Tests for :class:`~learning_commons_evaluators.evaluators.base.BaseEvaluator`. -Covers ``__init__``, ``evaluate`` (metadata, settings override, success/failure, telemetry), -``update_total_token_usage``, ``execute_step``, and ``execute_prompt_chain_step``. +Covers ``__init__``, ``evaluate`` / ``evaluate_sync``, metadata and settings override, +success/failure paths, ``update_total_token_usage``, ``execute_step``, and +``execute_prompt_chain_step``. ``EvaluationMetadata`` always uses ``input.input_metadata()`` (including when ``send_full_input_with_telemetry`` is enabled). Helpers use both a minimal stub evaluator and conventionality-oriented fixtures where useful. @@ -10,13 +11,18 @@ from __future__ import annotations import logging -from unittest.mock import MagicMock, patch +from typing import NoReturn +from unittest.mock import AsyncMock, MagicMock, patch import pytest -from langchain_core.messages import AIMessage +from langchain_core.callbacks import CallbackManagerForLLMRun +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.language_models.fake_chat_models import FakeMessagesListChatModel +from langchain_core.messages import AIMessage, BaseMessage from langchain_core.prompts import ChatPromptTemplate from pydantic import BaseModel, Field from pydantic import ValidationError as PydanticValidationError +from typing_extensions import override from learning_commons_evaluators import ( BaseEvaluator, @@ -50,6 +56,31 @@ _CHAIN_PATCH = "learning_commons_evaluators.evaluators.base.create_provider" +def _fake_chat_model(message: AIMessage) -> FakeMessagesListChatModel: + """Fixed-response chat model for ``template | model`` chains (MagicMock breaks LC compose).""" + + return FakeMessagesListChatModel(responses=[message]) + + +class _ChainFailureChatModel(BaseChatModel): + """Chat model that always raises inside generation (provider failure simulation).""" + + @override + def _generate( + self, + messages: list[BaseMessage], + stop: list[str] | None = None, + run_manager: CallbackManagerForLLMRun | None = None, + **kwargs: object, + ) -> NoReturn: + raise ValueError("simulated provider failure") + + @property + @override + def _llm_type(self) -> str: + return "chain-failure-test-double" + + class _ChainOutput(BaseModel): """Minimal LLM JSON payload model (stand-in for conventionality output models).""" @@ -85,7 +116,7 @@ class _StubEvaluator( ) default_evaluation_settings = _StubSettings() - def evaluate_impl( + async def evaluate_impl( self, input: TextComplexityEvaluationInput, evaluation_settings: _StubSettings, @@ -132,19 +163,19 @@ def test_omitted_constructor_default_falls_back_to_class_attribute(self, config) class TestEvaluateSuccess: def test_sets_status_succeeded_and_processing_time(self, stub_evaluator): - result = stub_evaluator.evaluate(_stub_input()) + result = stub_evaluator.evaluate_sync(_stub_input()) assert result.metadata.status == Status.succeeded assert result.metadata.processing_time_ms >= 0.0 def test_passes_explicit_evaluation_settings(self, stub_evaluator): custom = _StubSettings(marker=42) - result = stub_evaluator.evaluate(_stub_input(), evaluation_settings=custom) + result = stub_evaluator.evaluate_sync(_stub_input(), evaluation_settings=custom) assert result.metadata.evaluation_settings.marker == 42 assert result.explanation.details.get("marker") == 42 def test_constructor_default_used_when_evaluate_settings_omitted(self, config): ev = _StubEvaluator(config, default_evaluation_settings=_StubSettings(marker=77)) - result = ev.evaluate(_stub_input()) + result = ev.evaluate_sync(_stub_input()) assert result.metadata.evaluation_settings.marker == 77 assert result.explanation.details.get("marker") == 77 @@ -153,16 +184,23 @@ def test_evaluate_explicit_settings_override_constructor_default(self, config): config, default_evaluation_settings=_StubSettings(marker=1), ) - result = ev.evaluate(_stub_input(), evaluation_settings=_StubSettings(marker=2)) + result = ev.evaluate_sync(_stub_input(), evaluation_settings=_StubSettings(marker=2)) assert result.explanation.details.get("marker") == 2 +class TestEvaluateSyncLoopGuard: + @pytest.mark.asyncio + async def test_evaluate_sync_raises_clear_error_when_loop_running(self, stub_evaluator): + with pytest.raises(RuntimeError, match="await evaluator.evaluate"): + stub_evaluator.evaluate_sync(_stub_input()) + + class TestEvaluateInputMetadata: """``input_metadata`` on :class:`EvaluationMetadata` always comes from ``input.input_metadata()``.""" def test_evaluate_sets_metadata_from_input_metadata(self, stub_evaluator): inp = _stub_input() - result = stub_evaluator.evaluate(inp) + result = stub_evaluator.evaluate_sync(inp) assert result.metadata.input_metadata == inp.input_metadata() assert result.metadata.input_metadata["text"] == {"textLength": 11} assert result.metadata.input_metadata["grade_level"] == {"grade": 3} @@ -172,7 +210,7 @@ def test_full_telemetry_config_still_uses_input_metadata_not_raw_values(self, st cfg = create_config(telemetry_partner_id="test", send_full_input_with_telemetry=True) ev = _StubEvaluator(cfg) inp = _stub_input() - result = ev.evaluate(inp) + result = ev.evaluate_sync(inp) assert result.metadata.input_metadata == inp.input_metadata() assert result.metadata.input_metadata["text"] == {"textLength": 11} assert result.metadata.input_metadata["grade_level"] == {"grade": 3} @@ -188,14 +226,16 @@ def test_raises_validation_error_for_invalid_input(self, stub_evaluator): grade_level=GradeInputField(spec=GradeInputSpec(name="grade_level"), value=3), ) with pytest.raises(ValidationError): - stub_evaluator.evaluate(inp) + stub_evaluator.evaluate_sync(inp) def test_propagates_evaluate_impl_exception(self, stub_evaluator): with ( - patch.object(stub_evaluator, "evaluate_impl", side_effect=RuntimeError("boom")), + patch.object( + stub_evaluator, "evaluate_impl", AsyncMock(side_effect=RuntimeError("boom")) + ), pytest.raises(RuntimeError, match="boom"), ): - stub_evaluator.evaluate(_stub_input()) + stub_evaluator.evaluate_sync(_stub_input()) def test_validation_failure_emits_end_log_with_failed_status(self, stub_evaluator): captured: list = [] @@ -218,7 +258,7 @@ def emit(self, record: logging.LogRecord) -> None: grade_level=GradeInputField(spec=GradeInputSpec(name="grade_level"), value=3), ) with pytest.raises(ValidationError): - stub_evaluator.evaluate(inp) + stub_evaluator.evaluate_sync(inp) finally: stub_evaluator.config.logger.removeHandler(h) @@ -272,33 +312,43 @@ def test_accumulates_usage_for_existing_provider(self, stub_evaluator, evaluatio class TestExecuteStep: - def test_returns_implementation_result(self, stub_evaluator, evaluation_metadata): - assert ( - stub_evaluator.execute_step("s", evaluation_metadata, lambda: "the-result") - == "the-result" - ) + async def test_returns_implementation_result(self, stub_evaluator, evaluation_metadata): + async def impl(): + return "the-result" + + assert await stub_evaluator.execute_step("s", evaluation_metadata, impl) == "the-result" + + async def test_records_succeeded_status_on_success(self, stub_evaluator, evaluation_metadata): + async def impl(): + return None - def test_records_succeeded_status_on_success(self, stub_evaluator, evaluation_metadata): - stub_evaluator.execute_step("s", evaluation_metadata, lambda: None) + await stub_evaluator.execute_step("s", evaluation_metadata, impl) assert evaluation_metadata.step_details["s"].status == Status.succeeded - def test_records_failed_status_and_error_on_exception( + async def test_records_failed_status_and_error_on_exception( self, stub_evaluator, evaluation_metadata ): - failing = MagicMock(side_effect=ValueError("boom")) + async def failing(): + raise ValueError("boom") + with pytest.raises(ValueError, match="boom"): - stub_evaluator.execute_step("s", evaluation_metadata, failing) + await stub_evaluator.execute_step("s", evaluation_metadata, failing) step = evaluation_metadata.step_details["s"] assert step.status == Status.failed assert "boom" in step.error_details - def test_re_raises_exception(self, stub_evaluator, evaluation_metadata): - failing = MagicMock(side_effect=RuntimeError("inner")) + async def test_re_raises_exception(self, stub_evaluator, evaluation_metadata): + async def failing(): + raise RuntimeError("inner") + with pytest.raises(RuntimeError, match="inner"): - stub_evaluator.execute_step("s", evaluation_metadata, failing) + await stub_evaluator.execute_step("s", evaluation_metadata, failing) + + async def test_extras_appear_in_step_metadata(self, stub_evaluator, evaluation_metadata): + async def impl(): + return None - def test_extras_appear_in_step_metadata(self, stub_evaluator, evaluation_metadata): - stub_evaluator.execute_step("s", evaluation_metadata, lambda: None, extras={"k": "v"}) + await stub_evaluator.execute_step("s", evaluation_metadata, impl, extras={"k": "v"}) assert evaluation_metadata.step_details["s"].extras["k"] == "v" @@ -308,22 +358,15 @@ def test_extras_appear_in_step_metadata(self, stub_evaluator, evaluation_metadat class TestExecutePromptChainStep: - """Mock ``create_provider`` so ``template | provider`` runs in-process. + """Mock ``create_provider`` so ``template | provider`` runs in-process.""" - Fake LLMs return real ``AIMessage`` values so ``JsonOutputParser`` and - ``token_usage_from_aimessage`` exercise the real code paths where applicable. - """ - - def test_returns_raw_string_when_parser_output_type_is_none( + async def test_returns_raw_string_when_parser_output_type_is_none( self, stub_evaluator, evaluation_metadata ): - def _fake_llm(_pv): - return AIMessage(content="plain prose") - template = ChatPromptTemplate.from_messages([("human", "{input}")]) ev = _StubEvaluator(create_config_no_telemetry()) - with patch(_CHAIN_PATCH, return_value=_fake_llm): - out = ev.execute_prompt_chain_step( + with patch(_CHAIN_PATCH, return_value=_fake_chat_model(AIMessage(content="plain prose"))): + out = await ev.execute_prompt_chain_step( step_name="raw", prompt_settings=PromptSettings( provider_type=LlmProvider.GOOGLE, @@ -337,12 +380,12 @@ def _fake_llm(_pv): ) assert out == "plain prose" - def test_json_dict_normalizer_without_parser_type_raises( + async def test_json_dict_normalizer_without_parser_type_raises( self, stub_evaluator, evaluation_metadata ): template = ChatPromptTemplate.from_messages([("human", "{input}")]) with pytest.raises(ValueError, match="json_dict_normalizer requires"): - stub_evaluator.execute_prompt_chain_step( + await stub_evaluator.execute_prompt_chain_step( step_name="raw", prompt_settings=PromptSettings( provider_type=LlmProvider.GOOGLE, @@ -356,13 +399,10 @@ def test_json_dict_normalizer_without_parser_type_raises( json_dict_normalizer=lambda d: d, ) - def test_returns_parsed_pydantic_output(self, stub_evaluator, evaluation_metadata): - def _fake_llm(_pv): - return AIMessage(content=_CHAIN_JSON) - + async def test_returns_parsed_pydantic_output(self, stub_evaluator, evaluation_metadata): template = ChatPromptTemplate.from_messages([("human", "{input}")]) - with patch(_CHAIN_PATCH, return_value=_fake_llm): - result = stub_evaluator.execute_prompt_chain_step( + with patch(_CHAIN_PATCH, return_value=_fake_chat_model(AIMessage(content=_CHAIN_JSON))): + result = await stub_evaluator.execute_prompt_chain_step( step_name="main", prompt_settings=PromptSettings( provider_type=LlmProvider.GOOGLE, @@ -378,14 +418,11 @@ def _fake_llm(_pv): assert result.label == "ok" assert result.score == 7 - def test_json_dict_normalizer_parses_dict_then_normalizes_then_validates( + async def test_json_dict_normalizer_parses_dict_then_normalizes_then_validates( self, stub_evaluator, evaluation_metadata ): """Optional ``json_dict_normalizer``: loose JSON → dict → user fn → ``model_validate``.""" - def _fake_llm(_pv): - return AIMessage(content='{"n": 1}') - class _Out(BaseModel): n: int = Field(description="n") doubled: int = Field(description="doubled") @@ -396,8 +433,11 @@ def _double(d: dict) -> dict: return d template = ChatPromptTemplate.from_messages([("human", "{input}")]) - with patch(_CHAIN_PATCH, return_value=_fake_llm): - result = stub_evaluator.execute_prompt_chain_step( + with patch( + _CHAIN_PATCH, + return_value=_fake_chat_model(AIMessage(content='{"n": 1}')), + ): + result = await stub_evaluator.execute_prompt_chain_step( step_name="main", prompt_settings=PromptSettings( provider_type=LlmProvider.GOOGLE, @@ -414,24 +454,22 @@ def _double(d: dict) -> dict: assert result.n == 1 assert result.doubled == 2 - def test_parser_returning_model_instance_short_circuits_model_validate( + async def test_parser_returning_model_instance_short_circuits_model_validate( self, stub_evaluator, evaluation_metadata ): - """When ``JsonOutputParser.invoke`` returns a model, ``isinstance`` path skips ``model_validate``.""" + """When ``JsonOutputParser.ainvoke`` returns a model, ``isinstance`` path skips ``model_validate``.""" prebuilt = _ChainOutput(label="direct", score=99) - def _fake_llm(_pv): - return AIMessage(content="unused") - template = ChatPromptTemplate.from_messages([("human", "{input}")]) with ( - patch(_CHAIN_PATCH, return_value=_fake_llm), + patch(_CHAIN_PATCH, return_value=_fake_chat_model(AIMessage(content="unused"))), patch("langchain_core.output_parsers.json.JsonOutputParser") as mock_parser_cls, ): mock_parser = MagicMock() mock_parser.invoke.return_value = prebuilt + mock_parser.ainvoke = AsyncMock(return_value=prebuilt) mock_parser_cls.return_value = mock_parser - result = stub_evaluator.execute_prompt_chain_step( + result = await stub_evaluator.execute_prompt_chain_step( step_name="main", prompt_settings=PromptSettings( provider_type=LlmProvider.GOOGLE, @@ -445,20 +483,20 @@ def _fake_llm(_pv): ) assert result is prebuilt - def test_keyboard_interrupt_from_parser_propagates(self, stub_evaluator, evaluation_metadata): - def _fake_llm(_pv): - return AIMessage(content=_CHAIN_JSON) - + async def test_keyboard_interrupt_from_parser_propagates( + self, stub_evaluator, evaluation_metadata + ): template = ChatPromptTemplate.from_messages([("human", "{input}")]) with ( - patch(_CHAIN_PATCH, return_value=_fake_llm), + patch(_CHAIN_PATCH, return_value=_fake_chat_model(AIMessage(content=_CHAIN_JSON))), patch("langchain_core.output_parsers.json.JsonOutputParser") as mock_parser_cls, ): mock_parser = MagicMock() mock_parser.invoke.side_effect = KeyboardInterrupt + mock_parser.ainvoke = AsyncMock(side_effect=KeyboardInterrupt) mock_parser_cls.return_value = mock_parser with pytest.raises(KeyboardInterrupt): - stub_evaluator.execute_prompt_chain_step( + await stub_evaluator.execute_prompt_chain_step( step_name="main", prompt_settings=PromptSettings( provider_type=LlmProvider.GOOGLE, @@ -471,20 +509,18 @@ def _fake_llm(_pv): parser_output_type=_ChainOutput, ) - def test_system_exit_from_parser_propagates(self, stub_evaluator, evaluation_metadata): - def _fake_llm(_pv): - return AIMessage(content=_CHAIN_JSON) - + async def test_system_exit_from_parser_propagates(self, stub_evaluator, evaluation_metadata): template = ChatPromptTemplate.from_messages([("human", "{input}")]) with ( - patch(_CHAIN_PATCH, return_value=_fake_llm), + patch(_CHAIN_PATCH, return_value=_fake_chat_model(AIMessage(content=_CHAIN_JSON))), patch("langchain_core.output_parsers.json.JsonOutputParser") as mock_parser_cls, ): mock_parser = MagicMock() mock_parser.invoke.side_effect = SystemExit(3) + mock_parser.ainvoke = AsyncMock(side_effect=SystemExit(3)) mock_parser_cls.return_value = mock_parser with pytest.raises(SystemExit) as exc_info: - stub_evaluator.execute_prompt_chain_step( + await stub_evaluator.execute_prompt_chain_step( step_name="main", prompt_settings=PromptSettings( provider_type=LlmProvider.GOOGLE, @@ -498,7 +534,9 @@ def _fake_llm(_pv): ) assert exc_info.value.code == 3 - def test_prompt_settings_recorded_in_step_extras(self, stub_evaluator, evaluation_metadata): + async def test_prompt_settings_recorded_in_step_extras( + self, stub_evaluator, evaluation_metadata + ): settings = PromptSettings( provider_type=LlmProvider.GOOGLE, model="gemini-2.0-flash", @@ -506,8 +544,8 @@ def test_prompt_settings_recorded_in_step_extras(self, stub_evaluator, evaluatio ) template = ChatPromptTemplate.from_messages([("human", "{input}")]) - with patch(_CHAIN_PATCH, return_value=lambda _pv: AIMessage(content=_CHAIN_JSON)): - stub_evaluator.execute_prompt_chain_step( + with patch(_CHAIN_PATCH, return_value=_fake_chat_model(AIMessage(content=_CHAIN_JSON))): + await stub_evaluator.execute_prompt_chain_step( step_name="main", prompt_settings=settings, evaluation_metadata=evaluation_metadata, @@ -520,16 +558,17 @@ def test_prompt_settings_recorded_in_step_extras(self, stub_evaluator, evaluatio assert step.extras[PROMPT_STEP_EXTRA_PROMPT_SETTINGS]["model"] == "gemini-2.0-flash" assert PROMPT_STEP_EXTRA_TOKEN_USAGE in step.extras - def test_token_usage_recorded_when_llm_reports_usage(self, stub_evaluator, evaluation_metadata): - def _llm_with_usage(_pv): - return AIMessage( - content=_CHAIN_JSON, - usage_metadata={ - "input_tokens": 42, - "output_tokens": 17, - "total_tokens": 59, - }, - ) + async def test_token_usage_recorded_when_llm_reports_usage( + self, stub_evaluator, evaluation_metadata + ): + msg = AIMessage( + content=_CHAIN_JSON, + usage_metadata={ + "input_tokens": 42, + "output_tokens": 17, + "total_tokens": 59, + }, + ) settings = PromptSettings( provider_type=LlmProvider.GOOGLE, @@ -538,8 +577,8 @@ def _llm_with_usage(_pv): ) template = ChatPromptTemplate.from_messages([("human", "{input}")]) - with patch(_CHAIN_PATCH, return_value=_llm_with_usage): - stub_evaluator.execute_prompt_chain_step( + with patch(_CHAIN_PATCH, return_value=_fake_chat_model(msg)): + await stub_evaluator.execute_prompt_chain_step( step_name="main", prompt_settings=settings, evaluation_metadata=evaluation_metadata, @@ -553,7 +592,7 @@ def _llm_with_usage(_pv): assert step.extras[PROMPT_STEP_EXTRA_TOKEN_USAGE]["output_tokens"] == 17 assert evaluation_metadata.total_token_usage[LlmProvider.GOOGLE].input_tokens == 42 - def test_propagates_configuration_error_from_create_provider( + async def test_propagates_configuration_error_from_create_provider( self, stub_evaluator, evaluation_metadata ): template = ChatPromptTemplate.from_messages([("human", "{input}")]) @@ -564,7 +603,7 @@ def test_propagates_configuration_error_from_create_provider( ), pytest.raises(ConfigurationError, match="Google provider config is not set"), ): - stub_evaluator.execute_prompt_chain_step( + await stub_evaluator.execute_prompt_chain_step( step_name="main", prompt_settings=PromptSettings( provider_type=LlmProvider.GOOGLE, @@ -577,14 +616,16 @@ def test_propagates_configuration_error_from_create_provider( parser_output_type=_ChainOutput, ) - def test_propagates_evaluator_error_without_wrapping(self, stub_evaluator, evaluation_metadata): + async def test_propagates_evaluator_error_without_wrapping( + self, stub_evaluator, evaluation_metadata + ): """``EvaluatorError`` subclasses raised inside the chain are re-raised unchanged.""" template = ChatPromptTemplate.from_messages([("human", "{input}")]) with ( patch(_CHAIN_PATCH, side_effect=EvaluatorError("bare evaluator error")), pytest.raises(EvaluatorError, match="bare evaluator error"), ): - stub_evaluator.execute_prompt_chain_step( + await stub_evaluator.execute_prompt_chain_step( step_name="main", prompt_settings=PromptSettings( provider_type=LlmProvider.GOOGLE, @@ -597,16 +638,15 @@ def test_propagates_evaluator_error_without_wrapping(self, stub_evaluator, evalu parser_output_type=_ChainOutput, ) - def test_wraps_unexpected_chain_failure_as_api_error(self, stub_evaluator, evaluation_metadata): - def _boom(_pv): - raise ValueError("simulated provider failure") - + async def test_wraps_unexpected_chain_failure_as_api_error( + self, stub_evaluator, evaluation_metadata + ): template = ChatPromptTemplate.from_messages([("human", "{input}")]) with ( - patch(_CHAIN_PATCH, return_value=_boom), + patch(_CHAIN_PATCH, return_value=_ChainFailureChatModel()), pytest.raises(APIError, match="simulated provider failure"), ): - stub_evaluator.execute_prompt_chain_step( + await stub_evaluator.execute_prompt_chain_step( step_name="main", prompt_settings=PromptSettings( provider_type=LlmProvider.GOOGLE, @@ -619,18 +659,15 @@ def _boom(_pv): parser_output_type=_ChainOutput, ) - def test_malformed_llm_json_raises_api_error(self, stub_evaluator, evaluation_metadata): + async def test_malformed_llm_json_raises_api_error(self, stub_evaluator, evaluation_metadata): """Invalid JSON from the LLM becomes :class:`APIError` via ``wrap_provider_error``.""" - def _bad(_pv): - return AIMessage(content="not-json") - template = ChatPromptTemplate.from_messages([("human", "{input}")]) with ( - patch(_CHAIN_PATCH, return_value=_bad), + patch(_CHAIN_PATCH, return_value=_fake_chat_model(AIMessage(content="not-json"))), pytest.raises(APIError, match="Invalid json output"), ): - stub_evaluator.execute_prompt_chain_step( + await stub_evaluator.execute_prompt_chain_step( step_name="main", prompt_settings=PromptSettings( provider_type=LlmProvider.GOOGLE, @@ -643,20 +680,20 @@ def _bad(_pv): parser_output_type=_ChainOutput, ) - def test_schema_mismatch_raises_pydantic_validation_error( + async def test_schema_mismatch_raises_pydantic_validation_error( self, stub_evaluator, evaluation_metadata ): """Valid JSON that does not satisfy the output model raises Pydantic ``ValidationError``.""" - def _partial(_pv): - return AIMessage(content='{"label": "only"}') - template = ChatPromptTemplate.from_messages([("human", "{input}")]) with ( - patch(_CHAIN_PATCH, return_value=_partial), + patch( + _CHAIN_PATCH, + return_value=_fake_chat_model(AIMessage(content='{"label": "only"}')), + ), pytest.raises(PydanticValidationError), ): - stub_evaluator.execute_prompt_chain_step( + await stub_evaluator.execute_prompt_chain_step( step_name="main", prompt_settings=PromptSettings( provider_type=LlmProvider.GOOGLE, diff --git a/sdks/python/tests/evaluators/test_conventionality.py b/sdks/python/tests/evaluators/test_conventionality.py index e339da0..19b0557 100644 --- a/sdks/python/tests/evaluators/test_conventionality.py +++ b/sdks/python/tests/evaluators/test_conventionality.py @@ -38,7 +38,7 @@ def test_evaluate_returns_evaluation_result(self): evaluator = ConventionalityEvaluator(config) inp = ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=5) with patch.object(evaluator, "execute_prompt_chain_step", return_value=_make_mock_output()): - result = evaluator.evaluate(inp) + result = evaluator.evaluate_sync(inp) assert result.answer.score == "moderately_complex" assert result.answer.label == "Moderately complex" assert result.explanation.summary is not None @@ -65,7 +65,7 @@ def test_evaluate_with_explicit_settings(self): ) inp = ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=3) with patch.object(evaluator, "execute_prompt_chain_step", return_value=_make_mock_output()): - result = evaluator.evaluate(inp, evaluation_settings=settings) + result = evaluator.evaluate_sync(inp, evaluation_settings=settings) assert result.metadata.status == Status.succeeded def test_metadata_and_default_settings(self): diff --git a/sdks/python/tests/evaluators/test_vocabulary.py b/sdks/python/tests/evaluators/test_vocabulary.py index a6eacf0..9a7d5d9 100644 --- a/sdks/python/tests/evaluators/test_vocabulary.py +++ b/sdks/python/tests/evaluators/test_vocabulary.py @@ -79,7 +79,7 @@ def test_evaluate_grade_3_returns_result(self): evaluator = VocabularyEvaluator(config) inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=3) with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_grades34_output()): - result = evaluator.evaluate(inp) + result = evaluator.evaluate_sync(inp) assert result.answer.score == "moderately_complex" assert result.answer.label == "Moderately complex" @@ -93,7 +93,7 @@ def test_evaluate_grade_4_returns_result(self): with _patch_steps( evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_grades34_output("very_complex") ): - result = evaluator.evaluate(inp) + result = evaluator.evaluate_sync(inp) assert result.answer.score == "very_complex" @@ -107,7 +107,7 @@ def test_grades34_score_with_spaces_is_normalised(self): output = _make_grades34_output("slightly complex") with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, output): - result = evaluator.evaluate(inp) + result = evaluator.evaluate_sync(inp) assert result.answer.score == "slightly_complex" @@ -116,7 +116,7 @@ def test_evaluate_grades34_explanation_has_word_breakdown(self): evaluator = VocabularyEvaluator(config) inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=3) with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_grades34_output()): - result = evaluator.evaluate(inp) + result = evaluator.evaluate_sync(inp) details = result.explanation.details assert "tier_2_words" in details @@ -148,7 +148,7 @@ def test_all_complexity_scores_map_correctly(self, score_label, expected_score): _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(score_label), ): - result = evaluator.evaluate(inp) + result = evaluator.evaluate_sync(inp) assert result.answer.score == expected_score @@ -157,7 +157,7 @@ def test_evaluate_grade_12_returns_result(self): evaluator = VocabularyEvaluator(config) inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=12) with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(1)): - result = evaluator.evaluate(inp) + result = evaluator.evaluate_sync(inp) assert result.metadata.status == Status.succeeded assert result.answer.score == "slightly_complex" @@ -168,7 +168,7 @@ def test_other_grades_explanation_includes_word_breakdown(self): evaluator = VocabularyEvaluator(config) inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=8) with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(2)): - result = evaluator.evaluate(inp) + result = evaluator.evaluate_sync(inp) details = result.explanation.details assert details["tier_2_words"] == "sat" @@ -211,7 +211,7 @@ def test_other_grades_unexpected_digit_answer_raises(self): _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, unexpected), pytest.raises(ValueError, match=r"Unknown text complexity score: '9'"), ): - evaluator.evaluate(inp) + evaluator.evaluate_sync(inp) class TestNormalizeComplexityOutput: @@ -254,13 +254,13 @@ def test_allowed_grades_set_from_toml(self): @pytest.mark.parametrize("unsupported_grade", [0, 1, 2]) def test_unsupported_grade_raises_via_framework(self, unsupported_grade): - """BaseEvaluator.evaluate() calls input.validate(), which catches the bad grade.""" + """BaseEvaluator.evaluate_sync() calls input.validate(), which catches the bad grade.""" config = create_config_no_telemetry() evaluator = VocabularyEvaluator(config) inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=unsupported_grade) # The base evaluator catches the ValidationError, sets status=failed, then re-raises. with pytest.raises(ValidationError): - evaluator.evaluate(inp) + evaluator.evaluate_sync(inp) def test_unsupported_grade_sets_status_failed(self): """Metadata status is set to failed when grade validation fails.""" @@ -268,7 +268,7 @@ def test_unsupported_grade_sets_status_failed(self): evaluator = VocabularyEvaluator(config) inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=2) with pytest.raises(ValidationError): - evaluator.evaluate(inp) + evaluator.evaluate_sync(inp) # ── Metadata and settings ───────────────────────────────────────────────────── @@ -292,7 +292,7 @@ def test_evaluate_succeeds_and_records_metadata(self): evaluator = VocabularyEvaluator(config) inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5) with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(2)): - result = evaluator.evaluate(inp) + result = evaluator.evaluate_sync(inp) assert result.metadata.status == Status.succeeded assert result.metadata.evaluator_metadata.id == "vocabulary" From 4733f9f2b5d7f4bf4a377eabd1b278743bad4ef8 Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Fri, 15 May 2026 15:24:59 -0700 Subject: [PATCH 2/2] chore: address PR comments --- sdks/python/README.md | 6 +++--- .../evaluators/vocabulary.py | 2 +- sdks/python/tests/evaluators/test_base.py | 10 ++++++++++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/sdks/python/README.md b/sdks/python/README.md index f1847cd..da9fd40 100644 --- a/sdks/python/README.md +++ b/sdks/python/README.md @@ -351,14 +351,14 @@ class MyEvaluator(BaseEvaluator[MyInput, EvaluationResult, MySettings]): ) default_evaluation_settings = MySettings(...) - def evaluate_impl( + async def evaluate_impl( self, input: MyInput, evaluation_settings: MySettings, evaluation_metadata: EvaluationMetadata, ) -> EvaluationResult: - # Use self.execute_prompt_chain_step() for LLM calls - output = self.execute_prompt_chain_step( + # Use await self.execute_prompt_chain_step(...) for LLM calls + output = await self.execute_prompt_chain_step( step_name="main", prompt_settings=evaluation_settings.prompt_settings, evaluation_metadata=evaluation_metadata, diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py index 30ee1f7..dc6d761 100644 --- a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py +++ b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py @@ -98,7 +98,7 @@ async def evaluate_impl( Grade validation is handled by the framework before this method is called: ``VocabularyEvaluationInput`` automatically constrains ``grade`` to the evaluator's ``allowed_grades`` from settings (3–12), so - ``BaseEvaluator.evaluate`` / ``evaluate_sync`` raises before reaching here for unsupported grades. + ``BaseEvaluator.evaluate`` / ``evaluate_sync`` raise before reaching here for unsupported grades. """ ps_bk = evaluation_settings.prompt_settings_step_background_knowledge ps_34 = evaluation_settings.prompt_settings_step_vocab_grades_3_4 diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py index 7c61c9e..35d99e9 100644 --- a/sdks/python/tests/evaluators/test_base.py +++ b/sdks/python/tests/evaluators/test_base.py @@ -195,6 +195,16 @@ async def test_evaluate_sync_raises_clear_error_when_loop_running(self, stub_eva stub_evaluator.evaluate_sync(_stub_input()) +class TestEvaluateAsyncEntrypoint: + """``await evaluator.evaluate(...)`` is the primary API when an event loop is already running.""" + + @pytest.mark.asyncio + async def test_evaluate_returns_result_in_async_context(self, stub_evaluator): + result = await stub_evaluator.evaluate(_stub_input()) + assert result.metadata.status == Status.succeeded + assert result.metadata.processing_time_ms >= 0.0 + + class TestEvaluateInputMetadata: """``input_metadata`` on :class:`EvaluationMetadata` always comes from ``input.input_metadata()``."""