From a386280c246a972ec124444913e0082eb02befcf Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Fri, 15 May 2026 10:10:34 -0700
Subject: [PATCH 1/2] feat(python-sdk): async-first evaluators with
 evaluate_sync wrapper

- Make evaluate, evaluate_impl, execute_step, and execute_prompt_chain_step async; add evaluate_sync via asyncio.run for sync callers.
- Use ainvoke for LangChain prompt chains.
---
 sdks/python/README.md                         |  19 +-
 .../evaluators/base.py                        | 128 +++++++--
 .../evaluators/conventionality.py             |   4 +-
 .../evaluators/vocabulary.py                  |  16 +-
 sdks/python/tests/contract_tests/harness.py   |   2 +-
 .../contract_tests/test_conventionality.py    |   2 +-
 .../tests/contract_tests/test_vocabulary.py   |   4 +-
 sdks/python/tests/evaluators/test_base.py     | 259 ++++++++++--------
 .../tests/evaluators/test_conventionality.py  |   4 +-
 .../tests/evaluators/test_vocabulary.py       |  24 +-
 10 files changed, 284 insertions(+), 178 deletions(-)

diff --git a/sdks/python/README.md b/sdks/python/README.md
index 9a1e80c..f1847cd 100644
--- a/sdks/python/README.md
+++ b/sdks/python/README.md
@@ -153,7 +153,7 @@ config = create_config(
 
 # Create evaluator and run evaluation
 evaluator = ConventionalityEvaluator(config)
-result = evaluator.evaluate(
+result = evaluator.evaluate_sync(
     ConventionalityEvaluationInput(text="The cat's out of the bag now.", grade=5)
 )
 
@@ -185,7 +185,7 @@ config = create_config(
 )
 evaluator = ConventionalityEvaluator(config)
 
-result = evaluator.evaluate(
+result = evaluator.evaluate_sync(
     ConventionalityEvaluationInput(text="Your text here.", grade=5)
 )
 
@@ -254,16 +254,17 @@ evaluator = ConventionalityEvaluator(
     default_evaluation_settings=settings,
 )
 
-# Uses the instance default (a deep copy is taken inside evaluate)
-result = evaluator.evaluate(input)
+# Uses the instance default (a deep copy is taken inside evaluate / evaluate_sync)
+result = evaluator.evaluate_sync(input)
 
 # Per-call override still wins
-result = evaluator.evaluate(input, evaluation_settings=other_settings)
+result = evaluator.evaluate_sync(input, evaluation_settings=other_settings)
 ```
 
 If you omit `default_evaluation_settings` at construction, attribute lookup uses the
-subclass class attribute, same as before. Whenever you call `evaluate()` without
-`evaluation_settings`, the SDK uses `model_copy(deep=True)` of the resolved default,
+subclass class attribute, same as before. Whenever you call `evaluate_sync()` or
+`await evaluator.evaluate(...)` without `evaluation_settings`, the SDK uses
+`model_copy(deep=True)` of the resolved default,
 so the object you keep on the instance is not mutated by a run.
 
 ### Logging
@@ -313,7 +314,7 @@ from learning_commons_evaluators import (
 )
 
 try:
-    result = evaluator.evaluate(input)
+    result = evaluator.evaluate_sync(input)
 except ConfigurationError as e:
     print(f"Config issue: {e}")
 except ValidationError as e:
@@ -326,7 +327,7 @@ except APIError as e:
 
 Failures inside LLM prompt steps are passed through `wrap_provider_error()` (see `learning_commons_evaluators.schemas.errors`) so you typically see `APIError` subclasses rather than raw LangChain or HTTP client exceptions. Use `EvaluatorTimeoutError` for timeouts (the package does not export a `TimeoutError` alias, to avoid shadowing the Python builtin).
 
-On evaluation failure, `metadata.status` and `error_details` are set on the in-memory metadata object for the run and appear on the evaluation end log line; `BaseEvaluator.evaluate` still re-raises and does not return a result object.
+On evaluation failure, `metadata.status` and `error_details` are set on the in-memory metadata object for the run and appear on the evaluation end log line; `BaseEvaluator.evaluate` / `evaluate_sync` still re-raises and does not return a result object.
 
 ## Creating custom evaluators
 
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
index 509b2e1..2ec69c0 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
@@ -2,9 +2,10 @@
 
 from __future__ import annotations
 
+import asyncio
 import time
 from abc import ABC, abstractmethod
-from collections.abc import Callable
+from collections.abc import Awaitable, Callable
 from typing import Any, Generic, TypeVar, overload
 
 from pydantic import BaseModel
@@ -48,7 +49,13 @@
 class BaseEvaluator(ABC, Generic[InputT, OutputT, SettingsT]):
     """
     Abstract base class for all evaluators.
-    Subclasses must set metadata, default_evaluation_settings, and implement evaluate_impl().
+    Subclasses must set ``metadata``, ``default_evaluation_settings``, and implement
+    :meth:`evaluate_impl`.
+
+    Callers run an evaluation with :meth:`evaluate` (async: ``await evaluator.evaluate(...)``)
+    or :meth:`evaluate_sync` from synchronous code (uses :func:`asyncio.run`). If a loop is
+    already running on this thread, :meth:`evaluate_sync` raises :exc:`RuntimeError`; use
+    ``await`` :meth:`evaluate` in that case.
 
     Pass ``default_evaluation_settings`` at construction to override the class-level
     defaults for that instance (used when :meth:`evaluate` is called without
@@ -70,7 +77,7 @@ def __init__(
             self.default_evaluation_settings = default_evaluation_settings
         # TODO: validate config
 
-    def evaluate(
+    async def evaluate(
         self,
         input: InputT,
         evaluation_settings: SettingsT | None = None,
@@ -79,10 +86,14 @@ def evaluate(
 
         Validates the input, delegates to :meth:`evaluate_impl`, records timing
         and status on the returned metadata, and logs start/end events via the
-        configured logger.  If ``evaluation_settings`` is ``None``, a deep copy of
+        configured logger. If ``evaluation_settings`` is ``None``, a deep copy of
         the instance's :attr:`default_evaluation_settings` is used (from the
         constructor keyword when given, otherwise the subclass class attribute).
 
+        The ``finally`` block always runs so ``processing_time_ms`` and the end log
+        line are emitted even when validation or the implementation raises; telemetry
+        hooks remain TODOs here until wired.
+
         Args:
             input: Typed input for this evaluator.
             evaluation_settings: Optional override for evaluation settings.
@@ -116,7 +127,7 @@ def evaluate(
         )
         try:
             input.validate()
-            result = self.evaluate_impl(input, evaluation_settings, evaluation_metadata)
+            result = await self.evaluate_impl(input, evaluation_settings, evaluation_metadata)
             evaluation_metadata.status = Status.succeeded
             result.metadata = evaluation_metadata
             return result
@@ -133,38 +144,94 @@ def evaluate(
             # TODO: add full input to telemetry if enabled
             # TODO: send_telemetry(evaluation_metadata)
 
+    def evaluate_sync(
+        self,
+        input: InputT,
+        evaluation_settings: SettingsT | None = None,
+    ) -> OutputT:
+        """Run :meth:`evaluate` to completion from synchronous code.
+
+        This is a thin wrapper around :func:`asyncio.run` over :meth:`evaluate`. Use it
+        from scripts, REPLs, or tests that are not already inside an asyncio event loop.
+
+        Args:
+            input: Same as :meth:`evaluate`.
+            evaluation_settings: Same as :meth:`evaluate`.
+
+        Returns:
+            Same typed result as :meth:`evaluate` on success.
+
+        Raises:
+            Same exceptions as :meth:`evaluate`.
+            RuntimeError: If this thread already has a running asyncio event loop; use
+                ``await evaluator.evaluate(...)`` instead of :meth:`evaluate_sync`.
+
+        Note:
+            Do not call this method when a running event loop is active (for example
+            from inside an ``async def`` without nesting another ``asyncio.run``); prefer
+            awaiting :meth:`evaluate` instead.
+        """
+        try:
+            asyncio.get_running_loop()
+        except RuntimeError:
+            pass
+        else:
+            raise RuntimeError(
+                "evaluate_sync() cannot be used while an asyncio event loop is running in "
+                "this thread; use await evaluator.evaluate(...) from async code instead."
+            ) from None
+        return asyncio.run(self.evaluate(input, evaluation_settings))
+
     @abstractmethod
-    def evaluate_impl(
+    async def evaluate_impl(
         self,
         input: InputT,
         evaluation_settings: SettingsT,
         evaluation_metadata: EvaluationMetadata,
     ) -> OutputT:
-        """Implement the evaluation logic. Return a result; base assigns evaluation_metadata onto it."""
+        """Implement the evaluator-specific logic for one run.
+
+        Subclasses perform prompt steps (often via :meth:`execute_prompt_chain_step`),
+        assemble the typed result, and return it. The base :meth:`evaluate` assigns
+        ``evaluation_metadata`` onto ``result.metadata`` after a successful return.
+
+        Args:
+            input: Validated evaluation input.
+            evaluation_settings: Resolved settings for this run (already deep-copied
+                when the caller omitted overrides).
+            evaluation_metadata: Run metadata; populate ``step_details`` and related
+                fields as steps execute.
+
+        Returns:
+            A fully constructed result object (``metadata`` may still point at the
+            same ``evaluation_metadata`` instance; the base layer sets status and timing).
+        """
         ...
 
-    def execute_step(
+    async def execute_step(
         self,
         step_name: str,
         evaluation_metadata: EvaluationMetadata,
-        implementation_function: Callable[[], StepResultT],
+        implementation_function: Callable[[], Awaitable[StepResultT]],
         *,
         extras: dict[str, Any] | None = None,
     ) -> StepResultT:
-        """Run ``implementation_function`` and record step metadata on ``evaluation_metadata``.
+        """Await ``implementation_function`` and record step metadata on ``evaluation_metadata``.
 
         ``step_name`` is always the step id. Optional ``extras`` is copied into
         :attr:`StepMetadata.extras` (merged with any updates made during the step, e.g. token usage).
 
-        The step may return any type (e.g. a Pydantic model, a plain ``str``, or ``None``); the same
-        type is returned to the caller.
+        ``implementation_function`` must be a zero-argument callable that returns an
+        awaitable (typically an ``async def`` with no parameters, or a lambda that
+        returns one). The awaited value may be any type (e.g. a Pydantic model, a plain
+        ``str``, or ``None``); the same type is returned to the caller.
         """
         start = time.perf_counter()
         step_extras = dict(extras) if extras is not None else {}
         step_metadata = StepMetadata(step_id=step_name, extras=step_extras)
         self.config.logger.info("step start", extra={"step_metadata": step_metadata})
         try:
-            result = implementation_function()
+            result = await implementation_function()
             step_metadata.status = Status.succeeded
             return result
         except Exception as e:
@@ -177,7 +244,7 @@ def execute_step(
             evaluation_metadata.step_details[step_name] = step_metadata
 
     @overload
-    def execute_prompt_chain_step(
+    async def execute_prompt_chain_step(
         self,
         step_name: str,
         prompt_settings: PromptSettings,
@@ -188,7 +255,7 @@ def execute_prompt_chain_step(
     ) -> str: ...
 
     @overload
-    def execute_prompt_chain_step(
+    async def execute_prompt_chain_step(
         self,
         step_name: str,
         prompt_settings: PromptSettings,
@@ -199,7 +266,7 @@ def execute_prompt_chain_step(
         json_dict_normalizer: Callable[[dict], dict] | None = None,
     ) -> ParsedT: ...
 
-    def execute_prompt_chain_step(
+    async def execute_prompt_chain_step(
         self,
         step_name: str,
         prompt_settings: PromptSettings,
@@ -209,19 +276,20 @@ def execute_prompt_chain_step(
         parser_output_type: type[BaseModel] | None = None,
         json_dict_normalizer: Callable[[dict], dict] | None = None,
     ) -> BaseModel | str:
-        """Run a prompt chain (template | LLM), record metadata, and return the result.
+        """Run a prompt chain (template | LLM) with ``ainvoke``, record metadata, and return the result.
 
-        When ``parser_output_type`` is a Pydantic model class, the LLM response is
-        parsed as JSON and returned as an instance of that class.  When it is
-        ``None`` (the default), the raw response content is returned as a plain
-        ``str`` (no JSON parser) — use that for steps that produce unstructured prose
-        (e.g. a background-knowledge assumption).
+        The LangChain runnable ``template | provider`` is invoked asynchronously via
+        :meth:`~langchain_core.runnables.base.Runnable.ainvoke`. When ``parser_output_type``
+        is a Pydantic model class, the LLM response is parsed as JSON and returned as an
+        instance of that class. When it is ``None`` (the default), the raw response content
+        is returned as a plain ``str`` (no JSON parser) — use that for steps that produce
+        unstructured prose (e.g. a background-knowledge assumption).
 
         Provider config (e.g. API key) is resolved from ``self.config`` by
         ``prompt_settings.provider_type``.
 
         Args:
-            step_name: Identifier for this step in evaluation_metadata.step_details.
+            step_name: Identifier for this step in ``evaluation_metadata.step_details``.
             prompt_settings: Provider type, model, and temperature for the LLM call.
             evaluation_metadata: Metadata for the full evaluation; step metadata and
                 token usage are updated in place.
@@ -241,7 +309,7 @@ def execute_prompt_chain_step(
             ``str`` when ``parser_output_type`` is omitted or ``None``.
 
         Raises:
-            ConfigurationError: No provider config for prompt_settings.provider_type.
+            ConfigurationError: No provider config for ``prompt_settings.provider_type``.
             EvaluatorError: SDK errors, including :func:`~learning_commons_evaluators.schemas.errors.wrap_provider_error` output for LangChain or HTTP failures (typically :class:`~learning_commons_evaluators.schemas.errors.APIError` subclasses). Pydantic :exc:`pydantic.ValidationError` from output parsing is re-raised unchanged.
             ValueError: If ``json_dict_normalizer`` is set but ``parser_output_type`` is omitted.
         """
@@ -250,12 +318,12 @@ def execute_prompt_chain_step(
         # Populated after a successful LLM invoke so we can attach usage even if parsing fails.
         token_usage: TokenUsage | None = None
 
-        def _run_chain() -> BaseModel | str:
+        async def _run_chain() -> BaseModel | str:
             nonlocal token_usage
             try:
                 provider = create_provider(prompt_settings, self.config)
                 llm_chain: Any = template | provider
-                ai_message = llm_chain.invoke(chain_inputs)
+                ai_message = await llm_chain.ainvoke(chain_inputs)
                 token_usage = token_usage_from_aimessage(ai_message, prompt_settings)
                 if parser_output_type is None:
                     return str(ai_message.content)
@@ -263,14 +331,14 @@ def _run_chain() -> BaseModel | str:
 
                 if json_dict_normalizer is not None:
                     loose = JsonOutputParser()
-                    parsed_dict = loose.invoke(ai_message)
+                    parsed_dict = await loose.ainvoke(ai_message)
                     if not isinstance(parsed_dict, dict):
                         parsed_dict = dict(parsed_dict)
                     normalized = json_dict_normalizer(parsed_dict)
                     return parser_output_type.model_validate(normalized)
 
                 parser = JsonOutputParser(pydantic_object=parser_output_type)
-                raw = parser.invoke(ai_message)
+                raw = await parser.ainvoke(ai_message)
                 if isinstance(raw, parser_output_type):
                     return raw
                 return parser_output_type.model_validate(raw)
@@ -284,15 +352,15 @@ def _run_chain() -> BaseModel | str:
                 raise wrap_provider_error(e) from e
 
         try:
-            return self.execute_step(
+            return await self.execute_step(
                 step_name,
                 evaluation_metadata,
+                _run_chain,
                 extras={
                     PROMPT_STEP_EXTRA_PROMPT_SETTINGS: prompt_settings_to_extras_value(
                         prompt_settings
                     ),
                 },
-                implementation_function=_run_chain,
             )
         finally:
             if token_usage is not None:
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py b/sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py
index 1ae7032..7669d89 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/conventionality.py
@@ -72,7 +72,7 @@ class ConventionalityEvaluator(
         _CONVENTIONALITY_CONFIG.evaluation_settings
     )
 
-    def evaluate_impl(
+    async def evaluate_impl(
         self,
         input: ConventionalityEvaluationInput,
         evaluation_settings: ConventionalityEvaluationSettings,
@@ -93,7 +93,7 @@ def evaluate_impl(
                 ("human", prompts["human_prompt"]),
             ]
         ).partial(format_instructions=parser.get_format_instructions())
-        conventionality_output = self.execute_prompt_chain_step(
+        conventionality_output = await self.execute_prompt_chain_step(
             step_name="conventionality_evaluation",
             prompt_settings=step_prompt_settings,
             evaluation_metadata=evaluation_metadata,
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
index cde17df..30ee1f7 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
@@ -87,7 +87,7 @@ class VocabularyEvaluator(
         _VOCABULARY_CONFIG.evaluation_settings
     )
 
-    def evaluate_impl(
+    async def evaluate_impl(
         self,
         input: VocabularyEvaluationInput,
         evaluation_settings: VocabularyEvaluationSettings,
@@ -98,7 +98,7 @@ def evaluate_impl(
         Grade validation is handled by the framework before this method is called:
         ``VocabularyEvaluationInput`` automatically constrains ``grade`` to the
         evaluator's ``allowed_grades`` from settings (3–12), so
-        ``BaseEvaluator.evaluate`` raises before reaching here for unsupported grades.
+        ``BaseEvaluator.evaluate`` / ``evaluate_sync`` raises before reaching here for unsupported grades.
         """
         ps_bk = evaluation_settings.prompt_settings_step_background_knowledge
         ps_34 = evaluation_settings.prompt_settings_step_vocab_grades_3_4
@@ -113,7 +113,7 @@ def evaluate_impl(
         bk_template = ChatPromptTemplate.from_messages(
             [("human", prompts["background_knowledge_prompt"])]
         )
-        background_knowledge: str = self.execute_prompt_chain_step(
+        background_knowledge: str = await self.execute_prompt_chain_step(
             step_name="background_knowledge",
             prompt_settings=ps_bk,
             evaluation_metadata=evaluation_metadata,
@@ -130,7 +130,7 @@ def evaluate_impl(
         }
         if grade in _GRADES_3_4:
             chain_inputs["fk_level"] = fk_score
-            answer, explanation = self._run_vocab_complexity_chain(
+            answer, explanation = await self._run_vocab_complexity_chain(
                 chain_inputs=chain_inputs,
                 evaluation_metadata=evaluation_metadata,
                 prompt_settings_vocab=ps_34,
@@ -138,7 +138,7 @@ def evaluate_impl(
                 user_prompt_template=prompts["vocab_grades_3_4_user_prompt"],
             )
         else:
-            answer, explanation = self._run_vocab_complexity_chain(
+            answer, explanation = await self._run_vocab_complexity_chain(
                 chain_inputs=chain_inputs,
                 evaluation_metadata=evaluation_metadata,
                 prompt_settings_vocab=ps_og,
@@ -152,7 +152,7 @@ def evaluate_impl(
             metadata=evaluation_metadata,
         )
 
-    def _run_vocab_complexity_chain(
+    async def _run_vocab_complexity_chain(
         self,
         *,
         chain_inputs: dict[str, Any],
@@ -169,8 +169,8 @@ def _run_vocab_complexity_chain(
             ]
         ).partial(format_instructions=parser.get_format_instructions())
 
-        output = self.execute_prompt_chain_step(
-            step_name="complexity_evaluation",
+        output = await self.execute_prompt_chain_step(
+            step_name="vocab_complexity",
             prompt_settings=prompt_settings_vocab,
             evaluation_metadata=evaluation_metadata,
             template=template,
diff --git a/sdks/python/tests/contract_tests/harness.py b/sdks/python/tests/contract_tests/harness.py
index 93ff7e1..0f9c284 100644
--- a/sdks/python/tests/contract_tests/harness.py
+++ b/sdks/python/tests/contract_tests/harness.py
@@ -14,7 +14,7 @@
     )
 
     with ContractTestHarness(case) as harness:
-        result = evaluator.evaluate(inp)
+        result = evaluator.evaluate_sync(inp)
 
     harness.assert_prompt_step("main")
 
diff --git a/sdks/python/tests/contract_tests/test_conventionality.py b/sdks/python/tests/contract_tests/test_conventionality.py
index 44db637..128f772 100644
--- a/sdks/python/tests/contract_tests/test_conventionality.py
+++ b/sdks/python/tests/contract_tests/test_conventionality.py
@@ -57,7 +57,7 @@ def test_turnip_grade4(self) -> None:
         )
 
         with ContractTestHarness(case) as harness:
-            result = evaluator.evaluate(inp)
+            result = evaluator.evaluate_sync(inp)
 
         # --- Prompt fidelity ---
         # Asserts that the SDK sent the same fully-formatted request as the
diff --git a/sdks/python/tests/contract_tests/test_vocabulary.py b/sdks/python/tests/contract_tests/test_vocabulary.py
index 57364c7..bfcd88d 100644
--- a/sdks/python/tests/contract_tests/test_vocabulary.py
+++ b/sdks/python/tests/contract_tests/test_vocabulary.py
@@ -68,7 +68,7 @@ def test_marco_polo_grade3(self) -> None:
         )
 
         with ContractTestHarness(case) as harness:
-            result = evaluator.evaluate(inp)
+            result = evaluator.evaluate_sync(inp)
 
         # --- Prompt fidelity ---
         # Both steps are asserted: model, temperature, and formatted messages
@@ -113,7 +113,7 @@ def test_hurricanes_grade7(self) -> None:
         )
 
         with ContractTestHarness(case) as harness:
-            result = evaluator.evaluate(inp)
+            result = evaluator.evaluate_sync(inp)
 
         # --- Prompt fidelity ---
         harness.assert_prompt_step("background_knowledge")
diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py
index 333e5c1..7c61c9e 100644
--- a/sdks/python/tests/evaluators/test_base.py
+++ b/sdks/python/tests/evaluators/test_base.py
@@ -1,7 +1,8 @@
 """Tests for :class:`~learning_commons_evaluators.evaluators.base.BaseEvaluator`.
 
-Covers ``__init__``, ``evaluate`` (metadata, settings override, success/failure, telemetry),
-``update_total_token_usage``, ``execute_step``, and ``execute_prompt_chain_step``.
+Covers ``__init__``, ``evaluate`` / ``evaluate_sync``, metadata and settings override,
+success/failure paths, ``update_total_token_usage``, ``execute_step``, and
+``execute_prompt_chain_step``.
 ``EvaluationMetadata`` always uses ``input.input_metadata()`` (including when
 ``send_full_input_with_telemetry`` is enabled). Helpers use both a minimal stub evaluator
 and conventionality-oriented fixtures where useful.
@@ -10,13 +11,18 @@
 from __future__ import annotations
 
 import logging
-from unittest.mock import MagicMock, patch
+from typing import NoReturn
+from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
-from langchain_core.messages import AIMessage
+from langchain_core.callbacks import CallbackManagerForLLMRun
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.language_models.fake_chat_models import FakeMessagesListChatModel
+from langchain_core.messages import AIMessage, BaseMessage
 from langchain_core.prompts import ChatPromptTemplate
 from pydantic import BaseModel, Field
 from pydantic import ValidationError as PydanticValidationError
+from typing_extensions import override
 
 from learning_commons_evaluators import (
     BaseEvaluator,
@@ -50,6 +56,31 @@
 _CHAIN_PATCH = "learning_commons_evaluators.evaluators.base.create_provider"
 
 
+def _fake_chat_model(message: AIMessage) -> FakeMessagesListChatModel:
+    """Fixed-response chat model for ``template | model`` chains (MagicMock breaks LC compose)."""
+
+    return FakeMessagesListChatModel(responses=[message])
+
+
+class _ChainFailureChatModel(BaseChatModel):
+    """Chat model that always raises inside generation (provider failure simulation)."""
+
+    @override
+    def _generate(
+        self,
+        messages: list[BaseMessage],
+        stop: list[str] | None = None,
+        run_manager: CallbackManagerForLLMRun | None = None,
+        **kwargs: object,
+    ) -> NoReturn:
+        raise ValueError("simulated provider failure")
+
+    @property
+    @override
+    def _llm_type(self) -> str:
+        return "chain-failure-test-double"
+
+
 class _ChainOutput(BaseModel):
     """Minimal LLM JSON payload model (stand-in for conventionality output models)."""
 
@@ -85,7 +116,7 @@ class _StubEvaluator(
     )
     default_evaluation_settings = _StubSettings()
 
-    def evaluate_impl(
+    async def evaluate_impl(
         self,
         input: TextComplexityEvaluationInput,
         evaluation_settings: _StubSettings,
@@ -132,19 +163,19 @@ def test_omitted_constructor_default_falls_back_to_class_attribute(self, config)
 
 class TestEvaluateSuccess:
     def test_sets_status_succeeded_and_processing_time(self, stub_evaluator):
-        result = stub_evaluator.evaluate(_stub_input())
+        result = stub_evaluator.evaluate_sync(_stub_input())
         assert result.metadata.status == Status.succeeded
         assert result.metadata.processing_time_ms >= 0.0
 
     def test_passes_explicit_evaluation_settings(self, stub_evaluator):
         custom = _StubSettings(marker=42)
-        result = stub_evaluator.evaluate(_stub_input(), evaluation_settings=custom)
+        result = stub_evaluator.evaluate_sync(_stub_input(), evaluation_settings=custom)
         assert result.metadata.evaluation_settings.marker == 42
         assert result.explanation.details.get("marker") == 42
 
     def test_constructor_default_used_when_evaluate_settings_omitted(self, config):
         ev = _StubEvaluator(config, default_evaluation_settings=_StubSettings(marker=77))
-        result = ev.evaluate(_stub_input())
+        result = ev.evaluate_sync(_stub_input())
         assert result.metadata.evaluation_settings.marker == 77
         assert result.explanation.details.get("marker") == 77
 
@@ -153,16 +184,23 @@ def test_evaluate_explicit_settings_override_constructor_default(self, config):
             config,
             default_evaluation_settings=_StubSettings(marker=1),
         )
-        result = ev.evaluate(_stub_input(), evaluation_settings=_StubSettings(marker=2))
+        result = ev.evaluate_sync(_stub_input(), evaluation_settings=_StubSettings(marker=2))
         assert result.explanation.details.get("marker") == 2
 
 
+class TestEvaluateSyncLoopGuard:
+    @pytest.mark.asyncio
+    async def test_evaluate_sync_raises_clear_error_when_loop_running(self, stub_evaluator):
+        with pytest.raises(RuntimeError, match="await evaluator.evaluate"):
+            stub_evaluator.evaluate_sync(_stub_input())
+
+
 class TestEvaluateInputMetadata:
     """``input_metadata`` on :class:`EvaluationMetadata` always comes from ``input.input_metadata()``."""
 
     def test_evaluate_sets_metadata_from_input_metadata(self, stub_evaluator):
         inp = _stub_input()
-        result = stub_evaluator.evaluate(inp)
+        result = stub_evaluator.evaluate_sync(inp)
         assert result.metadata.input_metadata == inp.input_metadata()
         assert result.metadata.input_metadata["text"] == {"textLength": 11}
         assert result.metadata.input_metadata["grade_level"] == {"grade": 3}
@@ -172,7 +210,7 @@ def test_full_telemetry_config_still_uses_input_metadata_not_raw_values(self, st
         cfg = create_config(telemetry_partner_id="test", send_full_input_with_telemetry=True)
         ev = _StubEvaluator(cfg)
         inp = _stub_input()
-        result = ev.evaluate(inp)
+        result = ev.evaluate_sync(inp)
         assert result.metadata.input_metadata == inp.input_metadata()
         assert result.metadata.input_metadata["text"] == {"textLength": 11}
         assert result.metadata.input_metadata["grade_level"] == {"grade": 3}
@@ -188,14 +226,16 @@ def test_raises_validation_error_for_invalid_input(self, stub_evaluator):
             grade_level=GradeInputField(spec=GradeInputSpec(name="grade_level"), value=3),
         )
         with pytest.raises(ValidationError):
-            stub_evaluator.evaluate(inp)
+            stub_evaluator.evaluate_sync(inp)
 
     def test_propagates_evaluate_impl_exception(self, stub_evaluator):
         with (
-            patch.object(stub_evaluator, "evaluate_impl", side_effect=RuntimeError("boom")),
+            patch.object(
+                stub_evaluator, "evaluate_impl", AsyncMock(side_effect=RuntimeError("boom"))
+            ),
             pytest.raises(RuntimeError, match="boom"),
         ):
-            stub_evaluator.evaluate(_stub_input())
+            stub_evaluator.evaluate_sync(_stub_input())
 
     def test_validation_failure_emits_end_log_with_failed_status(self, stub_evaluator):
         captured: list = []
@@ -218,7 +258,7 @@ def emit(self, record: logging.LogRecord) -> None:
                 grade_level=GradeInputField(spec=GradeInputSpec(name="grade_level"), value=3),
             )
             with pytest.raises(ValidationError):
-                stub_evaluator.evaluate(inp)
+                stub_evaluator.evaluate_sync(inp)
         finally:
             stub_evaluator.config.logger.removeHandler(h)
 
@@ -272,33 +312,43 @@ def test_accumulates_usage_for_existing_provider(self, stub_evaluator, evaluatio
 
 
 class TestExecuteStep:
-    def test_returns_implementation_result(self, stub_evaluator, evaluation_metadata):
-        assert (
-            stub_evaluator.execute_step("s", evaluation_metadata, lambda: "the-result")
-            == "the-result"
-        )
+    async def test_returns_implementation_result(self, stub_evaluator, evaluation_metadata):
+        async def impl():
+            return "the-result"
+
+        assert await stub_evaluator.execute_step("s", evaluation_metadata, impl) == "the-result"
+
+    async def test_records_succeeded_status_on_success(self, stub_evaluator, evaluation_metadata):
+        async def impl():
+            return None
 
-    def test_records_succeeded_status_on_success(self, stub_evaluator, evaluation_metadata):
-        stub_evaluator.execute_step("s", evaluation_metadata, lambda: None)
+        await stub_evaluator.execute_step("s", evaluation_metadata, impl)
         assert evaluation_metadata.step_details["s"].status == Status.succeeded
 
-    def test_records_failed_status_and_error_on_exception(
+    async def test_records_failed_status_and_error_on_exception(
         self, stub_evaluator, evaluation_metadata
     ):
-        failing = MagicMock(side_effect=ValueError("boom"))
+        async def failing():
+            raise ValueError("boom")
+
         with pytest.raises(ValueError, match="boom"):
-            stub_evaluator.execute_step("s", evaluation_metadata, failing)
+            await stub_evaluator.execute_step("s", evaluation_metadata, failing)
         step = evaluation_metadata.step_details["s"]
         assert step.status == Status.failed
         assert "boom" in step.error_details
 
-    def test_re_raises_exception(self, stub_evaluator, evaluation_metadata):
-        failing = MagicMock(side_effect=RuntimeError("inner"))
+    async def test_re_raises_exception(self, stub_evaluator, evaluation_metadata):
+        async def failing():
+            raise RuntimeError("inner")
+
         with pytest.raises(RuntimeError, match="inner"):
-            stub_evaluator.execute_step("s", evaluation_metadata, failing)
+            await stub_evaluator.execute_step("s", evaluation_metadata, failing)
+
+    async def test_extras_appear_in_step_metadata(self, stub_evaluator, evaluation_metadata):
+        async def impl():
+            return None
 
-    def test_extras_appear_in_step_metadata(self, stub_evaluator, evaluation_metadata):
-        stub_evaluator.execute_step("s", evaluation_metadata, lambda: None, extras={"k": "v"})
+        await stub_evaluator.execute_step("s", evaluation_metadata, impl, extras={"k": "v"})
         assert evaluation_metadata.step_details["s"].extras["k"] == "v"
 
 
@@ -308,22 +358,15 @@ def test_extras_appear_in_step_metadata(self, stub_evaluator, evaluation_metadat
 
 
 class TestExecutePromptChainStep:
-    """Mock ``create_provider`` so ``template | provider`` runs in-process.
+    """Mock ``create_provider`` so ``template | provider`` runs in-process."""
 
-    Fake LLMs return real ``AIMessage`` values so ``JsonOutputParser`` and
-    ``token_usage_from_aimessage`` exercise the real code paths where applicable.
-    """
-
-    def test_returns_raw_string_when_parser_output_type_is_none(
+    async def test_returns_raw_string_when_parser_output_type_is_none(
         self, stub_evaluator, evaluation_metadata
     ):
-        def _fake_llm(_pv):
-            return AIMessage(content="plain prose")
-
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
         ev = _StubEvaluator(create_config_no_telemetry())
-        with patch(_CHAIN_PATCH, return_value=_fake_llm):
-            out = ev.execute_prompt_chain_step(
+        with patch(_CHAIN_PATCH, return_value=_fake_chat_model(AIMessage(content="plain prose"))):
+            out = await ev.execute_prompt_chain_step(
                 step_name="raw",
                 prompt_settings=PromptSettings(
                     provider_type=LlmProvider.GOOGLE,
@@ -337,12 +380,12 @@ def _fake_llm(_pv):
             )
         assert out == "plain prose"
 
-    def test_json_dict_normalizer_without_parser_type_raises(
+    async def test_json_dict_normalizer_without_parser_type_raises(
         self, stub_evaluator, evaluation_metadata
     ):
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
         with pytest.raises(ValueError, match="json_dict_normalizer requires"):
-            stub_evaluator.execute_prompt_chain_step(
+            await stub_evaluator.execute_prompt_chain_step(
                 step_name="raw",
                 prompt_settings=PromptSettings(
                     provider_type=LlmProvider.GOOGLE,
@@ -356,13 +399,10 @@ def test_json_dict_normalizer_without_parser_type_raises(
                 json_dict_normalizer=lambda d: d,
             )
 
-    def test_returns_parsed_pydantic_output(self, stub_evaluator, evaluation_metadata):
-        def _fake_llm(_pv):
-            return AIMessage(content=_CHAIN_JSON)
-
+    async def test_returns_parsed_pydantic_output(self, stub_evaluator, evaluation_metadata):
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
-        with patch(_CHAIN_PATCH, return_value=_fake_llm):
-            result = stub_evaluator.execute_prompt_chain_step(
+        with patch(_CHAIN_PATCH, return_value=_fake_chat_model(AIMessage(content=_CHAIN_JSON))):
+            result = await stub_evaluator.execute_prompt_chain_step(
                 step_name="main",
                 prompt_settings=PromptSettings(
                     provider_type=LlmProvider.GOOGLE,
@@ -378,14 +418,11 @@ def _fake_llm(_pv):
         assert result.label == "ok"
         assert result.score == 7
 
-    def test_json_dict_normalizer_parses_dict_then_normalizes_then_validates(
+    async def test_json_dict_normalizer_parses_dict_then_normalizes_then_validates(
         self, stub_evaluator, evaluation_metadata
     ):
         """Optional ``json_dict_normalizer``: loose JSON → dict → user fn → ``model_validate``."""
 
-        def _fake_llm(_pv):
-            return AIMessage(content='{"n": 1}')
-
         class _Out(BaseModel):
             n: int = Field(description="n")
             doubled: int = Field(description="doubled")
@@ -396,8 +433,11 @@ def _double(d: dict) -> dict:
             return d
 
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
-        with patch(_CHAIN_PATCH, return_value=_fake_llm):
-            result = stub_evaluator.execute_prompt_chain_step(
+        with patch(
+            _CHAIN_PATCH,
+            return_value=_fake_chat_model(AIMessage(content='{"n": 1}')),
+        ):
+            result = await stub_evaluator.execute_prompt_chain_step(
                 step_name="main",
                 prompt_settings=PromptSettings(
                     provider_type=LlmProvider.GOOGLE,
@@ -414,24 +454,22 @@ def _double(d: dict) -> dict:
         assert result.n == 1
         assert result.doubled == 2
 
-    def test_parser_returning_model_instance_short_circuits_model_validate(
+    async def test_parser_returning_model_instance_short_circuits_model_validate(
         self, stub_evaluator, evaluation_metadata
     ):
-        """When ``JsonOutputParser.invoke`` returns a model, ``isinstance`` path skips ``model_validate``."""
+        """When ``JsonOutputParser.ainvoke`` returns a model, ``isinstance`` path skips ``model_validate``."""
         prebuilt = _ChainOutput(label="direct", score=99)
 
-        def _fake_llm(_pv):
-            return AIMessage(content="unused")
-
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
         with (
-            patch(_CHAIN_PATCH, return_value=_fake_llm),
+            patch(_CHAIN_PATCH, return_value=_fake_chat_model(AIMessage(content="unused"))),
             patch("langchain_core.output_parsers.json.JsonOutputParser") as mock_parser_cls,
         ):
             mock_parser = MagicMock()
             mock_parser.invoke.return_value = prebuilt
+            mock_parser.ainvoke = AsyncMock(return_value=prebuilt)
             mock_parser_cls.return_value = mock_parser
-            result = stub_evaluator.execute_prompt_chain_step(
+            result = await stub_evaluator.execute_prompt_chain_step(
                 step_name="main",
                 prompt_settings=PromptSettings(
                     provider_type=LlmProvider.GOOGLE,
@@ -445,20 +483,20 @@ def _fake_llm(_pv):
             )
         assert result is prebuilt
 
-    def test_keyboard_interrupt_from_parser_propagates(self, stub_evaluator, evaluation_metadata):
-        def _fake_llm(_pv):
-            return AIMessage(content=_CHAIN_JSON)
-
+    async def test_keyboard_interrupt_from_parser_propagates(
+        self, stub_evaluator, evaluation_metadata
+    ):
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
         with (
-            patch(_CHAIN_PATCH, return_value=_fake_llm),
+            patch(_CHAIN_PATCH, return_value=_fake_chat_model(AIMessage(content=_CHAIN_JSON))),
             patch("langchain_core.output_parsers.json.JsonOutputParser") as mock_parser_cls,
         ):
             mock_parser = MagicMock()
             mock_parser.invoke.side_effect = KeyboardInterrupt
+            mock_parser.ainvoke = AsyncMock(side_effect=KeyboardInterrupt)
             mock_parser_cls.return_value = mock_parser
             with pytest.raises(KeyboardInterrupt):
-                stub_evaluator.execute_prompt_chain_step(
+                await stub_evaluator.execute_prompt_chain_step(
                     step_name="main",
                     prompt_settings=PromptSettings(
                         provider_type=LlmProvider.GOOGLE,
@@ -471,20 +509,18 @@ def _fake_llm(_pv):
                     parser_output_type=_ChainOutput,
                 )
 
-    def test_system_exit_from_parser_propagates(self, stub_evaluator, evaluation_metadata):
-        def _fake_llm(_pv):
-            return AIMessage(content=_CHAIN_JSON)
-
+    async def test_system_exit_from_parser_propagates(self, stub_evaluator, evaluation_metadata):
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
         with (
-            patch(_CHAIN_PATCH, return_value=_fake_llm),
+            patch(_CHAIN_PATCH, return_value=_fake_chat_model(AIMessage(content=_CHAIN_JSON))),
             patch("langchain_core.output_parsers.json.JsonOutputParser") as mock_parser_cls,
         ):
             mock_parser = MagicMock()
             mock_parser.invoke.side_effect = SystemExit(3)
+            mock_parser.ainvoke = AsyncMock(side_effect=SystemExit(3))
             mock_parser_cls.return_value = mock_parser
             with pytest.raises(SystemExit) as exc_info:
-                stub_evaluator.execute_prompt_chain_step(
+                await stub_evaluator.execute_prompt_chain_step(
                     step_name="main",
                     prompt_settings=PromptSettings(
                         provider_type=LlmProvider.GOOGLE,
@@ -498,7 +534,9 @@ def _fake_llm(_pv):
                 )
             assert exc_info.value.code == 3
 
-    def test_prompt_settings_recorded_in_step_extras(self, stub_evaluator, evaluation_metadata):
+    async def test_prompt_settings_recorded_in_step_extras(
+        self, stub_evaluator, evaluation_metadata
+    ):
         settings = PromptSettings(
             provider_type=LlmProvider.GOOGLE,
             model="gemini-2.0-flash",
@@ -506,8 +544,8 @@ def test_prompt_settings_recorded_in_step_extras(self, stub_evaluator, evaluatio
         )
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
 
-        with patch(_CHAIN_PATCH, return_value=lambda _pv: AIMessage(content=_CHAIN_JSON)):
-            stub_evaluator.execute_prompt_chain_step(
+        with patch(_CHAIN_PATCH, return_value=_fake_chat_model(AIMessage(content=_CHAIN_JSON))):
+            await stub_evaluator.execute_prompt_chain_step(
                 step_name="main",
                 prompt_settings=settings,
                 evaluation_metadata=evaluation_metadata,
@@ -520,16 +558,17 @@ def test_prompt_settings_recorded_in_step_extras(self, stub_evaluator, evaluatio
         assert step.extras[PROMPT_STEP_EXTRA_PROMPT_SETTINGS]["model"] == "gemini-2.0-flash"
         assert PROMPT_STEP_EXTRA_TOKEN_USAGE in step.extras
 
-    def test_token_usage_recorded_when_llm_reports_usage(self, stub_evaluator, evaluation_metadata):
-        def _llm_with_usage(_pv):
-            return AIMessage(
-                content=_CHAIN_JSON,
-                usage_metadata={
-                    "input_tokens": 42,
-                    "output_tokens": 17,
-                    "total_tokens": 59,
-                },
-            )
+    async def test_token_usage_recorded_when_llm_reports_usage(
+        self, stub_evaluator, evaluation_metadata
+    ):
+        msg = AIMessage(
+            content=_CHAIN_JSON,
+            usage_metadata={
+                "input_tokens": 42,
+                "output_tokens": 17,
+                "total_tokens": 59,
+            },
+        )
 
         settings = PromptSettings(
             provider_type=LlmProvider.GOOGLE,
@@ -538,8 +577,8 @@ def _llm_with_usage(_pv):
         )
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
 
-        with patch(_CHAIN_PATCH, return_value=_llm_with_usage):
-            stub_evaluator.execute_prompt_chain_step(
+        with patch(_CHAIN_PATCH, return_value=_fake_chat_model(msg)):
+            await stub_evaluator.execute_prompt_chain_step(
                 step_name="main",
                 prompt_settings=settings,
                 evaluation_metadata=evaluation_metadata,
@@ -553,7 +592,7 @@ def _llm_with_usage(_pv):
         assert step.extras[PROMPT_STEP_EXTRA_TOKEN_USAGE]["output_tokens"] == 17
         assert evaluation_metadata.total_token_usage[LlmProvider.GOOGLE].input_tokens == 42
 
-    def test_propagates_configuration_error_from_create_provider(
+    async def test_propagates_configuration_error_from_create_provider(
         self, stub_evaluator, evaluation_metadata
     ):
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
@@ -564,7 +603,7 @@ def test_propagates_configuration_error_from_create_provider(
             ),
             pytest.raises(ConfigurationError, match="Google provider config is not set"),
         ):
-            stub_evaluator.execute_prompt_chain_step(
+            await stub_evaluator.execute_prompt_chain_step(
                 step_name="main",
                 prompt_settings=PromptSettings(
                     provider_type=LlmProvider.GOOGLE,
@@ -577,14 +616,16 @@ def test_propagates_configuration_error_from_create_provider(
                 parser_output_type=_ChainOutput,
             )
 
-    def test_propagates_evaluator_error_without_wrapping(self, stub_evaluator, evaluation_metadata):
+    async def test_propagates_evaluator_error_without_wrapping(
+        self, stub_evaluator, evaluation_metadata
+    ):
         """``EvaluatorError`` subclasses raised inside the chain are re-raised unchanged."""
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
         with (
             patch(_CHAIN_PATCH, side_effect=EvaluatorError("bare evaluator error")),
             pytest.raises(EvaluatorError, match="bare evaluator error"),
         ):
-            stub_evaluator.execute_prompt_chain_step(
+            await stub_evaluator.execute_prompt_chain_step(
                 step_name="main",
                 prompt_settings=PromptSettings(
                     provider_type=LlmProvider.GOOGLE,
@@ -597,16 +638,15 @@ def test_propagates_evaluator_error_without_wrapping(self, stub_evaluator, evalu
                 parser_output_type=_ChainOutput,
             )
 
-    def test_wraps_unexpected_chain_failure_as_api_error(self, stub_evaluator, evaluation_metadata):
-        def _boom(_pv):
-            raise ValueError("simulated provider failure")
-
+    async def test_wraps_unexpected_chain_failure_as_api_error(
+        self, stub_evaluator, evaluation_metadata
+    ):
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
         with (
-            patch(_CHAIN_PATCH, return_value=_boom),
+            patch(_CHAIN_PATCH, return_value=_ChainFailureChatModel()),
             pytest.raises(APIError, match="simulated provider failure"),
         ):
-            stub_evaluator.execute_prompt_chain_step(
+            await stub_evaluator.execute_prompt_chain_step(
                 step_name="main",
                 prompt_settings=PromptSettings(
                     provider_type=LlmProvider.GOOGLE,
@@ -619,18 +659,15 @@ def _boom(_pv):
                 parser_output_type=_ChainOutput,
             )
 
-    def test_malformed_llm_json_raises_api_error(self, stub_evaluator, evaluation_metadata):
+    async def test_malformed_llm_json_raises_api_error(self, stub_evaluator, evaluation_metadata):
         """Invalid JSON from the LLM becomes :class:`APIError` via ``wrap_provider_error``."""
 
-        def _bad(_pv):
-            return AIMessage(content="not-json")
-
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
         with (
-            patch(_CHAIN_PATCH, return_value=_bad),
+            patch(_CHAIN_PATCH, return_value=_fake_chat_model(AIMessage(content="not-json"))),
             pytest.raises(APIError, match="Invalid json output"),
         ):
-            stub_evaluator.execute_prompt_chain_step(
+            await stub_evaluator.execute_prompt_chain_step(
                 step_name="main",
                 prompt_settings=PromptSettings(
                     provider_type=LlmProvider.GOOGLE,
@@ -643,20 +680,20 @@ def _bad(_pv):
                 parser_output_type=_ChainOutput,
             )
 
-    def test_schema_mismatch_raises_pydantic_validation_error(
+    async def test_schema_mismatch_raises_pydantic_validation_error(
         self, stub_evaluator, evaluation_metadata
     ):
         """Valid JSON that does not satisfy the output model raises Pydantic ``ValidationError``."""
 
-        def _partial(_pv):
-            return AIMessage(content='{"label": "only"}')
-
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
         with (
-            patch(_CHAIN_PATCH, return_value=_partial),
+            patch(
+                _CHAIN_PATCH,
+                return_value=_fake_chat_model(AIMessage(content='{"label": "only"}')),
+            ),
             pytest.raises(PydanticValidationError),
         ):
-            stub_evaluator.execute_prompt_chain_step(
+            await stub_evaluator.execute_prompt_chain_step(
                 step_name="main",
                 prompt_settings=PromptSettings(
                     provider_type=LlmProvider.GOOGLE,
diff --git a/sdks/python/tests/evaluators/test_conventionality.py b/sdks/python/tests/evaluators/test_conventionality.py
index e339da0..19b0557 100644
--- a/sdks/python/tests/evaluators/test_conventionality.py
+++ b/sdks/python/tests/evaluators/test_conventionality.py
@@ -38,7 +38,7 @@ def test_evaluate_returns_evaluation_result(self):
         evaluator = ConventionalityEvaluator(config)
         inp = ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=5)
         with patch.object(evaluator, "execute_prompt_chain_step", return_value=_make_mock_output()):
-            result = evaluator.evaluate(inp)
+            result = evaluator.evaluate_sync(inp)
         assert result.answer.score == "moderately_complex"
         assert result.answer.label == "Moderately complex"
         assert result.explanation.summary is not None
@@ -65,7 +65,7 @@ def test_evaluate_with_explicit_settings(self):
         )
         inp = ConventionalityEvaluationInput(text=_SAMPLE_TEXT, grade=3)
         with patch.object(evaluator, "execute_prompt_chain_step", return_value=_make_mock_output()):
-            result = evaluator.evaluate(inp, evaluation_settings=settings)
+            result = evaluator.evaluate_sync(inp, evaluation_settings=settings)
         assert result.metadata.status == Status.succeeded
 
     def test_metadata_and_default_settings(self):
diff --git a/sdks/python/tests/evaluators/test_vocabulary.py b/sdks/python/tests/evaluators/test_vocabulary.py
index a6eacf0..9a7d5d9 100644
--- a/sdks/python/tests/evaluators/test_vocabulary.py
+++ b/sdks/python/tests/evaluators/test_vocabulary.py
@@ -79,7 +79,7 @@ def test_evaluate_grade_3_returns_result(self):
         evaluator = VocabularyEvaluator(config)
         inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=3)
         with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_grades34_output()):
-            result = evaluator.evaluate(inp)
+            result = evaluator.evaluate_sync(inp)
 
         assert result.answer.score == "moderately_complex"
         assert result.answer.label == "Moderately complex"
@@ -93,7 +93,7 @@ def test_evaluate_grade_4_returns_result(self):
         with _patch_steps(
             evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_grades34_output("very_complex")
         ):
-            result = evaluator.evaluate(inp)
+            result = evaluator.evaluate_sync(inp)
 
         assert result.answer.score == "very_complex"
 
@@ -107,7 +107,7 @@ def test_grades34_score_with_spaces_is_normalised(self):
         output = _make_grades34_output("slightly complex")
 
         with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, output):
-            result = evaluator.evaluate(inp)
+            result = evaluator.evaluate_sync(inp)
 
         assert result.answer.score == "slightly_complex"
 
@@ -116,7 +116,7 @@ def test_evaluate_grades34_explanation_has_word_breakdown(self):
         evaluator = VocabularyEvaluator(config)
         inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=3)
         with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_grades34_output()):
-            result = evaluator.evaluate(inp)
+            result = evaluator.evaluate_sync(inp)
 
         details = result.explanation.details
         assert "tier_2_words" in details
@@ -148,7 +148,7 @@ def test_all_complexity_scores_map_correctly(self, score_label, expected_score):
             _MOCK_BACKGROUND_KNOWLEDGE,
             _make_other_grades_output(score_label),
         ):
-            result = evaluator.evaluate(inp)
+            result = evaluator.evaluate_sync(inp)
 
         assert result.answer.score == expected_score
 
@@ -157,7 +157,7 @@ def test_evaluate_grade_12_returns_result(self):
         evaluator = VocabularyEvaluator(config)
         inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=12)
         with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(1)):
-            result = evaluator.evaluate(inp)
+            result = evaluator.evaluate_sync(inp)
 
         assert result.metadata.status == Status.succeeded
         assert result.answer.score == "slightly_complex"
@@ -168,7 +168,7 @@ def test_other_grades_explanation_includes_word_breakdown(self):
         evaluator = VocabularyEvaluator(config)
         inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=8)
         with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(2)):
-            result = evaluator.evaluate(inp)
+            result = evaluator.evaluate_sync(inp)
 
         details = result.explanation.details
         assert details["tier_2_words"] == "sat"
@@ -211,7 +211,7 @@ def test_other_grades_unexpected_digit_answer_raises(self):
             _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, unexpected),
             pytest.raises(ValueError, match=r"Unknown text complexity score: '9'"),
         ):
-            evaluator.evaluate(inp)
+            evaluator.evaluate_sync(inp)
 
 
 class TestNormalizeComplexityOutput:
@@ -254,13 +254,13 @@ def test_allowed_grades_set_from_toml(self):
 
     @pytest.mark.parametrize("unsupported_grade", [0, 1, 2])
     def test_unsupported_grade_raises_via_framework(self, unsupported_grade):
-        """BaseEvaluator.evaluate() calls input.validate(), which catches the bad grade."""
+        """BaseEvaluator.evaluate_sync() calls input.validate(), which catches the bad grade."""
         config = create_config_no_telemetry()
         evaluator = VocabularyEvaluator(config)
         inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=unsupported_grade)
         # The base evaluator catches the ValidationError, sets status=failed, then re-raises.
         with pytest.raises(ValidationError):
-            evaluator.evaluate(inp)
+            evaluator.evaluate_sync(inp)
 
     def test_unsupported_grade_sets_status_failed(self):
         """Metadata status is set to failed when grade validation fails."""
@@ -268,7 +268,7 @@ def test_unsupported_grade_sets_status_failed(self):
         evaluator = VocabularyEvaluator(config)
         inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=2)
         with pytest.raises(ValidationError):
-            evaluator.evaluate(inp)
+            evaluator.evaluate_sync(inp)
 
 
 # ── Metadata and settings ─────────────────────────────────────────────────────
@@ -292,7 +292,7 @@ def test_evaluate_succeeds_and_records_metadata(self):
         evaluator = VocabularyEvaluator(config)
         inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=5)
         with _patch_steps(evaluator, _MOCK_BACKGROUND_KNOWLEDGE, _make_other_grades_output(2)):
-            result = evaluator.evaluate(inp)
+            result = evaluator.evaluate_sync(inp)
 
         assert result.metadata.status == Status.succeeded
         assert result.metadata.evaluator_metadata.id == "vocabulary"

From 4733f9f2b5d7f4bf4a377eabd1b278743bad4ef8 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Fri, 15 May 2026 15:24:59 -0700
Subject: [PATCH 2/2] chore: address PR comments

---
 sdks/python/README.md                                  |  6 +++---
 .../evaluators/vocabulary.py                           |  2 +-
 sdks/python/tests/evaluators/test_base.py              | 10 ++++++++++
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/sdks/python/README.md b/sdks/python/README.md
index f1847cd..da9fd40 100644
--- a/sdks/python/README.md
+++ b/sdks/python/README.md
@@ -351,14 +351,14 @@ class MyEvaluator(BaseEvaluator[MyInput, EvaluationResult, MySettings]):
     )
     default_evaluation_settings = MySettings(...)
 
-    def evaluate_impl(
+    async def evaluate_impl(
         self,
         input: MyInput,
         evaluation_settings: MySettings,
         evaluation_metadata: EvaluationMetadata,
     ) -> EvaluationResult:
-        # Use self.execute_prompt_chain_step() for LLM calls
-        output = self.execute_prompt_chain_step(
+        # Use await self.execute_prompt_chain_step(...) for LLM calls
+        output = await self.execute_prompt_chain_step(
             step_name="main",
             prompt_settings=evaluation_settings.prompt_settings,
             evaluation_metadata=evaluation_metadata,
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
index 30ee1f7..dc6d761 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/vocabulary.py
@@ -98,7 +98,7 @@ async def evaluate_impl(
         Grade validation is handled by the framework before this method is called:
         ``VocabularyEvaluationInput`` automatically constrains ``grade`` to the
         evaluator's ``allowed_grades`` from settings (3–12), so
-        ``BaseEvaluator.evaluate`` / ``evaluate_sync`` raises before reaching here for unsupported grades.
+        ``BaseEvaluator.evaluate`` / ``evaluate_sync`` raise before reaching here for unsupported grades.
         """
         ps_bk = evaluation_settings.prompt_settings_step_background_knowledge
         ps_34 = evaluation_settings.prompt_settings_step_vocab_grades_3_4
diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py
index 7c61c9e..35d99e9 100644
--- a/sdks/python/tests/evaluators/test_base.py
+++ b/sdks/python/tests/evaluators/test_base.py
@@ -195,6 +195,16 @@ async def test_evaluate_sync_raises_clear_error_when_loop_running(self, stub_eva
             stub_evaluator.evaluate_sync(_stub_input())
 
 
+class TestEvaluateAsyncEntrypoint:
+    """``await evaluator.evaluate(...)`` is the primary API when an event loop is already running."""
+
+    @pytest.mark.asyncio
+    async def test_evaluate_returns_result_in_async_context(self, stub_evaluator):
+        result = await stub_evaluator.evaluate(_stub_input())
+        assert result.metadata.status == Status.succeeded
+        assert result.metadata.processing_time_ms >= 0.0
+
+
 class TestEvaluateInputMetadata:
     """``input_metadata`` on :class:`EvaluationMetadata` always comes from ``input.input_metadata()``."""