From 28437e8e298f3a393dbc679e463df7e3ab547b05 Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Fri, 15 May 2026 17:12:53 -0700 Subject: [PATCH 1/8] feat(python-sdk): error handling improvements --- .../learning_commons_evaluators/__init__.py | 12 +- .../src/learning_commons_evaluators/errors.py | 16 +- .../evaluators/base.py | 45 +- .../schemas/__init__.py | 4 +- .../schemas/common_inputs.py | 16 +- .../schemas/errors.py | 518 ++++++++++++++---- .../schemas/evaluator.py | 16 +- sdks/python/tests/evaluators/test_base.py | 91 ++- .../tests/evaluators/test_vocabulary.py | 8 +- .../tests/schemas/test_common_inputs.py | 14 +- sdks/python/tests/schemas/test_errors.py | 386 ++++++++++--- .../tests/schemas/test_evaluator_schemas.py | 10 +- 12 files changed, 902 insertions(+), 234 deletions(-) diff --git a/sdks/python/src/learning_commons_evaluators/__init__.py b/sdks/python/src/learning_commons_evaluators/__init__.py index cccf241..5a0fbba 100644 --- a/sdks/python/src/learning_commons_evaluators/__init__.py +++ b/sdks/python/src/learning_commons_evaluators/__init__.py @@ -22,11 +22,11 @@ AuthenticationError, ConfigurationError, EvaluatorError, - EvaluatorRetryableError, - EvaluatorTimeoutError, + InputValidationError, NetworkError, + OutputValidationError, RateLimitError, - ValidationError, + RequestTimeoutError, wrap_provider_error, ) @@ -113,8 +113,6 @@ "EvaluatorError", "EvaluatorMaturity", "EvaluatorMetadata", - "EvaluatorRetryableError", - "EvaluatorTimeoutError", "GoogleLLMProviderConfig", "AnyInputSpec", "GradeInputField", @@ -122,6 +120,7 @@ "InputField", "InputSpec", "InputT", + "InputValidationError", "TextInputSpec", "LLMProvider", "LLMProviderConfig", @@ -129,15 +128,16 @@ "NetworkError", "OpenAILLMProviderConfig", "OutputT", + "OutputValidationError", "PromptSettings", "RateLimitError", + "RequestTimeoutError", "SDK_LOGGER_NAME", "Status", "TelemetryConfig", "TextComplexityEvaluationInput", "TextInputField", "TokenUsage", - "ValidationError", "VocabularyEvaluationInput", "VocabularyEvaluationSettings", "VocabularyEvaluator", diff --git a/sdks/python/src/learning_commons_evaluators/errors.py b/sdks/python/src/learning_commons_evaluators/errors.py index 648fe06..6d952e4 100644 --- a/sdks/python/src/learning_commons_evaluators/errors.py +++ b/sdks/python/src/learning_commons_evaluators/errors.py @@ -5,11 +5,13 @@ AuthenticationError, ConfigurationError, EvaluatorError, - EvaluatorRetryableError, - EvaluatorTimeoutError, + InputValidationError, NetworkError, + OutputValidationError, RateLimitError, - ValidationError, + RequestTimeoutError, + format_error_for_metadata, + sanitize_pydantic_errors, wrap_provider_error, ) @@ -18,10 +20,12 @@ "AuthenticationError", "ConfigurationError", "EvaluatorError", - "EvaluatorRetryableError", - "EvaluatorTimeoutError", + "InputValidationError", "NetworkError", + "OutputValidationError", "RateLimitError", - "ValidationError", + "RequestTimeoutError", + "format_error_for_metadata", + "sanitize_pydantic_errors", "wrap_provider_error", ] diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py index 2ec69c0..a28dbf5 100644 --- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py +++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py @@ -8,6 +8,7 @@ from collections.abc import Awaitable, Callable from typing import Any, Generic, TypeVar, overload +from langchain_core.exceptions import OutputParserException from pydantic import BaseModel from pydantic import ValidationError as PydanticValidationError @@ -22,6 +23,9 @@ ) from learning_commons_evaluators.schemas.errors import ( EvaluatorError, + OutputValidationError, + format_error_for_metadata, + sanitize_pydantic_errors, wrap_provider_error, ) from learning_commons_evaluators.schemas.evaluator import ( @@ -105,7 +109,7 @@ async def evaluate( on success. Raises: - ValidationError: Input fails validation. + InputValidationError: Input fails validation. ConfigurationError: No provider config for the required LLM provider. APIError (or subclasses): The LLM API call failed. @@ -133,7 +137,7 @@ async def evaluate( return result except Exception as e: evaluation_metadata.status = Status.failed - evaluation_metadata.error_details = str(e) + evaluation_metadata.error_details = format_error_for_metadata(e) raise finally: evaluation_metadata.processing_time_ms = (time.perf_counter() - start) * 1000 @@ -236,7 +240,7 @@ async def execute_step( return result except Exception as e: step_metadata.status = Status.failed - step_metadata.error_details = str(e) + step_metadata.error_details = format_error_for_metadata(e) raise finally: step_metadata.processing_time_ms = (time.perf_counter() - start) * 1000 @@ -310,7 +314,14 @@ async def execute_prompt_chain_step( Raises: ConfigurationError: No provider config for ``prompt_settings.provider_type``. - EvaluatorError: SDK errors, including :func:`~learning_commons_evaluators.schemas.errors.wrap_provider_error` output for LangChain or HTTP failures (typically :class:`~learning_commons_evaluators.schemas.errors.APIError` subclasses). Pydantic :exc:`pydantic.ValidationError` from output parsing is re-raised unchanged. + OutputValidationError: The LLM response didn't satisfy the expected + output schema — either because it wasn't valid JSON (LangChain + ``OutputParserException``) or because the JSON didn't match the + Pydantic model (``pydantic.ValidationError``). The original + exception is reachable via ``__cause__``. + APIError (or other ``EvaluatorError`` subclasses): + :func:`~learning_commons_evaluators.schemas.errors.wrap_provider_error` + output for LangChain or HTTP failures from the LLM provider. ValueError: If ``json_dict_normalizer`` is set but ``parser_output_type`` is omitted. """ if json_dict_normalizer is not None and parser_output_type is None: @@ -344,12 +355,32 @@ async def _run_chain() -> BaseModel | str: return parser_output_type.model_validate(raw) except EvaluatorError: raise - except PydanticValidationError: - raise + except (PydanticValidationError, OutputParserException) as e: + # The provider returned a response that didn't match the expected schema. + # This covers both Pydantic schema-validation failures and LangChain + # JSON-parse failures (``OutputParserException``). Wrap so callers can + # discriminate output-parse failures from other API errors and so the + # message that lands in ``EvaluationMetadata.error_details`` stays + # sanitized — the original error (which may include LLM output snippets) + # is reachable via ``__cause__`` for debugging. + validation_errors = ( + sanitize_pydantic_errors(e.errors()) + if isinstance(e, PydanticValidationError) + else None + ) + raise OutputValidationError( + provider=prompt_settings.provider_type, + model=prompt_settings.model, + validation_errors=validation_errors, + ) from e except (KeyboardInterrupt, SystemExit): raise except Exception as e: - raise wrap_provider_error(e) from e + raise wrap_provider_error( + e, + provider=prompt_settings.provider_type, + model=prompt_settings.model, + ) from e try: return await self.execute_step( diff --git a/sdks/python/src/learning_commons_evaluators/schemas/__init__.py b/sdks/python/src/learning_commons_evaluators/schemas/__init__.py index b168e59..f28b750 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/__init__.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/__init__.py @@ -13,7 +13,7 @@ ConventionalityEvaluationSettings, ConventionalityOutput, ) -from learning_commons_evaluators.schemas.errors import ValidationError +from learning_commons_evaluators.schemas.errors import InputValidationError from learning_commons_evaluators.schemas.evaluator import ( EvaluationAnswer, EvaluationExplanation, @@ -70,6 +70,6 @@ "TextComplexityEvaluationInput", "TextInputField", "TokenUsage", - "ValidationError", + "InputValidationError", "prompt_settings_to_extras_value", ] diff --git a/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py b/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py index d916289..0cf32b0 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py @@ -8,14 +8,14 @@ # spec loaded from TOML via EvaluatorMetadata.inputs text_spec: TextInputSpec = config.evaluator_metadata.inputs["text"] field = TextInputField(spec=text_spec, value="The quick brown fox...") - field.validate() # raises ValidationError if constraints are violated + field.validate() # raises InputValidationError if constraints are violated """ from typing import Any from pydantic import model_validator -from learning_commons_evaluators.schemas.errors import ValidationError +from learning_commons_evaluators.schemas.errors import InputValidationError from learning_commons_evaluators.schemas.evaluator import InputField from learning_commons_evaluators.schemas.input_specs import ( GradeInputSpec, @@ -59,13 +59,13 @@ def _strip_value_if_spec_requests(cls, data: Any) -> Any: return data def validate(self) -> None: - """Raise :class:`~.errors.ValidationError` if the value violates the spec constraints.""" + """Raise :class:`~.errors.InputValidationError` if the value violates the spec constraints.""" text_length = len(self.value) min_len = self.spec.min_text_length or 0 if text_length < min_len: - raise ValidationError(f"Text length {text_length} is below minimum {min_len}.") + raise InputValidationError(f"Text length {text_length} is below minimum {min_len}.") if self.spec.max_text_length is not None and text_length > self.spec.max_text_length: - raise ValidationError( + raise InputValidationError( f"Text length {text_length} exceeds maximum {self.spec.max_text_length}." ) @@ -87,12 +87,12 @@ class GradeInputField(InputField[int]): spec: GradeInputSpec def validate(self) -> None: - """Raise :class:`~.errors.ValidationError` if the grade is out of range or not in the allowed set.""" + """Raise :class:`~.errors.InputValidationError` if the grade is out of range or not in the allowed set.""" grade = self.value if grade < 0 or grade > 12: - raise ValidationError(f"Grade {grade} is not in allowed range 0-12.") + raise InputValidationError(f"Grade {grade} is not in allowed range 0-12.") if self.spec.allowed_grades is not None and grade not in self.spec.allowed_grades: - raise ValidationError( + raise InputValidationError( f"Grade {grade} is not in allowed set: {sorted(self.spec.allowed_grades)}." ) diff --git a/sdks/python/src/learning_commons_evaluators/schemas/errors.py b/sdks/python/src/learning_commons_evaluators/schemas/errors.py index 47e5a22..545c1b9 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/errors.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/errors.py @@ -1,143 +1,471 @@ -"""Custom error types for the Evaluators SDK.""" +"""Custom error types for the Evaluators SDK. +Design notes for maintainers +---------------------------- + +This module is a Python-idiomatic port of the TypeScript SDK's error hierarchy: + +* The class is the identity. We do *not* maintain ``self.name`` / ``self.code`` + / ``self.message`` string mirrors of the class — those are JS-isms. Callers + discriminate with ``isinstance``. ``str(err)`` returns the message. + +* Retryability is a single, attribute-based signal. ``err.retryable`` is the + source of truth. There is no parallel ``RetryableError`` marker class to + inherit from. Subclasses set a class-level default; callers may override per + instance for the cases where it varies (e.g. a DNS-failed ``NetworkError``). + +* Messages on SDK errors are short and controlled. We deliberately do **not** + echo raw provider error strings into ``str(err)`` because they may contain + prompt content, user text, or fragments of API keys. The original provider + exception is preserved via ``raise … from`` (``__cause__``) and structured + details (``status_code``, ``response_body``, ``request_id``, ``provider``, + ``model``) are exposed as attributes for callers that want to introspect. + +* :func:`wrap_provider_error` prefers structured attributes (``.status_code``, + ``.response``) from provider SDK exceptions over regex on message strings. + String parsing is the fallback when nothing structured is available. +""" + +from __future__ import annotations + +import asyncio +import builtins import re +from typing import TYPE_CHECKING, Any +if TYPE_CHECKING: + from learning_commons_evaluators.schemas.config import LLMProvider -# TODO: rename name and message, and remove Evaluator prefix where appropriate -class EvaluatorError(Exception): - """Base error class for all evaluator errors.""" - def __init__(self, message: str, code: str | None = None): - super().__init__(message) - self.message = message - self.code = code - self.name = "EvaluatorError" +class EvaluatorError(Exception): + """Base class for every error raised by this SDK. + All SDK errors carry a ``retryable`` boolean. The default is ``False`` + (subclasses for transient API failures override it). Callers that wrap an + evaluator call in retry logic can check ``isinstance(err, EvaluatorError)`` + and then consult ``err.retryable`` rather than learning the full hierarchy. + """ -class EvaluatorRetryableError(EvaluatorError): - """Base for errors that may be retried (possibly with backoff). All other evaluator errors are non-retryable.""" + #: Whether the error condition is plausibly transient. Subclasses override. + retryable: bool = False - def __init__(self, message: str, code: str | None = None): - super().__init__(message, code) - self.name = "EvaluatorRetryableError" + def __init__(self, message: str) -> None: + super().__init__(message) class ConfigurationError(EvaluatorError): - """Configuration error - e.g. missing API keys. Non-retryable.""" + """The SDK is misconfigured. + + Examples: missing provider config for the requested LLM provider, an + unknown model ID, a malformed settings file. These are developer errors + and are not retryable. + """ - def __init__(self, message: str): - super().__init__(message, "CONFIGURATION_ERROR") - self.name = "ConfigurationError" + retryable = False -class ValidationError(EvaluatorError): - """Validation error - invalid input. Non-retryable.""" +class InputValidationError(EvaluatorError): + """Evaluator input failed validation. - def __init__(self, message: str): - super().__init__(message, "VALIDATION_ERROR") - self.name = "ValidationError" + Raised before any LLM call when an :class:`InputField` rejects its value. + Renamed from ``ValidationError`` to avoid the collision with + :class:`pydantic.ValidationError`. Not retryable. + """ + + retryable = False class APIError(EvaluatorError): - """Base API error - LLM API calls failed. Use subclasses or set retryable explicitly.""" + """Base class for failures originating in the LLM provider call. + + Carries structured debug attributes alongside the sanitized message: + + Attributes: + status_code: HTTP status code from the provider, when one was returned. + retryable: ``True`` for transient failures (5xx, rate-limit, timeout, + network). Subclasses set sensible defaults; the constructor accepts + an override for cases like a non-retryable DNS failure. + provider: Which configured LLM provider raised the error, when known. + model: Model ID requested when the error occurred, when known. + response_body: Decoded response body from the provider, when available. + May contain echoed prompt text — treat as sensitive in logs. + request_id: Provider request ID for support escalation. + + The original provider exception is preserved as ``self.__cause__`` when + callers use ``raise wrap_provider_error(e) from e`` (which is what the SDK + base evaluator does). + """ def __init__( self, message: str, + *, status_code: int | None = None, - retryable: bool = False, - code: str | None = None, - ): - super().__init__(message, code) + retryable: bool | None = None, + provider: LLMProvider | None = None, + model: str | None = None, + response_body: Any | None = None, + request_id: str | None = None, + ) -> None: + super().__init__(message) self.status_code = status_code - self.retryable = retryable - self.name = "APIError" + # Honour subclass class-level default unless caller explicitly overrides. + if retryable is not None: + self.retryable = retryable + self.provider = provider + self.model = model + self.response_body = response_body + self.request_id = request_id class AuthenticationError(APIError): - """Invalid or missing API keys (401/403). Non-retryable.""" + """Provider rejected the request because of credentials (HTTP 401/403). + + Not retryable. The error message is deliberately generic ("Authentication + failed") rather than the provider's raw message, which may contain a + fragment of the API key. + """ + + retryable = False + + def __init__( + self, + message: str = "Authentication failed", + *, + status_code: int | None = None, + **kwargs: Any, + ) -> None: + super().__init__(message, status_code=status_code, **kwargs) + - def __init__(self, message: str, status_code: int | None = None): - super().__init__(message, status_code, False, "AUTHENTICATION_ERROR") - self.name = "AuthenticationError" +class RateLimitError(APIError): + """Provider returned HTTP 429. + Always retryable; honour ``retry_after`` (seconds) when present. + """ -class RateLimitError(APIError, EvaluatorRetryableError): - """Rate limit exceeded (429). Should be retried with backoff.""" + retryable = True + + def __init__( + self, + message: str = "Rate limit exceeded", + *, + retry_after: float | None = None, + status_code: int | None = 429, + **kwargs: Any, + ) -> None: + super().__init__(message, status_code=status_code, **kwargs) + #: Suggested delay before retry, in **seconds**. ``None`` if the + #: provider did not return a Retry-After header. + self.retry_after = retry_after - def __init__(self, message: str, retry_after: int | None = None): - super().__init__(message, 429, True, "RATE_LIMIT_ERROR") - self.retry_after = retry_after # milliseconds - self.name = "RateLimitError" +class NetworkError(APIError): + """A network-level failure prevented the request from completing. -class NetworkError(APIError, EvaluatorRetryableError): - """Network failure. May be retryable.""" + Connection refused, DNS resolution failure, broken TLS, etc. Retryable by + default but pass ``retryable=False`` for cases that won't improve on retry + (e.g. a permanently bad hostname). + """ - def __init__(self, message: str, retryable: bool = True): - super().__init__(message, None, retryable, "NETWORK_ERROR") - self.name = "NetworkError" + retryable = True -class EvaluatorTimeoutError(APIError, EvaluatorRetryableError): - """Request timed out. Should be retried with caution.""" +class RequestTimeoutError(APIError): + """The request to the provider exceeded the configured timeout. - def __init__(self, message: str = "Request timed out"): - super().__init__(message, 408, True, "TIMEOUT_ERROR") - self.name = "EvaluatorTimeoutError" + Retryable. Renamed from ``EvaluatorTimeoutError`` — the ``Evaluator`` + prefix was a TypeScript-port leftover and the Python builtin + ``TimeoutError`` makes the unqualified name a poor choice. + """ + retryable = True -# TODO: OpenAI & Anthropic may return a status_code in the response. -def _parse_provider_error(error: BaseException) -> tuple[str, int | None, str | None]: + def __init__( + self, + message: str = "Request timed out", + *, + status_code: int | None = 408, + **kwargs: Any, + ) -> None: + super().__init__(message, status_code=status_code, **kwargs) + + +class OutputValidationError(APIError): + """The LLM response did not satisfy the expected output schema. + + Sits under :class:`APIError` because the failure originated in a provider + response (not in user-supplied input), but the request itself completed — + so ``status_code`` is typically ``None``. + + Retryable by default: LLMs are nondeterministic, and a parse failure on + one call may resolve on the next without changing the prompt. Override + ``retryable=False`` if your prompt/schema combination is systematically + incompatible. + + The original :class:`pydantic.ValidationError` is preserved on + ``__cause__`` when raised via ``raise … from e``. The :attr:`validation_errors` + attribute exposes the field-level details from Pydantic's ``errors()`` API + with raw input values stripped — safe for inclusion in logs and telemetry. + """ + + retryable = True + + def __init__( + self, + message: str = "Model output failed schema validation", + *, + validation_errors: list[dict[str, Any]] | None = None, + **kwargs: Any, + ) -> None: + super().__init__(message, **kwargs) + #: Per-field validation failures. Each entry has ``loc``, ``type``, + #: and ``msg`` keys (from :meth:`pydantic.ValidationError.errors`). + #: Raw input values are deliberately omitted to avoid leaking LLM + #: output (which may echo user prompts) into telemetry. + self.validation_errors = validation_errors + + +def sanitize_pydantic_errors(errors: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Strip raw input values from a list of :meth:`pydantic.ValidationError.errors` entries. + + Pydantic's per-error dicts include an ``input`` key containing the raw + value that failed validation. For LLM outputs that value may echo prompt + content, so we drop it before the error reaches logs or telemetry. The + remaining keys (``loc``, ``type``, ``msg``, ``ctx`` if present) describe + the schema and the failure mode but not the offending content. + """ + return [ + {key: value for key, value in error.items() if key != "input"} + for error in errors + ] + + +def format_error_for_metadata(error: BaseException) -> str: + """Render an exception as a string safe for ``EvaluationMetadata.error_details``. + + SDK errors (:class:`EvaluatorError` subclasses) already carry sanitized + messages, so include both the class name and the message for debuggability. + Any other exception is rendered as just its class name — the message may + contain raw values (e.g. an :class:`AttributeError` echoing a field name + from user input) and isn't safe to put in telemetry. + """ + if isinstance(error, EvaluatorError): + return f"{type(error).__name__}: {error}" + return f"Unexpected error: {type(error).__name__}" + + +# --- Provider-error wrapping -------------------------------------------------- + +# Status codes we consider transient even without a more specific subclass match. +_RETRYABLE_STATUS_MIN = 500 + +# Exception class-name suffixes used as a soft signal when isinstance checks +# against provider SDKs aren't possible (e.g. those SDKs aren't installed in +# the caller's environment). The provider SDKs follow consistent naming. +_TIMEOUT_NAME_HINTS = ("TimeoutError", "APITimeoutError", "ReadTimeout", "ConnectTimeout") +_NETWORK_NAME_HINTS = ("ConnectionError", "APIConnectionError", "ConnectError", "NetworkError") + + +def _provider_status_code(error: Exception) -> int | None: + """Best-effort extraction of an HTTP status code from a provider exception. + + Prefers the structured attribute that openai-python and anthropic-python + both expose (``.status_code``). Falls back to ``.status`` (used by some + httpx-based clients) and finally to a regex on the message — narrowed to + HTTP-shaped 4xx/5xx codes near the start of the message to reduce false + positives like model names or token counts. + """ + for attr in ("status_code", "status"): + value = getattr(error, attr, None) + if isinstance(value, int): + return value + match = re.search(r"\b(4\d{2}|5\d{2})\b", str(error)[:80]) + if match: + return int(match.group(1)) + return None + + +def _provider_response_body(error: Exception) -> Any | None: + """Pull the decoded response body off the provider exception when available.""" + body = getattr(error, "body", None) + if body is not None: + return body + response = getattr(error, "response", None) + if response is None: + return None + # httpx.Response and similar objects: try .json() then fall back to .text. + json_fn = getattr(response, "json", None) + if callable(json_fn): + try: + return json_fn() + except Exception: # noqa: BLE001 — body is best-effort debug data + pass + return getattr(response, "text", None) + + +def _provider_request_id(error: Exception) -> str | None: + """Pull a request ID off the provider exception when available.""" + request_id = getattr(error, "request_id", None) + if request_id: + return str(request_id) + response = getattr(error, "response", None) + headers = getattr(response, "headers", None) + if headers is None: + return None + try: + value = headers.get("x-request-id") or headers.get("request-id") + except Exception: # noqa: BLE001 — non-mapping headers shouldn't crash wrapping + return None + return str(value) if value else None + + +def _extract_retry_after(error: Exception) -> float | None: + """Return Retry-After in **seconds** from a provider exception, if present. + + Reads ``response.headers['retry-after']`` when present (per RFC 7231 this + may be an integer number of seconds or an HTTP-date; only the integer form + is currently handled, which matches every provider we use). Falls back to + a ``retry-after: N`` substring in the message. + """ + response = getattr(error, "response", None) + headers = getattr(response, "headers", None) + if headers is not None: + try: + raw = headers.get("retry-after") or headers.get("Retry-After") + except Exception: # noqa: BLE001 + raw = None + if raw is not None: + try: + return float(raw) + except (TypeError, ValueError): + # HTTP-date form is not currently parsed — providers we + # integrate with always return seconds. + pass message = str(error) - status_code = None - code = getattr(error, "name", None) or type(error).__name__ - if code == "Error": - code = None - match = re.search(r"\b(4\d{2}|5\d{2})\b", message) + match = re.search(r"retry[- ]after[:\s]+(\d+)", message, re.I) if match: - status_code = int(match.group(1)) - return message, status_code, code + return float(match.group(1)) + return None + + +def _is_model_not_found(error: Exception, status_code: int | None) -> bool: + """Detect "model not found" errors from a 404 or model-shaped 400.""" + if status_code == 404: + return True + if status_code == 400: + return bool(re.search(r"\bmodel\b.*(not found|does not exist|invalid)", str(error), re.I)) + return False + + +def _is_timeout(error: Exception) -> bool: + """Detect timeout errors by exception class regardless of which SDK raised.""" + if isinstance(error, (asyncio.TimeoutError, builtins.TimeoutError)): + return True + name = type(error).__name__ + return any(hint in name for hint in _TIMEOUT_NAME_HINTS) + + +def _is_network(error: Exception) -> bool: + """Detect network-level failures by exception class.""" + if isinstance(error, ConnectionError): + return True + name = type(error).__name__ + return any(hint in name for hint in _NETWORK_NAME_HINTS) def wrap_provider_error( - error: BaseException, default_message: str = "API request failed" -) -> APIError: - """Wrap a provider error into the appropriate SDK error type.""" - message, status_code, code = _parse_provider_error(error) - msg = message or default_message + error: Exception, + *, + default_message: str = "API request failed", + provider: LLMProvider | None = None, + model: str | None = None, +) -> EvaluatorError: + """Translate a provider exception into the appropriate SDK error subclass. + + Routing precedence (most specific first): + + 1. Asyncio / builtin / SDK timeout exception types → :class:`RequestTimeoutError` + 2. Built-in / SDK connection-error exception types → :class:`NetworkError` + 3. HTTP 404 (and model-shaped 400) → :class:`ConfigurationError` (model not found) + 4. HTTP 401 / 403 → :class:`AuthenticationError` + 5. HTTP 429 → :class:`RateLimitError` + 6. HTTP 5xx → retryable :class:`APIError` + 7. Anything else → non-retryable :class:`APIError` + + SDK error messages are sanitized — we do not interpolate the provider's + raw message into ``str(err)``. Structured detail is available on the + exception's attributes (``status_code``, ``response_body``, ``request_id``, + ``provider``, ``model``) and the original exception is reachable via + ``__cause__`` when callers use ``raise wrap_provider_error(e) from e``. + + Args: + error: The provider exception to wrap. Must be a regular ``Exception`` + (callers should not pass ``KeyboardInterrupt``/``SystemExit``). + default_message: Used only when no more specific message can be + constructed (e.g. an unidentified, status-less exception). + provider: The configured LLM provider that raised, when the caller + knows it. Surfaced as ``err.provider`` to disambiguate multi- + provider runs. + model: The model ID that was being invoked, when known. Surfaced as + ``err.model`` and folded into the message for + :class:`ConfigurationError` results. + + Returns: + An :class:`EvaluatorError` subclass. Always a new instance — never the + original ``error``. + """ + status_code = _provider_status_code(error) + response_body = _provider_response_body(error) + request_id = _provider_request_id(error) + common: dict[str, Any] = { + "provider": provider, + "model": model, + "response_body": response_body, + "request_id": request_id, + } + + if _is_timeout(error): + return RequestTimeoutError(status_code=status_code, **common) + + if _is_network(error): + return NetworkError(**common) + + if _is_model_not_found(error, status_code): + model_suffix = f" (model: {model!r})" if model else "" + return ConfigurationError( + f"Model not found or invalid{model_suffix}. " + "Check the model ID configured for this provider." + ) if status_code in (401, 403): - return AuthenticationError( - msg if "API key" in msg or "api key" in msg.lower() else "Invalid API key", - status_code, - ) + return AuthenticationError(status_code=status_code, **common) + if status_code == 429: - retry_match = re.search(r"retry[- ]after[:\s]+(\d+)", msg, re.I) - retry_after = int(retry_match.group(1)) * 1000 if retry_match else None - return RateLimitError( - msg if "rate limit" in msg.lower() else "Rate limit exceeded", - retry_after, + return RateLimitError(retry_after=_extract_retry_after(error), **common) + + if status_code is not None: + return APIError( + f"API request failed (HTTP {status_code})", + status_code=status_code, + retryable=status_code >= _RETRYABLE_STATUS_MIN, + **common, ) - # Timeouts before generic "Connection" — many stacks use "Connection timed out" - if "timeout" in msg.lower() or "timed out" in msg.lower(): - return EvaluatorTimeoutError(msg) - # TODO: confirm if these apply to Python too. Based on TypeScript SDK implementation. - if any( - x in msg - for x in ( - "ECONNREFUSED", - "ENOTFOUND", - "ETIMEDOUT", - "network", - "Network", - "Connection", - ) - ): - return NetworkError(msg) - return APIError( - msg, - status_code, - bool(status_code and status_code >= 500), - code, - ) + + return APIError(default_message, retryable=False, **common) + + +__all__ = [ + "APIError", + "AuthenticationError", + "ConfigurationError", + "EvaluatorError", + "InputValidationError", + "NetworkError", + "OutputValidationError", + "RateLimitError", + "RequestTimeoutError", + "format_error_for_metadata", + "sanitize_pydantic_errors", + "wrap_provider_error", +] diff --git a/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py b/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py index 4a7d78f..ea5586f 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py @@ -13,7 +13,7 @@ from pydantic import BaseModel, Field, model_validator -from .errors import ConfigurationError, ValidationError +from .errors import ConfigurationError, InputValidationError from .input_specs import InputSpec from .metadata import EvaluationMetadata @@ -43,7 +43,7 @@ class TextInputField(InputField[str]): def validate(self) -> None: if len(self.value) < self.spec.min_text_length: - raise ValidationError(...) + raise InputValidationError(...) def input_metadata(self) -> dict[str, Any]: return {"textLength": len(self.value)} @@ -56,7 +56,7 @@ def input_metadata(self) -> dict[str, Any]: def validate(self) -> None: """Validate *value* against *spec* constraints. - Raise :class:`~.errors.ValidationError` if the value is invalid. + Raise :class:`~.errors.InputValidationError` if the value is invalid. """ @abstractmethod @@ -150,19 +150,19 @@ def _coerce_raw_to_input_fields(cls, data: Any) -> Any: def validate(self) -> None: """Validate all :class:`InputField` members, collecting every error before raising. - Raises :class:`~.errors.ValidationError` if any field is invalid. + Raises :class:`~.errors.InputValidationError` if any field is invalid. """ - errors: list[tuple[str, ValidationError]] = [] + errors: list[tuple[str, InputValidationError]] = [] for name in type(self).model_fields: field_val = getattr(self, name) if isinstance(field_val, InputField): try: field_val.validate() - except ValidationError as e: + except InputValidationError as e: errors.append((name, e)) if errors: - parts = [f"{field}: {err.message}" for field, err in errors] - raise ValidationError("Validation errors: " + "; ".join(parts)) + parts = [f"{field}: {err}" for field, err in errors] + raise InputValidationError("Validation errors: " + "; ".join(parts)) def input_metadata(self) -> dict[str, Any]: """Return a mapping of field name → :meth:`InputField.input_metadata` for each field. diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py index 43d123b..cca0c93 100644 --- a/sdks/python/tests/evaluators/test_base.py +++ b/sdks/python/tests/evaluators/test_base.py @@ -38,7 +38,12 @@ LLMProvider, PromptSettings, ) -from learning_commons_evaluators.schemas.errors import APIError, EvaluatorError, ValidationError +from learning_commons_evaluators.schemas.errors import ( + APIError, + EvaluatorError, + InputValidationError, + OutputValidationError, +) from learning_commons_evaluators.schemas.input_specs import GradeInputSpec, TextInputSpec from learning_commons_evaluators.schemas.metadata import ( PROMPT_STEP_EXTRA_PROMPT_SETTINGS, @@ -235,7 +240,7 @@ def test_raises_validation_error_for_invalid_input(self, stub_evaluator): ), grade_level=GradeInputField(spec=GradeInputSpec(name="grade_level"), value=3), ) - with pytest.raises(ValidationError): + with pytest.raises(InputValidationError): stub_evaluator.evaluate_sync(inp) def test_propagates_evaluate_impl_exception(self, stub_evaluator): @@ -267,7 +272,7 @@ def emit(self, record: logging.LogRecord) -> None: ), grade_level=GradeInputField(spec=GradeInputSpec(name="grade_level"), value=3), ) - with pytest.raises(ValidationError): + with pytest.raises(InputValidationError): stub_evaluator.evaluate_sync(inp) finally: stub_evaluator.config.logger.removeHandler(h) @@ -335,17 +340,42 @@ async def impl(): await stub_evaluator.execute_step("s", evaluation_metadata, impl) assert evaluation_metadata.step_details["s"].status == Status.succeeded - async def test_records_failed_status_and_error_on_exception( + async def test_records_failed_status_and_sanitized_error_on_unexpected_exception( self, stub_evaluator, evaluation_metadata ): + """Non-SDK exceptions record only the class name in step metadata. + + The raw message (``"boom"`` here) is intentionally stripped — for arbitrary + exceptions the message may contain user data, prompt content, or field values + that aren't safe for logs/telemetry. The exception itself still propagates + unchanged for callers that catch it. + """ + async def failing(): - raise ValueError("boom") + raise ValueError("boom with possibly-sensitive context") with pytest.raises(ValueError, match="boom"): await stub_evaluator.execute_step("s", evaluation_metadata, failing) step = evaluation_metadata.step_details["s"] assert step.status == Status.failed - assert "boom" in step.error_details + # Class name is included; raw message is not. + assert "ValueError" in step.error_details + assert "boom" not in step.error_details + + async def test_records_failed_status_and_sanitized_error_on_sdk_exception( + self, stub_evaluator, evaluation_metadata + ): + """SDK exceptions land in step metadata as ``ClassName: ``.""" + + async def failing(): + raise InputValidationError("text length 3 below minimum 10") + + with pytest.raises(InputValidationError): + await stub_evaluator.execute_step("s", evaluation_metadata, failing) + step = evaluation_metadata.step_details["s"] + assert step.status == Status.failed + # SDK error messages are already controlled by us, so they're included verbatim. + assert step.error_details == "InputValidationError: text length 3 below minimum 10" async def test_re_raises_exception(self, stub_evaluator, evaluation_metadata): async def failing(): @@ -669,13 +699,21 @@ async def test_wraps_unexpected_chain_failure_as_api_error( parser_output_type=_ChainOutput, ) - async def test_malformed_llm_json_raises_api_error(self, stub_evaluator, evaluation_metadata): - """Invalid JSON from the LLM becomes :class:`APIError` via ``wrap_provider_error``.""" + async def test_malformed_llm_json_raises_output_validation_error( + self, stub_evaluator, evaluation_metadata + ): + """Invalid JSON from the LLM becomes :class:`OutputValidationError` (a subclass of APIError). + + LangChain raises ``OutputParserException`` for unparseable JSON; we wrap it so the + SDK exposes a single, sanitized parse-failure type instead of leaking the raw + ``"Invalid json output: "`` string (which would echo LLM content). + """ template = ChatPromptTemplate.from_messages([("human", "{input}")]) + bad_output = "not-json with prompt echo: API_KEY=sk-...REDACTED" with ( - patch(_CHAIN_PATCH, return_value=_fake_chat_model(AIMessage(content="not-json"))), - pytest.raises(APIError, match="Invalid json output"), + patch(_CHAIN_PATCH, return_value=_fake_chat_model(AIMessage(content=bad_output))), + pytest.raises(OutputValidationError) as exc_info, ): await stub_evaluator.execute_prompt_chain_step( step_name="main", @@ -689,11 +727,27 @@ async def test_malformed_llm_json_raises_api_error(self, stub_evaluator, evaluat chain_inputs={"input": "text"}, parser_output_type=_ChainOutput, ) - - async def test_schema_mismatch_raises_pydantic_validation_error( + # Sanitized — the raw model output must not appear in str() of the wrapped error. + assert "API_KEY" not in str(exc_info.value) + assert "sk-" not in str(exc_info.value) + # Original exception is preserved for debugging. + assert exc_info.value.__cause__ is not None + # Subclass relationship: callers can still catch APIError. + assert isinstance(exc_info.value, APIError) + # Provider/model context is threaded through. + assert exc_info.value.provider is LLMProvider.GOOGLE + assert exc_info.value.model == "gemini-2.0-flash" + + async def test_schema_mismatch_raises_output_validation_error( self, stub_evaluator, evaluation_metadata ): - """Valid JSON that does not satisfy the output model raises Pydantic ``ValidationError``.""" + """Valid JSON that doesn't satisfy the output model becomes :class:`OutputValidationError`. + + Pydantic's :class:`ValidationError` is wrapped so the original (which may + include input value snippets via Pydantic's default formatting) stays on + ``__cause__`` rather than the SDK error's ``str()``. The structured + per-field details land on ``validation_errors`` with input values stripped. + """ template = ChatPromptTemplate.from_messages([("human", "{input}")]) with ( @@ -701,7 +755,7 @@ async def test_schema_mismatch_raises_pydantic_validation_error( _CHAIN_PATCH, return_value=_fake_chat_model(AIMessage(content='{"label": "only"}')), ), - pytest.raises(PydanticValidationError), + pytest.raises(OutputValidationError) as exc_info, ): await stub_evaluator.execute_prompt_chain_step( step_name="main", @@ -715,3 +769,12 @@ async def test_schema_mismatch_raises_pydantic_validation_error( chain_inputs={"input": "text"}, parser_output_type=_ChainOutput, ) + # Pydantic error preserved for callers that want full detail. + assert isinstance(exc_info.value.__cause__, PydanticValidationError) + # Structured per-field errors are populated; raw 'input' is stripped. + assert exc_info.value.validation_errors is not None + assert len(exc_info.value.validation_errors) > 0 + for entry in exc_info.value.validation_errors: + assert "input" not in entry + assert "loc" in entry + assert "type" in entry diff --git a/sdks/python/tests/evaluators/test_vocabulary.py b/sdks/python/tests/evaluators/test_vocabulary.py index 9a7d5d9..e3d9270 100644 --- a/sdks/python/tests/evaluators/test_vocabulary.py +++ b/sdks/python/tests/evaluators/test_vocabulary.py @@ -9,7 +9,7 @@ VocabularyEvaluator, create_config_no_telemetry, ) -from learning_commons_evaluators.schemas.errors import ConfigurationError, ValidationError +from learning_commons_evaluators.schemas.errors import ConfigurationError, InputValidationError from learning_commons_evaluators.schemas.metadata import Status from learning_commons_evaluators.schemas.vocabulary import ( VocabularyComplexityOutput, @@ -258,8 +258,8 @@ def test_unsupported_grade_raises_via_framework(self, unsupported_grade): config = create_config_no_telemetry() evaluator = VocabularyEvaluator(config) inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=unsupported_grade) - # The base evaluator catches the ValidationError, sets status=failed, then re-raises. - with pytest.raises(ValidationError): + # The base evaluator catches the InputValidationError, sets status=failed, then re-raises. + with pytest.raises(InputValidationError): evaluator.evaluate_sync(inp) def test_unsupported_grade_sets_status_failed(self): @@ -267,7 +267,7 @@ def test_unsupported_grade_sets_status_failed(self): config = create_config_no_telemetry() evaluator = VocabularyEvaluator(config) inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=2) - with pytest.raises(ValidationError): + with pytest.raises(InputValidationError): evaluator.evaluate_sync(inp) diff --git a/sdks/python/tests/schemas/test_common_inputs.py b/sdks/python/tests/schemas/test_common_inputs.py index 2cc7d3e..8bbb7e3 100644 --- a/sdks/python/tests/schemas/test_common_inputs.py +++ b/sdks/python/tests/schemas/test_common_inputs.py @@ -6,7 +6,7 @@ GradeInputField, TextInputField, ) -from learning_commons_evaluators.schemas.errors import ValidationError +from learning_commons_evaluators.schemas.errors import InputValidationError from learning_commons_evaluators.schemas.input_specs import ( GradeInputSpec, TextInputSpec, @@ -43,14 +43,14 @@ def test_validate_passes_at_exact_min(self): TextInputField(spec=_text_spec(min_text_length=2), value="ab").validate() def test_validate_raises_below_min(self): - with pytest.raises(ValidationError, match="below minimum"): + with pytest.raises(InputValidationError, match="below minimum"): TextInputField(spec=_text_spec(min_text_length=2), value="a").validate() def test_validate_passes_at_exact_max(self): TextInputField(spec=_text_spec(max_text_length=10), value="a" * 10).validate() def test_validate_raises_above_max(self): - with pytest.raises(ValidationError, match="exceeds maximum"): + with pytest.raises(InputValidationError, match="exceeds maximum"): TextInputField(spec=_text_spec(max_text_length=10), value="a" * 11).validate() def test_validate_no_max_by_default(self): @@ -85,7 +85,7 @@ def test_strip_whitespace_explicit_true_trims_value(self): def test_validate_raises_when_strip_shortens_below_min(self): """Padding does not count toward ``min_text_length`` when stripping is on.""" - with pytest.raises(ValidationError, match="below minimum"): + with pytest.raises(InputValidationError, match="below minimum"): TextInputField( spec=_text_spec(min_text_length=5, strip_whitespace=True), value=" ab ", @@ -103,11 +103,11 @@ def test_validate_passes_at_boundaries(self): GradeInputField(spec=_grade_spec(), value=12).validate() # upper boundary def test_validate_raises_below_0(self): - with pytest.raises(ValidationError, match="0-12"): + with pytest.raises(InputValidationError, match="0-12"): GradeInputField(spec=_grade_spec(), value=-1).validate() def test_validate_raises_above_12(self): - with pytest.raises(ValidationError, match="0-12"): + with pytest.raises(InputValidationError, match="0-12"): GradeInputField(spec=_grade_spec(), value=13).validate() def test_validate_passes_when_in_allowed_grades(self): @@ -117,7 +117,7 @@ def test_validate_passes_when_in_allowed_grades(self): ).validate() def test_validate_raises_when_not_in_allowed_grades(self): - with pytest.raises(ValidationError, match="not in allowed set"): + with pytest.raises(InputValidationError, match="not in allowed set"): GradeInputField( spec=_grade_spec(allowed_grades=[5, 6, 7]), value=8, diff --git a/sdks/python/tests/schemas/test_errors.py b/sdks/python/tests/schemas/test_errors.py index d25cefd..ce25d18 100644 --- a/sdks/python/tests/schemas/test_errors.py +++ b/sdks/python/tests/schemas/test_errors.py @@ -1,134 +1,376 @@ """Tests for the error hierarchy and wrap_provider_error(). -Each wrap_provider_error test targets a specific branch in the function so -that every routing decision is covered independently. +Covers two surfaces: + +1. The hierarchy itself — every subclass advertises the right ``retryable`` + default, message defaults are sanitized, and structured attributes (status + code, retry-after, response body, request id, provider, model) round-trip. + +2. ``wrap_provider_error`` — one test per routing branch, plus tests that the + function prefers structured attributes (``.status_code``, ``.response``) over + regex on the message and that raw provider messages do not leak into the + wrapped error's ``str()``. """ +from __future__ import annotations + +import asyncio +from typing import Any + import pytest +from learning_commons_evaluators.schemas.config import LLMProvider from learning_commons_evaluators.schemas.errors import ( APIError, AuthenticationError, ConfigurationError, EvaluatorError, - EvaluatorTimeoutError, + InputValidationError, NetworkError, + OutputValidationError, RateLimitError, - ValidationError, + RequestTimeoutError, + format_error_for_metadata, + sanitize_pydantic_errors, wrap_provider_error, ) +# --- Hierarchy --------------------------------------------------------------- + + class TestErrorHierarchy: - """Verify that every error type carries the right code, name, and retryable flag.""" + """The class is the identity. Each subclass exposes the right retryable default.""" @pytest.mark.parametrize( - "exc,expected_code,expected_retryable", + "exc,expected_retryable", [ - (ValidationError("bad"), "VALIDATION_ERROR", None), - (ConfigurationError("missing"), "CONFIGURATION_ERROR", None), - (AuthenticationError("denied", 401), "AUTHENTICATION_ERROR", False), - (RateLimitError("slow down"), "RATE_LIMIT_ERROR", True), - (NetworkError("no route"), "NETWORK_ERROR", True), - (EvaluatorTimeoutError(), "TIMEOUT_ERROR", True), + (InputValidationError("bad"), False), + (ConfigurationError("missing"), False), + (AuthenticationError(status_code=401), False), + (RateLimitError(), True), + (NetworkError(), True), + (RequestTimeoutError(), True), + (OutputValidationError(), True), ], ) - def test_error_code_and_retryable(self, exc, expected_code, expected_retryable): - assert exc.code == expected_code - if expected_retryable is not None: - assert exc.retryable is expected_retryable + def test_retryable_default(self, exc: EvaluatorError, expected_retryable: bool) -> None: + assert exc.retryable is expected_retryable + + def test_every_subclass_is_an_evaluator_error(self) -> None: + for exc in ( + InputValidationError("x"), + ConfigurationError("x"), + APIError("x"), + AuthenticationError(), + RateLimitError(), + NetworkError(), + RequestTimeoutError(), + OutputValidationError(), + ): + assert isinstance(exc, EvaluatorError) + + def test_api_error_subclasses_are_api_errors(self) -> None: + for exc in ( + AuthenticationError(), + RateLimitError(), + NetworkError(), + RequestTimeoutError(), + OutputValidationError(), + ): + assert isinstance(exc, APIError) + + def test_output_validation_error_carries_validation_errors(self) -> None: + details = [ + {"loc": ("vocabulary_complexity",), "type": "missing", "msg": "Field required"} + ] + err = OutputValidationError(validation_errors=details) + assert err.validation_errors == details + # No input snippets leak into the SDK error's string form. + assert str(err) == "Model output failed schema validation" - def test_evaluator_error_stores_message(self): - err = EvaluatorError("something broke", code="ERR") + def test_evaluator_error_str_returns_message(self) -> None: + err = EvaluatorError("something broke") assert str(err) == "something broke" - assert err.message == "something broke" - assert err.code == "ERR" - def test_api_error_status_code_and_retryable(self): - err = APIError("server error", status_code=500, retryable=True) + def test_api_error_structured_attributes(self) -> None: + err = APIError( + "server error", + status_code=500, + retryable=True, + provider=LLMProvider.OPENAI, + model="gpt-4o", + response_body={"error": "explode"}, + request_id="req_abc", + ) assert err.status_code == 500 assert err.retryable is True + assert err.provider is LLMProvider.OPENAI + assert err.model == "gpt-4o" + assert err.response_body == {"error": "explode"} + assert err.request_id == "req_abc" - def test_rate_limit_error_carries_retry_after(self): - err = RateLimitError("too fast", retry_after=60_000) + def test_rate_limit_error_retry_after_is_seconds(self) -> None: + """Retry-After is exposed in **seconds** (not milliseconds like the TS SDK).""" + err = RateLimitError(retry_after=60.0) assert err.status_code == 429 - assert err.retry_after == 60_000 + assert err.retry_after == 60.0 + + def test_network_error_retryable_overridable(self) -> None: + """Permanently bad hostnames should be flaggable as non-retryable.""" + err = NetworkError("DNS resolution failed permanently", retryable=False) + assert err.retryable is False + + def test_authentication_error_default_message_is_generic(self) -> None: + """Default message must not contain anything from the original provider error.""" + err = AuthenticationError(status_code=401) + assert str(err) == "Authentication failed" + + +# --- Helpers for fake provider exceptions ------------------------------------ + + +class _FakeHeaders: + def __init__(self, headers: dict[str, str]) -> None: + self._headers = headers + + def get(self, key: str, default: Any = None) -> Any: + return self._headers.get(key.lower(), self._headers.get(key, default)) + + +class _FakeResponse: + def __init__(self, headers: dict[str, str] | None = None, body: Any = None) -> None: + self.headers = _FakeHeaders(headers or {}) + self._body = body + + def json(self) -> Any: + return self._body + + +class _FakeProviderError(Exception): + """Stand-in for openai/anthropic API errors: exposes ``status_code`` / ``response``.""" + + def __init__( + self, + message: str, + *, + status_code: int | None = None, + response: _FakeResponse | None = None, + body: Any | None = None, + ) -> None: + super().__init__(message) + self.status_code = status_code + self.response = response + self.body = body + + +# --- wrap_provider_error: routing branches ---------------------------------- class TestWrapProviderError: - """One test per routing branch in wrap_provider_error().""" + """One test per routing branch, plus attribute-vs-regex preferences.""" - def test_401_with_api_key_text_returns_authentication_error(self): - wrapped = wrap_provider_error(Exception("401 Invalid API key")) - assert isinstance(wrapped, AuthenticationError) - assert wrapped.status_code == 401 + def test_status_code_read_from_attribute_not_message(self) -> None: + """Should prefer the structured .status_code attribute over message regex.""" + err = _FakeProviderError("something opaque", status_code=429) + wrapped = wrap_provider_error(err) + assert isinstance(wrapped, RateLimitError) + assert wrapped.status_code == 429 + + def test_404_routes_to_configuration_error_with_model_in_message(self) -> None: + """Model-not-found is a developer mistake, not an API error.""" + err = _FakeProviderError("not found", status_code=404) + wrapped = wrap_provider_error(err, provider=LLMProvider.OPENAI, model="gpt-bogus") + assert isinstance(wrapped, ConfigurationError) + assert "gpt-bogus" in str(wrapped) + + def test_400_model_message_routes_to_configuration_error(self) -> None: + err = _FakeProviderError("model 'gpt-bogus' does not exist", status_code=400) + wrapped = wrap_provider_error(err) + assert isinstance(wrapped, ConfigurationError) - def test_401_without_api_key_text_uses_fallback_message(self): - """When the 401 message doesn't mention 'api key', wrap should still use AuthenticationError - with the generic 'Invalid API key' fallback rather than the raw message.""" - wrapped = wrap_provider_error(Exception("401 Unauthorized")) + def test_401_routes_to_authentication_error_with_sanitized_message(self) -> None: + err = _FakeProviderError("401: api key sk-...REDACTED is invalid", status_code=401) + wrapped = wrap_provider_error(err) assert isinstance(wrapped, AuthenticationError) assert wrapped.status_code == 401 - assert "Invalid API key" in str(wrapped) + # Original raw message must not leak into the SDK error's string form. + assert "sk-" not in str(wrapped) + assert str(wrapped) == "Authentication failed" - def test_403_returns_authentication_error(self): - wrapped = wrap_provider_error(Exception("403 Forbidden")) + def test_403_routes_to_authentication_error(self) -> None: + err = _FakeProviderError("forbidden", status_code=403) + wrapped = wrap_provider_error(err) assert isinstance(wrapped, AuthenticationError) assert wrapped.status_code == 403 - def test_429_returns_rate_limit_error(self): - wrapped = wrap_provider_error(Exception("429 rate limit exceeded")) + def test_429_with_retry_after_header_is_in_seconds(self) -> None: + """Retry-After header takes precedence over message regex and is in seconds.""" + err = _FakeProviderError( + "429 rate limit", + status_code=429, + response=_FakeResponse(headers={"retry-after": "12"}), + ) + wrapped = wrap_provider_error(err) assert isinstance(wrapped, RateLimitError) - assert wrapped.status_code == 429 + assert wrapped.retry_after == 12.0 - def test_429_with_retry_after_header_parses_delay(self): - """Retry-After value in the message should be extracted and converted to ms.""" - wrapped = wrap_provider_error(Exception("429 rate limit exceeded. Retry-After: 5")) + def test_429_falls_back_to_message_regex_for_retry_after(self) -> None: + err = _FakeProviderError("429 rate limit. Retry-After: 5", status_code=429) + wrapped = wrap_provider_error(err) assert isinstance(wrapped, RateLimitError) - assert wrapped.retry_after == 5000 # 5 seconds → 5000 ms + assert wrapped.retry_after == 5.0 - def test_429_without_retry_after_sets_none(self): - wrapped = wrap_provider_error(Exception("429 rate limit exceeded")) + def test_429_without_retry_after_sets_none(self) -> None: + err = _FakeProviderError("429 rate limit", status_code=429) + wrapped = wrap_provider_error(err) assert isinstance(wrapped, RateLimitError) assert wrapped.retry_after is None - @pytest.mark.parametrize( - "message", - [ - "ECONNREFUSED 127.0.0.1:443", - "ENOTFOUND api.example.com", - "ETIMEDOUT after 30s", - "Connection failed", - "Network error", - ], - ) - def test_network_keywords_return_network_error(self, message): - wrapped = wrap_provider_error(Exception(message)) + def test_builtin_connection_error_routes_to_network_error(self) -> None: + wrapped = wrap_provider_error(ConnectionError("connection refused")) assert isinstance(wrapped, NetworkError) - @pytest.mark.parametrize( - "message", - [ - "request timed out", - "Connection timed out after 10 seconds", - ], - ) - def test_timeout_phrases_return_timeout_error(self, message): - wrapped = wrap_provider_error(Exception(message)) - assert isinstance(wrapped, EvaluatorTimeoutError) + def test_provider_named_connection_error_routes_to_network_error(self) -> None: + """Detect by class name when isinstance against the SDK class isn't possible.""" + + class APIConnectionError(Exception): + pass + + wrapped = wrap_provider_error(APIConnectionError("connect failed")) + assert isinstance(wrapped, NetworkError) + + def test_builtin_timeout_error_routes_to_timeout(self) -> None: + wrapped = wrap_provider_error(TimeoutError("timed out")) + assert isinstance(wrapped, RequestTimeoutError) - def test_500_returns_retryable_api_error(self): - wrapped = wrap_provider_error(Exception("500 Internal Server Error")) + def test_asyncio_timeout_error_routes_to_timeout(self) -> None: + wrapped = wrap_provider_error(asyncio.TimeoutError()) + assert isinstance(wrapped, RequestTimeoutError) + + def test_provider_named_timeout_error_routes_to_timeout(self) -> None: + class APITimeoutError(Exception): + pass + + wrapped = wrap_provider_error(APITimeoutError("timed out")) + assert isinstance(wrapped, RequestTimeoutError) + + def test_500_returns_retryable_api_error(self) -> None: + err = _FakeProviderError("internal server error", status_code=500) + wrapped = wrap_provider_error(err) assert isinstance(wrapped, APIError) assert wrapped.status_code == 500 assert wrapped.retryable is True - def test_unknown_exception_returns_non_retryable_api_error(self): + def test_502_returns_retryable_api_error(self) -> None: + err = _FakeProviderError("bad gateway", status_code=502) + wrapped = wrap_provider_error(err) + assert isinstance(wrapped, APIError) + assert wrapped.retryable is True + + def test_unknown_exception_returns_non_retryable_api_error(self) -> None: wrapped = wrap_provider_error(Exception("something completely unexpected")) assert isinstance(wrapped, APIError) assert wrapped.status_code is None assert wrapped.retryable is False - def test_empty_message_uses_default_message(self): + def test_default_message_is_used_when_no_signal(self) -> None: wrapped = wrap_provider_error(Exception(""), default_message="fallback message") assert "fallback message" in str(wrapped) + + def test_provider_and_model_threaded_through(self) -> None: + err = _FakeProviderError("429", status_code=429) + wrapped = wrap_provider_error(err, provider=LLMProvider.ANTHROPIC, model="claude-x") + assert wrapped.provider is LLMProvider.ANTHROPIC + assert wrapped.model == "claude-x" + + def test_response_body_and_request_id_extracted(self) -> None: + err = _FakeProviderError( + "429", + status_code=429, + response=_FakeResponse( + headers={"x-request-id": "req_xyz"}, + body={"error": "rate_limited"}, + ), + ) + wrapped = wrap_provider_error(err) + assert wrapped.request_id == "req_xyz" + assert wrapped.response_body == {"error": "rate_limited"} + + def test_body_attribute_preferred_over_response_json(self) -> None: + """When the SDK already has a decoded body attribute, use it directly.""" + err = _FakeProviderError( + "429", + status_code=429, + body={"error": "from_body_attr"}, + response=_FakeResponse(body={"error": "from_response_json"}), + ) + wrapped = wrap_provider_error(err) + assert wrapped.response_body == {"error": "from_body_attr"} + + def test_original_exception_preserved_via_cause(self) -> None: + """`raise wrap_provider_error(e) from e` keeps __cause__ for debugging.""" + original = _FakeProviderError("opaque inner error", status_code=500) + try: + try: + raise original + except Exception as e: + raise wrap_provider_error(e) from e + except APIError as wrapped: + assert wrapped.__cause__ is original + + def test_message_regex_fallback_when_no_status_code_attr(self) -> None: + """When the exception has no .status_code, fall back to bounded regex on message.""" + wrapped = wrap_provider_error(Exception("503 Service Unavailable")) + assert isinstance(wrapped, APIError) + assert wrapped.status_code == 503 + assert wrapped.retryable is True + + +# --- sanitize_pydantic_errors ------------------------------------------------ + + +class TestSanitizePydanticErrors: + def test_strips_input_key_only(self) -> None: + """Drop the raw ``input`` value but keep loc/type/msg/ctx for debugging.""" + raw = [ + { + "loc": ("vocabulary_complexity",), + "type": "missing", + "msg": "Field required", + "input": {"label": "only"}, # potentially echoes LLM output + "ctx": {"some": "context"}, + } + ] + sanitized = sanitize_pydantic_errors(raw) + assert sanitized == [ + { + "loc": ("vocabulary_complexity",), + "type": "missing", + "msg": "Field required", + "ctx": {"some": "context"}, + } + ] + + def test_handles_empty_list(self) -> None: + assert sanitize_pydantic_errors([]) == [] + + +# --- format_error_for_metadata ----------------------------------------------- + + +class TestFormatErrorForMetadata: + def test_sdk_error_includes_class_name_and_sanitized_message(self) -> None: + err = RateLimitError() + assert format_error_for_metadata(err) == "RateLimitError: Rate limit exceeded" + + def test_non_sdk_error_drops_message(self) -> None: + """Arbitrary exception messages may contain user data — only the class name is recorded.""" + formatted = format_error_for_metadata(ValueError("something sensitive: sk-XYZ")) + assert "sk-XYZ" not in formatted + assert "something sensitive" not in formatted + assert "ValueError" in formatted + + def test_returns_string(self) -> None: + assert isinstance(format_error_for_metadata(EvaluatorError("x")), str) + assert isinstance(format_error_for_metadata(Exception("y")), str) diff --git a/sdks/python/tests/schemas/test_evaluator_schemas.py b/sdks/python/tests/schemas/test_evaluator_schemas.py index 433fa66..9837fcf 100644 --- a/sdks/python/tests/schemas/test_evaluator_schemas.py +++ b/sdks/python/tests/schemas/test_evaluator_schemas.py @@ -18,7 +18,7 @@ ) from learning_commons_evaluators.schemas.errors import ( ConfigurationError, - ValidationError, + InputValidationError, ) from learning_commons_evaluators.schemas.evaluator import ( EvaluationAnswer, @@ -103,18 +103,18 @@ def test_input_values_returns_primitive_values(self): def test_validate_raises_on_invalid_grade(self): inp = _ExampleEvaluationInput(text=_LONG_TEXT, grade=99) - with pytest.raises(ValidationError): + with pytest.raises(InputValidationError): inp.validate() def test_validate_raises_on_invalid_text_length(self): inp = _ExampleEvaluationInput(text="x", grade=5) - with pytest.raises(ValidationError): + with pytest.raises(InputValidationError): inp.validate() def test_validate_collects_all_errors_before_raising(self): - """All field errors are collected; a single ValidationError is raised at the end.""" + """All field errors are collected; a single InputValidationError is raised at the end.""" inp = _ExampleEvaluationInput(text="x", grade=99) - with pytest.raises(ValidationError) as exc_info: + with pytest.raises(InputValidationError) as exc_info: inp.validate() msg = str(exc_info.value) assert "below minimum" in msg From e442885b19cfdffb1785096977aadec68792dee8 Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Fri, 15 May 2026 17:24:36 -0700 Subject: [PATCH 2/8] chore: update error handling section in README --- sdks/python/README.md | 98 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 81 insertions(+), 17 deletions(-) diff --git a/sdks/python/README.md b/sdks/python/README.md index 0b26e37..13a894f 100644 --- a/sdks/python/README.md +++ b/sdks/python/README.md @@ -302,32 +302,96 @@ config = create_config_no_telemetry(logger=create_silent_logger()) ## Error handling +Every exception the SDK raises inherits from `EvaluatorError`. Failures inside LLM prompt steps are wrapped at the boundary so callers see a predictable, sanitized hierarchy instead of raw LangChain, OpenAI, Anthropic, or HTTP-client exceptions. + +### Hierarchy + +``` +EvaluatorError +├── ConfigurationError — bad config (missing provider, unknown model, malformed settings) +├── InputValidationError — caller-supplied input failed validation +└── APIError — failures originating in the LLM provider call + ├── AuthenticationError — 401 / 403 + ├── RateLimitError — 429; carries `retry_after` (seconds) + ├── NetworkError — connection refused, DNS failure, broken TLS + ├── RequestTimeoutError — request exceeded the configured timeout + └── OutputValidationError — LLM response failed to parse or didn't match the expected schema +``` + +`InputValidationError` is named that way deliberately to avoid collision with `pydantic.ValidationError`. `RequestTimeoutError` is named that way to avoid shadowing the builtin `TimeoutError`. There is **no** `ValidationError` or `EvaluatorTimeoutError` in the public API. + +### Knowing when to retry + +Every `EvaluatorError` exposes a boolean `retryable` attribute. This is the single signal callers should consult when wrapping `evaluate()` in retry logic — there is no separate marker class to check. Subclasses set sensible defaults: + +- Retryable by default: `RateLimitError`, `NetworkError`, `RequestTimeoutError`, `OutputValidationError`, and any `APIError` with a 5xx status code. +- Not retryable: `ConfigurationError`, `InputValidationError`, `AuthenticationError`, and `APIError` with a 4xx status code. + +`retryable` is also accepted as an `__init__` kwarg on `APIError` and `NetworkError` if you need to flag a specific instance differently (e.g. a permanently-bad hostname). + ```python -from learning_commons_evaluators import ( - ConfigurationError, # Missing/invalid config - ValidationError, # Invalid input - AuthenticationError, # Invalid API keys (401/403) - RateLimitError, # Rate limit exceeded (429) - has retry_after - NetworkError, # Network failures - EvaluatorTimeoutError, # Request timeout - APIError, # Other API errors -) +import time +from learning_commons_evaluators import EvaluatorError, RateLimitError + +for attempt in range(3): + try: + result = evaluator.evaluate_sync(input) + break + except EvaluatorError as e: + if not e.retryable or attempt == 2: + raise + delay = e.retry_after if isinstance(e, RateLimitError) and e.retry_after else 2 ** attempt + time.sleep(delay) # retry_after is in seconds +``` + +### Sanitization and debugging context + +Error **messages** (the value returned by `str(err)`) are short and controlled. Raw provider strings — which may contain prompt echoes, user text, or fragments of API keys — are **not** interpolated into the SDK exception's message. Structured detail lives on attributes instead: +- `status_code` on `APIError` — HTTP status from the provider, when one was returned. Populated from the provider exception's `.status_code` attribute when present (preferred over message regex). +- `retry_after` on `RateLimitError` — suggested delay before retry, **in seconds**, or `None` if the provider didn't return a `Retry-After` header. +- `provider` on `APIError` — the `LLMProvider` being called when the failure occurred. +- `model` on `APIError` — the model ID requested. +- `response_body` on `APIError` — decoded response body. Opt-in for debugging; may contain echoed prompt content, so treat as sensitive. +- `request_id` on `APIError` — provider request ID, useful for support escalation. +- `validation_errors` on `OutputValidationError` — per-field details from Pydantic's `errors()` API with raw input values stripped (safe for logs). + +The original provider exception is preserved on `__cause__` (via `raise … from e`), so debuggers, tracebacks, and `logging.exception()` retain full detail even though `str(err)` is sanitized. + +```python +import logging +from learning_commons_evaluators import APIError, OutputValidationError, RateLimitError + +log = logging.getLogger(__name__) try: result = evaluator.evaluate_sync(input) -except ConfigurationError as e: - print(f"Config issue: {e}") -except ValidationError as e: - print(f"Invalid input: {e}") except RateLimitError as e: - print(f"Rate limited, retry after {e.retry_after}ms") + time.sleep(e.retry_after or 30) # seconds +except OutputValidationError as e: + # Field-level errors are safe to log; raw LLM output is not in str(e). + log.warning("Bad LLM output: %s", e.validation_errors) + # Original pydantic.ValidationError / OutputParserException available as e.__cause__ except APIError as e: - print(f"API error (retryable={e.retryable}): {e}") + log.error( + "Provider call failed", + extra={ + "provider": e.provider, + "model": e.model, + "status": e.status_code, + "request_id": e.request_id, + }, + ) + raise ``` -Failures inside LLM prompt steps are passed through `wrap_provider_error()` (see `learning_commons_evaluators.schemas.errors`) so you typically see `APIError` subclasses rather than raw LangChain or HTTP client exceptions. Use `EvaluatorTimeoutError` for timeouts (the package does not export a `TimeoutError` alias, to avoid shadowing the Python builtin). +### Metadata and telemetry + +On evaluation failure, `evaluation_metadata.status` is set to `failed` and `evaluation_metadata.error_details` is populated before `evaluate()` / `evaluate_sync()` re-raises (no result object is returned). `error_details` is itself sanitized: + +- SDK errors record `"ClassName: "`. +- Any other exception that escapes records only `"Unexpected error: ClassName"` — the message is omitted because arbitrary `ValueError`/`AttributeError`/etc. messages may contain user data or field values that aren't safe for telemetry. -On evaluation failure, `metadata.status` and `error_details` are set on the in-memory metadata object for the run and appear on the evaluation end log line; `BaseEvaluator.evaluate` / `evaluate_sync` still re-raises and does not return a result object. +The same policy applies to per-step `StepMetadata.error_details`. Both fields are emitted on the evaluation end log line. ## Creating custom evaluators From 029b7913d5499f6696187a1c9af66619fa7ab6f2 Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Fri, 15 May 2026 17:25:50 -0700 Subject: [PATCH 3/8] style: linting fixes --- .../python/src/learning_commons_evaluators/schemas/errors.py | 5 +---- sdks/python/tests/schemas/test_errors.py | 4 +--- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/sdks/python/src/learning_commons_evaluators/schemas/errors.py b/sdks/python/src/learning_commons_evaluators/schemas/errors.py index 545c1b9..cb87e09 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/errors.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/errors.py @@ -235,10 +235,7 @@ def sanitize_pydantic_errors(errors: list[dict[str, Any]]) -> list[dict[str, Any remaining keys (``loc``, ``type``, ``msg``, ``ctx`` if present) describe the schema and the failure mode but not the offending content. """ - return [ - {key: value for key, value in error.items() if key != "input"} - for error in errors - ] + return [{key: value for key, value in error.items() if key != "input"} for error in errors] def format_error_for_metadata(error: BaseException) -> str: diff --git a/sdks/python/tests/schemas/test_errors.py b/sdks/python/tests/schemas/test_errors.py index ce25d18..9f26c08 100644 --- a/sdks/python/tests/schemas/test_errors.py +++ b/sdks/python/tests/schemas/test_errors.py @@ -81,9 +81,7 @@ def test_api_error_subclasses_are_api_errors(self) -> None: assert isinstance(exc, APIError) def test_output_validation_error_carries_validation_errors(self) -> None: - details = [ - {"loc": ("vocabulary_complexity",), "type": "missing", "msg": "Field required"} - ] + details = [{"loc": ("vocabulary_complexity",), "type": "missing", "msg": "Field required"}] err = OutputValidationError(validation_errors=details) assert err.validation_errors == details # No input snippets leak into the SDK error's string form. From d255b284948dd642473bf75b2ee1e281d47173d0 Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Fri, 15 May 2026 17:37:07 -0700 Subject: [PATCH 4/8] feat: lint-fix make command --- sdks/python/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sdks/python/Makefile b/sdks/python/Makefile index 72d8b4b..a29aaa9 100644 --- a/sdks/python/Makefile +++ b/sdks/python/Makefile @@ -14,7 +14,7 @@ SETTINGS_DST := src/learning_commons_evaluators/settings .PHONY: help build check-build test unit-test contract-test \ generate-settings check-generated sync-settings check-sync \ - lint format format-check typecheck pip-check verify coverage + lint lint-fix format format-check typecheck pip-check verify coverage help: @echo "Usage: make " @@ -23,6 +23,7 @@ help: @echo " check-build Verify build artifacts are up to date (use in CI)" @echo "" @echo " lint Ruff linter (src/, tests/, scripts/)" + @echo " lint-fix Ruff linter with --fix (safe auto-fixes)" @echo " format Apply Ruff formatter" @echo " format-check Fail if Ruff would reformat any file" @echo " typecheck Mypy on src package + tests" @@ -58,6 +59,9 @@ check-build: check-generated check-sync lint: $(RUFF) check src tests scripts +lint-fix: + $(RUFF) check --fix src tests scripts + format: $(RUFF) format src tests scripts From 4b963a05a71cdb9a2ffbbf3dc3c11807ab600e77 Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Fri, 15 May 2026 17:37:42 -0700 Subject: [PATCH 5/8] style: more linting --- sdks/python/tests/schemas/test_errors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdks/python/tests/schemas/test_errors.py b/sdks/python/tests/schemas/test_errors.py index 9f26c08..4445dbf 100644 --- a/sdks/python/tests/schemas/test_errors.py +++ b/sdks/python/tests/schemas/test_errors.py @@ -35,7 +35,6 @@ wrap_provider_error, ) - # --- Hierarchy --------------------------------------------------------------- From 59b0416f5655968f72d70f9c81763f1d61808c10 Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Fri, 15 May 2026 17:49:16 -0700 Subject: [PATCH 6/8] build: lint and type issues addressed --- .../learning_commons_evaluators/schemas/errors.py | 14 +++++++++++++- sdks/python/tests/evaluators/test_base.py | 5 ++++- sdks/python/tests/schemas/test_errors.py | 3 +++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/sdks/python/src/learning_commons_evaluators/schemas/errors.py b/sdks/python/src/learning_commons_evaluators/schemas/errors.py index cb87e09..5d60cc6 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/errors.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/errors.py @@ -31,6 +31,7 @@ import asyncio import builtins import re +from collections.abc import Mapping, Sequence from typing import TYPE_CHECKING, Any if TYPE_CHECKING: @@ -170,6 +171,15 @@ class NetworkError(APIError): retryable = True + def __init__( + self, + message: str = "Network request failed", + *, + status_code: int | None = None, + **kwargs: Any, + ) -> None: + super().__init__(message, status_code=status_code, **kwargs) + class RequestTimeoutError(APIError): """The request to the provider exceeded the configured timeout. @@ -226,7 +236,9 @@ def __init__( self.validation_errors = validation_errors -def sanitize_pydantic_errors(errors: list[dict[str, Any]]) -> list[dict[str, Any]]: +def sanitize_pydantic_errors( + errors: Sequence[Mapping[str, Any]], +) -> list[dict[str, Any]]: """Strip raw input values from a list of :meth:`pydantic.ValidationError.errors` entries. Pydantic's per-error dicts include an ``input`` key containing the raw diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py index cca0c93..4d07dea 100644 --- a/sdks/python/tests/evaluators/test_base.py +++ b/sdks/python/tests/evaluators/test_base.py @@ -684,7 +684,7 @@ async def test_wraps_unexpected_chain_failure_as_api_error( template = ChatPromptTemplate.from_messages([("human", "{input}")]) with ( patch(_CHAIN_PATCH, return_value=_ChainFailureChatModel()), - pytest.raises(APIError, match="simulated provider failure"), + pytest.raises(APIError, match="^API request failed$") as exc_info, ): await stub_evaluator.execute_prompt_chain_step( step_name="main", @@ -698,6 +698,9 @@ async def test_wraps_unexpected_chain_failure_as_api_error( chain_inputs={"input": "text"}, parser_output_type=_ChainOutput, ) + cause = exc_info.value.__cause__ + assert isinstance(cause, ValueError) + assert str(cause) == "simulated provider failure" async def test_malformed_llm_json_raises_output_validation_error( self, stub_evaluator, evaluation_metadata diff --git a/sdks/python/tests/schemas/test_errors.py b/sdks/python/tests/schemas/test_errors.py index 4445dbf..6829206 100644 --- a/sdks/python/tests/schemas/test_errors.py +++ b/sdks/python/tests/schemas/test_errors.py @@ -278,6 +278,7 @@ def test_default_message_is_used_when_no_signal(self) -> None: def test_provider_and_model_threaded_through(self) -> None: err = _FakeProviderError("429", status_code=429) wrapped = wrap_provider_error(err, provider=LLMProvider.ANTHROPIC, model="claude-x") + assert isinstance(wrapped, APIError) assert wrapped.provider is LLMProvider.ANTHROPIC assert wrapped.model == "claude-x" @@ -291,6 +292,7 @@ def test_response_body_and_request_id_extracted(self) -> None: ), ) wrapped = wrap_provider_error(err) + assert isinstance(wrapped, APIError) assert wrapped.request_id == "req_xyz" assert wrapped.response_body == {"error": "rate_limited"} @@ -303,6 +305,7 @@ def test_body_attribute_preferred_over_response_json(self) -> None: response=_FakeResponse(body={"error": "from_response_json"}), ) wrapped = wrap_provider_error(err) + assert isinstance(wrapped, APIError) assert wrapped.response_body == {"error": "from_body_attr"} def test_original_exception_preserved_via_cause(self) -> None: From 86ac788136c1da66c60ccfe51940ec0b70068678 Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Fri, 15 May 2026 18:37:31 -0700 Subject: [PATCH 7/8] chore: addressing PR comments --- sdks/python/README.md | 4 +- .../evaluators/base.py | 12 +- .../schemas/errors.py | 118 ++++++++++++--- sdks/python/tests/evaluators/test_base.py | 40 +++++ sdks/python/tests/schemas/test_errors.py | 140 ++++++++++++++++-- 5 files changed, 282 insertions(+), 32 deletions(-) diff --git a/sdks/python/README.md b/sdks/python/README.md index 13a894f..1565d6b 100644 --- a/sdks/python/README.md +++ b/sdks/python/README.md @@ -302,7 +302,7 @@ config = create_config_no_telemetry(logger=create_silent_logger()) ## Error handling -Every exception the SDK raises inherits from `EvaluatorError`. Failures inside LLM prompt steps are wrapped at the boundary so callers see a predictable, sanitized hierarchy instead of raw LangChain, OpenAI, Anthropic, or HTTP-client exceptions. +Exceptions for evaluator failures, provider failures, and input validation inherit from `EvaluatorError`. Failures inside LLM prompt steps are wrapped at the boundary so callers see a predictable, sanitized hierarchy instead of raw LangChain, OpenAI, Anthropic, or HTTP-client exceptions. A few documented misuse paths still raise standard Python exceptions — for example `ValueError` from `execute_prompt_chain_step` when `json_dict_normalizer` is set without `parser_output_type`, and `RuntimeError` from `evaluate_sync()` when an asyncio event loop is already running on the current thread. Those are programmer errors, not evaluation failures. ### Hierarchy @@ -360,6 +360,8 @@ The original provider exception is preserved on `__cause__` (via `raise … from ```python import logging +import time + from learning_commons_evaluators import APIError, OutputValidationError, RateLimitError log = logging.getLogger(__name__) diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py index a28dbf5..04ddff6 100644 --- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py +++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py @@ -344,7 +344,17 @@ async def _run_chain() -> BaseModel | str: loose = JsonOutputParser() parsed_dict = await loose.ainvoke(ai_message) if not isinstance(parsed_dict, dict): - parsed_dict = dict(parsed_dict) + # JSON parsed cleanly but the top-level value isn't an object + # (e.g. the LLM returned a JSON array or scalar). That's an + # output-shape failure, not a parse failure — surface it as + # OutputValidationError so callers can treat it consistently + # with schema-mismatch errors, and avoid the TypeError that + # ``dict(parsed_dict)`` would raise on a non-dict. + raise OutputValidationError( + "Model output is not a JSON object", + provider=prompt_settings.provider_type, + model=prompt_settings.model, + ) normalized = json_dict_normalizer(parsed_dict) return parser_output_type.model_validate(normalized) diff --git a/sdks/python/src/learning_commons_evaluators/schemas/errors.py b/sdks/python/src/learning_commons_evaluators/schemas/errors.py index 5d60cc6..cdcd751 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/errors.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/errors.py @@ -38,6 +38,12 @@ from learning_commons_evaluators.schemas.config import LLMProvider +# Status codes at or above this threshold are treated as transient. Used both by +# the :class:`APIError` constructor's retryable auto-derivation and by +# :func:`wrap_provider_error` when classifying status-only responses. +_RETRYABLE_STATUS_MIN = 500 + + class EvaluatorError(Exception): """Base class for every error raised by this SDK. @@ -110,9 +116,18 @@ def __init__( ) -> None: super().__init__(message) self.status_code = status_code - # Honour subclass class-level default unless caller explicitly overrides. + # Resolve ``retryable`` in three layers, most specific first: + # 1. Explicit caller override wins. + # 2. 5xx status codes are transient by definition — flip to True so a + # direct ``APIError("...", status_code=500)`` doesn't silently end up + # non-retryable. + # 3. Otherwise inherit the subclass class-level default + # (False for APIError/AuthenticationError, True for RateLimitError / + # NetworkError / RequestTimeoutError / OutputValidationError). if retryable is not None: self.retryable = retryable + elif status_code is not None and status_code >= _RETRYABLE_STATUS_MIN: + self.retryable = True self.provider = provider self.model = model self.response_body = response_body @@ -236,18 +251,48 @@ def __init__( self.validation_errors = validation_errors +def _is_safe_ctx_value(value: Any) -> bool: + """True if a Pydantic ``ctx`` value is safe to expose in logs/telemetry. + + Pydantic puts constraint metadata in ``ctx`` (e.g. ``{"min_length": 3}``) + but custom validators can attach *anything* there, including exception + objects whose ``str()`` may echo the offending input. We restrict to + JSON-primitive values and tuples/lists of the same. + """ + if value is None or isinstance(value, (bool, int, float, str)): + return True + if isinstance(value, (tuple, list)): + return all(_is_safe_ctx_value(item) for item in value) + return False + + def sanitize_pydantic_errors( errors: Sequence[Mapping[str, Any]], ) -> list[dict[str, Any]]: - """Strip raw input values from a list of :meth:`pydantic.ValidationError.errors` entries. - - Pydantic's per-error dicts include an ``input`` key containing the raw - value that failed validation. For LLM outputs that value may echo prompt - content, so we drop it before the error reaches logs or telemetry. The - remaining keys (``loc``, ``type``, ``msg``, ``ctx`` if present) describe - the schema and the failure mode but not the offending content. + """Strip unsafe values from :meth:`pydantic.ValidationError.errors` entries. + + Pydantic's per-error dicts include an ``input`` key holding the raw value + that failed validation; for LLM outputs that value may echo prompt + content, so we always drop it. We also re-shape ``ctx`` because custom + validators can attach arbitrary objects there — even raising + ``ValueError(unsafe_user_text)`` inside a validator puts the resulting + exception (and its message) into ``ctx``. We keep only known-safe + primitive ``ctx`` values and omit ``ctx`` entirely if nothing safe + remains. The other keys (``loc``, ``type``, ``msg``, ``url``) describe + the schema and the failure mode and are passed through. """ - return [{key: value for key, value in error.items() if key != "input"} for error in errors] + sanitized: list[dict[str, Any]] = [] + for error in errors: + clean: dict[str, Any] = { + key: value for key, value in error.items() if key not in {"input", "ctx"} + } + ctx = error.get("ctx") + if isinstance(ctx, Mapping): + safe_ctx = {k: v for k, v in ctx.items() if _is_safe_ctx_value(v)} + if safe_ctx: + clean["ctx"] = safe_ctx + sanitized.append(clean) + return sanitized def format_error_for_metadata(error: BaseException) -> str: @@ -266,9 +311,6 @@ def format_error_for_metadata(error: BaseException) -> str: # --- Provider-error wrapping -------------------------------------------------- -# Status codes we consider transient even without a more specific subclass match. -_RETRYABLE_STATUS_MIN = 500 - # Exception class-name suffixes used as a soft signal when isinstance checks # against provider SDKs aren't possible (e.g. those SDKs aren't installed in # the caller's environment). The provider SDKs follow consistent naming. @@ -367,20 +409,52 @@ def _is_model_not_found(error: Exception, status_code: int | None) -> bool: return False -def _is_timeout(error: Exception) -> bool: - """Detect timeout errors by exception class regardless of which SDK raised.""" +# Narrow, Python-flavoured phrases used as a last-resort fallback when the +# exception surfaces no typed signal at all (e.g. a third party caught a +# socket/httpx error and re-raised as a generic Exception). Kept deliberately +# specific — the previous TS-port keyword list included bare "Connection" / +# "network", which false-positives on unrelated messages. +_TIMEOUT_MESSAGE_PATTERN = re.compile(r"\b(timed out|timeout)\b", re.IGNORECASE) +_NETWORK_MESSAGE_PATTERN = re.compile( + r"connection refused|connection reset|name resolution|could not resolve", + re.IGNORECASE, +) + + +def _is_timeout(error: Exception, status_code: int | None = None) -> bool: + """Detect timeout errors from status code, exception class, or message. + + HTTP 408 takes precedence — a provider that surfaces 408 should be treated + as a timeout regardless of the exception's class name. After that we check + typed signals (``asyncio.TimeoutError`` / builtin ``TimeoutError`` and + common SDK timeout class names), then fall back to a narrow message scan + for status-less exceptions wrapped by upstream code. + """ + if status_code == 408: + return True if isinstance(error, (asyncio.TimeoutError, builtins.TimeoutError)): return True name = type(error).__name__ - return any(hint in name for hint in _TIMEOUT_NAME_HINTS) + if any(hint in name for hint in _TIMEOUT_NAME_HINTS): + return True + return bool(_TIMEOUT_MESSAGE_PATTERN.search(str(error))) def _is_network(error: Exception) -> bool: - """Detect network-level failures by exception class.""" + """Detect network-level failures from exception class or message. + + The class check covers builtins (``ConnectionError`` and its subclasses + ``ConnectionRefusedError`` / ``ConnectionResetError`` / etc.) and the + common SDK class names (``APIConnectionError``, ``ConnectError``, …). + The message fallback handles cases where some third party caught a + socket/httpx failure and re-raised it as a plain ``Exception``. + """ if isinstance(error, ConnectionError): return True name = type(error).__name__ - return any(hint in name for hint in _NETWORK_NAME_HINTS) + if any(hint in name for hint in _NETWORK_NAME_HINTS): + return True + return bool(_NETWORK_MESSAGE_PATTERN.search(str(error))) def wrap_provider_error( @@ -394,8 +468,12 @@ def wrap_provider_error( Routing precedence (most specific first): - 1. Asyncio / builtin / SDK timeout exception types → :class:`RequestTimeoutError` - 2. Built-in / SDK connection-error exception types → :class:`NetworkError` + 1. Timeouts — HTTP 408, asyncio / builtin / SDK timeout exception types, + or a narrow "timed out / timeout" message pattern → :class:`RequestTimeoutError` + 2. Network failures — builtin ``ConnectionError`` (and subclasses), + SDK connection-error class names, or a narrow message pattern + ("connection refused / reset", "name resolution", "could not resolve") + → :class:`NetworkError` 3. HTTP 404 (and model-shaped 400) → :class:`ConfigurationError` (model not found) 4. HTTP 401 / 403 → :class:`AuthenticationError` 5. HTTP 429 → :class:`RateLimitError` @@ -434,7 +512,7 @@ def wrap_provider_error( "request_id": request_id, } - if _is_timeout(error): + if _is_timeout(error, status_code): return RequestTimeoutError(status_code=status_code, **common) if _is_network(error): diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py index 4d07dea..87eb91e 100644 --- a/sdks/python/tests/evaluators/test_base.py +++ b/sdks/python/tests/evaluators/test_base.py @@ -781,3 +781,43 @@ async def test_schema_mismatch_raises_output_validation_error( assert "input" not in entry assert "loc" in entry assert "type" in entry + + async def test_non_dict_json_in_normalizer_path_raises_output_validation_error( + self, stub_evaluator, evaluation_metadata + ): + """A JSON array (or any non-object) on the ``json_dict_normalizer`` path becomes OutputValidationError. + + Previously the code did ``dict(parsed_dict)`` after the JsonOutputParser, + which raises ``TypeError`` on a list and falls through to ``wrap_provider_error`` + as a generic ``APIError`` — making the failure indistinguishable from a + provider HTTP error. We now surface it as ``OutputValidationError`` so + callers can catch malformed model output consistently. + """ + + def passthrough(d: dict) -> dict: + return d + + template = ChatPromptTemplate.from_messages([("human", "{input}")]) + with ( + patch( + _CHAIN_PATCH, + return_value=_fake_chat_model(AIMessage(content='["not", "an", "object"]')), + ), + pytest.raises(OutputValidationError) as exc_info, + ): + await stub_evaluator.execute_prompt_chain_step( + step_name="main", + prompt_settings=PromptSettings( + provider_type=LLMProvider.GOOGLE, + model="gemini-2.0-flash", + temperature=0.0, + ), + evaluation_metadata=evaluation_metadata, + template=template, + chain_inputs={"input": "text"}, + parser_output_type=_ChainOutput, + json_dict_normalizer=passthrough, + ) + assert "JSON object" in str(exc_info.value) + assert exc_info.value.provider is LLMProvider.GOOGLE + assert exc_info.value.model == "gemini-2.0-flash" diff --git a/sdks/python/tests/schemas/test_errors.py b/sdks/python/tests/schemas/test_errors.py index 6829206..0f92d53 100644 --- a/sdks/python/tests/schemas/test_errors.py +++ b/sdks/python/tests/schemas/test_errors.py @@ -123,6 +123,36 @@ def test_authentication_error_default_message_is_generic(self) -> None: err = AuthenticationError(status_code=401) assert str(err) == "Authentication failed" + def test_api_error_auto_retryable_for_5xx_when_unspecified(self) -> None: + """5xx status codes flip ``retryable`` to True without an explicit argument. + + Direct ``APIError(..., status_code=500)`` construction (outside of + ``wrap_provider_error``) should agree with the documented behavior that + transient server errors are retryable. Explicit ``retryable=False`` + always wins. + """ + assert APIError("server error", status_code=500).retryable is True + assert APIError("bad gateway", status_code=502).retryable is True + assert APIError("gateway timeout", status_code=504).retryable is True + # 4xx still inherits the class default (False). + assert APIError("not found", status_code=404).retryable is False + # No status code at all — fall back to class default. + assert APIError("opaque failure").retryable is False + # Explicit override beats auto-derive. + assert APIError("server", status_code=500, retryable=False).retryable is False + + def test_network_error_constructs_without_args(self) -> None: + """Regression: NetworkError must accept zero positional args. + + ``wrap_provider_error`` calls ``NetworkError(**common)`` with only kwargs, + so the class needs its own default message — otherwise the previously-broken + version would raise ``TypeError`` instead of returning a NetworkError. + """ + err = NetworkError() + assert isinstance(err, APIError) + assert err.retryable is True + assert str(err) # non-empty default message + # --- Helpers for fake provider exceptions ------------------------------------ @@ -252,6 +282,51 @@ class APITimeoutError(Exception): wrapped = wrap_provider_error(APITimeoutError("timed out")) assert isinstance(wrapped, RequestTimeoutError) + def test_408_routes_to_timeout_not_generic_api_error(self) -> None: + """HTTP 408 is a timeout per RFC 9110 and should route to RequestTimeoutError. + + Without explicit handling it would fall through to a non-retryable APIError + (4xx status, below the 5xx-retry threshold), which is wrong — 408 is the + canonical "Request Timeout" status and is documented as retryable. + """ + err = _FakeProviderError("Request Timeout", status_code=408) + wrapped = wrap_provider_error(err) + assert isinstance(wrapped, RequestTimeoutError) + assert wrapped.status_code == 408 + assert wrapped.retryable is True + + def test_message_only_timeout_routes_to_timeout(self) -> None: + """A status-less, untyped exception with 'timed out' in the message still routes correctly. + + Covers the case where an upstream library wraps a socket/httpx timeout + into a plain Exception. The narrow message-pattern fallback catches it. + """ + wrapped = wrap_provider_error(Exception("request to provider timed out after 30s")) + assert isinstance(wrapped, RequestTimeoutError) + assert wrapped.retryable is True + + def test_message_only_connection_refused_routes_to_network(self) -> None: + wrapped = wrap_provider_error(Exception("[Errno 111] Connection refused")) + assert isinstance(wrapped, NetworkError) + assert wrapped.retryable is True + + def test_message_only_dns_failure_routes_to_network(self) -> None: + wrapped = wrap_provider_error(Exception("Could not resolve host: api.example.com")) + assert isinstance(wrapped, NetworkError) + + def test_message_fallback_does_not_false_positive_on_bare_network_word(self) -> None: + """The narrow pattern intentionally rejects the bare word 'network'. + + The old TS-port keyword list matched any occurrence of 'network', which + caused false positives on messages like 'check your network adapter + settings'. The bounded pattern only accepts specific phrases. + """ + wrapped = wrap_provider_error(Exception("please configure your network adapter")) + # Falls through to generic, non-retryable APIError instead. + assert isinstance(wrapped, APIError) + assert not isinstance(wrapped, NetworkError) + assert wrapped.retryable is False + def test_500_returns_retryable_api_error(self) -> None: err = _FakeProviderError("internal server error", status_code=500) wrapped = wrap_provider_error(err) @@ -331,27 +406,72 @@ def test_message_regex_fallback_when_no_status_code_attr(self) -> None: class TestSanitizePydanticErrors: - def test_strips_input_key_only(self) -> None: - """Drop the raw ``input`` value but keep loc/type/msg/ctx for debugging.""" + def test_strips_input_key(self) -> None: + """The raw ``input`` value is always dropped; safe primitives in ``ctx`` pass through.""" raw = [ { "loc": ("vocabulary_complexity",), - "type": "missing", - "msg": "Field required", + "type": "string_too_short", + "msg": "String should have at least 3 characters", "input": {"label": "only"}, # potentially echoes LLM output - "ctx": {"some": "context"}, + "ctx": {"min_length": 3}, } ] - sanitized = sanitize_pydantic_errors(raw) - assert sanitized == [ + assert sanitize_pydantic_errors(raw) == [ { "loc": ("vocabulary_complexity",), - "type": "missing", - "msg": "Field required", - "ctx": {"some": "context"}, + "type": "string_too_short", + "msg": "String should have at least 3 characters", + "ctx": {"min_length": 3}, } ] + def test_drops_unsafe_ctx_values(self) -> None: + """Custom validators can put arbitrary objects into ``ctx`` — drop them. + + Pydantic propagates a custom ``ValueError`` into ``ctx["error"]`` when a + ``model_validator`` raises one, and the exception's ``str()`` may include + the offending input. Anything that isn't a JSON primitive (or a tuple/list + thereof) is filtered out, and ``ctx`` is omitted entirely if nothing safe + remains. + """ + unsafe_exception = ValueError("rejected text: ") + raw: list[dict[str, Any]] = [ + { + "loc": ("answer",), + "type": "value_error", + "msg": "Value error, rejected text: ", + "input": "anything", + "ctx": {"error": unsafe_exception}, # only key — must be dropped entirely + }, + { + "loc": ("score",), + "type": "less_than_equal", + "msg": "Input should be less than or equal to 10", + "input": 42, + "ctx": { + "le": 10, # primitive — keep + "details": unsafe_exception, # unsafe — drop + "tags": ["one", "two"], # list of primitives — keep + "nested": {"deep": 1}, # mapping — drop (not in allowlist) + }, + }, + ] + sanitized = sanitize_pydantic_errors(raw) + # First entry: ctx had only an unsafe value, so the whole ctx key is gone. + assert sanitized[0] == { + "loc": ("answer",), + "type": "value_error", + "msg": "Value error, rejected text: ", + } + # Second entry: kept primitives and a tuple/list of primitives, dropped the rest. + assert sanitized[1] == { + "loc": ("score",), + "type": "less_than_equal", + "msg": "Input should be less than or equal to 10", + "ctx": {"le": 10, "tags": ["one", "two"]}, + } + def test_handles_empty_list(self) -> None: assert sanitize_pydantic_errors([]) == [] From 14500716cf436f46790ac2e9a3f804c5c8ecc493 Mon Sep 17 00:00:00 2001 From: Fredrick Sisenda Date: Mon, 18 May 2026 19:26:00 -0600 Subject: [PATCH 8/8] chore: more PR comment fixes --- sdks/python/README.md | 6 +- .../evaluators/base.py | 18 +++-- .../schemas/errors.py | 73 +++++++++++++------ sdks/python/tests/evaluators/test_base.py | 36 +++++++++ sdks/python/tests/schemas/test_errors.py | 61 ++++++++++++++-- 5 files changed, 156 insertions(+), 38 deletions(-) diff --git a/sdks/python/README.md b/sdks/python/README.md index 1565d6b..f39a012 100644 --- a/sdks/python/README.md +++ b/sdks/python/README.md @@ -302,7 +302,7 @@ config = create_config_no_telemetry(logger=create_silent_logger()) ## Error handling -Exceptions for evaluator failures, provider failures, and input validation inherit from `EvaluatorError`. Failures inside LLM prompt steps are wrapped at the boundary so callers see a predictable, sanitized hierarchy instead of raw LangChain, OpenAI, Anthropic, or HTTP-client exceptions. A few documented misuse paths still raise standard Python exceptions — for example `ValueError` from `execute_prompt_chain_step` when `json_dict_normalizer` is set without `parser_output_type`, and `RuntimeError` from `evaluate_sync()` when an asyncio event loop is already running on the current thread. Those are programmer errors, not evaluation failures. +During a normal `evaluate()` / `evaluate_sync()` run, failures from evaluator input checks, configuration, LLM prompt steps, and output validation typically surface as subclasses of `EvaluatorError`. Failures inside LLM prompt steps are wrapped at the boundary so callers see a predictable, sanitized hierarchy instead of raw LangChain, OpenAI, Anthropic, or HTTP-client exceptions. A few documented paths still raise standard Python exceptions — for example `ValueError` from `execute_prompt_chain_step` when `json_dict_normalizer` is set without `parser_output_type`, and `RuntimeError` from `evaluate_sync()` when an asyncio event loop is already running on the current thread. Those are programmer errors, not evaluation failures. ### Hierarchy @@ -354,7 +354,7 @@ Error **messages** (the value returned by `str(err)`) are short and controlled. - `model` on `APIError` — the model ID requested. - `response_body` on `APIError` — decoded response body. Opt-in for debugging; may contain echoed prompt content, so treat as sensitive. - `request_id` on `APIError` — provider request ID, useful for support escalation. -- `validation_errors` on `OutputValidationError` — per-field details from Pydantic's `errors()` API with raw input values stripped (safe for logs). +- `validation_errors` on `OutputValidationError` — per-field entries from Pydantic's `errors()` API after `sanitize_pydantic_errors` (`loc`, `type`, optional `url`, and safe primitive `ctx` only — no `input` or `msg`, which can echo model output). The original provider exception is preserved on `__cause__` (via `raise … from e`), so debuggers, tracebacks, and `logging.exception()` retain full detail even though `str(err)` is sanitized. @@ -370,7 +370,7 @@ try: except RateLimitError as e: time.sleep(e.retry_after or 30) # seconds except OutputValidationError as e: - # Field-level errors are safe to log; raw LLM output is not in str(e). + # Structured entries omit Pydantic msg/input (may echo LLM text); use __cause__ for full detail. log.warning("Bad LLM output: %s", e.validation_errors) # Original pydantic.ValidationError / OutputParserException available as e.__cause__ except APIError as e: diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py index 04ddff6..9954c98 100644 --- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py +++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py @@ -315,10 +315,11 @@ async def execute_prompt_chain_step( Raises: ConfigurationError: No provider config for ``prompt_settings.provider_type``. OutputValidationError: The LLM response didn't satisfy the expected - output schema — either because it wasn't valid JSON (LangChain - ``OutputParserException``) or because the JSON didn't match the - Pydantic model (``pydantic.ValidationError``). The original - exception is reachable via ``__cause__``. + output schema — invalid JSON (LangChain ``OutputParserException``), + JSON that didn't match the Pydantic model (``pydantic.ValidationError``), + a non-object JSON value when using ``json_dict_normalizer``, or + ``TypeError`` / ``ValueError`` from ``json_dict_normalizer`` itself. + The original exception is reachable via ``__cause__``. APIError (or other ``EvaluatorError`` subclasses): :func:`~learning_commons_evaluators.schemas.errors.wrap_provider_error` output for LangChain or HTTP failures from the LLM provider. @@ -355,7 +356,14 @@ async def _run_chain() -> BaseModel | str: provider=prompt_settings.provider_type, model=prompt_settings.model, ) - normalized = json_dict_normalizer(parsed_dict) + try: + normalized = json_dict_normalizer(parsed_dict) + except (TypeError, ValueError) as norm_err: + raise OutputValidationError( + "Model output could not be normalized before validation", + provider=prompt_settings.provider_type, + model=prompt_settings.model, + ) from norm_err return parser_output_type.model_validate(normalized) parser = JsonOutputParser(pydantic_object=parser_output_type) diff --git a/sdks/python/src/learning_commons_evaluators/schemas/errors.py b/sdks/python/src/learning_commons_evaluators/schemas/errors.py index cdcd751..bfef160 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/errors.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/errors.py @@ -22,8 +22,9 @@ ``model``) are exposed as attributes for callers that want to introspect. * :func:`wrap_provider_error` prefers structured attributes (``.status_code``, - ``.response``) from provider SDK exceptions over regex on message strings. - String parsing is the fallback when nothing structured is available. + ``.response``) from provider SDK exceptions. Bounded message patterns are used + only when no HTTP status was extracted — so a 4xx/5xx response is not + reclassified from message wording alone. """ from __future__ import annotations @@ -128,6 +129,8 @@ def __init__( self.retryable = retryable elif status_code is not None and status_code >= _RETRYABLE_STATUS_MIN: self.retryable = True + else: + self.retryable = type(self).retryable self.provider = provider self.model = model self.response_body = response_body @@ -230,8 +233,9 @@ class OutputValidationError(APIError): The original :class:`pydantic.ValidationError` is preserved on ``__cause__`` when raised via ``raise … from e``. The :attr:`validation_errors` - attribute exposes the field-level details from Pydantic's ``errors()`` API - with raw input values stripped — safe for inclusion in logs and telemetry. + list is produced by :func:`sanitize_pydantic_errors` — ``loc`` / ``type`` / + ``url`` only, so entries are safe for logs and telemetry without echoing raw + model text (see that helper for what is stripped). """ retryable = True @@ -244,10 +248,10 @@ def __init__( **kwargs: Any, ) -> None: super().__init__(message, **kwargs) - #: Per-field validation failures. Each entry has ``loc``, ``type``, - #: and ``msg`` keys (from :meth:`pydantic.ValidationError.errors`). - #: Raw input values are deliberately omitted to avoid leaking LLM - #: output (which may echo user prompts) into telemetry. + #: Per-field validation failures. Each entry carries at least ``loc`` + #: and ``type`` (from :meth:`pydantic.ValidationError.errors`). The + #: ``msg`` string is omitted because custom validators often embed raw + #: model output there; use :attr:`__cause__` for full Pydantic detail. self.validation_errors = validation_errors @@ -278,13 +282,15 @@ def sanitize_pydantic_errors( ``ValueError(unsafe_user_text)`` inside a validator puts the resulting exception (and its message) into ``ctx``. We keep only known-safe primitive ``ctx`` values and omit ``ctx`` entirely if nothing safe - remains. The other keys (``loc``, ``type``, ``msg``, ``url``) describe - the schema and the failure mode and are passed through. + remains. The ``msg`` field is dropped as well: custom validators often + interpolate rejected values into the message. ``loc``, ``type``, and + ``url`` (when present) are retained as schema-oriented, non-echoing hints; + full detail stays on the original :exc:`pydantic.ValidationError`. """ sanitized: list[dict[str, Any]] = [] for error in errors: clean: dict[str, Any] = { - key: value for key, value in error.items() if key not in {"input", "ctx"} + key: value for key, value in error.items() if key not in {"input", "ctx", "msg"} } ctx = error.get("ctx") if isinstance(ctx, Mapping): @@ -322,15 +328,21 @@ def _provider_status_code(error: Exception) -> int | None: """Best-effort extraction of an HTTP status code from a provider exception. Prefers the structured attribute that openai-python and anthropic-python - both expose (``.status_code``). Falls back to ``.status`` (used by some - httpx-based clients) and finally to a regex on the message — narrowed to - HTTP-shaped 4xx/5xx codes near the start of the message to reduce false - positives like model names or token counts. + both expose (``.status_code`` on the exception), then ``.response.status_code`` + (e.g. ``httpx.HTTPStatusError``), then ``.status`` on either object, and + finally a regex on the message — narrowed to HTTP-shaped 4xx/5xx codes near + the start of the message to reduce false positives like model names or token counts. """ for attr in ("status_code", "status"): value = getattr(error, attr, None) if isinstance(value, int): return value + response = getattr(error, "response", None) + if response is not None: + for attr in ("status_code", "status"): + value = getattr(response, attr, None) + if isinstance(value, int): + return value match = re.search(r"\b(4\d{2}|5\d{2})\b", str(error)[:80]) if match: return int(match.group(1)) @@ -352,7 +364,10 @@ def _provider_response_body(error: Exception) -> Any | None: return json_fn() except Exception: # noqa: BLE001 — body is best-effort debug data pass - return getattr(response, "text", None) + try: + return getattr(response, "text", None) + except Exception: # noqa: BLE001 — unread / streaming bodies can raise on read + return None def _provider_request_id(error: Exception) -> str | None: @@ -428,7 +443,8 @@ def _is_timeout(error: Exception, status_code: int | None = None) -> bool: as a timeout regardless of the exception's class name. After that we check typed signals (``asyncio.TimeoutError`` / builtin ``TimeoutError`` and common SDK timeout class names), then fall back to a narrow message scan - for status-less exceptions wrapped by upstream code. + only when **no** HTTP status was extracted — so a 400/401 whose body + mentions "timeout" is not misclassified as a transport timeout. """ if status_code == 408: return True @@ -437,24 +453,31 @@ def _is_timeout(error: Exception, status_code: int | None = None) -> bool: name = type(error).__name__ if any(hint in name for hint in _TIMEOUT_NAME_HINTS): return True - return bool(_TIMEOUT_MESSAGE_PATTERN.search(str(error))) + if status_code is None: + return bool(_TIMEOUT_MESSAGE_PATTERN.search(str(error))) + return False -def _is_network(error: Exception) -> bool: +def _is_network(error: Exception, status_code: int | None = None) -> bool: """Detect network-level failures from exception class or message. The class check covers builtins (``ConnectionError`` and its subclasses ``ConnectionRefusedError`` / ``ConnectionResetError`` / etc.) and the common SDK class names (``APIConnectionError``, ``ConnectError``, …). The message fallback handles cases where some third party caught a - socket/httpx failure and re-raised it as a plain ``Exception``. + socket/httpx failure and re-raised it as a plain ``Exception``. The message + scan runs only when no HTTP status was extracted (same rule as + ``_is_timeout``), so a 404 whose JSON mentions "could not resolve" does not + become a retryable :class:`NetworkError`. """ if isinstance(error, ConnectionError): return True name = type(error).__name__ if any(hint in name for hint in _NETWORK_NAME_HINTS): return True - return bool(_NETWORK_MESSAGE_PATTERN.search(str(error))) + if status_code is None: + return bool(_NETWORK_MESSAGE_PATTERN.search(str(error))) + return False def wrap_provider_error( @@ -469,9 +492,11 @@ def wrap_provider_error( Routing precedence (most specific first): 1. Timeouts — HTTP 408, asyncio / builtin / SDK timeout exception types, - or a narrow "timed out / timeout" message pattern → :class:`RequestTimeoutError` + or (only when no HTTP status was extracted) a narrow "timed out / timeout" + message pattern → :class:`RequestTimeoutError` 2. Network failures — builtin ``ConnectionError`` (and subclasses), - SDK connection-error class names, or a narrow message pattern + SDK connection-error class names, or (only when no HTTP status was + extracted) a narrow message pattern ("connection refused / reset", "name resolution", "could not resolve") → :class:`NetworkError` 3. HTTP 404 (and model-shaped 400) → :class:`ConfigurationError` (model not found) @@ -515,7 +540,7 @@ def wrap_provider_error( if _is_timeout(error, status_code): return RequestTimeoutError(status_code=status_code, **common) - if _is_network(error): + if _is_network(error, status_code): return NetworkError(**common) if _is_model_not_found(error, status_code): diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py index 87eb91e..6c0e762 100644 --- a/sdks/python/tests/evaluators/test_base.py +++ b/sdks/python/tests/evaluators/test_base.py @@ -494,6 +494,41 @@ def _double(d: dict) -> dict: assert result.n == 1 assert result.doubled == 2 + async def test_json_dict_normalizer_raises_valueerror_wraps_as_output_validation( + self, stub_evaluator, evaluation_metadata + ) -> None: + """User ``json_dict_normalizer`` failures become ``OutputValidationError``, not ``APIError``.""" + + class _Out(BaseModel): + n: int = Field(description="n") + + def _bad(_d: dict) -> dict: + raise ValueError("cannot normalize") + + template = ChatPromptTemplate.from_messages([("human", "{input}")]) + with ( + patch( + _CHAIN_PATCH, + return_value=_fake_chat_model(AIMessage(content='{"n": 1}')), + ), + pytest.raises(OutputValidationError) as exc_info, + ): + await stub_evaluator.execute_prompt_chain_step( + step_name="main", + prompt_settings=PromptSettings( + provider_type=LLMProvider.GOOGLE, + model="gemini-2.0-flash", + temperature=0.0, + ), + evaluation_metadata=evaluation_metadata, + template=template, + chain_inputs={"input": "Hello"}, + parser_output_type=_Out, + json_dict_normalizer=_bad, + ) + assert str(exc_info.value) == "Model output could not be normalized before validation" + assert isinstance(exc_info.value.__cause__, ValueError) + async def test_parser_returning_model_instance_short_circuits_model_validate( self, stub_evaluator, evaluation_metadata ): @@ -779,6 +814,7 @@ async def test_schema_mismatch_raises_output_validation_error( assert len(exc_info.value.validation_errors) > 0 for entry in exc_info.value.validation_errors: assert "input" not in entry + assert "msg" not in entry assert "loc" in entry assert "type" in entry diff --git a/sdks/python/tests/schemas/test_errors.py b/sdks/python/tests/schemas/test_errors.py index 0f92d53..2cf7480 100644 --- a/sdks/python/tests/schemas/test_errors.py +++ b/sdks/python/tests/schemas/test_errors.py @@ -182,7 +182,7 @@ def __init__( message: str, *, status_code: int | None = None, - response: _FakeResponse | None = None, + response: Any = None, body: Any | None = None, ) -> None: super().__init__(message) @@ -204,6 +204,38 @@ def test_status_code_read_from_attribute_not_message(self) -> None: assert isinstance(wrapped, RateLimitError) assert wrapped.status_code == 429 + def test_status_code_read_from_response_when_missing_on_exception(self) -> None: + """``httpx.HTTPStatusError``-style objects keep the code on ``response``.""" + + class _Resp: + status_code = 401 + + class _HttpxLike(Exception): + def __init__(self, message: str) -> None: + super().__init__(message) + self.response = _Resp() + + wrapped = wrap_provider_error(_HttpxLike("Request failed")) + assert isinstance(wrapped, AuthenticationError) + assert wrapped.status_code == 401 + + def test_wrap_succeeds_when_response_text_read_raises(self) -> None: + """Best-effort body extraction must not mask the original provider error.""" + + class _UnreadableTextResponse: + headers = _FakeHeaders({}) + + @property + def text(self) -> str: + raise OSError("body not available") + + err = _FakeProviderError("failed", status_code=500, response=_UnreadableTextResponse()) + wrapped = wrap_provider_error(err) + assert isinstance(wrapped, APIError) + assert wrapped.status_code == 500 + assert wrapped.retryable is True + assert wrapped.response_body is None + def test_404_routes_to_configuration_error_with_model_in_message(self) -> None: """Model-not-found is a developer mistake, not an API error.""" err = _FakeProviderError("not found", status_code=404) @@ -305,6 +337,26 @@ def test_message_only_timeout_routes_to_timeout(self) -> None: assert isinstance(wrapped, RequestTimeoutError) assert wrapped.retryable is True + def test_structured_4xx_wins_over_timeout_message_pattern(self) -> None: + """When ``status_code`` is present, message wording must not re-route to timeout.""" + err = _FakeProviderError("invalid request timeout parameter", status_code=400) + wrapped = wrap_provider_error(err) + assert isinstance(wrapped, APIError) + assert wrapped.status_code == 400 + assert wrapped.retryable is False + assert not isinstance(wrapped, RequestTimeoutError) + + def test_structured_status_wins_over_network_message_pattern(self) -> None: + """When ``status_code`` is present, DNS-ish wording must not yield ``NetworkError``.""" + err = _FakeProviderError( + "Unauthorized: could not resolve upstream identity provider", + status_code=401, + ) + wrapped = wrap_provider_error(err) + assert isinstance(wrapped, AuthenticationError) + assert wrapped.status_code == 401 + assert not isinstance(wrapped, NetworkError) + def test_message_only_connection_refused_routes_to_network(self) -> None: wrapped = wrap_provider_error(Exception("[Errno 111] Connection refused")) assert isinstance(wrapped, NetworkError) @@ -407,7 +459,7 @@ def test_message_regex_fallback_when_no_status_code_attr(self) -> None: class TestSanitizePydanticErrors: def test_strips_input_key(self) -> None: - """The raw ``input`` value is always dropped; safe primitives in ``ctx`` pass through.""" + """The raw ``input`` value is always dropped; ``msg`` is dropped; safe ``ctx`` passes through.""" raw = [ { "loc": ("vocabulary_complexity",), @@ -421,7 +473,6 @@ def test_strips_input_key(self) -> None: { "loc": ("vocabulary_complexity",), "type": "string_too_short", - "msg": "String should have at least 3 characters", "ctx": {"min_length": 3}, } ] @@ -433,7 +484,7 @@ def test_drops_unsafe_ctx_values(self) -> None: ``model_validator`` raises one, and the exception's ``str()`` may include the offending input. Anything that isn't a JSON primitive (or a tuple/list thereof) is filtered out, and ``ctx`` is omitted entirely if nothing safe - remains. + remains. ``msg`` is always stripped because it can echo the same content. """ unsafe_exception = ValueError("rejected text: ") raw: list[dict[str, Any]] = [ @@ -462,13 +513,11 @@ def test_drops_unsafe_ctx_values(self) -> None: assert sanitized[0] == { "loc": ("answer",), "type": "value_error", - "msg": "Value error, rejected text: ", } # Second entry: kept primitives and a tuple/list of primitives, dropped the rest. assert sanitized[1] == { "loc": ("score",), "type": "less_than_equal", - "msg": "Input should be less than or equal to 10", "ctx": {"le": 10, "tags": ["one", "two"]}, }