From 28437e8e298f3a393dbc679e463df7e3ab547b05 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Fri, 15 May 2026 17:12:53 -0700
Subject: [PATCH 1/8] feat(python-sdk): error handling improvements

---
 .../learning_commons_evaluators/__init__.py   |  12 +-
 .../src/learning_commons_evaluators/errors.py |  16 +-
 .../evaluators/base.py                        |  45 +-
 .../schemas/__init__.py                       |   4 +-
 .../schemas/common_inputs.py                  |  16 +-
 .../schemas/errors.py                         | 518 ++++++++++++++----
 .../schemas/evaluator.py                      |  16 +-
 sdks/python/tests/evaluators/test_base.py     |  91 ++-
 .../tests/evaluators/test_vocabulary.py       |   8 +-
 .../tests/schemas/test_common_inputs.py       |  14 +-
 sdks/python/tests/schemas/test_errors.py      | 386 ++++++++++---
 .../tests/schemas/test_evaluator_schemas.py   |  10 +-
 12 files changed, 902 insertions(+), 234 deletions(-)

diff --git a/sdks/python/src/learning_commons_evaluators/__init__.py b/sdks/python/src/learning_commons_evaluators/__init__.py
index cccf241..5a0fbba 100644
--- a/sdks/python/src/learning_commons_evaluators/__init__.py
+++ b/sdks/python/src/learning_commons_evaluators/__init__.py
@@ -22,11 +22,11 @@
     AuthenticationError,
     ConfigurationError,
     EvaluatorError,
-    EvaluatorRetryableError,
-    EvaluatorTimeoutError,
+    InputValidationError,
     NetworkError,
+    OutputValidationError,
     RateLimitError,
-    ValidationError,
+    RequestTimeoutError,
     wrap_provider_error,
 )
 
@@ -113,8 +113,6 @@
     "EvaluatorError",
     "EvaluatorMaturity",
     "EvaluatorMetadata",
-    "EvaluatorRetryableError",
-    "EvaluatorTimeoutError",
     "GoogleLLMProviderConfig",
     "AnyInputSpec",
     "GradeInputField",
@@ -122,6 +120,7 @@
     "InputField",
     "InputSpec",
     "InputT",
+    "InputValidationError",
     "TextInputSpec",
     "LLMProvider",
     "LLMProviderConfig",
@@ -129,15 +128,16 @@
     "NetworkError",
     "OpenAILLMProviderConfig",
     "OutputT",
+    "OutputValidationError",
     "PromptSettings",
     "RateLimitError",
+    "RequestTimeoutError",
     "SDK_LOGGER_NAME",
     "Status",
     "TelemetryConfig",
     "TextComplexityEvaluationInput",
     "TextInputField",
     "TokenUsage",
-    "ValidationError",
     "VocabularyEvaluationInput",
     "VocabularyEvaluationSettings",
     "VocabularyEvaluator",
diff --git a/sdks/python/src/learning_commons_evaluators/errors.py b/sdks/python/src/learning_commons_evaluators/errors.py
index 648fe06..6d952e4 100644
--- a/sdks/python/src/learning_commons_evaluators/errors.py
+++ b/sdks/python/src/learning_commons_evaluators/errors.py
@@ -5,11 +5,13 @@
     AuthenticationError,
     ConfigurationError,
     EvaluatorError,
-    EvaluatorRetryableError,
-    EvaluatorTimeoutError,
+    InputValidationError,
     NetworkError,
+    OutputValidationError,
     RateLimitError,
-    ValidationError,
+    RequestTimeoutError,
+    format_error_for_metadata,
+    sanitize_pydantic_errors,
     wrap_provider_error,
 )
 
@@ -18,10 +20,12 @@
     "AuthenticationError",
     "ConfigurationError",
     "EvaluatorError",
-    "EvaluatorRetryableError",
-    "EvaluatorTimeoutError",
+    "InputValidationError",
     "NetworkError",
+    "OutputValidationError",
     "RateLimitError",
-    "ValidationError",
+    "RequestTimeoutError",
+    "format_error_for_metadata",
+    "sanitize_pydantic_errors",
     "wrap_provider_error",
 ]
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
index 2ec69c0..a28dbf5 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
@@ -8,6 +8,7 @@
 from collections.abc import Awaitable, Callable
 from typing import Any, Generic, TypeVar, overload
 
+from langchain_core.exceptions import OutputParserException
 from pydantic import BaseModel
 from pydantic import ValidationError as PydanticValidationError
 
@@ -22,6 +23,9 @@
 )
 from learning_commons_evaluators.schemas.errors import (
     EvaluatorError,
+    OutputValidationError,
+    format_error_for_metadata,
+    sanitize_pydantic_errors,
     wrap_provider_error,
 )
 from learning_commons_evaluators.schemas.evaluator import (
@@ -105,7 +109,7 @@ async def evaluate(
             on success.
 
         Raises:
-            ValidationError: Input fails validation.
+            InputValidationError: Input fails validation.
             ConfigurationError: No provider config for the required LLM provider.
             APIError (or subclasses): The LLM API call failed.
 
@@ -133,7 +137,7 @@ async def evaluate(
             return result
         except Exception as e:
             evaluation_metadata.status = Status.failed
-            evaluation_metadata.error_details = str(e)
+            evaluation_metadata.error_details = format_error_for_metadata(e)
             raise
         finally:
             evaluation_metadata.processing_time_ms = (time.perf_counter() - start) * 1000
@@ -236,7 +240,7 @@ async def execute_step(
             return result
         except Exception as e:
             step_metadata.status = Status.failed
-            step_metadata.error_details = str(e)
+            step_metadata.error_details = format_error_for_metadata(e)
             raise
         finally:
             step_metadata.processing_time_ms = (time.perf_counter() - start) * 1000
@@ -310,7 +314,14 @@ async def execute_prompt_chain_step(
 
         Raises:
             ConfigurationError: No provider config for ``prompt_settings.provider_type``.
-            EvaluatorError: SDK errors, including :func:`~learning_commons_evaluators.schemas.errors.wrap_provider_error` output for LangChain or HTTP failures (typically :class:`~learning_commons_evaluators.schemas.errors.APIError` subclasses). Pydantic :exc:`pydantic.ValidationError` from output parsing is re-raised unchanged.
+            OutputValidationError: The LLM response didn't satisfy the expected
+                output schema — either because it wasn't valid JSON (LangChain
+                ``OutputParserException``) or because the JSON didn't match the
+                Pydantic model (``pydantic.ValidationError``). The original
+                exception is reachable via ``__cause__``.
+            APIError (or other ``EvaluatorError`` subclasses):
+                :func:`~learning_commons_evaluators.schemas.errors.wrap_provider_error`
+                output for LangChain or HTTP failures from the LLM provider.
             ValueError: If ``json_dict_normalizer`` is set but ``parser_output_type`` is omitted.
         """
         if json_dict_normalizer is not None and parser_output_type is None:
@@ -344,12 +355,32 @@ async def _run_chain() -> BaseModel | str:
                 return parser_output_type.model_validate(raw)
             except EvaluatorError:
                 raise
-            except PydanticValidationError:
-                raise
+            except (PydanticValidationError, OutputParserException) as e:
+                # The provider returned a response that didn't match the expected schema.
+                # This covers both Pydantic schema-validation failures and LangChain
+                # JSON-parse failures (``OutputParserException``). Wrap so callers can
+                # discriminate output-parse failures from other API errors and so the
+                # message that lands in ``EvaluationMetadata.error_details`` stays
+                # sanitized — the original error (which may include LLM output snippets)
+                # is reachable via ``__cause__`` for debugging.
+                validation_errors = (
+                    sanitize_pydantic_errors(e.errors())
+                    if isinstance(e, PydanticValidationError)
+                    else None
+                )
+                raise OutputValidationError(
+                    provider=prompt_settings.provider_type,
+                    model=prompt_settings.model,
+                    validation_errors=validation_errors,
+                ) from e
             except (KeyboardInterrupt, SystemExit):
                 raise
             except Exception as e:
-                raise wrap_provider_error(e) from e
+                raise wrap_provider_error(
+                    e,
+                    provider=prompt_settings.provider_type,
+                    model=prompt_settings.model,
+                ) from e
 
         try:
             return await self.execute_step(
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/__init__.py b/sdks/python/src/learning_commons_evaluators/schemas/__init__.py
index b168e59..f28b750 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/__init__.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/__init__.py
@@ -13,7 +13,7 @@
     ConventionalityEvaluationSettings,
     ConventionalityOutput,
 )
-from learning_commons_evaluators.schemas.errors import ValidationError
+from learning_commons_evaluators.schemas.errors import InputValidationError
 from learning_commons_evaluators.schemas.evaluator import (
     EvaluationAnswer,
     EvaluationExplanation,
@@ -70,6 +70,6 @@
     "TextComplexityEvaluationInput",
     "TextInputField",
     "TokenUsage",
-    "ValidationError",
+    "InputValidationError",
     "prompt_settings_to_extras_value",
 ]
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py b/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py
index d916289..0cf32b0 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/common_inputs.py
@@ -8,14 +8,14 @@
     # spec loaded from TOML via EvaluatorMetadata.inputs
     text_spec: TextInputSpec = config.evaluator_metadata.inputs["text"]
     field = TextInputField(spec=text_spec, value="The quick brown fox...")
-    field.validate()  # raises ValidationError if constraints are violated
+    field.validate()  # raises InputValidationError if constraints are violated
 """
 
 from typing import Any
 
 from pydantic import model_validator
 
-from learning_commons_evaluators.schemas.errors import ValidationError
+from learning_commons_evaluators.schemas.errors import InputValidationError
 from learning_commons_evaluators.schemas.evaluator import InputField
 from learning_commons_evaluators.schemas.input_specs import (
     GradeInputSpec,
@@ -59,13 +59,13 @@ def _strip_value_if_spec_requests(cls, data: Any) -> Any:
         return data
 
     def validate(self) -> None:
-        """Raise :class:`~.errors.ValidationError` if the value violates the spec constraints."""
+        """Raise :class:`~.errors.InputValidationError` if the value violates the spec constraints."""
         text_length = len(self.value)
         min_len = self.spec.min_text_length or 0
         if text_length < min_len:
-            raise ValidationError(f"Text length {text_length} is below minimum {min_len}.")
+            raise InputValidationError(f"Text length {text_length} is below minimum {min_len}.")
         if self.spec.max_text_length is not None and text_length > self.spec.max_text_length:
-            raise ValidationError(
+            raise InputValidationError(
                 f"Text length {text_length} exceeds maximum {self.spec.max_text_length}."
             )
 
@@ -87,12 +87,12 @@ class GradeInputField(InputField[int]):
     spec: GradeInputSpec
 
     def validate(self) -> None:
-        """Raise :class:`~.errors.ValidationError` if the grade is out of range or not in the allowed set."""
+        """Raise :class:`~.errors.InputValidationError` if the grade is out of range or not in the allowed set."""
         grade = self.value
         if grade < 0 or grade > 12:
-            raise ValidationError(f"Grade {grade} is not in allowed range 0-12.")
+            raise InputValidationError(f"Grade {grade} is not in allowed range 0-12.")
         if self.spec.allowed_grades is not None and grade not in self.spec.allowed_grades:
-            raise ValidationError(
+            raise InputValidationError(
                 f"Grade {grade} is not in allowed set: {sorted(self.spec.allowed_grades)}."
             )
 
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/errors.py b/sdks/python/src/learning_commons_evaluators/schemas/errors.py
index 47e5a22..545c1b9 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/errors.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/errors.py
@@ -1,143 +1,471 @@
-"""Custom error types for the Evaluators SDK."""
+"""Custom error types for the Evaluators SDK.
 
+Design notes for maintainers
+----------------------------
+
+This module is a Python-idiomatic port of the TypeScript SDK's error hierarchy:
+
+* The class is the identity. We do *not* maintain ``self.name`` / ``self.code``
+  / ``self.message`` string mirrors of the class — those are JS-isms. Callers
+  discriminate with ``isinstance``. ``str(err)`` returns the message.
+
+* Retryability is a single, attribute-based signal. ``err.retryable`` is the
+  source of truth. There is no parallel ``RetryableError`` marker class to
+  inherit from. Subclasses set a class-level default; callers may override per
+  instance for the cases where it varies (e.g. a DNS-failed ``NetworkError``).
+
+* Messages on SDK errors are short and controlled. We deliberately do **not**
+  echo raw provider error strings into ``str(err)`` because they may contain
+  prompt content, user text, or fragments of API keys. The original provider
+  exception is preserved via ``raise … from`` (``__cause__``) and structured
+  details (``status_code``, ``response_body``, ``request_id``, ``provider``,
+  ``model``) are exposed as attributes for callers that want to introspect.
+
+* :func:`wrap_provider_error` prefers structured attributes (``.status_code``,
+  ``.response``) from provider SDK exceptions over regex on message strings.
+  String parsing is the fallback when nothing structured is available.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import builtins
 import re
+from typing import TYPE_CHECKING, Any
 
+if TYPE_CHECKING:
+    from learning_commons_evaluators.schemas.config import LLMProvider
 
-# TODO: rename name and message, and remove Evaluator prefix where appropriate
-class EvaluatorError(Exception):
-    """Base error class for all evaluator errors."""
 
-    def __init__(self, message: str, code: str | None = None):
-        super().__init__(message)
-        self.message = message
-        self.code = code
-        self.name = "EvaluatorError"
+class EvaluatorError(Exception):
+    """Base class for every error raised by this SDK.
 
+    All SDK errors carry a ``retryable`` boolean. The default is ``False``
+    (subclasses for transient API failures override it). Callers that wrap an
+    evaluator call in retry logic can check ``isinstance(err, EvaluatorError)``
+    and then consult ``err.retryable`` rather than learning the full hierarchy.
+    """
 
-class EvaluatorRetryableError(EvaluatorError):
-    """Base for errors that may be retried (possibly with backoff). All other evaluator errors are non-retryable."""
+    #: Whether the error condition is plausibly transient. Subclasses override.
+    retryable: bool = False
 
-    def __init__(self, message: str, code: str | None = None):
-        super().__init__(message, code)
-        self.name = "EvaluatorRetryableError"
+    def __init__(self, message: str) -> None:
+        super().__init__(message)
 
 
 class ConfigurationError(EvaluatorError):
-    """Configuration error - e.g. missing API keys. Non-retryable."""
+    """The SDK is misconfigured.
+
+    Examples: missing provider config for the requested LLM provider, an
+    unknown model ID, a malformed settings file. These are developer errors
+    and are not retryable.
+    """
 
-    def __init__(self, message: str):
-        super().__init__(message, "CONFIGURATION_ERROR")
-        self.name = "ConfigurationError"
+    retryable = False
 
 
-class ValidationError(EvaluatorError):
-    """Validation error - invalid input. Non-retryable."""
+class InputValidationError(EvaluatorError):
+    """Evaluator input failed validation.
 
-    def __init__(self, message: str):
-        super().__init__(message, "VALIDATION_ERROR")
-        self.name = "ValidationError"
+    Raised before any LLM call when an :class:`InputField` rejects its value.
+    Renamed from ``ValidationError`` to avoid the collision with
+    :class:`pydantic.ValidationError`. Not retryable.
+    """
+
+    retryable = False
 
 
 class APIError(EvaluatorError):
-    """Base API error - LLM API calls failed. Use subclasses or set retryable explicitly."""
+    """Base class for failures originating in the LLM provider call.
+
+    Carries structured debug attributes alongside the sanitized message:
+
+    Attributes:
+        status_code: HTTP status code from the provider, when one was returned.
+        retryable: ``True`` for transient failures (5xx, rate-limit, timeout,
+            network). Subclasses set sensible defaults; the constructor accepts
+            an override for cases like a non-retryable DNS failure.
+        provider: Which configured LLM provider raised the error, when known.
+        model: Model ID requested when the error occurred, when known.
+        response_body: Decoded response body from the provider, when available.
+            May contain echoed prompt text — treat as sensitive in logs.
+        request_id: Provider request ID for support escalation.
+
+    The original provider exception is preserved as ``self.__cause__`` when
+    callers use ``raise wrap_provider_error(e) from e`` (which is what the SDK
+    base evaluator does).
+    """
 
     def __init__(
         self,
         message: str,
+        *,
         status_code: int | None = None,
-        retryable: bool = False,
-        code: str | None = None,
-    ):
-        super().__init__(message, code)
+        retryable: bool | None = None,
+        provider: LLMProvider | None = None,
+        model: str | None = None,
+        response_body: Any | None = None,
+        request_id: str | None = None,
+    ) -> None:
+        super().__init__(message)
         self.status_code = status_code
-        self.retryable = retryable
-        self.name = "APIError"
+        # Honour subclass class-level default unless caller explicitly overrides.
+        if retryable is not None:
+            self.retryable = retryable
+        self.provider = provider
+        self.model = model
+        self.response_body = response_body
+        self.request_id = request_id
 
 
 class AuthenticationError(APIError):
-    """Invalid or missing API keys (401/403). Non-retryable."""
+    """Provider rejected the request because of credentials (HTTP 401/403).
+
+    Not retryable. The error message is deliberately generic ("Authentication
+    failed") rather than the provider's raw message, which may contain a
+    fragment of the API key.
+    """
+
+    retryable = False
+
+    def __init__(
+        self,
+        message: str = "Authentication failed",
+        *,
+        status_code: int | None = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(message, status_code=status_code, **kwargs)
+
 
-    def __init__(self, message: str, status_code: int | None = None):
-        super().__init__(message, status_code, False, "AUTHENTICATION_ERROR")
-        self.name = "AuthenticationError"
+class RateLimitError(APIError):
+    """Provider returned HTTP 429.
 
+    Always retryable; honour ``retry_after`` (seconds) when present.
+    """
 
-class RateLimitError(APIError, EvaluatorRetryableError):
-    """Rate limit exceeded (429). Should be retried with backoff."""
+    retryable = True
+
+    def __init__(
+        self,
+        message: str = "Rate limit exceeded",
+        *,
+        retry_after: float | None = None,
+        status_code: int | None = 429,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(message, status_code=status_code, **kwargs)
+        #: Suggested delay before retry, in **seconds**. ``None`` if the
+        #: provider did not return a Retry-After header.
+        self.retry_after = retry_after
 
-    def __init__(self, message: str, retry_after: int | None = None):
-        super().__init__(message, 429, True, "RATE_LIMIT_ERROR")
-        self.retry_after = retry_after  # milliseconds
-        self.name = "RateLimitError"
 
+class NetworkError(APIError):
+    """A network-level failure prevented the request from completing.
 
-class NetworkError(APIError, EvaluatorRetryableError):
-    """Network failure. May be retryable."""
+    Connection refused, DNS resolution failure, broken TLS, etc. Retryable by
+    default but pass ``retryable=False`` for cases that won't improve on retry
+    (e.g. a permanently bad hostname).
+    """
 
-    def __init__(self, message: str, retryable: bool = True):
-        super().__init__(message, None, retryable, "NETWORK_ERROR")
-        self.name = "NetworkError"
+    retryable = True
 
 
-class EvaluatorTimeoutError(APIError, EvaluatorRetryableError):
-    """Request timed out. Should be retried with caution."""
+class RequestTimeoutError(APIError):
+    """The request to the provider exceeded the configured timeout.
 
-    def __init__(self, message: str = "Request timed out"):
-        super().__init__(message, 408, True, "TIMEOUT_ERROR")
-        self.name = "EvaluatorTimeoutError"
+    Retryable. Renamed from ``EvaluatorTimeoutError`` — the ``Evaluator``
+    prefix was a TypeScript-port leftover and the Python builtin
+    ``TimeoutError`` makes the unqualified name a poor choice.
+    """
 
+    retryable = True
 
-# TODO: OpenAI & Anthropic may return a status_code in the response.
-def _parse_provider_error(error: BaseException) -> tuple[str, int | None, str | None]:
+    def __init__(
+        self,
+        message: str = "Request timed out",
+        *,
+        status_code: int | None = 408,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(message, status_code=status_code, **kwargs)
+
+
+class OutputValidationError(APIError):
+    """The LLM response did not satisfy the expected output schema.
+
+    Sits under :class:`APIError` because the failure originated in a provider
+    response (not in user-supplied input), but the request itself completed —
+    so ``status_code`` is typically ``None``.
+
+    Retryable by default: LLMs are nondeterministic, and a parse failure on
+    one call may resolve on the next without changing the prompt. Override
+    ``retryable=False`` if your prompt/schema combination is systematically
+    incompatible.
+
+    The original :class:`pydantic.ValidationError` is preserved on
+    ``__cause__`` when raised via ``raise … from e``. The :attr:`validation_errors`
+    attribute exposes the field-level details from Pydantic's ``errors()`` API
+    with raw input values stripped — safe for inclusion in logs and telemetry.
+    """
+
+    retryable = True
+
+    def __init__(
+        self,
+        message: str = "Model output failed schema validation",
+        *,
+        validation_errors: list[dict[str, Any]] | None = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(message, **kwargs)
+        #: Per-field validation failures. Each entry has ``loc``, ``type``,
+        #: and ``msg`` keys (from :meth:`pydantic.ValidationError.errors`).
+        #: Raw input values are deliberately omitted to avoid leaking LLM
+        #: output (which may echo user prompts) into telemetry.
+        self.validation_errors = validation_errors
+
+
+def sanitize_pydantic_errors(errors: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Strip raw input values from a list of :meth:`pydantic.ValidationError.errors` entries.
+
+    Pydantic's per-error dicts include an ``input`` key containing the raw
+    value that failed validation. For LLM outputs that value may echo prompt
+    content, so we drop it before the error reaches logs or telemetry. The
+    remaining keys (``loc``, ``type``, ``msg``, ``ctx`` if present) describe
+    the schema and the failure mode but not the offending content.
+    """
+    return [
+        {key: value for key, value in error.items() if key != "input"}
+        for error in errors
+    ]
+
+
+def format_error_for_metadata(error: BaseException) -> str:
+    """Render an exception as a string safe for ``EvaluationMetadata.error_details``.
+
+    SDK errors (:class:`EvaluatorError` subclasses) already carry sanitized
+    messages, so include both the class name and the message for debuggability.
+    Any other exception is rendered as just its class name — the message may
+    contain raw values (e.g. an :class:`AttributeError` echoing a field name
+    from user input) and isn't safe to put in telemetry.
+    """
+    if isinstance(error, EvaluatorError):
+        return f"{type(error).__name__}: {error}"
+    return f"Unexpected error: {type(error).__name__}"
+
+
+# --- Provider-error wrapping --------------------------------------------------
+
+# Status codes we consider transient even without a more specific subclass match.
+_RETRYABLE_STATUS_MIN = 500
+
+# Exception class-name suffixes used as a soft signal when isinstance checks
+# against provider SDKs aren't possible (e.g. those SDKs aren't installed in
+# the caller's environment). The provider SDKs follow consistent naming.
+_TIMEOUT_NAME_HINTS = ("TimeoutError", "APITimeoutError", "ReadTimeout", "ConnectTimeout")
+_NETWORK_NAME_HINTS = ("ConnectionError", "APIConnectionError", "ConnectError", "NetworkError")
+
+
+def _provider_status_code(error: Exception) -> int | None:
+    """Best-effort extraction of an HTTP status code from a provider exception.
+
+    Prefers the structured attribute that openai-python and anthropic-python
+    both expose (``.status_code``). Falls back to ``.status`` (used by some
+    httpx-based clients) and finally to a regex on the message — narrowed to
+    HTTP-shaped 4xx/5xx codes near the start of the message to reduce false
+    positives like model names or token counts.
+    """
+    for attr in ("status_code", "status"):
+        value = getattr(error, attr, None)
+        if isinstance(value, int):
+            return value
+    match = re.search(r"\b(4\d{2}|5\d{2})\b", str(error)[:80])
+    if match:
+        return int(match.group(1))
+    return None
+
+
+def _provider_response_body(error: Exception) -> Any | None:
+    """Pull the decoded response body off the provider exception when available."""
+    body = getattr(error, "body", None)
+    if body is not None:
+        return body
+    response = getattr(error, "response", None)
+    if response is None:
+        return None
+    # httpx.Response and similar objects: try .json() then fall back to .text.
+    json_fn = getattr(response, "json", None)
+    if callable(json_fn):
+        try:
+            return json_fn()
+        except Exception:  # noqa: BLE001 — body is best-effort debug data
+            pass
+    return getattr(response, "text", None)
+
+
+def _provider_request_id(error: Exception) -> str | None:
+    """Pull a request ID off the provider exception when available."""
+    request_id = getattr(error, "request_id", None)
+    if request_id:
+        return str(request_id)
+    response = getattr(error, "response", None)
+    headers = getattr(response, "headers", None)
+    if headers is None:
+        return None
+    try:
+        value = headers.get("x-request-id") or headers.get("request-id")
+    except Exception:  # noqa: BLE001 — non-mapping headers shouldn't crash wrapping
+        return None
+    return str(value) if value else None
+
+
+def _extract_retry_after(error: Exception) -> float | None:
+    """Return Retry-After in **seconds** from a provider exception, if present.
+
+    Reads ``response.headers['retry-after']`` when present (per RFC 7231 this
+    may be an integer number of seconds or an HTTP-date; only the integer form
+    is currently handled, which matches every provider we use). Falls back to
+    a ``retry-after: N`` substring in the message.
+    """
+    response = getattr(error, "response", None)
+    headers = getattr(response, "headers", None)
+    if headers is not None:
+        try:
+            raw = headers.get("retry-after") or headers.get("Retry-After")
+        except Exception:  # noqa: BLE001
+            raw = None
+        if raw is not None:
+            try:
+                return float(raw)
+            except (TypeError, ValueError):
+                # HTTP-date form is not currently parsed — providers we
+                # integrate with always return seconds.
+                pass
     message = str(error)
-    status_code = None
-    code = getattr(error, "name", None) or type(error).__name__
-    if code == "Error":
-        code = None
-    match = re.search(r"\b(4\d{2}|5\d{2})\b", message)
+    match = re.search(r"retry[- ]after[:\s]+(\d+)", message, re.I)
     if match:
-        status_code = int(match.group(1))
-    return message, status_code, code
+        return float(match.group(1))
+    return None
+
+
+def _is_model_not_found(error: Exception, status_code: int | None) -> bool:
+    """Detect "model not found" errors from a 404 or model-shaped 400."""
+    if status_code == 404:
+        return True
+    if status_code == 400:
+        return bool(re.search(r"\bmodel\b.*(not found|does not exist|invalid)", str(error), re.I))
+    return False
+
+
+def _is_timeout(error: Exception) -> bool:
+    """Detect timeout errors by exception class regardless of which SDK raised."""
+    if isinstance(error, (asyncio.TimeoutError, builtins.TimeoutError)):
+        return True
+    name = type(error).__name__
+    return any(hint in name for hint in _TIMEOUT_NAME_HINTS)
+
+
+def _is_network(error: Exception) -> bool:
+    """Detect network-level failures by exception class."""
+    if isinstance(error, ConnectionError):
+        return True
+    name = type(error).__name__
+    return any(hint in name for hint in _NETWORK_NAME_HINTS)
 
 
 def wrap_provider_error(
-    error: BaseException, default_message: str = "API request failed"
-) -> APIError:
-    """Wrap a provider error into the appropriate SDK error type."""
-    message, status_code, code = _parse_provider_error(error)
-    msg = message or default_message
+    error: Exception,
+    *,
+    default_message: str = "API request failed",
+    provider: LLMProvider | None = None,
+    model: str | None = None,
+) -> EvaluatorError:
+    """Translate a provider exception into the appropriate SDK error subclass.
+
+    Routing precedence (most specific first):
+
+    1. Asyncio / builtin / SDK timeout exception types → :class:`RequestTimeoutError`
+    2. Built-in / SDK connection-error exception types → :class:`NetworkError`
+    3. HTTP 404 (and model-shaped 400) → :class:`ConfigurationError` (model not found)
+    4. HTTP 401 / 403 → :class:`AuthenticationError`
+    5. HTTP 429 → :class:`RateLimitError`
+    6. HTTP 5xx → retryable :class:`APIError`
+    7. Anything else → non-retryable :class:`APIError`
+
+    SDK error messages are sanitized — we do not interpolate the provider's
+    raw message into ``str(err)``. Structured detail is available on the
+    exception's attributes (``status_code``, ``response_body``, ``request_id``,
+    ``provider``, ``model``) and the original exception is reachable via
+    ``__cause__`` when callers use ``raise wrap_provider_error(e) from e``.
+
+    Args:
+        error: The provider exception to wrap. Must be a regular ``Exception``
+            (callers should not pass ``KeyboardInterrupt``/``SystemExit``).
+        default_message: Used only when no more specific message can be
+            constructed (e.g. an unidentified, status-less exception).
+        provider: The configured LLM provider that raised, when the caller
+            knows it. Surfaced as ``err.provider`` to disambiguate multi-
+            provider runs.
+        model: The model ID that was being invoked, when known. Surfaced as
+            ``err.model`` and folded into the message for
+            :class:`ConfigurationError` results.
+
+    Returns:
+        An :class:`EvaluatorError` subclass. Always a new instance — never the
+        original ``error``.
+    """
+    status_code = _provider_status_code(error)
+    response_body = _provider_response_body(error)
+    request_id = _provider_request_id(error)
+    common: dict[str, Any] = {
+        "provider": provider,
+        "model": model,
+        "response_body": response_body,
+        "request_id": request_id,
+    }
+
+    if _is_timeout(error):
+        return RequestTimeoutError(status_code=status_code, **common)
+
+    if _is_network(error):
+        return NetworkError(**common)
+
+    if _is_model_not_found(error, status_code):
+        model_suffix = f" (model: {model!r})" if model else ""
+        return ConfigurationError(
+            f"Model not found or invalid{model_suffix}. "
+            "Check the model ID configured for this provider."
+        )
 
     if status_code in (401, 403):
-        return AuthenticationError(
-            msg if "API key" in msg or "api key" in msg.lower() else "Invalid API key",
-            status_code,
-        )
+        return AuthenticationError(status_code=status_code, **common)
+
     if status_code == 429:
-        retry_match = re.search(r"retry[- ]after[:\s]+(\d+)", msg, re.I)
-        retry_after = int(retry_match.group(1)) * 1000 if retry_match else None
-        return RateLimitError(
-            msg if "rate limit" in msg.lower() else "Rate limit exceeded",
-            retry_after,
+        return RateLimitError(retry_after=_extract_retry_after(error), **common)
+
+    if status_code is not None:
+        return APIError(
+            f"API request failed (HTTP {status_code})",
+            status_code=status_code,
+            retryable=status_code >= _RETRYABLE_STATUS_MIN,
+            **common,
         )
-    # Timeouts before generic "Connection" — many stacks use "Connection timed out"
-    if "timeout" in msg.lower() or "timed out" in msg.lower():
-        return EvaluatorTimeoutError(msg)
-    # TODO: confirm if these apply to Python too. Based on TypeScript SDK implementation.
-    if any(
-        x in msg
-        for x in (
-            "ECONNREFUSED",
-            "ENOTFOUND",
-            "ETIMEDOUT",
-            "network",
-            "Network",
-            "Connection",
-        )
-    ):
-        return NetworkError(msg)
-    return APIError(
-        msg,
-        status_code,
-        bool(status_code and status_code >= 500),
-        code,
-    )
+
+    return APIError(default_message, retryable=False, **common)
+
+
+__all__ = [
+    "APIError",
+    "AuthenticationError",
+    "ConfigurationError",
+    "EvaluatorError",
+    "InputValidationError",
+    "NetworkError",
+    "OutputValidationError",
+    "RateLimitError",
+    "RequestTimeoutError",
+    "format_error_for_metadata",
+    "sanitize_pydantic_errors",
+    "wrap_provider_error",
+]
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py b/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py
index 4a7d78f..ea5586f 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/evaluator.py
@@ -13,7 +13,7 @@
 
 from pydantic import BaseModel, Field, model_validator
 
-from .errors import ConfigurationError, ValidationError
+from .errors import ConfigurationError, InputValidationError
 from .input_specs import InputSpec
 from .metadata import EvaluationMetadata
 
@@ -43,7 +43,7 @@ class TextInputField(InputField[str]):
 
             def validate(self) -> None:
                 if len(self.value) < self.spec.min_text_length:
-                    raise ValidationError(...)
+                    raise InputValidationError(...)
 
             def input_metadata(self) -> dict[str, Any]:
                 return {"textLength": len(self.value)}
@@ -56,7 +56,7 @@ def input_metadata(self) -> dict[str, Any]:
     def validate(self) -> None:
         """Validate *value* against *spec* constraints.
 
-        Raise :class:`~.errors.ValidationError` if the value is invalid.
+        Raise :class:`~.errors.InputValidationError` if the value is invalid.
         """
 
     @abstractmethod
@@ -150,19 +150,19 @@ def _coerce_raw_to_input_fields(cls, data: Any) -> Any:
     def validate(self) -> None:
         """Validate all :class:`InputField` members, collecting every error before raising.
 
-        Raises :class:`~.errors.ValidationError` if any field is invalid.
+        Raises :class:`~.errors.InputValidationError` if any field is invalid.
         """
-        errors: list[tuple[str, ValidationError]] = []
+        errors: list[tuple[str, InputValidationError]] = []
         for name in type(self).model_fields:
             field_val = getattr(self, name)
             if isinstance(field_val, InputField):
                 try:
                     field_val.validate()
-                except ValidationError as e:
+                except InputValidationError as e:
                     errors.append((name, e))
         if errors:
-            parts = [f"{field}: {err.message}" for field, err in errors]
-            raise ValidationError("Validation errors: " + "; ".join(parts))
+            parts = [f"{field}: {err}" for field, err in errors]
+            raise InputValidationError("Validation errors: " + "; ".join(parts))
 
     def input_metadata(self) -> dict[str, Any]:
         """Return a mapping of field name → :meth:`InputField.input_metadata` for each field.
diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py
index 43d123b..cca0c93 100644
--- a/sdks/python/tests/evaluators/test_base.py
+++ b/sdks/python/tests/evaluators/test_base.py
@@ -38,7 +38,12 @@
     LLMProvider,
     PromptSettings,
 )
-from learning_commons_evaluators.schemas.errors import APIError, EvaluatorError, ValidationError
+from learning_commons_evaluators.schemas.errors import (
+    APIError,
+    EvaluatorError,
+    InputValidationError,
+    OutputValidationError,
+)
 from learning_commons_evaluators.schemas.input_specs import GradeInputSpec, TextInputSpec
 from learning_commons_evaluators.schemas.metadata import (
     PROMPT_STEP_EXTRA_PROMPT_SETTINGS,
@@ -235,7 +240,7 @@ def test_raises_validation_error_for_invalid_input(self, stub_evaluator):
             ),
             grade_level=GradeInputField(spec=GradeInputSpec(name="grade_level"), value=3),
         )
-        with pytest.raises(ValidationError):
+        with pytest.raises(InputValidationError):
             stub_evaluator.evaluate_sync(inp)
 
     def test_propagates_evaluate_impl_exception(self, stub_evaluator):
@@ -267,7 +272,7 @@ def emit(self, record: logging.LogRecord) -> None:
                 ),
                 grade_level=GradeInputField(spec=GradeInputSpec(name="grade_level"), value=3),
             )
-            with pytest.raises(ValidationError):
+            with pytest.raises(InputValidationError):
                 stub_evaluator.evaluate_sync(inp)
         finally:
             stub_evaluator.config.logger.removeHandler(h)
@@ -335,17 +340,42 @@ async def impl():
         await stub_evaluator.execute_step("s", evaluation_metadata, impl)
         assert evaluation_metadata.step_details["s"].status == Status.succeeded
 
-    async def test_records_failed_status_and_error_on_exception(
+    async def test_records_failed_status_and_sanitized_error_on_unexpected_exception(
         self, stub_evaluator, evaluation_metadata
     ):
+        """Non-SDK exceptions record only the class name in step metadata.
+
+        The raw message (``"boom"`` here) is intentionally stripped — for arbitrary
+        exceptions the message may contain user data, prompt content, or field values
+        that aren't safe for logs/telemetry. The exception itself still propagates
+        unchanged for callers that catch it.
+        """
+
         async def failing():
-            raise ValueError("boom")
+            raise ValueError("boom with possibly-sensitive context")
 
         with pytest.raises(ValueError, match="boom"):
             await stub_evaluator.execute_step("s", evaluation_metadata, failing)
         step = evaluation_metadata.step_details["s"]
         assert step.status == Status.failed
-        assert "boom" in step.error_details
+        # Class name is included; raw message is not.
+        assert "ValueError" in step.error_details
+        assert "boom" not in step.error_details
+
+    async def test_records_failed_status_and_sanitized_error_on_sdk_exception(
+        self, stub_evaluator, evaluation_metadata
+    ):
+        """SDK exceptions land in step metadata as ``ClassName: <sanitized message>``."""
+
+        async def failing():
+            raise InputValidationError("text length 3 below minimum 10")
+
+        with pytest.raises(InputValidationError):
+            await stub_evaluator.execute_step("s", evaluation_metadata, failing)
+        step = evaluation_metadata.step_details["s"]
+        assert step.status == Status.failed
+        # SDK error messages are already controlled by us, so they're included verbatim.
+        assert step.error_details == "InputValidationError: text length 3 below minimum 10"
 
     async def test_re_raises_exception(self, stub_evaluator, evaluation_metadata):
         async def failing():
@@ -669,13 +699,21 @@ async def test_wraps_unexpected_chain_failure_as_api_error(
                 parser_output_type=_ChainOutput,
             )
 
-    async def test_malformed_llm_json_raises_api_error(self, stub_evaluator, evaluation_metadata):
-        """Invalid JSON from the LLM becomes :class:`APIError` via ``wrap_provider_error``."""
+    async def test_malformed_llm_json_raises_output_validation_error(
+        self, stub_evaluator, evaluation_metadata
+    ):
+        """Invalid JSON from the LLM becomes :class:`OutputValidationError` (a subclass of APIError).
+
+        LangChain raises ``OutputParserException`` for unparseable JSON; we wrap it so the
+        SDK exposes a single, sanitized parse-failure type instead of leaking the raw
+        ``"Invalid json output: <model text>"`` string (which would echo LLM content).
+        """
 
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
+        bad_output = "not-json with prompt echo: API_KEY=sk-...REDACTED"
         with (
-            patch(_CHAIN_PATCH, return_value=_fake_chat_model(AIMessage(content="not-json"))),
-            pytest.raises(APIError, match="Invalid json output"),
+            patch(_CHAIN_PATCH, return_value=_fake_chat_model(AIMessage(content=bad_output))),
+            pytest.raises(OutputValidationError) as exc_info,
         ):
             await stub_evaluator.execute_prompt_chain_step(
                 step_name="main",
@@ -689,11 +727,27 @@ async def test_malformed_llm_json_raises_api_error(self, stub_evaluator, evaluat
                 chain_inputs={"input": "text"},
                 parser_output_type=_ChainOutput,
             )
-
-    async def test_schema_mismatch_raises_pydantic_validation_error(
+        # Sanitized — the raw model output must not appear in str() of the wrapped error.
+        assert "API_KEY" not in str(exc_info.value)
+        assert "sk-" not in str(exc_info.value)
+        # Original exception is preserved for debugging.
+        assert exc_info.value.__cause__ is not None
+        # Subclass relationship: callers can still catch APIError.
+        assert isinstance(exc_info.value, APIError)
+        # Provider/model context is threaded through.
+        assert exc_info.value.provider is LLMProvider.GOOGLE
+        assert exc_info.value.model == "gemini-2.0-flash"
+
+    async def test_schema_mismatch_raises_output_validation_error(
         self, stub_evaluator, evaluation_metadata
     ):
-        """Valid JSON that does not satisfy the output model raises Pydantic ``ValidationError``."""
+        """Valid JSON that doesn't satisfy the output model becomes :class:`OutputValidationError`.
+
+        Pydantic's :class:`ValidationError` is wrapped so the original (which may
+        include input value snippets via Pydantic's default formatting) stays on
+        ``__cause__`` rather than the SDK error's ``str()``. The structured
+        per-field details land on ``validation_errors`` with input values stripped.
+        """
 
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
         with (
@@ -701,7 +755,7 @@ async def test_schema_mismatch_raises_pydantic_validation_error(
                 _CHAIN_PATCH,
                 return_value=_fake_chat_model(AIMessage(content='{"label": "only"}')),
             ),
-            pytest.raises(PydanticValidationError),
+            pytest.raises(OutputValidationError) as exc_info,
         ):
             await stub_evaluator.execute_prompt_chain_step(
                 step_name="main",
@@ -715,3 +769,12 @@ async def test_schema_mismatch_raises_pydantic_validation_error(
                 chain_inputs={"input": "text"},
                 parser_output_type=_ChainOutput,
             )
+        # Pydantic error preserved for callers that want full detail.
+        assert isinstance(exc_info.value.__cause__, PydanticValidationError)
+        # Structured per-field errors are populated; raw 'input' is stripped.
+        assert exc_info.value.validation_errors is not None
+        assert len(exc_info.value.validation_errors) > 0
+        for entry in exc_info.value.validation_errors:
+            assert "input" not in entry
+            assert "loc" in entry
+            assert "type" in entry
diff --git a/sdks/python/tests/evaluators/test_vocabulary.py b/sdks/python/tests/evaluators/test_vocabulary.py
index 9a7d5d9..e3d9270 100644
--- a/sdks/python/tests/evaluators/test_vocabulary.py
+++ b/sdks/python/tests/evaluators/test_vocabulary.py
@@ -9,7 +9,7 @@
     VocabularyEvaluator,
     create_config_no_telemetry,
 )
-from learning_commons_evaluators.schemas.errors import ConfigurationError, ValidationError
+from learning_commons_evaluators.schemas.errors import ConfigurationError, InputValidationError
 from learning_commons_evaluators.schemas.metadata import Status
 from learning_commons_evaluators.schemas.vocabulary import (
     VocabularyComplexityOutput,
@@ -258,8 +258,8 @@ def test_unsupported_grade_raises_via_framework(self, unsupported_grade):
         config = create_config_no_telemetry()
         evaluator = VocabularyEvaluator(config)
         inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=unsupported_grade)
-        # The base evaluator catches the ValidationError, sets status=failed, then re-raises.
-        with pytest.raises(ValidationError):
+        # The base evaluator catches the InputValidationError, sets status=failed, then re-raises.
+        with pytest.raises(InputValidationError):
             evaluator.evaluate_sync(inp)
 
     def test_unsupported_grade_sets_status_failed(self):
@@ -267,7 +267,7 @@ def test_unsupported_grade_sets_status_failed(self):
         config = create_config_no_telemetry()
         evaluator = VocabularyEvaluator(config)
         inp = VocabularyEvaluationInput(text=_SAMPLE_TEXT, grade=2)
-        with pytest.raises(ValidationError):
+        with pytest.raises(InputValidationError):
             evaluator.evaluate_sync(inp)
 
 
diff --git a/sdks/python/tests/schemas/test_common_inputs.py b/sdks/python/tests/schemas/test_common_inputs.py
index 2cc7d3e..8bbb7e3 100644
--- a/sdks/python/tests/schemas/test_common_inputs.py
+++ b/sdks/python/tests/schemas/test_common_inputs.py
@@ -6,7 +6,7 @@
     GradeInputField,
     TextInputField,
 )
-from learning_commons_evaluators.schemas.errors import ValidationError
+from learning_commons_evaluators.schemas.errors import InputValidationError
 from learning_commons_evaluators.schemas.input_specs import (
     GradeInputSpec,
     TextInputSpec,
@@ -43,14 +43,14 @@ def test_validate_passes_at_exact_min(self):
         TextInputField(spec=_text_spec(min_text_length=2), value="ab").validate()
 
     def test_validate_raises_below_min(self):
-        with pytest.raises(ValidationError, match="below minimum"):
+        with pytest.raises(InputValidationError, match="below minimum"):
             TextInputField(spec=_text_spec(min_text_length=2), value="a").validate()
 
     def test_validate_passes_at_exact_max(self):
         TextInputField(spec=_text_spec(max_text_length=10), value="a" * 10).validate()
 
     def test_validate_raises_above_max(self):
-        with pytest.raises(ValidationError, match="exceeds maximum"):
+        with pytest.raises(InputValidationError, match="exceeds maximum"):
             TextInputField(spec=_text_spec(max_text_length=10), value="a" * 11).validate()
 
     def test_validate_no_max_by_default(self):
@@ -85,7 +85,7 @@ def test_strip_whitespace_explicit_true_trims_value(self):
 
     def test_validate_raises_when_strip_shortens_below_min(self):
         """Padding does not count toward ``min_text_length`` when stripping is on."""
-        with pytest.raises(ValidationError, match="below minimum"):
+        with pytest.raises(InputValidationError, match="below minimum"):
             TextInputField(
                 spec=_text_spec(min_text_length=5, strip_whitespace=True),
                 value="  ab  ",
@@ -103,11 +103,11 @@ def test_validate_passes_at_boundaries(self):
         GradeInputField(spec=_grade_spec(), value=12).validate()  # upper boundary
 
     def test_validate_raises_below_0(self):
-        with pytest.raises(ValidationError, match="0-12"):
+        with pytest.raises(InputValidationError, match="0-12"):
             GradeInputField(spec=_grade_spec(), value=-1).validate()
 
     def test_validate_raises_above_12(self):
-        with pytest.raises(ValidationError, match="0-12"):
+        with pytest.raises(InputValidationError, match="0-12"):
             GradeInputField(spec=_grade_spec(), value=13).validate()
 
     def test_validate_passes_when_in_allowed_grades(self):
@@ -117,7 +117,7 @@ def test_validate_passes_when_in_allowed_grades(self):
         ).validate()
 
     def test_validate_raises_when_not_in_allowed_grades(self):
-        with pytest.raises(ValidationError, match="not in allowed set"):
+        with pytest.raises(InputValidationError, match="not in allowed set"):
             GradeInputField(
                 spec=_grade_spec(allowed_grades=[5, 6, 7]),
                 value=8,
diff --git a/sdks/python/tests/schemas/test_errors.py b/sdks/python/tests/schemas/test_errors.py
index d25cefd..ce25d18 100644
--- a/sdks/python/tests/schemas/test_errors.py
+++ b/sdks/python/tests/schemas/test_errors.py
@@ -1,134 +1,376 @@
 """Tests for the error hierarchy and wrap_provider_error().
 
-Each wrap_provider_error test targets a specific branch in the function so
-that every routing decision is covered independently.
+Covers two surfaces:
+
+1. The hierarchy itself — every subclass advertises the right ``retryable``
+   default, message defaults are sanitized, and structured attributes (status
+   code, retry-after, response body, request id, provider, model) round-trip.
+
+2. ``wrap_provider_error`` — one test per routing branch, plus tests that the
+   function prefers structured attributes (``.status_code``, ``.response``) over
+   regex on the message and that raw provider messages do not leak into the
+   wrapped error's ``str()``.
 """
 
+from __future__ import annotations
+
+import asyncio
+from typing import Any
+
 import pytest
 
+from learning_commons_evaluators.schemas.config import LLMProvider
 from learning_commons_evaluators.schemas.errors import (
     APIError,
     AuthenticationError,
     ConfigurationError,
     EvaluatorError,
-    EvaluatorTimeoutError,
+    InputValidationError,
     NetworkError,
+    OutputValidationError,
     RateLimitError,
-    ValidationError,
+    RequestTimeoutError,
+    format_error_for_metadata,
+    sanitize_pydantic_errors,
     wrap_provider_error,
 )
 
 
+# --- Hierarchy ---------------------------------------------------------------
+
+
 class TestErrorHierarchy:
-    """Verify that every error type carries the right code, name, and retryable flag."""
+    """The class is the identity. Each subclass exposes the right retryable default."""
 
     @pytest.mark.parametrize(
-        "exc,expected_code,expected_retryable",
+        "exc,expected_retryable",
         [
-            (ValidationError("bad"), "VALIDATION_ERROR", None),
-            (ConfigurationError("missing"), "CONFIGURATION_ERROR", None),
-            (AuthenticationError("denied", 401), "AUTHENTICATION_ERROR", False),
-            (RateLimitError("slow down"), "RATE_LIMIT_ERROR", True),
-            (NetworkError("no route"), "NETWORK_ERROR", True),
-            (EvaluatorTimeoutError(), "TIMEOUT_ERROR", True),
+            (InputValidationError("bad"), False),
+            (ConfigurationError("missing"), False),
+            (AuthenticationError(status_code=401), False),
+            (RateLimitError(), True),
+            (NetworkError(), True),
+            (RequestTimeoutError(), True),
+            (OutputValidationError(), True),
         ],
     )
-    def test_error_code_and_retryable(self, exc, expected_code, expected_retryable):
-        assert exc.code == expected_code
-        if expected_retryable is not None:
-            assert exc.retryable is expected_retryable
+    def test_retryable_default(self, exc: EvaluatorError, expected_retryable: bool) -> None:
+        assert exc.retryable is expected_retryable
+
+    def test_every_subclass_is_an_evaluator_error(self) -> None:
+        for exc in (
+            InputValidationError("x"),
+            ConfigurationError("x"),
+            APIError("x"),
+            AuthenticationError(),
+            RateLimitError(),
+            NetworkError(),
+            RequestTimeoutError(),
+            OutputValidationError(),
+        ):
+            assert isinstance(exc, EvaluatorError)
+
+    def test_api_error_subclasses_are_api_errors(self) -> None:
+        for exc in (
+            AuthenticationError(),
+            RateLimitError(),
+            NetworkError(),
+            RequestTimeoutError(),
+            OutputValidationError(),
+        ):
+            assert isinstance(exc, APIError)
+
+    def test_output_validation_error_carries_validation_errors(self) -> None:
+        details = [
+            {"loc": ("vocabulary_complexity",), "type": "missing", "msg": "Field required"}
+        ]
+        err = OutputValidationError(validation_errors=details)
+        assert err.validation_errors == details
+        # No input snippets leak into the SDK error's string form.
+        assert str(err) == "Model output failed schema validation"
 
-    def test_evaluator_error_stores_message(self):
-        err = EvaluatorError("something broke", code="ERR")
+    def test_evaluator_error_str_returns_message(self) -> None:
+        err = EvaluatorError("something broke")
         assert str(err) == "something broke"
-        assert err.message == "something broke"
-        assert err.code == "ERR"
 
-    def test_api_error_status_code_and_retryable(self):
-        err = APIError("server error", status_code=500, retryable=True)
+    def test_api_error_structured_attributes(self) -> None:
+        err = APIError(
+            "server error",
+            status_code=500,
+            retryable=True,
+            provider=LLMProvider.OPENAI,
+            model="gpt-4o",
+            response_body={"error": "explode"},
+            request_id="req_abc",
+        )
         assert err.status_code == 500
         assert err.retryable is True
+        assert err.provider is LLMProvider.OPENAI
+        assert err.model == "gpt-4o"
+        assert err.response_body == {"error": "explode"}
+        assert err.request_id == "req_abc"
 
-    def test_rate_limit_error_carries_retry_after(self):
-        err = RateLimitError("too fast", retry_after=60_000)
+    def test_rate_limit_error_retry_after_is_seconds(self) -> None:
+        """Retry-After is exposed in **seconds** (not milliseconds like the TS SDK)."""
+        err = RateLimitError(retry_after=60.0)
         assert err.status_code == 429
-        assert err.retry_after == 60_000
+        assert err.retry_after == 60.0
+
+    def test_network_error_retryable_overridable(self) -> None:
+        """Permanently bad hostnames should be flaggable as non-retryable."""
+        err = NetworkError("DNS resolution failed permanently", retryable=False)
+        assert err.retryable is False
+
+    def test_authentication_error_default_message_is_generic(self) -> None:
+        """Default message must not contain anything from the original provider error."""
+        err = AuthenticationError(status_code=401)
+        assert str(err) == "Authentication failed"
+
+
+# --- Helpers for fake provider exceptions ------------------------------------
+
+
+class _FakeHeaders:
+    def __init__(self, headers: dict[str, str]) -> None:
+        self._headers = headers
+
+    def get(self, key: str, default: Any = None) -> Any:
+        return self._headers.get(key.lower(), self._headers.get(key, default))
+
+
+class _FakeResponse:
+    def __init__(self, headers: dict[str, str] | None = None, body: Any = None) -> None:
+        self.headers = _FakeHeaders(headers or {})
+        self._body = body
+
+    def json(self) -> Any:
+        return self._body
+
+
+class _FakeProviderError(Exception):
+    """Stand-in for openai/anthropic API errors: exposes ``status_code`` / ``response``."""
+
+    def __init__(
+        self,
+        message: str,
+        *,
+        status_code: int | None = None,
+        response: _FakeResponse | None = None,
+        body: Any | None = None,
+    ) -> None:
+        super().__init__(message)
+        self.status_code = status_code
+        self.response = response
+        self.body = body
+
+
+# --- wrap_provider_error: routing branches ----------------------------------
 
 
 class TestWrapProviderError:
-    """One test per routing branch in wrap_provider_error()."""
+    """One test per routing branch, plus attribute-vs-regex preferences."""
 
-    def test_401_with_api_key_text_returns_authentication_error(self):
-        wrapped = wrap_provider_error(Exception("401 Invalid API key"))
-        assert isinstance(wrapped, AuthenticationError)
-        assert wrapped.status_code == 401
+    def test_status_code_read_from_attribute_not_message(self) -> None:
+        """Should prefer the structured .status_code attribute over message regex."""
+        err = _FakeProviderError("something opaque", status_code=429)
+        wrapped = wrap_provider_error(err)
+        assert isinstance(wrapped, RateLimitError)
+        assert wrapped.status_code == 429
+
+    def test_404_routes_to_configuration_error_with_model_in_message(self) -> None:
+        """Model-not-found is a developer mistake, not an API error."""
+        err = _FakeProviderError("not found", status_code=404)
+        wrapped = wrap_provider_error(err, provider=LLMProvider.OPENAI, model="gpt-bogus")
+        assert isinstance(wrapped, ConfigurationError)
+        assert "gpt-bogus" in str(wrapped)
+
+    def test_400_model_message_routes_to_configuration_error(self) -> None:
+        err = _FakeProviderError("model 'gpt-bogus' does not exist", status_code=400)
+        wrapped = wrap_provider_error(err)
+        assert isinstance(wrapped, ConfigurationError)
 
-    def test_401_without_api_key_text_uses_fallback_message(self):
-        """When the 401 message doesn't mention 'api key', wrap should still use AuthenticationError
-        with the generic 'Invalid API key' fallback rather than the raw message."""
-        wrapped = wrap_provider_error(Exception("401 Unauthorized"))
+    def test_401_routes_to_authentication_error_with_sanitized_message(self) -> None:
+        err = _FakeProviderError("401: api key sk-...REDACTED is invalid", status_code=401)
+        wrapped = wrap_provider_error(err)
         assert isinstance(wrapped, AuthenticationError)
         assert wrapped.status_code == 401
-        assert "Invalid API key" in str(wrapped)
+        # Original raw message must not leak into the SDK error's string form.
+        assert "sk-" not in str(wrapped)
+        assert str(wrapped) == "Authentication failed"
 
-    def test_403_returns_authentication_error(self):
-        wrapped = wrap_provider_error(Exception("403 Forbidden"))
+    def test_403_routes_to_authentication_error(self) -> None:
+        err = _FakeProviderError("forbidden", status_code=403)
+        wrapped = wrap_provider_error(err)
         assert isinstance(wrapped, AuthenticationError)
         assert wrapped.status_code == 403
 
-    def test_429_returns_rate_limit_error(self):
-        wrapped = wrap_provider_error(Exception("429 rate limit exceeded"))
+    def test_429_with_retry_after_header_is_in_seconds(self) -> None:
+        """Retry-After header takes precedence over message regex and is in seconds."""
+        err = _FakeProviderError(
+            "429 rate limit",
+            status_code=429,
+            response=_FakeResponse(headers={"retry-after": "12"}),
+        )
+        wrapped = wrap_provider_error(err)
         assert isinstance(wrapped, RateLimitError)
-        assert wrapped.status_code == 429
+        assert wrapped.retry_after == 12.0
 
-    def test_429_with_retry_after_header_parses_delay(self):
-        """Retry-After value in the message should be extracted and converted to ms."""
-        wrapped = wrap_provider_error(Exception("429 rate limit exceeded. Retry-After: 5"))
+    def test_429_falls_back_to_message_regex_for_retry_after(self) -> None:
+        err = _FakeProviderError("429 rate limit. Retry-After: 5", status_code=429)
+        wrapped = wrap_provider_error(err)
         assert isinstance(wrapped, RateLimitError)
-        assert wrapped.retry_after == 5000  # 5 seconds → 5000 ms
+        assert wrapped.retry_after == 5.0
 
-    def test_429_without_retry_after_sets_none(self):
-        wrapped = wrap_provider_error(Exception("429 rate limit exceeded"))
+    def test_429_without_retry_after_sets_none(self) -> None:
+        err = _FakeProviderError("429 rate limit", status_code=429)
+        wrapped = wrap_provider_error(err)
         assert isinstance(wrapped, RateLimitError)
         assert wrapped.retry_after is None
 
-    @pytest.mark.parametrize(
-        "message",
-        [
-            "ECONNREFUSED 127.0.0.1:443",
-            "ENOTFOUND api.example.com",
-            "ETIMEDOUT after 30s",
-            "Connection failed",
-            "Network error",
-        ],
-    )
-    def test_network_keywords_return_network_error(self, message):
-        wrapped = wrap_provider_error(Exception(message))
+    def test_builtin_connection_error_routes_to_network_error(self) -> None:
+        wrapped = wrap_provider_error(ConnectionError("connection refused"))
         assert isinstance(wrapped, NetworkError)
 
-    @pytest.mark.parametrize(
-        "message",
-        [
-            "request timed out",
-            "Connection timed out after 10 seconds",
-        ],
-    )
-    def test_timeout_phrases_return_timeout_error(self, message):
-        wrapped = wrap_provider_error(Exception(message))
-        assert isinstance(wrapped, EvaluatorTimeoutError)
+    def test_provider_named_connection_error_routes_to_network_error(self) -> None:
+        """Detect by class name when isinstance against the SDK class isn't possible."""
+
+        class APIConnectionError(Exception):
+            pass
+
+        wrapped = wrap_provider_error(APIConnectionError("connect failed"))
+        assert isinstance(wrapped, NetworkError)
+
+    def test_builtin_timeout_error_routes_to_timeout(self) -> None:
+        wrapped = wrap_provider_error(TimeoutError("timed out"))
+        assert isinstance(wrapped, RequestTimeoutError)
 
-    def test_500_returns_retryable_api_error(self):
-        wrapped = wrap_provider_error(Exception("500 Internal Server Error"))
+    def test_asyncio_timeout_error_routes_to_timeout(self) -> None:
+        wrapped = wrap_provider_error(asyncio.TimeoutError())
+        assert isinstance(wrapped, RequestTimeoutError)
+
+    def test_provider_named_timeout_error_routes_to_timeout(self) -> None:
+        class APITimeoutError(Exception):
+            pass
+
+        wrapped = wrap_provider_error(APITimeoutError("timed out"))
+        assert isinstance(wrapped, RequestTimeoutError)
+
+    def test_500_returns_retryable_api_error(self) -> None:
+        err = _FakeProviderError("internal server error", status_code=500)
+        wrapped = wrap_provider_error(err)
         assert isinstance(wrapped, APIError)
         assert wrapped.status_code == 500
         assert wrapped.retryable is True
 
-    def test_unknown_exception_returns_non_retryable_api_error(self):
+    def test_502_returns_retryable_api_error(self) -> None:
+        err = _FakeProviderError("bad gateway", status_code=502)
+        wrapped = wrap_provider_error(err)
+        assert isinstance(wrapped, APIError)
+        assert wrapped.retryable is True
+
+    def test_unknown_exception_returns_non_retryable_api_error(self) -> None:
         wrapped = wrap_provider_error(Exception("something completely unexpected"))
         assert isinstance(wrapped, APIError)
         assert wrapped.status_code is None
         assert wrapped.retryable is False
 
-    def test_empty_message_uses_default_message(self):
+    def test_default_message_is_used_when_no_signal(self) -> None:
         wrapped = wrap_provider_error(Exception(""), default_message="fallback message")
         assert "fallback message" in str(wrapped)
+
+    def test_provider_and_model_threaded_through(self) -> None:
+        err = _FakeProviderError("429", status_code=429)
+        wrapped = wrap_provider_error(err, provider=LLMProvider.ANTHROPIC, model="claude-x")
+        assert wrapped.provider is LLMProvider.ANTHROPIC
+        assert wrapped.model == "claude-x"
+
+    def test_response_body_and_request_id_extracted(self) -> None:
+        err = _FakeProviderError(
+            "429",
+            status_code=429,
+            response=_FakeResponse(
+                headers={"x-request-id": "req_xyz"},
+                body={"error": "rate_limited"},
+            ),
+        )
+        wrapped = wrap_provider_error(err)
+        assert wrapped.request_id == "req_xyz"
+        assert wrapped.response_body == {"error": "rate_limited"}
+
+    def test_body_attribute_preferred_over_response_json(self) -> None:
+        """When the SDK already has a decoded body attribute, use it directly."""
+        err = _FakeProviderError(
+            "429",
+            status_code=429,
+            body={"error": "from_body_attr"},
+            response=_FakeResponse(body={"error": "from_response_json"}),
+        )
+        wrapped = wrap_provider_error(err)
+        assert wrapped.response_body == {"error": "from_body_attr"}
+
+    def test_original_exception_preserved_via_cause(self) -> None:
+        """`raise wrap_provider_error(e) from e` keeps __cause__ for debugging."""
+        original = _FakeProviderError("opaque inner error", status_code=500)
+        try:
+            try:
+                raise original
+            except Exception as e:
+                raise wrap_provider_error(e) from e
+        except APIError as wrapped:
+            assert wrapped.__cause__ is original
+
+    def test_message_regex_fallback_when_no_status_code_attr(self) -> None:
+        """When the exception has no .status_code, fall back to bounded regex on message."""
+        wrapped = wrap_provider_error(Exception("503 Service Unavailable"))
+        assert isinstance(wrapped, APIError)
+        assert wrapped.status_code == 503
+        assert wrapped.retryable is True
+
+
+# --- sanitize_pydantic_errors ------------------------------------------------
+
+
+class TestSanitizePydanticErrors:
+    def test_strips_input_key_only(self) -> None:
+        """Drop the raw ``input`` value but keep loc/type/msg/ctx for debugging."""
+        raw = [
+            {
+                "loc": ("vocabulary_complexity",),
+                "type": "missing",
+                "msg": "Field required",
+                "input": {"label": "only"},  # potentially echoes LLM output
+                "ctx": {"some": "context"},
+            }
+        ]
+        sanitized = sanitize_pydantic_errors(raw)
+        assert sanitized == [
+            {
+                "loc": ("vocabulary_complexity",),
+                "type": "missing",
+                "msg": "Field required",
+                "ctx": {"some": "context"},
+            }
+        ]
+
+    def test_handles_empty_list(self) -> None:
+        assert sanitize_pydantic_errors([]) == []
+
+
+# --- format_error_for_metadata -----------------------------------------------
+
+
+class TestFormatErrorForMetadata:
+    def test_sdk_error_includes_class_name_and_sanitized_message(self) -> None:
+        err = RateLimitError()
+        assert format_error_for_metadata(err) == "RateLimitError: Rate limit exceeded"
+
+    def test_non_sdk_error_drops_message(self) -> None:
+        """Arbitrary exception messages may contain user data — only the class name is recorded."""
+        formatted = format_error_for_metadata(ValueError("something sensitive: sk-XYZ"))
+        assert "sk-XYZ" not in formatted
+        assert "something sensitive" not in formatted
+        assert "ValueError" in formatted
+
+    def test_returns_string(self) -> None:
+        assert isinstance(format_error_for_metadata(EvaluatorError("x")), str)
+        assert isinstance(format_error_for_metadata(Exception("y")), str)
diff --git a/sdks/python/tests/schemas/test_evaluator_schemas.py b/sdks/python/tests/schemas/test_evaluator_schemas.py
index 433fa66..9837fcf 100644
--- a/sdks/python/tests/schemas/test_evaluator_schemas.py
+++ b/sdks/python/tests/schemas/test_evaluator_schemas.py
@@ -18,7 +18,7 @@
 )
 from learning_commons_evaluators.schemas.errors import (
     ConfigurationError,
-    ValidationError,
+    InputValidationError,
 )
 from learning_commons_evaluators.schemas.evaluator import (
     EvaluationAnswer,
@@ -103,18 +103,18 @@ def test_input_values_returns_primitive_values(self):
 
     def test_validate_raises_on_invalid_grade(self):
         inp = _ExampleEvaluationInput(text=_LONG_TEXT, grade=99)
-        with pytest.raises(ValidationError):
+        with pytest.raises(InputValidationError):
             inp.validate()
 
     def test_validate_raises_on_invalid_text_length(self):
         inp = _ExampleEvaluationInput(text="x", grade=5)
-        with pytest.raises(ValidationError):
+        with pytest.raises(InputValidationError):
             inp.validate()
 
     def test_validate_collects_all_errors_before_raising(self):
-        """All field errors are collected; a single ValidationError is raised at the end."""
+        """All field errors are collected; a single InputValidationError is raised at the end."""
         inp = _ExampleEvaluationInput(text="x", grade=99)
-        with pytest.raises(ValidationError) as exc_info:
+        with pytest.raises(InputValidationError) as exc_info:
             inp.validate()
         msg = str(exc_info.value)
         assert "below minimum" in msg

From e442885b19cfdffb1785096977aadec68792dee8 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Fri, 15 May 2026 17:24:36 -0700
Subject: [PATCH 2/8] chore: update error handling section in README

---
 sdks/python/README.md | 98 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 81 insertions(+), 17 deletions(-)

diff --git a/sdks/python/README.md b/sdks/python/README.md
index 0b26e37..13a894f 100644
--- a/sdks/python/README.md
+++ b/sdks/python/README.md
@@ -302,32 +302,96 @@ config = create_config_no_telemetry(logger=create_silent_logger())
 
 ## Error handling
 
+Every exception the SDK raises inherits from `EvaluatorError`. Failures inside LLM prompt steps are wrapped at the boundary so callers see a predictable, sanitized hierarchy instead of raw LangChain, OpenAI, Anthropic, or HTTP-client exceptions.
+
+### Hierarchy
+
+```
+EvaluatorError
+├── ConfigurationError       — bad config (missing provider, unknown model, malformed settings)
+├── InputValidationError     — caller-supplied input failed validation
+└── APIError                 — failures originating in the LLM provider call
+    ├── AuthenticationError  — 401 / 403
+    ├── RateLimitError       — 429; carries `retry_after` (seconds)
+    ├── NetworkError         — connection refused, DNS failure, broken TLS
+    ├── RequestTimeoutError  — request exceeded the configured timeout
+    └── OutputValidationError — LLM response failed to parse or didn't match the expected schema
+```
+
+`InputValidationError` is named that way deliberately to avoid collision with `pydantic.ValidationError`. `RequestTimeoutError` is named that way to avoid shadowing the builtin `TimeoutError`. There is **no** `ValidationError` or `EvaluatorTimeoutError` in the public API.
+
+### Knowing when to retry
+
+Every `EvaluatorError` exposes a boolean `retryable` attribute. This is the single signal callers should consult when wrapping `evaluate()` in retry logic — there is no separate marker class to check. Subclasses set sensible defaults:
+
+- Retryable by default: `RateLimitError`, `NetworkError`, `RequestTimeoutError`, `OutputValidationError`, and any `APIError` with a 5xx status code.
+- Not retryable: `ConfigurationError`, `InputValidationError`, `AuthenticationError`, and `APIError` with a 4xx status code.
+
+`retryable` is also accepted as an `__init__` kwarg on `APIError` and `NetworkError` if you need to flag a specific instance differently (e.g. a permanently-bad hostname).
+
 ```python
-from learning_commons_evaluators import (
-    ConfigurationError,  # Missing/invalid config
-    ValidationError,     # Invalid input
-    AuthenticationError, # Invalid API keys (401/403)
-    RateLimitError,      # Rate limit exceeded (429) - has retry_after
-    NetworkError,        # Network failures
-    EvaluatorTimeoutError,  # Request timeout
-    APIError,            # Other API errors
-)
+import time
+from learning_commons_evaluators import EvaluatorError, RateLimitError
+
+for attempt in range(3):
+    try:
+        result = evaluator.evaluate_sync(input)
+        break
+    except EvaluatorError as e:
+        if not e.retryable or attempt == 2:
+            raise
+        delay = e.retry_after if isinstance(e, RateLimitError) and e.retry_after else 2 ** attempt
+        time.sleep(delay)  # retry_after is in seconds
+```
+
+### Sanitization and debugging context
+
+Error **messages** (the value returned by `str(err)`) are short and controlled. Raw provider strings — which may contain prompt echoes, user text, or fragments of API keys — are **not** interpolated into the SDK exception's message. Structured detail lives on attributes instead:
 
+- `status_code` on `APIError` — HTTP status from the provider, when one was returned. Populated from the provider exception's `.status_code` attribute when present (preferred over message regex).
+- `retry_after` on `RateLimitError` — suggested delay before retry, **in seconds**, or `None` if the provider didn't return a `Retry-After` header.
+- `provider` on `APIError` — the `LLMProvider` being called when the failure occurred.
+- `model` on `APIError` — the model ID requested.
+- `response_body` on `APIError` — decoded response body. Opt-in for debugging; may contain echoed prompt content, so treat as sensitive.
+- `request_id` on `APIError` — provider request ID, useful for support escalation.
+- `validation_errors` on `OutputValidationError` — per-field details from Pydantic's `errors()` API with raw input values stripped (safe for logs).
+
+The original provider exception is preserved on `__cause__` (via `raise … from e`), so debuggers, tracebacks, and `logging.exception()` retain full detail even though `str(err)` is sanitized.
+
+```python
+import logging
+from learning_commons_evaluators import APIError, OutputValidationError, RateLimitError
+
+log = logging.getLogger(__name__)
 try:
     result = evaluator.evaluate_sync(input)
-except ConfigurationError as e:
-    print(f"Config issue: {e}")
-except ValidationError as e:
-    print(f"Invalid input: {e}")
 except RateLimitError as e:
-    print(f"Rate limited, retry after {e.retry_after}ms")
+    time.sleep(e.retry_after or 30)  # seconds
+except OutputValidationError as e:
+    # Field-level errors are safe to log; raw LLM output is not in str(e).
+    log.warning("Bad LLM output: %s", e.validation_errors)
+    # Original pydantic.ValidationError / OutputParserException available as e.__cause__
 except APIError as e:
-    print(f"API error (retryable={e.retryable}): {e}")
+    log.error(
+        "Provider call failed",
+        extra={
+            "provider": e.provider,
+            "model": e.model,
+            "status": e.status_code,
+            "request_id": e.request_id,
+        },
+    )
+    raise
 ```
 
-Failures inside LLM prompt steps are passed through `wrap_provider_error()` (see `learning_commons_evaluators.schemas.errors`) so you typically see `APIError` subclasses rather than raw LangChain or HTTP client exceptions. Use `EvaluatorTimeoutError` for timeouts (the package does not export a `TimeoutError` alias, to avoid shadowing the Python builtin).
+### Metadata and telemetry
+
+On evaluation failure, `evaluation_metadata.status` is set to `failed` and `evaluation_metadata.error_details` is populated before `evaluate()` / `evaluate_sync()` re-raises (no result object is returned). `error_details` is itself sanitized:
+
+- SDK errors record `"ClassName: <sanitized message>"`.
+- Any other exception that escapes records only `"Unexpected error: ClassName"` — the message is omitted because arbitrary `ValueError`/`AttributeError`/etc. messages may contain user data or field values that aren't safe for telemetry.
 
-On evaluation failure, `metadata.status` and `error_details` are set on the in-memory metadata object for the run and appear on the evaluation end log line; `BaseEvaluator.evaluate` / `evaluate_sync` still re-raises and does not return a result object.
+The same policy applies to per-step `StepMetadata.error_details`. Both fields are emitted on the evaluation end log line.
 
 ## Creating custom evaluators
 

From 029b7913d5499f6696187a1c9af66619fa7ab6f2 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Fri, 15 May 2026 17:25:50 -0700
Subject: [PATCH 3/8] style: linting fixes

---
 .../python/src/learning_commons_evaluators/schemas/errors.py | 5 +----
 sdks/python/tests/schemas/test_errors.py                     | 4 +---
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/sdks/python/src/learning_commons_evaluators/schemas/errors.py b/sdks/python/src/learning_commons_evaluators/schemas/errors.py
index 545c1b9..cb87e09 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/errors.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/errors.py
@@ -235,10 +235,7 @@ def sanitize_pydantic_errors(errors: list[dict[str, Any]]) -> list[dict[str, Any
     remaining keys (``loc``, ``type``, ``msg``, ``ctx`` if present) describe
     the schema and the failure mode but not the offending content.
     """
-    return [
-        {key: value for key, value in error.items() if key != "input"}
-        for error in errors
-    ]
+    return [{key: value for key, value in error.items() if key != "input"} for error in errors]
 
 
 def format_error_for_metadata(error: BaseException) -> str:
diff --git a/sdks/python/tests/schemas/test_errors.py b/sdks/python/tests/schemas/test_errors.py
index ce25d18..9f26c08 100644
--- a/sdks/python/tests/schemas/test_errors.py
+++ b/sdks/python/tests/schemas/test_errors.py
@@ -81,9 +81,7 @@ def test_api_error_subclasses_are_api_errors(self) -> None:
             assert isinstance(exc, APIError)
 
     def test_output_validation_error_carries_validation_errors(self) -> None:
-        details = [
-            {"loc": ("vocabulary_complexity",), "type": "missing", "msg": "Field required"}
-        ]
+        details = [{"loc": ("vocabulary_complexity",), "type": "missing", "msg": "Field required"}]
         err = OutputValidationError(validation_errors=details)
         assert err.validation_errors == details
         # No input snippets leak into the SDK error's string form.

From d255b284948dd642473bf75b2ee1e281d47173d0 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Fri, 15 May 2026 17:37:07 -0700
Subject: [PATCH 4/8] feat: lint-fix make command

---
 sdks/python/Makefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/sdks/python/Makefile b/sdks/python/Makefile
index 72d8b4b..a29aaa9 100644
--- a/sdks/python/Makefile
+++ b/sdks/python/Makefile
@@ -14,7 +14,7 @@ SETTINGS_DST := src/learning_commons_evaluators/settings
 
 .PHONY: help build check-build test unit-test contract-test \
         generate-settings check-generated sync-settings check-sync \
-        lint format format-check typecheck pip-check verify coverage
+        lint lint-fix format format-check typecheck pip-check verify coverage
 
 help:
 	@echo "Usage: make <target>"
@@ -23,6 +23,7 @@ help:
 	@echo "  check-build        Verify build artifacts are up to date (use in CI)"
 	@echo ""
 	@echo "  lint               Ruff linter (src/, tests/, scripts/)"
+	@echo "  lint-fix           Ruff linter with --fix (safe auto-fixes)"
 	@echo "  format             Apply Ruff formatter"
 	@echo "  format-check       Fail if Ruff would reformat any file"
 	@echo "  typecheck          Mypy on src package + tests"
@@ -58,6 +59,9 @@ check-build: check-generated check-sync
 lint:
 	$(RUFF) check src tests scripts
 
+lint-fix:
+	$(RUFF) check --fix src tests scripts
+
 format:
 	$(RUFF) format src tests scripts
 

From 4b963a05a71cdb9a2ffbbf3dc3c11807ab600e77 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Fri, 15 May 2026 17:37:42 -0700
Subject: [PATCH 5/8] style: more linting

---
 sdks/python/tests/schemas/test_errors.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sdks/python/tests/schemas/test_errors.py b/sdks/python/tests/schemas/test_errors.py
index 9f26c08..4445dbf 100644
--- a/sdks/python/tests/schemas/test_errors.py
+++ b/sdks/python/tests/schemas/test_errors.py
@@ -35,7 +35,6 @@
     wrap_provider_error,
 )
 
-
 # --- Hierarchy ---------------------------------------------------------------
 
 

From 59b0416f5655968f72d70f9c81763f1d61808c10 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Fri, 15 May 2026 17:49:16 -0700
Subject: [PATCH 6/8] build: lint and type issues addressed

---
 .../learning_commons_evaluators/schemas/errors.py  | 14 +++++++++++++-
 sdks/python/tests/evaluators/test_base.py          |  5 ++++-
 sdks/python/tests/schemas/test_errors.py           |  3 +++
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/sdks/python/src/learning_commons_evaluators/schemas/errors.py b/sdks/python/src/learning_commons_evaluators/schemas/errors.py
index cb87e09..5d60cc6 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/errors.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/errors.py
@@ -31,6 +31,7 @@
 import asyncio
 import builtins
 import re
+from collections.abc import Mapping, Sequence
 from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
@@ -170,6 +171,15 @@ class NetworkError(APIError):
 
     retryable = True
 
+    def __init__(
+        self,
+        message: str = "Network request failed",
+        *,
+        status_code: int | None = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(message, status_code=status_code, **kwargs)
+
 
 class RequestTimeoutError(APIError):
     """The request to the provider exceeded the configured timeout.
@@ -226,7 +236,9 @@ def __init__(
         self.validation_errors = validation_errors
 
 
-def sanitize_pydantic_errors(errors: list[dict[str, Any]]) -> list[dict[str, Any]]:
+def sanitize_pydantic_errors(
+    errors: Sequence[Mapping[str, Any]],
+) -> list[dict[str, Any]]:
     """Strip raw input values from a list of :meth:`pydantic.ValidationError.errors` entries.
 
     Pydantic's per-error dicts include an ``input`` key containing the raw
diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py
index cca0c93..4d07dea 100644
--- a/sdks/python/tests/evaluators/test_base.py
+++ b/sdks/python/tests/evaluators/test_base.py
@@ -684,7 +684,7 @@ async def test_wraps_unexpected_chain_failure_as_api_error(
         template = ChatPromptTemplate.from_messages([("human", "{input}")])
         with (
             patch(_CHAIN_PATCH, return_value=_ChainFailureChatModel()),
-            pytest.raises(APIError, match="simulated provider failure"),
+            pytest.raises(APIError, match="^API request failed$") as exc_info,
         ):
             await stub_evaluator.execute_prompt_chain_step(
                 step_name="main",
@@ -698,6 +698,9 @@ async def test_wraps_unexpected_chain_failure_as_api_error(
                 chain_inputs={"input": "text"},
                 parser_output_type=_ChainOutput,
             )
+        cause = exc_info.value.__cause__
+        assert isinstance(cause, ValueError)
+        assert str(cause) == "simulated provider failure"
 
     async def test_malformed_llm_json_raises_output_validation_error(
         self, stub_evaluator, evaluation_metadata
diff --git a/sdks/python/tests/schemas/test_errors.py b/sdks/python/tests/schemas/test_errors.py
index 4445dbf..6829206 100644
--- a/sdks/python/tests/schemas/test_errors.py
+++ b/sdks/python/tests/schemas/test_errors.py
@@ -278,6 +278,7 @@ def test_default_message_is_used_when_no_signal(self) -> None:
     def test_provider_and_model_threaded_through(self) -> None:
         err = _FakeProviderError("429", status_code=429)
         wrapped = wrap_provider_error(err, provider=LLMProvider.ANTHROPIC, model="claude-x")
+        assert isinstance(wrapped, APIError)
         assert wrapped.provider is LLMProvider.ANTHROPIC
         assert wrapped.model == "claude-x"
 
@@ -291,6 +292,7 @@ def test_response_body_and_request_id_extracted(self) -> None:
             ),
         )
         wrapped = wrap_provider_error(err)
+        assert isinstance(wrapped, APIError)
         assert wrapped.request_id == "req_xyz"
         assert wrapped.response_body == {"error": "rate_limited"}
 
@@ -303,6 +305,7 @@ def test_body_attribute_preferred_over_response_json(self) -> None:
             response=_FakeResponse(body={"error": "from_response_json"}),
         )
         wrapped = wrap_provider_error(err)
+        assert isinstance(wrapped, APIError)
         assert wrapped.response_body == {"error": "from_body_attr"}
 
     def test_original_exception_preserved_via_cause(self) -> None:

From 86ac788136c1da66c60ccfe51940ec0b70068678 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Fri, 15 May 2026 18:37:31 -0700
Subject: [PATCH 7/8] chore: addressing PR comments

---
 sdks/python/README.md                         |   4 +-
 .../evaluators/base.py                        |  12 +-
 .../schemas/errors.py                         | 118 ++++++++++++---
 sdks/python/tests/evaluators/test_base.py     |  40 +++++
 sdks/python/tests/schemas/test_errors.py      | 140 ++++++++++++++++--
 5 files changed, 282 insertions(+), 32 deletions(-)

diff --git a/sdks/python/README.md b/sdks/python/README.md
index 13a894f..1565d6b 100644
--- a/sdks/python/README.md
+++ b/sdks/python/README.md
@@ -302,7 +302,7 @@ config = create_config_no_telemetry(logger=create_silent_logger())
 
 ## Error handling
 
-Every exception the SDK raises inherits from `EvaluatorError`. Failures inside LLM prompt steps are wrapped at the boundary so callers see a predictable, sanitized hierarchy instead of raw LangChain, OpenAI, Anthropic, or HTTP-client exceptions.
+Exceptions for evaluator failures, provider failures, and input validation inherit from `EvaluatorError`. Failures inside LLM prompt steps are wrapped at the boundary so callers see a predictable, sanitized hierarchy instead of raw LangChain, OpenAI, Anthropic, or HTTP-client exceptions. A few documented misuse paths still raise standard Python exceptions — for example `ValueError` from `execute_prompt_chain_step` when `json_dict_normalizer` is set without `parser_output_type`, and `RuntimeError` from `evaluate_sync()` when an asyncio event loop is already running on the current thread. Those are programmer errors, not evaluation failures.
 
 ### Hierarchy
 
@@ -360,6 +360,8 @@ The original provider exception is preserved on `__cause__` (via `raise … from
 
 ```python
 import logging
+import time
+
 from learning_commons_evaluators import APIError, OutputValidationError, RateLimitError
 
 log = logging.getLogger(__name__)
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
index a28dbf5..04ddff6 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
@@ -344,7 +344,17 @@ async def _run_chain() -> BaseModel | str:
                     loose = JsonOutputParser()
                     parsed_dict = await loose.ainvoke(ai_message)
                     if not isinstance(parsed_dict, dict):
-                        parsed_dict = dict(parsed_dict)
+                        # JSON parsed cleanly but the top-level value isn't an object
+                        # (e.g. the LLM returned a JSON array or scalar). That's an
+                        # output-shape failure, not a parse failure — surface it as
+                        # OutputValidationError so callers can treat it consistently
+                        # with schema-mismatch errors, and avoid the TypeError that
+                        # ``dict(parsed_dict)`` would raise on a non-dict.
+                        raise OutputValidationError(
+                            "Model output is not a JSON object",
+                            provider=prompt_settings.provider_type,
+                            model=prompt_settings.model,
+                        )
                     normalized = json_dict_normalizer(parsed_dict)
                     return parser_output_type.model_validate(normalized)
 
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/errors.py b/sdks/python/src/learning_commons_evaluators/schemas/errors.py
index 5d60cc6..cdcd751 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/errors.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/errors.py
@@ -38,6 +38,12 @@
     from learning_commons_evaluators.schemas.config import LLMProvider
 
 
+# Status codes at or above this threshold are treated as transient. Used both by
+# the :class:`APIError` constructor's retryable auto-derivation and by
+# :func:`wrap_provider_error` when classifying status-only responses.
+_RETRYABLE_STATUS_MIN = 500
+
+
 class EvaluatorError(Exception):
     """Base class for every error raised by this SDK.
 
@@ -110,9 +116,18 @@ def __init__(
     ) -> None:
         super().__init__(message)
         self.status_code = status_code
-        # Honour subclass class-level default unless caller explicitly overrides.
+        # Resolve ``retryable`` in three layers, most specific first:
+        #   1. Explicit caller override wins.
+        #   2. 5xx status codes are transient by definition — flip to True so a
+        #      direct ``APIError("...", status_code=500)`` doesn't silently end up
+        #      non-retryable.
+        #   3. Otherwise inherit the subclass class-level default
+        #      (False for APIError/AuthenticationError, True for RateLimitError /
+        #      NetworkError / RequestTimeoutError / OutputValidationError).
         if retryable is not None:
             self.retryable = retryable
+        elif status_code is not None and status_code >= _RETRYABLE_STATUS_MIN:
+            self.retryable = True
         self.provider = provider
         self.model = model
         self.response_body = response_body
@@ -236,18 +251,48 @@ def __init__(
         self.validation_errors = validation_errors
 
 
+def _is_safe_ctx_value(value: Any) -> bool:
+    """True if a Pydantic ``ctx`` value is safe to expose in logs/telemetry.
+
+    Pydantic puts constraint metadata in ``ctx`` (e.g. ``{"min_length": 3}``)
+    but custom validators can attach *anything* there, including exception
+    objects whose ``str()`` may echo the offending input. We restrict to
+    JSON-primitive values and tuples/lists of the same.
+    """
+    if value is None or isinstance(value, (bool, int, float, str)):
+        return True
+    if isinstance(value, (tuple, list)):
+        return all(_is_safe_ctx_value(item) for item in value)
+    return False
+
+
 def sanitize_pydantic_errors(
     errors: Sequence[Mapping[str, Any]],
 ) -> list[dict[str, Any]]:
-    """Strip raw input values from a list of :meth:`pydantic.ValidationError.errors` entries.
-
-    Pydantic's per-error dicts include an ``input`` key containing the raw
-    value that failed validation. For LLM outputs that value may echo prompt
-    content, so we drop it before the error reaches logs or telemetry. The
-    remaining keys (``loc``, ``type``, ``msg``, ``ctx`` if present) describe
-    the schema and the failure mode but not the offending content.
+    """Strip unsafe values from :meth:`pydantic.ValidationError.errors` entries.
+
+    Pydantic's per-error dicts include an ``input`` key holding the raw value
+    that failed validation; for LLM outputs that value may echo prompt
+    content, so we always drop it. We also re-shape ``ctx`` because custom
+    validators can attach arbitrary objects there — even raising
+    ``ValueError(unsafe_user_text)`` inside a validator puts the resulting
+    exception (and its message) into ``ctx``. We keep only known-safe
+    primitive ``ctx`` values and omit ``ctx`` entirely if nothing safe
+    remains. The other keys (``loc``, ``type``, ``msg``, ``url``) describe
+    the schema and the failure mode and are passed through.
     """
-    return [{key: value for key, value in error.items() if key != "input"} for error in errors]
+    sanitized: list[dict[str, Any]] = []
+    for error in errors:
+        clean: dict[str, Any] = {
+            key: value for key, value in error.items() if key not in {"input", "ctx"}
+        }
+        ctx = error.get("ctx")
+        if isinstance(ctx, Mapping):
+            safe_ctx = {k: v for k, v in ctx.items() if _is_safe_ctx_value(v)}
+            if safe_ctx:
+                clean["ctx"] = safe_ctx
+        sanitized.append(clean)
+    return sanitized
 
 
 def format_error_for_metadata(error: BaseException) -> str:
@@ -266,9 +311,6 @@ def format_error_for_metadata(error: BaseException) -> str:
 
 # --- Provider-error wrapping --------------------------------------------------
 
-# Status codes we consider transient even without a more specific subclass match.
-_RETRYABLE_STATUS_MIN = 500
-
 # Exception class-name suffixes used as a soft signal when isinstance checks
 # against provider SDKs aren't possible (e.g. those SDKs aren't installed in
 # the caller's environment). The provider SDKs follow consistent naming.
@@ -367,20 +409,52 @@ def _is_model_not_found(error: Exception, status_code: int | None) -> bool:
     return False
 
 
-def _is_timeout(error: Exception) -> bool:
-    """Detect timeout errors by exception class regardless of which SDK raised."""
+# Narrow, Python-flavoured phrases used as a last-resort fallback when the
+# exception surfaces no typed signal at all (e.g. a third party caught a
+# socket/httpx error and re-raised as a generic Exception). Kept deliberately
+# specific — the previous TS-port keyword list included bare "Connection" /
+# "network", which false-positives on unrelated messages.
+_TIMEOUT_MESSAGE_PATTERN = re.compile(r"\b(timed out|timeout)\b", re.IGNORECASE)
+_NETWORK_MESSAGE_PATTERN = re.compile(
+    r"connection refused|connection reset|name resolution|could not resolve",
+    re.IGNORECASE,
+)
+
+
+def _is_timeout(error: Exception, status_code: int | None = None) -> bool:
+    """Detect timeout errors from status code, exception class, or message.
+
+    HTTP 408 takes precedence — a provider that surfaces 408 should be treated
+    as a timeout regardless of the exception's class name. After that we check
+    typed signals (``asyncio.TimeoutError`` / builtin ``TimeoutError`` and
+    common SDK timeout class names), then fall back to a narrow message scan
+    for status-less exceptions wrapped by upstream code.
+    """
+    if status_code == 408:
+        return True
     if isinstance(error, (asyncio.TimeoutError, builtins.TimeoutError)):
         return True
     name = type(error).__name__
-    return any(hint in name for hint in _TIMEOUT_NAME_HINTS)
+    if any(hint in name for hint in _TIMEOUT_NAME_HINTS):
+        return True
+    return bool(_TIMEOUT_MESSAGE_PATTERN.search(str(error)))
 
 
 def _is_network(error: Exception) -> bool:
-    """Detect network-level failures by exception class."""
+    """Detect network-level failures from exception class or message.
+
+    The class check covers builtins (``ConnectionError`` and its subclasses
+    ``ConnectionRefusedError`` / ``ConnectionResetError`` / etc.) and the
+    common SDK class names (``APIConnectionError``, ``ConnectError``, …).
+    The message fallback handles cases where some third party caught a
+    socket/httpx failure and re-raised it as a plain ``Exception``.
+    """
     if isinstance(error, ConnectionError):
         return True
     name = type(error).__name__
-    return any(hint in name for hint in _NETWORK_NAME_HINTS)
+    if any(hint in name for hint in _NETWORK_NAME_HINTS):
+        return True
+    return bool(_NETWORK_MESSAGE_PATTERN.search(str(error)))
 
 
 def wrap_provider_error(
@@ -394,8 +468,12 @@ def wrap_provider_error(
 
     Routing precedence (most specific first):
 
-    1. Asyncio / builtin / SDK timeout exception types → :class:`RequestTimeoutError`
-    2. Built-in / SDK connection-error exception types → :class:`NetworkError`
+    1. Timeouts — HTTP 408, asyncio / builtin / SDK timeout exception types,
+       or a narrow "timed out / timeout" message pattern → :class:`RequestTimeoutError`
+    2. Network failures — builtin ``ConnectionError`` (and subclasses),
+       SDK connection-error class names, or a narrow message pattern
+       ("connection refused / reset", "name resolution", "could not resolve")
+       → :class:`NetworkError`
     3. HTTP 404 (and model-shaped 400) → :class:`ConfigurationError` (model not found)
     4. HTTP 401 / 403 → :class:`AuthenticationError`
     5. HTTP 429 → :class:`RateLimitError`
@@ -434,7 +512,7 @@ def wrap_provider_error(
         "request_id": request_id,
     }
 
-    if _is_timeout(error):
+    if _is_timeout(error, status_code):
         return RequestTimeoutError(status_code=status_code, **common)
 
     if _is_network(error):
diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py
index 4d07dea..87eb91e 100644
--- a/sdks/python/tests/evaluators/test_base.py
+++ b/sdks/python/tests/evaluators/test_base.py
@@ -781,3 +781,43 @@ async def test_schema_mismatch_raises_output_validation_error(
             assert "input" not in entry
             assert "loc" in entry
             assert "type" in entry
+
+    async def test_non_dict_json_in_normalizer_path_raises_output_validation_error(
+        self, stub_evaluator, evaluation_metadata
+    ):
+        """A JSON array (or any non-object) on the ``json_dict_normalizer`` path becomes OutputValidationError.
+
+        Previously the code did ``dict(parsed_dict)`` after the JsonOutputParser,
+        which raises ``TypeError`` on a list and falls through to ``wrap_provider_error``
+        as a generic ``APIError`` — making the failure indistinguishable from a
+        provider HTTP error. We now surface it as ``OutputValidationError`` so
+        callers can catch malformed model output consistently.
+        """
+
+        def passthrough(d: dict) -> dict:
+            return d
+
+        template = ChatPromptTemplate.from_messages([("human", "{input}")])
+        with (
+            patch(
+                _CHAIN_PATCH,
+                return_value=_fake_chat_model(AIMessage(content='["not", "an", "object"]')),
+            ),
+            pytest.raises(OutputValidationError) as exc_info,
+        ):
+            await stub_evaluator.execute_prompt_chain_step(
+                step_name="main",
+                prompt_settings=PromptSettings(
+                    provider_type=LLMProvider.GOOGLE,
+                    model="gemini-2.0-flash",
+                    temperature=0.0,
+                ),
+                evaluation_metadata=evaluation_metadata,
+                template=template,
+                chain_inputs={"input": "text"},
+                parser_output_type=_ChainOutput,
+                json_dict_normalizer=passthrough,
+            )
+        assert "JSON object" in str(exc_info.value)
+        assert exc_info.value.provider is LLMProvider.GOOGLE
+        assert exc_info.value.model == "gemini-2.0-flash"
diff --git a/sdks/python/tests/schemas/test_errors.py b/sdks/python/tests/schemas/test_errors.py
index 6829206..0f92d53 100644
--- a/sdks/python/tests/schemas/test_errors.py
+++ b/sdks/python/tests/schemas/test_errors.py
@@ -123,6 +123,36 @@ def test_authentication_error_default_message_is_generic(self) -> None:
         err = AuthenticationError(status_code=401)
         assert str(err) == "Authentication failed"
 
+    def test_api_error_auto_retryable_for_5xx_when_unspecified(self) -> None:
+        """5xx status codes flip ``retryable`` to True without an explicit argument.
+
+        Direct ``APIError(..., status_code=500)`` construction (outside of
+        ``wrap_provider_error``) should agree with the documented behavior that
+        transient server errors are retryable. Explicit ``retryable=False``
+        always wins.
+        """
+        assert APIError("server error", status_code=500).retryable is True
+        assert APIError("bad gateway", status_code=502).retryable is True
+        assert APIError("gateway timeout", status_code=504).retryable is True
+        # 4xx still inherits the class default (False).
+        assert APIError("not found", status_code=404).retryable is False
+        # No status code at all — fall back to class default.
+        assert APIError("opaque failure").retryable is False
+        # Explicit override beats auto-derive.
+        assert APIError("server", status_code=500, retryable=False).retryable is False
+
+    def test_network_error_constructs_without_args(self) -> None:
+        """Regression: NetworkError must accept zero positional args.
+
+        ``wrap_provider_error`` calls ``NetworkError(**common)`` with only kwargs,
+        so the class needs its own default message — otherwise the previously-broken
+        version would raise ``TypeError`` instead of returning a NetworkError.
+        """
+        err = NetworkError()
+        assert isinstance(err, APIError)
+        assert err.retryable is True
+        assert str(err)  # non-empty default message
+
 
 # --- Helpers for fake provider exceptions ------------------------------------
 
@@ -252,6 +282,51 @@ class APITimeoutError(Exception):
         wrapped = wrap_provider_error(APITimeoutError("timed out"))
         assert isinstance(wrapped, RequestTimeoutError)
 
+    def test_408_routes_to_timeout_not_generic_api_error(self) -> None:
+        """HTTP 408 is a timeout per RFC 9110 and should route to RequestTimeoutError.
+
+        Without explicit handling it would fall through to a non-retryable APIError
+        (4xx status, below the 5xx-retry threshold), which is wrong — 408 is the
+        canonical "Request Timeout" status and is documented as retryable.
+        """
+        err = _FakeProviderError("Request Timeout", status_code=408)
+        wrapped = wrap_provider_error(err)
+        assert isinstance(wrapped, RequestTimeoutError)
+        assert wrapped.status_code == 408
+        assert wrapped.retryable is True
+
+    def test_message_only_timeout_routes_to_timeout(self) -> None:
+        """A status-less, untyped exception with 'timed out' in the message still routes correctly.
+
+        Covers the case where an upstream library wraps a socket/httpx timeout
+        into a plain Exception. The narrow message-pattern fallback catches it.
+        """
+        wrapped = wrap_provider_error(Exception("request to provider timed out after 30s"))
+        assert isinstance(wrapped, RequestTimeoutError)
+        assert wrapped.retryable is True
+
+    def test_message_only_connection_refused_routes_to_network(self) -> None:
+        wrapped = wrap_provider_error(Exception("[Errno 111] Connection refused"))
+        assert isinstance(wrapped, NetworkError)
+        assert wrapped.retryable is True
+
+    def test_message_only_dns_failure_routes_to_network(self) -> None:
+        wrapped = wrap_provider_error(Exception("Could not resolve host: api.example.com"))
+        assert isinstance(wrapped, NetworkError)
+
+    def test_message_fallback_does_not_false_positive_on_bare_network_word(self) -> None:
+        """The narrow pattern intentionally rejects the bare word 'network'.
+
+        The old TS-port keyword list matched any occurrence of 'network', which
+        caused false positives on messages like 'check your network adapter
+        settings'. The bounded pattern only accepts specific phrases.
+        """
+        wrapped = wrap_provider_error(Exception("please configure your network adapter"))
+        # Falls through to generic, non-retryable APIError instead.
+        assert isinstance(wrapped, APIError)
+        assert not isinstance(wrapped, NetworkError)
+        assert wrapped.retryable is False
+
     def test_500_returns_retryable_api_error(self) -> None:
         err = _FakeProviderError("internal server error", status_code=500)
         wrapped = wrap_provider_error(err)
@@ -331,27 +406,72 @@ def test_message_regex_fallback_when_no_status_code_attr(self) -> None:
 
 
 class TestSanitizePydanticErrors:
-    def test_strips_input_key_only(self) -> None:
-        """Drop the raw ``input`` value but keep loc/type/msg/ctx for debugging."""
+    def test_strips_input_key(self) -> None:
+        """The raw ``input`` value is always dropped; safe primitives in ``ctx`` pass through."""
         raw = [
             {
                 "loc": ("vocabulary_complexity",),
-                "type": "missing",
-                "msg": "Field required",
+                "type": "string_too_short",
+                "msg": "String should have at least 3 characters",
                 "input": {"label": "only"},  # potentially echoes LLM output
-                "ctx": {"some": "context"},
+                "ctx": {"min_length": 3},
             }
         ]
-        sanitized = sanitize_pydantic_errors(raw)
-        assert sanitized == [
+        assert sanitize_pydantic_errors(raw) == [
             {
                 "loc": ("vocabulary_complexity",),
-                "type": "missing",
-                "msg": "Field required",
-                "ctx": {"some": "context"},
+                "type": "string_too_short",
+                "msg": "String should have at least 3 characters",
+                "ctx": {"min_length": 3},
             }
         ]
 
+    def test_drops_unsafe_ctx_values(self) -> None:
+        """Custom validators can put arbitrary objects into ``ctx`` — drop them.
+
+        Pydantic propagates a custom ``ValueError`` into ``ctx["error"]`` when a
+        ``model_validator`` raises one, and the exception's ``str()`` may include
+        the offending input. Anything that isn't a JSON primitive (or a tuple/list
+        thereof) is filtered out, and ``ctx`` is omitted entirely if nothing safe
+        remains.
+        """
+        unsafe_exception = ValueError("rejected text: <sensitive prompt echo>")
+        raw: list[dict[str, Any]] = [
+            {
+                "loc": ("answer",),
+                "type": "value_error",
+                "msg": "Value error, rejected text: <sensitive prompt echo>",
+                "input": "anything",
+                "ctx": {"error": unsafe_exception},  # only key — must be dropped entirely
+            },
+            {
+                "loc": ("score",),
+                "type": "less_than_equal",
+                "msg": "Input should be less than or equal to 10",
+                "input": 42,
+                "ctx": {
+                    "le": 10,  # primitive — keep
+                    "details": unsafe_exception,  # unsafe — drop
+                    "tags": ["one", "two"],  # list of primitives — keep
+                    "nested": {"deep": 1},  # mapping — drop (not in allowlist)
+                },
+            },
+        ]
+        sanitized = sanitize_pydantic_errors(raw)
+        # First entry: ctx had only an unsafe value, so the whole ctx key is gone.
+        assert sanitized[0] == {
+            "loc": ("answer",),
+            "type": "value_error",
+            "msg": "Value error, rejected text: <sensitive prompt echo>",
+        }
+        # Second entry: kept primitives and a tuple/list of primitives, dropped the rest.
+        assert sanitized[1] == {
+            "loc": ("score",),
+            "type": "less_than_equal",
+            "msg": "Input should be less than or equal to 10",
+            "ctx": {"le": 10, "tags": ["one", "two"]},
+        }
+
     def test_handles_empty_list(self) -> None:
         assert sanitize_pydantic_errors([]) == []
 

From 14500716cf436f46790ac2e9a3f804c5c8ecc493 Mon Sep 17 00:00:00 2001
From: Fredrick Sisenda <fsisenda@chanzuckerberg.com>
Date: Mon, 18 May 2026 19:26:00 -0600
Subject: [PATCH 8/8] chore: more PR comment fixes

---
 sdks/python/README.md                         |  6 +-
 .../evaluators/base.py                        | 18 +++--
 .../schemas/errors.py                         | 73 +++++++++++++------
 sdks/python/tests/evaluators/test_base.py     | 36 +++++++++
 sdks/python/tests/schemas/test_errors.py      | 61 ++++++++++++++--
 5 files changed, 156 insertions(+), 38 deletions(-)

diff --git a/sdks/python/README.md b/sdks/python/README.md
index 1565d6b..f39a012 100644
--- a/sdks/python/README.md
+++ b/sdks/python/README.md
@@ -302,7 +302,7 @@ config = create_config_no_telemetry(logger=create_silent_logger())
 
 ## Error handling
 
-Exceptions for evaluator failures, provider failures, and input validation inherit from `EvaluatorError`. Failures inside LLM prompt steps are wrapped at the boundary so callers see a predictable, sanitized hierarchy instead of raw LangChain, OpenAI, Anthropic, or HTTP-client exceptions. A few documented misuse paths still raise standard Python exceptions — for example `ValueError` from `execute_prompt_chain_step` when `json_dict_normalizer` is set without `parser_output_type`, and `RuntimeError` from `evaluate_sync()` when an asyncio event loop is already running on the current thread. Those are programmer errors, not evaluation failures.
+During a normal `evaluate()` / `evaluate_sync()` run, failures from evaluator input checks, configuration, LLM prompt steps, and output validation typically surface as subclasses of `EvaluatorError`. Failures inside LLM prompt steps are wrapped at the boundary so callers see a predictable, sanitized hierarchy instead of raw LangChain, OpenAI, Anthropic, or HTTP-client exceptions. A few documented paths still raise standard Python exceptions — for example `ValueError` from `execute_prompt_chain_step` when `json_dict_normalizer` is set without `parser_output_type`, and `RuntimeError` from `evaluate_sync()` when an asyncio event loop is already running on the current thread. Those are programmer errors, not evaluation failures.
 
 ### Hierarchy
 
@@ -354,7 +354,7 @@ Error **messages** (the value returned by `str(err)`) are short and controlled.
 - `model` on `APIError` — the model ID requested.
 - `response_body` on `APIError` — decoded response body. Opt-in for debugging; may contain echoed prompt content, so treat as sensitive.
 - `request_id` on `APIError` — provider request ID, useful for support escalation.
-- `validation_errors` on `OutputValidationError` — per-field details from Pydantic's `errors()` API with raw input values stripped (safe for logs).
+- `validation_errors` on `OutputValidationError` — per-field entries from Pydantic's `errors()` API after `sanitize_pydantic_errors` (`loc`, `type`, optional `url`, and safe primitive `ctx` only — no `input` or `msg`, which can echo model output).
 
 The original provider exception is preserved on `__cause__` (via `raise … from e`), so debuggers, tracebacks, and `logging.exception()` retain full detail even though `str(err)` is sanitized.
 
@@ -370,7 +370,7 @@ try:
 except RateLimitError as e:
     time.sleep(e.retry_after or 30)  # seconds
 except OutputValidationError as e:
-    # Field-level errors are safe to log; raw LLM output is not in str(e).
+    # Structured entries omit Pydantic msg/input (may echo LLM text); use __cause__ for full detail.
     log.warning("Bad LLM output: %s", e.validation_errors)
     # Original pydantic.ValidationError / OutputParserException available as e.__cause__
 except APIError as e:
diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
index 04ddff6..9954c98 100644
--- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py
+++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py
@@ -315,10 +315,11 @@ async def execute_prompt_chain_step(
         Raises:
             ConfigurationError: No provider config for ``prompt_settings.provider_type``.
             OutputValidationError: The LLM response didn't satisfy the expected
-                output schema — either because it wasn't valid JSON (LangChain
-                ``OutputParserException``) or because the JSON didn't match the
-                Pydantic model (``pydantic.ValidationError``). The original
-                exception is reachable via ``__cause__``.
+                output schema — invalid JSON (LangChain ``OutputParserException``),
+                JSON that didn't match the Pydantic model (``pydantic.ValidationError``),
+                a non-object JSON value when using ``json_dict_normalizer``, or
+                ``TypeError`` / ``ValueError`` from ``json_dict_normalizer`` itself.
+                The original exception is reachable via ``__cause__``.
             APIError (or other ``EvaluatorError`` subclasses):
                 :func:`~learning_commons_evaluators.schemas.errors.wrap_provider_error`
                 output for LangChain or HTTP failures from the LLM provider.
@@ -355,7 +356,14 @@ async def _run_chain() -> BaseModel | str:
                             provider=prompt_settings.provider_type,
                             model=prompt_settings.model,
                         )
-                    normalized = json_dict_normalizer(parsed_dict)
+                    try:
+                        normalized = json_dict_normalizer(parsed_dict)
+                    except (TypeError, ValueError) as norm_err:
+                        raise OutputValidationError(
+                            "Model output could not be normalized before validation",
+                            provider=prompt_settings.provider_type,
+                            model=prompt_settings.model,
+                        ) from norm_err
                     return parser_output_type.model_validate(normalized)
 
                 parser = JsonOutputParser(pydantic_object=parser_output_type)
diff --git a/sdks/python/src/learning_commons_evaluators/schemas/errors.py b/sdks/python/src/learning_commons_evaluators/schemas/errors.py
index cdcd751..bfef160 100644
--- a/sdks/python/src/learning_commons_evaluators/schemas/errors.py
+++ b/sdks/python/src/learning_commons_evaluators/schemas/errors.py
@@ -22,8 +22,9 @@
   ``model``) are exposed as attributes for callers that want to introspect.
 
 * :func:`wrap_provider_error` prefers structured attributes (``.status_code``,
-  ``.response``) from provider SDK exceptions over regex on message strings.
-  String parsing is the fallback when nothing structured is available.
+  ``.response``) from provider SDK exceptions. Bounded message patterns are used
+  only when no HTTP status was extracted — so a 4xx/5xx response is not
+  reclassified from message wording alone.
 """
 
 from __future__ import annotations
@@ -128,6 +129,8 @@ def __init__(
             self.retryable = retryable
         elif status_code is not None and status_code >= _RETRYABLE_STATUS_MIN:
             self.retryable = True
+        else:
+            self.retryable = type(self).retryable
         self.provider = provider
         self.model = model
         self.response_body = response_body
@@ -230,8 +233,9 @@ class OutputValidationError(APIError):
 
     The original :class:`pydantic.ValidationError` is preserved on
     ``__cause__`` when raised via ``raise … from e``. The :attr:`validation_errors`
-    attribute exposes the field-level details from Pydantic's ``errors()`` API
-    with raw input values stripped — safe for inclusion in logs and telemetry.
+    list is produced by :func:`sanitize_pydantic_errors` — ``loc`` / ``type`` /
+    ``url`` only, so entries are safe for logs and telemetry without echoing raw
+    model text (see that helper for what is stripped).
     """
 
     retryable = True
@@ -244,10 +248,10 @@ def __init__(
         **kwargs: Any,
     ) -> None:
         super().__init__(message, **kwargs)
-        #: Per-field validation failures. Each entry has ``loc``, ``type``,
-        #: and ``msg`` keys (from :meth:`pydantic.ValidationError.errors`).
-        #: Raw input values are deliberately omitted to avoid leaking LLM
-        #: output (which may echo user prompts) into telemetry.
+        #: Per-field validation failures. Each entry carries at least ``loc``
+        #: and ``type`` (from :meth:`pydantic.ValidationError.errors`). The
+        #: ``msg`` string is omitted because custom validators often embed raw
+        #: model output there; use :attr:`__cause__` for full Pydantic detail.
         self.validation_errors = validation_errors
 
 
@@ -278,13 +282,15 @@ def sanitize_pydantic_errors(
     ``ValueError(unsafe_user_text)`` inside a validator puts the resulting
     exception (and its message) into ``ctx``. We keep only known-safe
     primitive ``ctx`` values and omit ``ctx`` entirely if nothing safe
-    remains. The other keys (``loc``, ``type``, ``msg``, ``url``) describe
-    the schema and the failure mode and are passed through.
+    remains. The ``msg`` field is dropped as well: custom validators often
+    interpolate rejected values into the message. ``loc``, ``type``, and
+    ``url`` (when present) are retained as schema-oriented, non-echoing hints;
+    full detail stays on the original :exc:`pydantic.ValidationError`.
     """
     sanitized: list[dict[str, Any]] = []
     for error in errors:
         clean: dict[str, Any] = {
-            key: value for key, value in error.items() if key not in {"input", "ctx"}
+            key: value for key, value in error.items() if key not in {"input", "ctx", "msg"}
         }
         ctx = error.get("ctx")
         if isinstance(ctx, Mapping):
@@ -322,15 +328,21 @@ def _provider_status_code(error: Exception) -> int | None:
     """Best-effort extraction of an HTTP status code from a provider exception.
 
     Prefers the structured attribute that openai-python and anthropic-python
-    both expose (``.status_code``). Falls back to ``.status`` (used by some
-    httpx-based clients) and finally to a regex on the message — narrowed to
-    HTTP-shaped 4xx/5xx codes near the start of the message to reduce false
-    positives like model names or token counts.
+    both expose (``.status_code`` on the exception), then ``.response.status_code``
+    (e.g. ``httpx.HTTPStatusError``), then ``.status`` on either object, and
+    finally a regex on the message — narrowed to HTTP-shaped 4xx/5xx codes near
+    the start of the message to reduce false positives like model names or token counts.
     """
     for attr in ("status_code", "status"):
         value = getattr(error, attr, None)
         if isinstance(value, int):
             return value
+    response = getattr(error, "response", None)
+    if response is not None:
+        for attr in ("status_code", "status"):
+            value = getattr(response, attr, None)
+            if isinstance(value, int):
+                return value
     match = re.search(r"\b(4\d{2}|5\d{2})\b", str(error)[:80])
     if match:
         return int(match.group(1))
@@ -352,7 +364,10 @@ def _provider_response_body(error: Exception) -> Any | None:
             return json_fn()
         except Exception:  # noqa: BLE001 — body is best-effort debug data
             pass
-    return getattr(response, "text", None)
+    try:
+        return getattr(response, "text", None)
+    except Exception:  # noqa: BLE001 — unread / streaming bodies can raise on read
+        return None
 
 
 def _provider_request_id(error: Exception) -> str | None:
@@ -428,7 +443,8 @@ def _is_timeout(error: Exception, status_code: int | None = None) -> bool:
     as a timeout regardless of the exception's class name. After that we check
     typed signals (``asyncio.TimeoutError`` / builtin ``TimeoutError`` and
     common SDK timeout class names), then fall back to a narrow message scan
-    for status-less exceptions wrapped by upstream code.
+    only when **no** HTTP status was extracted — so a 400/401 whose body
+    mentions "timeout" is not misclassified as a transport timeout.
     """
     if status_code == 408:
         return True
@@ -437,24 +453,31 @@ def _is_timeout(error: Exception, status_code: int | None = None) -> bool:
     name = type(error).__name__
     if any(hint in name for hint in _TIMEOUT_NAME_HINTS):
         return True
-    return bool(_TIMEOUT_MESSAGE_PATTERN.search(str(error)))
+    if status_code is None:
+        return bool(_TIMEOUT_MESSAGE_PATTERN.search(str(error)))
+    return False
 
 
-def _is_network(error: Exception) -> bool:
+def _is_network(error: Exception, status_code: int | None = None) -> bool:
     """Detect network-level failures from exception class or message.
 
     The class check covers builtins (``ConnectionError`` and its subclasses
     ``ConnectionRefusedError`` / ``ConnectionResetError`` / etc.) and the
     common SDK class names (``APIConnectionError``, ``ConnectError``, …).
     The message fallback handles cases where some third party caught a
-    socket/httpx failure and re-raised it as a plain ``Exception``.
+    socket/httpx failure and re-raised it as a plain ``Exception``. The message
+    scan runs only when no HTTP status was extracted (same rule as
+    ``_is_timeout``), so a 404 whose JSON mentions "could not resolve" does not
+    become a retryable :class:`NetworkError`.
     """
     if isinstance(error, ConnectionError):
         return True
     name = type(error).__name__
     if any(hint in name for hint in _NETWORK_NAME_HINTS):
         return True
-    return bool(_NETWORK_MESSAGE_PATTERN.search(str(error)))
+    if status_code is None:
+        return bool(_NETWORK_MESSAGE_PATTERN.search(str(error)))
+    return False
 
 
 def wrap_provider_error(
@@ -469,9 +492,11 @@ def wrap_provider_error(
     Routing precedence (most specific first):
 
     1. Timeouts — HTTP 408, asyncio / builtin / SDK timeout exception types,
-       or a narrow "timed out / timeout" message pattern → :class:`RequestTimeoutError`
+       or (only when no HTTP status was extracted) a narrow "timed out / timeout"
+       message pattern → :class:`RequestTimeoutError`
     2. Network failures — builtin ``ConnectionError`` (and subclasses),
-       SDK connection-error class names, or a narrow message pattern
+       SDK connection-error class names, or (only when no HTTP status was
+       extracted) a narrow message pattern
        ("connection refused / reset", "name resolution", "could not resolve")
        → :class:`NetworkError`
     3. HTTP 404 (and model-shaped 400) → :class:`ConfigurationError` (model not found)
@@ -515,7 +540,7 @@ def wrap_provider_error(
     if _is_timeout(error, status_code):
         return RequestTimeoutError(status_code=status_code, **common)
 
-    if _is_network(error):
+    if _is_network(error, status_code):
         return NetworkError(**common)
 
     if _is_model_not_found(error, status_code):
diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py
index 87eb91e..6c0e762 100644
--- a/sdks/python/tests/evaluators/test_base.py
+++ b/sdks/python/tests/evaluators/test_base.py
@@ -494,6 +494,41 @@ def _double(d: dict) -> dict:
         assert result.n == 1
         assert result.doubled == 2
 
+    async def test_json_dict_normalizer_raises_valueerror_wraps_as_output_validation(
+        self, stub_evaluator, evaluation_metadata
+    ) -> None:
+        """User ``json_dict_normalizer`` failures become ``OutputValidationError``, not ``APIError``."""
+
+        class _Out(BaseModel):
+            n: int = Field(description="n")
+
+        def _bad(_d: dict) -> dict:
+            raise ValueError("cannot normalize")
+
+        template = ChatPromptTemplate.from_messages([("human", "{input}")])
+        with (
+            patch(
+                _CHAIN_PATCH,
+                return_value=_fake_chat_model(AIMessage(content='{"n": 1}')),
+            ),
+            pytest.raises(OutputValidationError) as exc_info,
+        ):
+            await stub_evaluator.execute_prompt_chain_step(
+                step_name="main",
+                prompt_settings=PromptSettings(
+                    provider_type=LLMProvider.GOOGLE,
+                    model="gemini-2.0-flash",
+                    temperature=0.0,
+                ),
+                evaluation_metadata=evaluation_metadata,
+                template=template,
+                chain_inputs={"input": "Hello"},
+                parser_output_type=_Out,
+                json_dict_normalizer=_bad,
+            )
+        assert str(exc_info.value) == "Model output could not be normalized before validation"
+        assert isinstance(exc_info.value.__cause__, ValueError)
+
     async def test_parser_returning_model_instance_short_circuits_model_validate(
         self, stub_evaluator, evaluation_metadata
     ):
@@ -779,6 +814,7 @@ async def test_schema_mismatch_raises_output_validation_error(
         assert len(exc_info.value.validation_errors) > 0
         for entry in exc_info.value.validation_errors:
             assert "input" not in entry
+            assert "msg" not in entry
             assert "loc" in entry
             assert "type" in entry
 
diff --git a/sdks/python/tests/schemas/test_errors.py b/sdks/python/tests/schemas/test_errors.py
index 0f92d53..2cf7480 100644
--- a/sdks/python/tests/schemas/test_errors.py
+++ b/sdks/python/tests/schemas/test_errors.py
@@ -182,7 +182,7 @@ def __init__(
         message: str,
         *,
         status_code: int | None = None,
-        response: _FakeResponse | None = None,
+        response: Any = None,
         body: Any | None = None,
     ) -> None:
         super().__init__(message)
@@ -204,6 +204,38 @@ def test_status_code_read_from_attribute_not_message(self) -> None:
         assert isinstance(wrapped, RateLimitError)
         assert wrapped.status_code == 429
 
+    def test_status_code_read_from_response_when_missing_on_exception(self) -> None:
+        """``httpx.HTTPStatusError``-style objects keep the code on ``response``."""
+
+        class _Resp:
+            status_code = 401
+
+        class _HttpxLike(Exception):
+            def __init__(self, message: str) -> None:
+                super().__init__(message)
+                self.response = _Resp()
+
+        wrapped = wrap_provider_error(_HttpxLike("Request failed"))
+        assert isinstance(wrapped, AuthenticationError)
+        assert wrapped.status_code == 401
+
+    def test_wrap_succeeds_when_response_text_read_raises(self) -> None:
+        """Best-effort body extraction must not mask the original provider error."""
+
+        class _UnreadableTextResponse:
+            headers = _FakeHeaders({})
+
+            @property
+            def text(self) -> str:
+                raise OSError("body not available")
+
+        err = _FakeProviderError("failed", status_code=500, response=_UnreadableTextResponse())
+        wrapped = wrap_provider_error(err)
+        assert isinstance(wrapped, APIError)
+        assert wrapped.status_code == 500
+        assert wrapped.retryable is True
+        assert wrapped.response_body is None
+
     def test_404_routes_to_configuration_error_with_model_in_message(self) -> None:
         """Model-not-found is a developer mistake, not an API error."""
         err = _FakeProviderError("not found", status_code=404)
@@ -305,6 +337,26 @@ def test_message_only_timeout_routes_to_timeout(self) -> None:
         assert isinstance(wrapped, RequestTimeoutError)
         assert wrapped.retryable is True
 
+    def test_structured_4xx_wins_over_timeout_message_pattern(self) -> None:
+        """When ``status_code`` is present, message wording must not re-route to timeout."""
+        err = _FakeProviderError("invalid request timeout parameter", status_code=400)
+        wrapped = wrap_provider_error(err)
+        assert isinstance(wrapped, APIError)
+        assert wrapped.status_code == 400
+        assert wrapped.retryable is False
+        assert not isinstance(wrapped, RequestTimeoutError)
+
+    def test_structured_status_wins_over_network_message_pattern(self) -> None:
+        """When ``status_code`` is present, DNS-ish wording must not yield ``NetworkError``."""
+        err = _FakeProviderError(
+            "Unauthorized: could not resolve upstream identity provider",
+            status_code=401,
+        )
+        wrapped = wrap_provider_error(err)
+        assert isinstance(wrapped, AuthenticationError)
+        assert wrapped.status_code == 401
+        assert not isinstance(wrapped, NetworkError)
+
     def test_message_only_connection_refused_routes_to_network(self) -> None:
         wrapped = wrap_provider_error(Exception("[Errno 111] Connection refused"))
         assert isinstance(wrapped, NetworkError)
@@ -407,7 +459,7 @@ def test_message_regex_fallback_when_no_status_code_attr(self) -> None:
 
 class TestSanitizePydanticErrors:
     def test_strips_input_key(self) -> None:
-        """The raw ``input`` value is always dropped; safe primitives in ``ctx`` pass through."""
+        """The raw ``input`` value is always dropped; ``msg`` is dropped; safe ``ctx`` passes through."""
         raw = [
             {
                 "loc": ("vocabulary_complexity",),
@@ -421,7 +473,6 @@ def test_strips_input_key(self) -> None:
             {
                 "loc": ("vocabulary_complexity",),
                 "type": "string_too_short",
-                "msg": "String should have at least 3 characters",
                 "ctx": {"min_length": 3},
             }
         ]
@@ -433,7 +484,7 @@ def test_drops_unsafe_ctx_values(self) -> None:
         ``model_validator`` raises one, and the exception's ``str()`` may include
         the offending input. Anything that isn't a JSON primitive (or a tuple/list
         thereof) is filtered out, and ``ctx`` is omitted entirely if nothing safe
-        remains.
+        remains. ``msg`` is always stripped because it can echo the same content.
         """
         unsafe_exception = ValueError("rejected text: <sensitive prompt echo>")
         raw: list[dict[str, Any]] = [
@@ -462,13 +513,11 @@ def test_drops_unsafe_ctx_values(self) -> None:
         assert sanitized[0] == {
             "loc": ("answer",),
             "type": "value_error",
-            "msg": "Value error, rejected text: <sensitive prompt echo>",
         }
         # Second entry: kept primitives and a tuple/list of primitives, dropped the rest.
         assert sanitized[1] == {
             "loc": ("score",),
             "type": "less_than_equal",
-            "msg": "Input should be less than or equal to 10",
             "ctx": {"le": 10, "tags": ["one", "two"]},
         }