From fc90b047ee554ca94eafc0b604fed94446d8e795 Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Mon, 18 May 2026 14:38:26 +0000 Subject: [PATCH 01/26] refactor: drop jsonl_export result echo and server rewriter The inference mixin stamped a `jsonl_export` dict into the executor result and the server's results router rewrote its `path` / `url` / `relative_path` on subsequent file uploads. Both are redundant: the JSONL file lives at `artifacts/{spec.postprocess.jsonl_export.path}`, the spec already declares that relative path, and the existing `_artifacts` context plus the generic ref resolvers already turn any `{path: rel}` into a URL or local path on demand. Removing the echo also drops the only executor-specific field the server was reading out of an opaque result payload. Signed-off-by: Noppanat Wadlom --- src/server/routers/v1/results.py | 52 ------------------- src/worker/executors/mixins/inference.py | 6 --- .../test_transformers_chat_inference.py | 1 - 3 files changed, 59 deletions(-) diff --git a/src/server/routers/v1/results.py b/src/server/routers/v1/results.py index 9edceb13..00086507 100644 --- a/src/server/routers/v1/results.py +++ b/src/server/routers/v1/results.py @@ -25,7 +25,6 @@ result_file_path, write_result, ) -from shared.utils.atomic import atomic_write_text from shared.utils.manifest import ARTIFACTS_DIR, LOGS_DIR, RESULTS_NAME, sync_manifest from ...app_state import ( @@ -186,8 +185,6 @@ async def upload_result_file( detail=f"Failed to store artifact: {exc}", ) from exc - _rewrite_jsonl_export_paths(task_id, base_dir, target_path) - record = runtime.get_record(task_id) expected_artifacts: list[str] = [] if record: @@ -333,55 +330,6 @@ async def download_task_logs( return FileResponse(target_path) -def _rewrite_jsonl_export_paths( - task_id: str, base_dir: Path, artifact_path: Path -) -> None: - results_path = base_dir / RESULTS_NAME - if not results_path.exists(): - return - try: - payload = json.loads(results_path.read_text(encoding="utf-8")) - except Exception: - return - - filename = artifact_path.name - new_abs = str(artifact_path) - new_relative = f"{ARTIFACTS_DIR}/{filename}" - updated = False - - def _update(entry: dict[str, Any]) -> None: - nonlocal updated - if not isinstance(entry, dict): - return - block = entry.get("jsonl_export") - if isinstance(block, dict): - path_value = str(block.get("path") or "") - if path_value and Path(path_value).name == filename: - if "worker_path" not in block and path_value != new_abs: - block["worker_path"] = path_value - block["path"] = new_abs - block["relative_path"] = new_relative - block.setdefault("url", f"/api/v1/results/{task_id}/files/{filename}") - updated = True - children = entry.get("children") - if isinstance(children, dict): - for child_entry in children.values(): - if isinstance(child_entry, dict): - _update(child_entry) - - result_entry = payload.get("result") if isinstance(payload, dict) else None - if isinstance(result_entry, dict): - _update(result_entry) - - if updated: - try: - atomic_write_text( - results_path, json.dumps(payload, ensure_ascii=False, indent=2) - ) - except Exception: - pass - - def _resolve_bundle_sections(include: list[str]) -> tuple[str, ...]: if not include: return _BUNDLE_SECTIONS_DEFAULT diff --git a/src/worker/executors/mixins/inference.py b/src/worker/executors/mixins/inference.py index fb7ae7a2..b2c9d7e3 100644 --- a/src/worker/executors/mixins/inference.py +++ b/src/worker/executors/mixins/inference.py @@ -15,7 +15,6 @@ from shared.utils.json import to_json_serializable from ..base_executor import ExecutionError -from ..utils.checkpoints import artifact_ref from .data import DataMixin, InferenceEntry, PromptInput logger = logging.getLogger(__name__) @@ -292,11 +291,6 @@ def _maybe_export_jsonl( fh.write("\n") rel_path = target_path.relative_to(artifacts_dir).as_posix() - result["jsonl_export"] = { - **artifact_ref(rel_path), - "record_count": len(records), - "fields": list(fields_cfg.keys()), - } logger.info( "Task %s exported %d records to artifacts/%s", task_id, diff --git a/tests/worker/test_transformers_chat_inference.py b/tests/worker/test_transformers_chat_inference.py index 12fa0f4a..ef05f502 100644 --- a/tests/worker/test_transformers_chat_inference.py +++ b/tests/worker/test_transformers_chat_inference.py @@ -127,7 +127,6 @@ def test_transformers_executor_supports_chat_prompts_and_jsonl_export( assert [item["metadata"]["row_id"] for item in result["items"]] == ["a", "b"] assert result["usage"]["num_requests"] == 2 assert "latency_sec" in result["usage"] - assert result["jsonl_export"]["record_count"] == 2 exported = (tmp_path / "artifacts" / "rows.jsonl").read_text(encoding="utf-8") assert '"row_id": "a"' in exported From 69edc346c8767423d8cb0aa266904d6f29b5d1d7 Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Mon, 18 May 2026 14:40:17 +0000 Subject: [PATCH 02/26] feat(shared): add typed executor result and artifact schemas MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce ``BaseExecutorResult`` as the shared shape every executor's result conforms to, plus ``ArtifactRef`` and ``ArtifactContext`` for the existing relative-path resolution model. ``extra="allow"`` on the base class lets the server deserialize executor-specific subclass payloads without losing fields, so the wire format stays decoupled from the executor registry. No callers yet — wiring lands in follow-up commits. Signed-off-by: Noppanat Wadlom --- src/shared/schemas/artifact.py | 12 +++++++++++ src/shared/schemas/executor_result.py | 30 +++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 src/shared/schemas/artifact.py create mode 100644 src/shared/schemas/executor_result.py diff --git a/src/shared/schemas/artifact.py b/src/shared/schemas/artifact.py new file mode 100644 index 00000000..d9a232cc --- /dev/null +++ b/src/shared/schemas/artifact.py @@ -0,0 +1,12 @@ +from pydantic import BaseModel, Field + + +class ArtifactRef(BaseModel): + path: str = Field(description="Path relative to the task's artifacts/ dir.") + + +class ArtifactContext(BaseModel): + base_dir: str = Field(description="Producing task's output directory.") + base_url: str | None = Field( + default=None, description="HTTP origin (scheme://host[:port]) for upload." + ) diff --git a/src/shared/schemas/executor_result.py b/src/shared/schemas/executor_result.py new file mode 100644 index 00000000..5ee39721 --- /dev/null +++ b/src/shared/schemas/executor_result.py @@ -0,0 +1,30 @@ +from pydantic import BaseModel, ConfigDict, Field + +from .artifact import ArtifactContext + + +class BaseExecutorResult(BaseModel): + """Common shape for every executor's result payload. + + ``extra="allow"`` lets the server round-trip subclass payloads through + this base class without losing executor-specific fields. + """ + + model_config = ConfigDict( + extra="allow", + populate_by_name=True, + serialize_by_alias=True, + ) + + children: dict[str, "BaseExecutorResult"] = Field( + default_factory=dict, + description="Per-child result payloads for task merging.", + ) + artifacts: ArtifactContext | None = Field( + default=None, + alias="_artifacts", + description="Resolution context for relative artifact refs.", + ) + + +BaseExecutorResult.model_rebuild() From def4085e193b0f606616721d41f77bde96b2ccab Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Mon, 18 May 2026 14:46:18 +0000 Subject: [PATCH 03/26] refactor(shared): type ResultEnvelope.result and artifact helpers ``ResultEnvelope.result`` is now ``BaseExecutorResult`` instead of ``dict[str, Any]``. ``artifact_ref`` returns ``ArtifactRef`` and ``build_artifact_context`` returns ``ArtifactContext``. ``write_executor_result`` accepts either form and validates dict input into the base model before stamping the artifact context. The wire format is preserved by serializing under the ``_artifacts`` alias. Executors continue to return ``dict[str, Any]`` and are migrated to the typed subclasses in follow-up commits; the server flattens the typed ``result`` back to a dict at ``_load_stage_result`` so the existing ref-resolution path is unaffected. Signed-off-by: Noppanat Wadlom --- src/server/dispatcher/base.py | 2 +- src/shared/schemas/result.py | 8 ++++++-- src/worker/executors/utils/checkpoints.py | 24 +++++++++++++---------- tests/worker/test_checkpoint_utils.py | 15 +++++++++----- 4 files changed, 31 insertions(+), 18 deletions(-) diff --git a/src/server/dispatcher/base.py b/src/server/dispatcher/base.py index d3c15bc3..2f7c405b 100644 --- a/src/server/dispatcher/base.py +++ b/src/server/dispatcher/base.py @@ -922,7 +922,7 @@ def _load_stage_result(self, stage_task_id: str) -> dict[str, Any]: f"Result for task {stage_task_id} not found at {path}" ) content = json.loads(path.read_text(encoding="utf-8")) - return ResultEnvelope.model_validate(content).result + return ResultEnvelope.model_validate(content).result.model_dump() def _dig_path(self, data: Any, parts: list[str]) -> Any: current = data diff --git a/src/shared/schemas/result.py b/src/shared/schemas/result.py index 0eea3e5c..953e7fa7 100644 --- a/src/shared/schemas/result.py +++ b/src/shared/schemas/result.py @@ -9,10 +9,12 @@ from shared.utils.manifest import prepare_output_dir from shared.utils.time import now_iso +from .executor_result import BaseExecutorResult + class ResultEnvelope(BaseModel): task_id: str = Field(description="Task identifier.") - result: dict[str, Any] = Field(description="Result payload data.") + result: BaseExecutorResult = Field(description="Result payload data.") worker_id: str | None = Field( default=None, description="Worker identifier submitting the result." ) @@ -40,7 +42,9 @@ def write_result(base_dir: Path, envelope: ResultEnvelope) -> Path: return path -def write_result_in_envelope(path: Path, task_id: str, result: dict[str, Any]) -> None: +def write_result_in_envelope( + path: Path, task_id: str, result: BaseExecutorResult | dict[str, Any] +) -> None: """Wrap ``result`` in a ``ResultEnvelope`` and persist it at ``path``.""" path.parent.mkdir(parents=True, exist_ok=True) envelope = ResultEnvelope(task_id=task_id, result=result) diff --git a/src/worker/executors/utils/checkpoints.py b/src/worker/executors/utils/checkpoints.py index 60ba6053..c5c6e59f 100644 --- a/src/worker/executors/utils/checkpoints.py +++ b/src/worker/executors/utils/checkpoints.py @@ -11,6 +11,8 @@ import requests +from shared.schemas.artifact import ArtifactContext, ArtifactRef +from shared.schemas.executor_result import BaseExecutorResult from shared.schemas.result import write_result_in_envelope from shared.tasks.specs import TaskSpecStrictBase from shared.utils.http import add_auth_headers @@ -394,15 +396,15 @@ def is_cleanup_enabled() -> bool: return normalized not in {"0", "false", "no", "off"} -def artifact_ref(rel_path: str) -> dict[str, str]: - """Build an artifact reference dict. `rel_path` is the path relative to - `out_dir/artifacts/`""" - return {"path": rel_path} +def artifact_ref(rel_path: str) -> ArtifactRef: + """Build an artifact reference. ``rel_path`` is relative to + ``out_dir/artifacts/``.""" + return ArtifactRef(path=rel_path) -def build_artifact_context(spec: TaskSpecStrictBase, out_dir: Path) -> dict[str, Any]: - """Top-level `_artifacts` context: {base_dir, base_url}. base_url is the - destination origin (scheme://host[:port]) for HTTP, else None.""" +def build_artifact_context(spec: TaskSpecStrictBase, out_dir: Path) -> ArtifactContext: + """Top-level ``_artifacts`` context. ``base_url`` is the destination + origin (scheme://host[:port]) for HTTP, else ``None``.""" base_dir = Path(out_dir).resolve().as_posix() base_url: str | None = None destination = get_http_destination(spec) @@ -410,17 +412,19 @@ def build_artifact_context(spec: TaskSpecStrictBase, out_dir: Path) -> dict[str, parsed = urlparse(destination.url) if parsed.scheme and parsed.netloc: base_url = f"{parsed.scheme}://{parsed.netloc}" - return {"base_dir": base_dir, "base_url": base_url} + return ArtifactContext(base_dir=base_dir, base_url=base_url) def write_executor_result( path: Path, task_id: str, spec: TaskSpecStrictBase, - result: dict[str, Any], + result: BaseExecutorResult | dict[str, Any], ) -> None: """Stamp ``_artifacts`` onto ``result`` and persist the envelope.""" - result["_artifacts"] = build_artifact_context(spec, path.parent) + if isinstance(result, dict): + result = BaseExecutorResult.model_validate(result) + result.artifacts = build_artifact_context(spec, path.parent) write_result_in_envelope(path, task_id, result) diff --git a/tests/worker/test_checkpoint_utils.py b/tests/worker/test_checkpoint_utils.py index 27816fbe..171a0c1a 100644 --- a/tests/worker/test_checkpoint_utils.py +++ b/tests/worker/test_checkpoint_utils.py @@ -6,6 +6,7 @@ import pytest +from shared.schemas.artifact import ArtifactContext, ArtifactRef from worker.executors.base_executor import TaskReference from worker.executors.utils import checkpoints @@ -31,7 +32,9 @@ def _task( class TestArtifactRef: def test_returns_path_only(self) -> None: - assert checkpoints.artifact_ref("images/foo.png") == {"path": "images/foo.png"} + assert checkpoints.artifact_ref("images/foo.png") == ArtifactRef( + path="images/foo.png" + ) class TestBuildArtifactContext: @@ -39,15 +42,17 @@ def test_http_destination_strips_api_suffix(self, tmp_path: Path) -> None: out_dir = tmp_path / "task-1" out_dir.mkdir() ctx = checkpoints.build_artifact_context(_task().spec, out_dir) - assert ctx["base_dir"] == out_dir.resolve().as_posix() - assert ctx["base_url"] == "http://host:8010" + assert ctx == ArtifactContext( + base_dir=out_dir.resolve().as_posix(), base_url="http://host:8010" + ) def test_local_destination_leaves_base_url_none(self, tmp_path: Path) -> None: out_dir = tmp_path / "task-1" out_dir.mkdir() ctx = checkpoints.build_artifact_context(_task("local").spec, out_dir) - assert ctx["base_dir"] == out_dir.resolve().as_posix() - assert ctx["base_url"] is None + assert ctx == ArtifactContext( + base_dir=out_dir.resolve().as_posix(), base_url=None + ) class TestMaybeUploadArtifacts: From 33785b6540c2aa2ef400ea3779de44e2db5e6abe Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Mon, 18 May 2026 14:56:49 +0000 Subject: [PATCH 04/26] refactor(worker): type non-inference, non-training executor results Each migrated executor now declares its own ``BaseExecutorResult`` subclass (``EchoResult``, ``APIResult``, ``SSHResult``, ``DataProfilingResult``, ``DataRetrievalResult``) and ``run()`` returns the typed model in place of an opaque ``dict[str, Any]``. The runner and the governance mixin accept either form during the transition (``BaseExecutorResult | dict[str, Any]``); inference and training executors migrate in follow-up commits. The base ``Executor.run`` return type is widened correspondingly. ``APIResult.response_json`` carries the ``json`` wire-format alias to avoid shadowing ``BaseModel.json``. Signed-off-by: Noppanat Wadlom --- src/worker/executors/api_executor.py | 60 ++++++++++++------- src/worker/executors/base_executor.py | 8 ++- .../executors/data_profiling_executor.py | 32 +++++----- .../executors/data_retrieval_executor.py | 52 +++++++++------- src/worker/executors/echo_executor.py | 21 ++++--- src/worker/executors/mixins/governance.py | 10 +++- src/worker/executors/ssh_executor.py | 19 ++++-- src/worker/runner.py | 22 ++++--- tests/worker/test_agent_connector.py | 6 +- 9 files changed, 142 insertions(+), 88 deletions(-) diff --git a/src/worker/executors/api_executor.py b/src/worker/executors/api_executor.py index 4129ce0f..45be3269 100644 --- a/src/worker/executors/api_executor.py +++ b/src/worker/executors/api_executor.py @@ -5,7 +5,9 @@ from typing import Any, ClassVar import httpx +from pydantic import Field +from shared.schemas.executor_result import BaseExecutorResult from shared.tasks.specs import ApiSpecStrict from .base_executor import ExecutionError, Executor, ExecutorTask @@ -16,6 +18,19 @@ _ClientKey = tuple[str, float, bool, bool] +class APIResult(BaseExecutorResult): + ok: bool + executor: str + method: str + url: str + status_code: int + truncated: bool = False + headers: dict[str, str] | None = None + response_json: Any = Field(default=None, alias="json") + usage: dict[str, Any] | None = None + text: str | None = None + + class APIExecutor(Executor): """Executor that performs a single HTTP request defined by task YAML. @@ -89,7 +104,7 @@ def cleanup_after_run(self) -> None: """Close the connection pool when the runner deactivates this executor.""" self.close_all_clients() - def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: + def run(self, task: ExecutorTask, out_dir: Path) -> APIResult: spec = self.require_spec(task, ApiSpecStrict) api_cfg = spec.api or {} if not isinstance(api_cfg, dict): @@ -173,17 +188,17 @@ def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: body_bytes = body_bytes[:max_body_bytes] truncated = True - result: dict[str, Any] = { - "ok": resp.is_success, - "executor": self.name, - "method": method, - "url": str(resp.url), - "status_code": resp.status_code, - "truncated": truncated, - } + result = APIResult( + ok=resp.is_success, + executor=self.name, + method=method, + url=str(resp.url), + status_code=resp.status_code, + truncated=truncated, + ) if include_headers: - result["headers"] = dict(resp.headers) + result.headers = dict(resp.headers) body_text: str | None = None if return_body: @@ -191,26 +206,25 @@ def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: body_text = body_bytes.decode(encoding, errors="replace") if parse_json: - result["json"] = resp.json() - if not isinstance(result["json"], dict): + result.response_json = resp.json() + if not isinstance(result.response_json, dict): raise ExecutionError("Response is not a valid JSON mapping") - if isinstance(result["json"], dict): - usage = result["json"].get("usage") - if not isinstance(usage, dict): - raise ExecutionError( - "spec.api.response.parse_json is true but response JSON " - f"does not contain usage info: {result['json']}" - ) - result["usage"] = usage + usage = result.response_json.get("usage") + if not isinstance(usage, dict): + raise ExecutionError( + "spec.api.response.parse_json is true but response JSON " + f"does not contain usage info: {result.response_json}" + ) + result.usage = usage try: - result["text"] = result["json"]["choices"][0]["message"]["content"] + result.text = result.response_json["choices"][0]["message"]["content"] except Exception as exc: raise ExecutionError( "spec.api.response.parse_json is true but response JSON " - f"does not contain message.content: {result['json']}" + f"does not contain message.content: {result.response_json}" ) from exc elif return_body: - result["text"] = body_text + result.text = body_text if raise_for_status and resp.is_error: if body_text: diff --git a/src/worker/executors/base_executor.py b/src/worker/executors/base_executor.py index f4f43f90..ebb58ed4 100644 --- a/src/worker/executors/base_executor.py +++ b/src/worker/executors/base_executor.py @@ -27,6 +27,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> dict: from pathlib import Path from typing import Any, TypeVar +from shared.schemas.executor_result import BaseExecutorResult from shared.tasks import MergedChildTaskStrict from shared.tasks.specs import TaskSpecStrictBase from shared.tasks.worker_message import WorkerHardware, WorkerTaskMessage @@ -84,7 +85,9 @@ def prepare(self) -> None: return None @abstractmethod - def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: + def run( + self, task: ExecutorTask, out_dir: Path + ) -> BaseExecutorResult | dict[str, Any]: """Execute a single task. Args: @@ -93,7 +96,8 @@ def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: if needed. Returns: - A JSON-serializable dictionary summarizing the result. + A ``BaseExecutorResult`` subclass instance (preferred) or a + JSON-serializable dictionary summarizing the result. Raises: ExecutionError: for expected, user-facing failures. diff --git a/src/worker/executors/data_profiling_executor.py b/src/worker/executors/data_profiling_executor.py index bfb7678f..20b94645 100644 --- a/src/worker/executors/data_profiling_executor.py +++ b/src/worker/executors/data_profiling_executor.py @@ -8,6 +8,7 @@ from pathlib import Path from typing import Any +from shared.schemas.executor_result import BaseExecutorResult from shared.tasks.specs import DataProfilingSpecStrict from shared.utils.json import to_json_serializable, validate_keys @@ -19,19 +20,25 @@ logger = logging.getLogger(__name__) +class DataProfilingResult(BaseExecutorResult): + ok: bool = True + type: str = "sql" + template: str | None = None + cost_estimates: dict[str, Any] | None = None + + class DataProfilingExecutor(DataMixin, Executor): """Executor that estimates SQL query costs by sampling SQL template params.""" name = "data_profiling" - def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: + def run(self, task: ExecutorTask, out_dir: Path) -> DataProfilingResult: spec = self.require_spec(task, DataProfilingSpecStrict) task_id = task.task_id merge_children = task.merged_children or [] result = self._run_single_profile(spec, task_id) - child_results: dict[str, dict[str, Any]] = {} for child in merge_children: child_id = child.task_id child_spec = child.spec @@ -39,16 +46,13 @@ def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: raise ExecutionError( "Merged child spec must be data_profiling for merged profiling" ) - child_results[child_id] = self._run_single_profile(child_spec, child_id) - - if child_results: - result["children"] = child_results + result.children[child_id] = self._run_single_profile(child_spec, child_id) return result def _run_single_profile( self, spec: DataProfilingSpecStrict, task_id: str - ) -> dict[str, Any]: + ) -> DataProfilingResult: data_cfg = spec.data if not isinstance(data_cfg, dict): raise ExecutionError( @@ -100,14 +104,12 @@ def _run_single_profile( connection_string, queries, params_rows ) - result: dict[str, Any] = { - "ok": True, - "type": "sql", - "template": template_str, - "cost_estimates": cost_estimates, - } - - return result + return DataProfilingResult( + ok=True, + type="sql", + template=template_str, + cost_estimates=cost_estimates, + ) def _sample_template_queries( self, diff --git a/src/worker/executors/data_retrieval_executor.py b/src/worker/executors/data_retrieval_executor.py index 0321f147..3775284b 100644 --- a/src/worker/executors/data_retrieval_executor.py +++ b/src/worker/executors/data_retrieval_executor.py @@ -10,6 +10,7 @@ import pandas as pd from PIL import Image +from shared.schemas.executor_result import BaseExecutorResult from shared.tasks.specs import DataRetrievalSpecStrict from shared.utils.json import validate_keys @@ -27,10 +28,18 @@ logger = logging.getLogger(__name__) +class DataRetrievalResult(BaseExecutorResult): + ok: bool = True + type: str | None = None + items: list[dict[str, Any]] = [] + count: int | None = None + metadata: dict[str, Any] | None = None + + class DataRetrievalExecutor(DataMixin, Executor): name = "data_retrieval" - def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: + def run(self, task: ExecutorTask, out_dir: Path) -> DataRetrievalResult: spec = self.require_spec(task, DataRetrievalSpecStrict) task_id = task.task_id with self._task_span( @@ -72,7 +81,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: def _run_sql( self, data_cfg: dict[str, Any], context: dict[str, Any] - ) -> dict[str, Any]: + ) -> DataRetrievalResult: """ Execute SQL queries based on the provided data configuration and context. @@ -144,18 +153,18 @@ def _run_sql( } ) - return { - "ok": True, - "items": items, - "count": len(items), - } + return DataRetrievalResult( + ok=True, + items=items, + count=len(items), + ) def _run_s3( self, data_cfg: dict[str, Any], context: dict[str, Any], out_dir: Path, - ) -> dict[str, Any]: + ) -> DataRetrievalResult: validate_keys( data_cfg, "DataRetrievalExecutor.spec.data", @@ -207,20 +216,19 @@ def _run_s3( item["params"] = params_rows items.append(item) - result = { - "ok": True, - "type": "s3", - "items": items, - "metadata": s3_result["metadata"], # type: ignore - } - return result + return DataRetrievalResult( + ok=True, + type="s3", + items=items, + metadata=s3_result["metadata"], # type: ignore + ) def _run_agent( self, data_cfg: dict[str, Any], context: dict[str, Any], out_dir: Path, - ) -> dict[str, Any]: + ) -> DataRetrievalResult: """Drive lumid.data's data agent for NL-driven retrieval.""" validate_keys( data_cfg, @@ -314,12 +322,12 @@ def _run_agent( } ) - return { - "ok": True, - "type": "agent", - "items": items, - "count": len(items), - } + return DataRetrievalResult( + ok=True, + type="agent", + items=items, + count=len(items), + ) def _load_table(self, path: Path, output_format: str) -> pd.DataFrame: """Load the materialized retrieval file into a DataFrame.""" diff --git a/src/worker/executors/echo_executor.py b/src/worker/executors/echo_executor.py index 480938a4..884cdcf9 100644 --- a/src/worker/executors/echo_executor.py +++ b/src/worker/executors/echo_executor.py @@ -2,6 +2,7 @@ from pathlib import Path from typing import Any +from shared.schemas.executor_result import BaseExecutorResult from shared.tasks.specs import EchoSpecStrict from .base_executor import ExecutionError, Executor, ExecutorTask @@ -14,6 +15,12 @@ type EchoItem = str | dict[str, str] +class EchoResult(BaseExecutorResult): + ok: bool = True + items: list[dict[str, Any]] = [] + count: int = 0 + + class EchoExecutor(DataMixin, Executor): name = "echo" @@ -55,7 +62,7 @@ def _resolve_item(self, item: EchoItem, context: dict[str, Any]) -> Any: "a string literal or a mapping" ) - def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: + def run(self, task: ExecutorTask, out_dir: Path) -> EchoResult: spec = self.require_spec(task, EchoSpecStrict) task_id = task.task_id.strip() with self._task_span( @@ -81,18 +88,16 @@ def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: resolved = self._resolve_item(item, context) self._append_outputs(merged_items, resolved) - payload: dict[str, Any] = { - "ok": True, - "items": merged_items, - "count": len(merged_items), - } + result = EchoResult( + ok=True, items=merged_items, count=len(merged_items) + ) deps = self._extract_source_data_ids(spec) dependencies_by_task = {task_id: deps} self._dump_to_governance( task_id=task_id, - result=payload, + result=result, dependencies_by_task=dependencies_by_task, ) maybe_upload_traces(task, out_dir, logger=logger) - return payload + return result diff --git a/src/worker/executors/mixins/governance.py b/src/worker/executors/mixins/governance.py index 30adfcc9..7a7650ec 100644 --- a/src/worker/executors/mixins/governance.py +++ b/src/worker/executors/mixins/governance.py @@ -13,6 +13,7 @@ from opentelemetry.trace import Span as OTelSpan +from shared.schemas.executor_result import BaseExecutorResult from shared.schemas.governance import ( READY_SPAN_NAME, TASK_SPAN_NAME, @@ -236,17 +237,20 @@ def _extract_source_data_ids(self, spec: TaskSpecStrictBase) -> list[str]: def _dump_to_governance( self, task_id: str, - result: dict[str, Any], + result: BaseExecutorResult | dict[str, Any], dependencies_by_task: dict[str, list[str]], ) -> None: """Write parent + merged-child results and emit asset/lineage rows.""" parent_deps = dependencies_by_task.get(task_id, []) - children_payload = result.get("children", {}) + payload = ( + result.model_dump() if isinstance(result, BaseExecutorResult) else result + ) + children_payload = payload.get("children", {}) or {} collection_jobs: list[dict[str, Any]] = [ { "task_id": task_id, - "result": result, + "result": payload, "deps": parent_deps, "is_parent": True, } diff --git a/src/worker/executors/ssh_executor.py b/src/worker/executors/ssh_executor.py index ef6d7958..ac695e36 100644 --- a/src/worker/executors/ssh_executor.py +++ b/src/worker/executors/ssh_executor.py @@ -29,6 +29,7 @@ import requests +from shared.schemas.executor_result import BaseExecutorResult from shared.tasks.components.resources import GPURequirements from shared.tasks.specs.ssh import ( SSHInputSpec, @@ -55,6 +56,13 @@ TaskCancelledError, ) + +class SSHResult(BaseExecutorResult): + session_id: str + exit_code: int + command: str | None = None + entrypoint: str | None = None + try: import docker from docker import DockerClient @@ -425,7 +433,7 @@ def teardown(self) -> None: # Main execution # ------------------------------------------------------------------ # - def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: + def run(self, task: ExecutorTask, out_dir: Path) -> SSHResult: spec = self.require_spec(task, SSHSpecStrict) cfg = SSHConfig.from_spec(spec, self._config, self._hardware) access_mode = cfg.access_mode @@ -539,16 +547,17 @@ def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: cfg.output, mount_plan, ) - result: dict[str, Any] = {"session_id": session_id, "exit_code": exit_code} + result = SSHResult(session_id=session_id, exit_code=exit_code) if interactive: - result.update(session_info) + for key, value in session_info.items(): + setattr(result, key, value) else: # Keep as fallback — captures any output the streaming thread missed. self._save_container_logs(container, out_dir) if cfg.command is not None: - result["command"] = cfg.command + result.command = cfg.command if cfg.entrypoint is not None: - result["entrypoint"] = cfg.entrypoint + result.entrypoint = cfg.entrypoint if mount_plan.copy_output_path: self._copy_output_directory( container, diff --git a/src/worker/runner.py b/src/worker/runner.py index eef7e292..f009532c 100644 --- a/src/worker/runner.py +++ b/src/worker/runner.py @@ -10,6 +10,7 @@ import requests +from shared.schemas.executor_result import BaseExecutorResult from shared.tasks import MergedChildTaskStrict from shared.tasks.specs import InferenceSpecStrict, TaskSpecStrictBase from shared.tasks.worker_message import ( @@ -127,16 +128,17 @@ def _write_results( spec: TaskSpecStrictBase, merged_children: list[MergedChildTaskStrict], out_dir: Path, - result: dict[str, Any] | None, + result: BaseExecutorResult | dict[str, Any] | None, ): if result is None: return self._write_single_result(task_id, spec, out_dir, result) child_lookup = {entry.task_id: entry for entry in merged_children} - children_payload = ( - result.get("children") if isinstance(result, dict) else {} - ) or {} + if isinstance(result, BaseExecutorResult): + children_payload: dict[str, Any] = dict(result.children) + else: + children_payload = result.get("children") or {} for child_id, child_result in children_payload.items(): child_info = child_lookup.get(child_id) if child_info is None: @@ -151,7 +153,7 @@ def _write_single_result( task_id: str, spec: TaskSpecStrictBase, out_dir: Path, - payload: dict[str, Any] | None, + payload: BaseExecutorResult | dict[str, Any] | None, ): if payload is None: return @@ -179,7 +181,10 @@ def _simulate_bandwidth_delay(self, payload_bytes: int, destination: str) -> Non time.sleep(delay) def _maybe_emit_http( - self, task_id: str, spec: TaskSpecStrictBase, result: dict[str, Any] + self, + task_id: str, + spec: TaskSpecStrictBase, + result: BaseExecutorResult | dict[str, Any], ) -> None: """Send task results to an HTTP endpoint when requested by the spec.""" destination = get_http_destination(spec) @@ -188,9 +193,12 @@ def _maybe_emit_http( url = destination.url ignore_error = destination.ignore_error + result_dict = ( + result.model_dump() if isinstance(result, BaseExecutorResult) else result + ) payload = { "task_id": task_id, - "result": result, + "result": result_dict, "worker_id": self.lifecycle.worker_id, } payload_size = len(json.dumps(payload, ensure_ascii=False).encode("utf-8")) diff --git a/tests/worker/test_agent_connector.py b/tests/worker/test_agent_connector.py index b4fd1eb0..f3b07f32 100644 --- a/tests/worker/test_agent_connector.py +++ b/tests/worker/test_agent_connector.py @@ -182,9 +182,9 @@ def test_renders_description_from_params_and_yields_table_item( ) result = executor._run_agent(data_cfg, context, out_dir) - assert result["ok"] is True - assert result["count"] == 1 - item = result["items"][0] + assert result.ok is True + assert result.count == 1 + item = result.items[0] assert "fetch NVDA quotes for 2024-01-01" in item["description"] assert item["rows"] == 1 assert item["run_id"] == "run-fake" From 14b8aaea4455c26b7bbfa5f71ac543d85f5af25d Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Mon, 18 May 2026 15:06:05 +0000 Subject: [PATCH 05/26] refactor(worker): type inference-family executor results VLLM, transformers, diffusers, omni, RAG, agent, and the MP wrapper now return typed ``BaseExecutorResult`` subclasses (``VLLMResult``, ``TransformersResult``, ``DiffusersResult``, ``OmniResult``, ``RAGResult``, ``AgentResult``). Internals keep building the result as a dict and validate into the model at the return boundary, preserving the wire shape. ``MPExecutor`` widens its return signature to pass through either form from the inner executor. The transformers inference test switches to attribute access on the typed result. Signed-off-by: Noppanat Wadlom --- src/worker/executors/agent_executor.py | 16 +++++++- src/worker/executors/diffusers_executor.py | 12 +++++- src/worker/executors/mp_executor.py | 5 ++- src/worker/executors/omni_executor_base.py | 10 ++++- src/worker/executors/rag_executor.py | 38 +++++++++++-------- src/worker/executors/transformers_executor.py | 16 +++++++- src/worker/executors/vllm_executor.py | 14 +++++-- .../test_transformers_chat_inference.py | 11 +++--- 8 files changed, 90 insertions(+), 32 deletions(-) diff --git a/src/worker/executors/agent_executor.py b/src/worker/executors/agent_executor.py index c12d4003..d0262dd7 100644 --- a/src/worker/executors/agent_executor.py +++ b/src/worker/executors/agent_executor.py @@ -16,6 +16,8 @@ from datasets import load_dataset +from shared.schemas.artifact import ArtifactRef +from shared.schemas.executor_result import BaseExecutorResult from shared.tasks.specs import AgentSpecStrict from .base_executor import ExecutionError, Executor, ExecutorTask @@ -54,6 +56,16 @@ def _resolve_task_timeout(agent: dict[str, Any] | None) -> int: logger = logging.getLogger("worker.agent") +class AgentResult(BaseExecutorResult): + ok: bool + model: str + items: list[dict[str, Any]] = [] + usage: dict[str, Any] | None = None + metadata: dict[str, Any] | None = None + agent_output: ArtifactRef | None = None + batch_summary_file: ArtifactRef | None = None + + class AgentExecutor(Executor): """Agent executor using youtu-agent (utu) framework""" @@ -223,7 +235,7 @@ def prepare_data(self, spec: AgentSpecStrict) -> None: else: raise ExecutionError(f"Unsupported spec.data.type: {dtype!r}") - def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: + def run(self, task: ExecutorTask, out_dir: Path) -> AgentResult: """Execute agent tasks using youtu-agent (utu) framework""" self.ensure_dir(out_dir) @@ -321,7 +333,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: maybe_upload_artifacts(task, out_dir, logger=logger) logger.info(f"Agent execution completed: {len(self._tasks)} task(s)") - return output + return AgentResult.model_validate(output) except ExecutionError: raise diff --git a/src/worker/executors/diffusers_executor.py b/src/worker/executors/diffusers_executor.py index f74fb8e4..8855b1ac 100644 --- a/src/worker/executors/diffusers_executor.py +++ b/src/worker/executors/diffusers_executor.py @@ -14,6 +14,8 @@ from PIL import Image +from shared.schemas.artifact import ArtifactRef +from shared.schemas.executor_result import BaseExecutorResult from shared.tasks.specs import DiffusionSpecStrict from ..utils.logging import configure_hf_library_logging @@ -45,6 +47,12 @@ logger = logging.getLogger(__name__) +class DiffusersResult(BaseExecutorResult): + ok: bool = True + model: str | None = None + images: list[ArtifactRef] = [] + + class DiffusersExecutor(DataMixin, Executor): """Executor that runs text-to-image generation via Hugging Face Diffusers.""" @@ -240,7 +248,7 @@ def _encode_and_combine_prompts( return combined_pos, combined_neg, user_pos_pooled, user_neg_pooled - def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: + def run(self, task: ExecutorTask, out_dir: Path) -> DiffusersResult: configure_hf_library_logging() spec = self.require_spec(task, DiffusionSpecStrict) task_id = task.task_id.strip() @@ -250,7 +258,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: response = self._run_inner(spec, task_id, out_dir) maybe_upload_artifacts(task, out_dir, logger=logger) maybe_upload_traces(task, out_dir, logger=logger) - return response + return DiffusersResult.model_validate(response) def _run_inner( self, diff --git a/src/worker/executors/mp_executor.py b/src/worker/executors/mp_executor.py index f2379fe2..db92eebc 100644 --- a/src/worker/executors/mp_executor.py +++ b/src/worker/executors/mp_executor.py @@ -21,6 +21,7 @@ import psutil +from shared.schemas.executor_result import BaseExecutorResult from shared.tasks.worker_message import WorkerHardware from worker.config import WorkerConfig @@ -439,7 +440,9 @@ def _loop() -> None: self._log_thread = t t.start() - def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: + def run( + self, task: ExecutorTask, out_dir: Path + ) -> BaseExecutorResult | dict[str, Any]: with self._lock: if self._shutdown: logger.info("Starting worker subprocess for %s", self.name) diff --git a/src/worker/executors/omni_executor_base.py b/src/worker/executors/omni_executor_base.py index 87e36f93..86021bbd 100644 --- a/src/worker/executors/omni_executor_base.py +++ b/src/worker/executors/omni_executor_base.py @@ -19,6 +19,7 @@ import yaml +from shared.schemas.executor_result import BaseExecutorResult from shared.tasks.specs import TaskSpecStrictBase from shared.utils.parsing import to_bool, to_int @@ -40,6 +41,11 @@ logger = logging.getLogger(__name__) +class OmniResult(BaseExecutorResult): + ok: bool = True + model: str | None = None + + class OmniExecutorBase(InferenceMixin, Executor): """Shared base for Omni-family executors. @@ -58,7 +64,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self._omni_spec: tuple[Any, ...] | None = None self._stage_configs_tmp: Path | None = None - def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: + def run(self, task: ExecutorTask, out_dir: Path) -> OmniResult: spec = self.require_spec(task, self._TASK_SPEC_TYPE) spec_dict = spec.model_dump(by_alias=True) out_dir = Path(out_dir).resolve() @@ -68,7 +74,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: result = self._run_inner(task, spec, spec_dict, out_dir) maybe_upload_artifacts(task, out_dir, logger=logger) maybe_upload_traces(task, out_dir, logger=logger) - return result + return OmniResult.model_validate(result) @abstractmethod def _run_inner( diff --git a/src/worker/executors/rag_executor.py b/src/worker/executors/rag_executor.py index 839041bf..a1634258 100644 --- a/src/worker/executors/rag_executor.py +++ b/src/worker/executors/rag_executor.py @@ -16,6 +16,7 @@ from datasets import load_dataset from qdrant_client import QdrantClient, models +from shared.schemas.executor_result import BaseExecutorResult from shared.tasks.specs import RagSpecStrict from .base_executor import ExecutionError, Executor, ExecutorTask @@ -24,10 +25,20 @@ logger = logging.getLogger("worker.rag") +class RAGResult(BaseExecutorResult): + ok: bool = True + executor: str + qdrant: dict[str, Any] + embedding: dict[str, Any] + search: dict[str, Any] + queries: list[dict[str, Any]] = [] + usage: dict[str, Any] | None = None + + class RAGExecutor(Executor): name = "rag" - def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: + def run(self, task: ExecutorTask, out_dir: Path) -> RAGResult: start_ts = time.time() spec = self.require_spec(task, RagSpecStrict) @@ -180,22 +191,19 @@ def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: } ) - # Compose response - out: dict[str, Any] = { - "ok": True, - "executor": self.name, - "qdrant": {"collection": collection, "url": url}, - "embedding": {"model": model_name}, - "search": {"top_k": top_k}, - "queries": results_per_query, - "usage": { + logger.info( + "RAG query completed queries=%d total_results=%d", len(queries), total_items + ) + return RAGResult( + ok=True, + executor=self.name, + qdrant={"collection": collection, "url": url}, + embedding={"model": model_name}, + search={"top_k": top_k}, + queries=results_per_query, + usage={ "latency_sec": round(time.time() - start_ts, 4), "num_queries": len(queries), "total_results": total_items, }, - } - - logger.info( - "RAG query completed queries=%d total_results=%d", len(queries), total_items ) - return out diff --git a/src/worker/executors/transformers_executor.py b/src/worker/executors/transformers_executor.py index c4a3b586..c8a32e95 100644 --- a/src/worker/executors/transformers_executor.py +++ b/src/worker/executors/transformers_executor.py @@ -56,6 +56,8 @@ from pathlib import Path from typing import TYPE_CHECKING, Any +from shared.schemas.artifact import ArtifactRef +from shared.schemas.executor_result import BaseExecutorResult from shared.schemas.governance import SpanType from shared.tasks.specs import ( EmbeddingSpecStrict, @@ -115,6 +117,16 @@ logger = logging.getLogger(__name__) +class TransformersResult(BaseExecutorResult): + ok: bool = True + model: str | None = None + items: list[dict[str, Any]] = [] + usage: dict[str, Any] | None = None + count: int | None = None + embedding_file: ArtifactRef | None = None + image_group_sizes: list[int] | None = None + + class HFTransformersExecutor(InferenceMixin, Executor): """Executor that runs text generation via Hugging Face Transformers.""" @@ -384,7 +396,7 @@ def _detect_finish_reason( return "length" return None - def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: # type: ignore[override] + def run(self, task: ExecutorTask, out_dir: Path) -> TransformersResult: configure_hf_library_logging() spec = task.spec if not isinstance(spec, (InferenceSpecStrict, EmbeddingSpecStrict)): @@ -399,7 +411,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: # type: ign result = self._run_inner(spec, task_id, out_dir) maybe_upload_artifacts(task, out_dir, logger=logger) maybe_upload_traces(task, out_dir, logger=logger) - return result + return TransformersResult.model_validate(result) def _run_inner( self, diff --git a/src/worker/executors/vllm_executor.py b/src/worker/executors/vllm_executor.py index 6ad6abd1..7319ea43 100644 --- a/src/worker/executors/vllm_executor.py +++ b/src/worker/executors/vllm_executor.py @@ -67,6 +67,7 @@ StructuredOutputsParams = None # type: ignore from shared.schemas.governance import SpanType +from shared.schemas.executor_result import BaseExecutorResult from shared.tasks.specs import InferenceSpecStrict from .base_executor import ExecutionError, Executor, ExecutorTask @@ -81,6 +82,13 @@ logger = logging.getLogger(__name__) +class VLLMResult(BaseExecutorResult): + ok: bool = True + model: str | None = None + items: list[dict[str, Any]] = [] + usage: dict[str, Any] | None = None + + class _RawJsonSchema: """Tag wrapping a JSON schema so ``_build_sampling_params`` can ``isinstance``-dispatch raw schemas vs. named-fields pydantic kwargs.""" @@ -849,7 +857,7 @@ def _postprocess_prompts(self, parsed: InferenceEntry) -> PreparedInferenceEntry # --------------------------------------------------------------------- # # Execution # --------------------------------------------------------------------- # - def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: # type: ignore[override] + def run(self, task: ExecutorTask, out_dir: Path) -> VLLMResult: spec = self.require_spec(task, InferenceSpecStrict) task_id = task.task_id.strip() if not task_id: @@ -868,7 +876,7 @@ def _run_inner( task: ExecutorTask, spec: InferenceSpecStrict, out_dir: Path, - ) -> dict[str, Any]: + ) -> VLLMResult: task_id = task.task_id.strip() merge_children = task.merged_children or [] entries: list[PreparedInferenceEntry] = [] @@ -1196,7 +1204,7 @@ def _collect( dependencies_by_task=dependencies_by_task, ) - return result + return VLLMResult.model_validate(result) def cleanup_after_run(self) -> None: if self._llm: diff --git a/tests/worker/test_transformers_chat_inference.py b/tests/worker/test_transformers_chat_inference.py index ef05f502..5f67fa80 100644 --- a/tests/worker/test_transformers_chat_inference.py +++ b/tests/worker/test_transformers_chat_inference.py @@ -116,17 +116,18 @@ def test_transformers_executor_supports_chat_prompts_and_jsonl_export( result = executor.run(task, tmp_path) mock_ensure_model.assert_called_once_with(spec) - assert [item["output"] for item in result["items"]] == [ + assert [item["output"] for item in result.items] == [ "first answer", "second answer", ] - assert [item["prompt"] for item in result["items"]] == [ + assert [item["prompt"] for item in result.items] == [ "hello", "world", ] - assert [item["metadata"]["row_id"] for item in result["items"]] == ["a", "b"] - assert result["usage"]["num_requests"] == 2 - assert "latency_sec" in result["usage"] + assert [item["metadata"]["row_id"] for item in result.items] == ["a", "b"] + assert result.usage is not None + assert result.usage["num_requests"] == 2 + assert "latency_sec" in result.usage exported = (tmp_path / "artifacts" / "rows.jsonl").read_text(encoding="utf-8") assert '"row_id": "a"' in exported From 08baf4ddbf514e3f663ccd6d2d98f8e4341b5fd4 Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Mon, 18 May 2026 15:10:58 +0000 Subject: [PATCH 06/26] refactor(worker): type training executor results SFT, LoRA-SFT, DPO, and PPO executors return typed ``BaseExecutorResult`` subclasses (``SFTResult``, ``LoRAResult``, ``DPOResult``, ``PPOResult``). The non-LoRA training shapes are kept permissive (most fields optional) because the distributed-spawn paths return a partial dict, either loaded from the subprocess IPC file or synthesized by the parent. Artifact-bearing fields are typed as ``ArtifactRef`` so the on-disk shape is validated end-to-end. Signed-off-by: Noppanat Wadlom --- src/worker/executors/dpo_executor.py | 33 +++++++++++++++------ src/worker/executors/lora_sft_executor.py | 22 ++++++++++++-- src/worker/executors/ppo_executor.py | 33 +++++++++++++++------ src/worker/executors/sft_executor.py | 35 +++++++++++++++++------ 4 files changed, 93 insertions(+), 30 deletions(-) diff --git a/src/worker/executors/dpo_executor.py b/src/worker/executors/dpo_executor.py index 7dc1e8d2..99e7920b 100644 --- a/src/worker/executors/dpo_executor.py +++ b/src/worker/executors/dpo_executor.py @@ -25,6 +25,8 @@ from trl.trainer.dpo_config import DPOConfig from trl.trainer.dpo_trainer import DPOTrainer +from shared.schemas.artifact import ArtifactRef +from shared.schemas.executor_result import BaseExecutorResult from shared.tasks.specs import DPOSpecStrict from shared.utils.manifest import scratch_dir @@ -45,6 +47,19 @@ logger = logging.getLogger("worker.dpo") +class DPOResult(BaseExecutorResult): + training_successful: bool = True + training_time_seconds: float | None = None + error_message: str | None = None + model_name: str | None = None + dataset_size: int = 0 + output_dir: str | None = None + checkpoints_dir: ArtifactRef | None = None + final_model: ArtifactRef | None = None + final_model_archive: ArtifactRef | None = None + spawned_torchrun: bool = False + + class DPOExecutor(TrainingMixin, Executor): """DPO training executor using TRL library.""" @@ -59,7 +74,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self._current_trainer: DPOTrainer | None = None self._task_out_dir: Path | None = None - def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: + def run(self, task: ExecutorTask, out_dir: Path) -> DPOResult: configure_hf_library_logging() logger.info("Starting DPO training task") spec = self.require_spec(task, DPOSpecStrict) @@ -80,24 +95,24 @@ def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: ) ipc_path = scratch_dir(out_dir) / "distributed_result.json" if ipc_path.exists(): - return self.load_json(ipc_path) - return { - "training_successful": True, - "spawned_torchrun": True, - "model_name": ( + return DPOResult.model_validate(self.load_json(ipc_path)) + return DPOResult( + training_successful=True, + spawned_torchrun=True, + model_name=( spec.model and spec.model.source and spec.model.source.identifier ), - "output_dir": out_dir.as_posix(), - } + output_dir=out_dir.as_posix(), + ) result = self._execute_training(task, out_dir) logger.info( "DPO training task completed in %.2f seconds", result.get("training_time_seconds", 0.0), ) - return result + return DPOResult.model_validate(result) finally: self._task_out_dir = None diff --git a/src/worker/executors/lora_sft_executor.py b/src/worker/executors/lora_sft_executor.py index f40c99ad..20f9a5f3 100644 --- a/src/worker/executors/lora_sft_executor.py +++ b/src/worker/executors/lora_sft_executor.py @@ -18,6 +18,8 @@ from trl.trainer.sft_config import SFTConfig from trl.trainer.sft_trainer import SFTTrainer +from shared.schemas.artifact import ArtifactRef +from shared.schemas.executor_result import BaseExecutorResult from shared.tasks.specs import LoRASFTSpecStrict from ..utils.logging import configure_hf_library_logging @@ -48,6 +50,20 @@ logger = logging.getLogger("worker.sft.lora") +class LoRAResult(BaseExecutorResult): + task_id: str + training_successful: bool + training_time_seconds: float + error_message: str | None = None + model_name: str | None = None + dataset_size: int = 0 + output_dir: str + checkpoints_dir: ArtifactRef + resume_from_path: str | None = None + final_lora: ArtifactRef | None = None + final_lora_archive: ArtifactRef | None = None + + class LoRASFTExecutor(TrainingMixin, Executor): """Execute LoRA-based supervised fine-tuning using TRL's SFTTrainer.""" @@ -59,7 +75,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self._current_model: Any | None = None self._current_trainer: Any | None = None - def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: + def run(self, task: ExecutorTask, out_dir: Path) -> LoRAResult: configure_hf_library_logging() spec = self.require_spec(task, LoRASFTSpecStrict) start_time = time.time() @@ -321,7 +337,7 @@ def _compute_loss_with_guard( final_adapter_path, archive_path, ) - return result_payload + return LoRAResult.model_validate(result_payload) except Exception as exc: # pylint: disable=broad-except error_msg = str(exc) @@ -345,7 +361,7 @@ def _compute_loss_with_guard( } if training_successful: - return result + return LoRAResult.model_validate(result) write_executor_result(out_dir / "results.json", task.task_id, task.spec, result) message = error_msg or "LoRA SFT training failed" diff --git a/src/worker/executors/ppo_executor.py b/src/worker/executors/ppo_executor.py index 1cc47183..c9320f58 100644 --- a/src/worker/executors/ppo_executor.py +++ b/src/worker/executors/ppo_executor.py @@ -33,6 +33,8 @@ from trl.trainer.ppo_config import PPOConfig from trl.trainer.ppo_trainer import PPOTrainer +from shared.schemas.artifact import ArtifactRef +from shared.schemas.executor_result import BaseExecutorResult from shared.tasks.specs import PPOSpecStrict from shared.utils.manifest import scratch_dir from shared.utils.parsing import safe_float, safe_int, to_bool @@ -54,6 +56,19 @@ logger = logging.getLogger("worker.ppo") +class PPOResult(BaseExecutorResult): + training_successful: bool = True + training_time_seconds: float | None = None + error_message: str | None = None + model_name: str | None = None + dataset_size: int = 0 + output_dir: str | None = None + checkpoints_dir: ArtifactRef | None = None + final_model: ArtifactRef | None = None + final_model_archive: ArtifactRef | None = None + spawned_torchrun: bool = False + + class _ExternalRewardModel(torch.nn.Module): """Wraps a sequence classification model to score decoded PPO responses.""" @@ -392,7 +407,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self._reward_module: _ExternalRewardModel | _RewardAdapter | None = None self._task_out_dir: Path | None = None - def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: + def run(self, task: ExecutorTask, out_dir: Path) -> PPOResult: configure_hf_library_logging() logger.info("Starting PPO training task") spec = self.require_spec(task, PPOSpecStrict) @@ -420,14 +435,14 @@ def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: if ipc_path.exists(): result = self.load_json(ipc_path) self._task_out_dir = None - return result + return PPOResult.model_validate(result) self._task_out_dir = None - return { - "training_successful": True, - "spawned_torchrun": True, - "model_name": spec.model_name, - "output_dir": out_dir.as_posix(), - } + return PPOResult( + training_successful=True, + spawned_torchrun=True, + model_name=spec.model_name, + output_dir=out_dir.as_posix(), + ) start_time = time.time() @@ -931,7 +946,7 @@ def build_trainer() -> _EarlyStopPPOTrainer: logger.info("PPO training task completed in %.2f seconds", training_time) self._task_out_dir = None - return results + return PPOResult.model_validate(results) def _ensure_jsonl_local(self, jsonl_cfg: dict[str, Any]) -> Path: headers_cfg = ( diff --git a/src/worker/executors/sft_executor.py b/src/worker/executors/sft_executor.py index 4361c94d..4c6cc82b 100644 --- a/src/worker/executors/sft_executor.py +++ b/src/worker/executors/sft_executor.py @@ -22,6 +22,8 @@ from trl.trainer.sft_config import SFTConfig from trl.trainer.sft_trainer import SFTTrainer +from shared.schemas.artifact import ArtifactRef +from shared.schemas.executor_result import BaseExecutorResult from shared.tasks.specs import SFTSpecStrict, TaskSpecStrictBase from shared.utils.manifest import scratch_dir @@ -43,6 +45,21 @@ logger = logging.getLogger("worker.sft") +class SFTResult(BaseExecutorResult): + task_id: str | None = None + training_successful: bool = True + training_time_seconds: float | None = None + error_message: str | None = None + model_name: str | None = None + dataset_size: int = 0 + output_dir: str | None = None + checkpoints_dir: ArtifactRef | None = None + resume_from_path: str | None = None + final_model: ArtifactRef | None = None + final_model_archive: ArtifactRef | None = None + spawned_torchrun: bool = False + + class SFTExecutor(TrainingMixin, Executor): name = "sft_executor" @@ -55,7 +72,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self._final_model_dir: Path | None = None self._task_out_dir: Path | None = None - def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: + def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: configure_hf_library_logging() spec = self.require_spec(task, SFTSpecStrict) requested_gpu_count = self._requested_gpu_count(spec) @@ -191,14 +208,14 @@ def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: if ipc_path.exists(): distributed_result = self.load_json(ipc_path) self._task_out_dir = None - return distributed_result + return SFTResult.model_validate(distributed_result) self._task_out_dir = None - return { - "training_successful": True, - "spawned_torchrun": True, - "model_name": spec.model_name, - "output_dir": out_dir.as_posix(), - } + return SFTResult( + training_successful=True, + spawned_torchrun=True, + model_name=spec.model_name, + output_dir=out_dir.as_posix(), + ) except Exception as spawn_exc: logger.exception("Failed to launch distributed SFT: %s", spawn_exc) raise ExecutionError( @@ -467,7 +484,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: self._final_model_dir = None self._task_out_dir = None - return result_payload + return SFTResult.model_validate(result_payload) except Exception as exc: error_msg = str(exc) From 4ad515359a6f4442ee629f68432596b9add6ca14 Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Mon, 18 May 2026 15:15:40 +0000 Subject: [PATCH 07/26] test: cover BaseExecutorResult round-trip and fix subclass field serialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add round-trip tests for the executor result schema: subclass payload preservation through the base class, ``_artifacts`` alias in both directions, recursive ``children``, and the full envelope write→read path. The recursive-children test exposed a wire-format bug — Pydantic defaults to serializing fields using their declared type, so a subclass instance assigned to a ``BaseExecutorResult`` field (or nested inside ``children``) lost its executor-specific fields on ``model_dump_json``. Pass ``serialize_as_any=True`` at every worker-side serialization seam (``write_result_in_envelope``, ``_maybe_emit_http``, ``_dump_to_governance``) and on the server's stage-result load so the actual subclass shape reaches disk and downstream consumers. Signed-off-by: Noppanat Wadlom --- src/server/dispatcher/base.py | 4 +- src/shared/schemas/result.py | 4 +- src/worker/executors/mixins/governance.py | 4 +- src/worker/runner.py | 4 +- tests/shared/test_executor_result.py | 85 +++++++++++++++++++++++ 5 files changed, 97 insertions(+), 4 deletions(-) create mode 100644 tests/shared/test_executor_result.py diff --git a/src/server/dispatcher/base.py b/src/server/dispatcher/base.py index 2f7c405b..d201c8dc 100644 --- a/src/server/dispatcher/base.py +++ b/src/server/dispatcher/base.py @@ -922,7 +922,9 @@ def _load_stage_result(self, stage_task_id: str) -> dict[str, Any]: f"Result for task {stage_task_id} not found at {path}" ) content = json.loads(path.read_text(encoding="utf-8")) - return ResultEnvelope.model_validate(content).result.model_dump() + return ResultEnvelope.model_validate(content).result.model_dump( + serialize_as_any=True + ) def _dig_path(self, data: Any, parts: list[str]) -> Any: current = data diff --git a/src/shared/schemas/result.py b/src/shared/schemas/result.py index 953e7fa7..eb44c67d 100644 --- a/src/shared/schemas/result.py +++ b/src/shared/schemas/result.py @@ -48,7 +48,9 @@ def write_result_in_envelope( """Wrap ``result`` in a ``ResultEnvelope`` and persist it at ``path``.""" path.parent.mkdir(parents=True, exist_ok=True) envelope = ResultEnvelope(task_id=task_id, result=result) - atomic_write_text(path, envelope.model_dump_json(indent=2)) + atomic_write_text( + path, envelope.model_dump_json(indent=2, serialize_as_any=True) + ) def read_result(base_dir: Path, task_id: str) -> str: diff --git a/src/worker/executors/mixins/governance.py b/src/worker/executors/mixins/governance.py index 7a7650ec..c623c934 100644 --- a/src/worker/executors/mixins/governance.py +++ b/src/worker/executors/mixins/governance.py @@ -243,7 +243,9 @@ def _dump_to_governance( """Write parent + merged-child results and emit asset/lineage rows.""" parent_deps = dependencies_by_task.get(task_id, []) payload = ( - result.model_dump() if isinstance(result, BaseExecutorResult) else result + result.model_dump(serialize_as_any=True) + if isinstance(result, BaseExecutorResult) + else result ) children_payload = payload.get("children", {}) or {} diff --git a/src/worker/runner.py b/src/worker/runner.py index f009532c..6d7b17b0 100644 --- a/src/worker/runner.py +++ b/src/worker/runner.py @@ -194,7 +194,9 @@ def _maybe_emit_http( url = destination.url ignore_error = destination.ignore_error result_dict = ( - result.model_dump() if isinstance(result, BaseExecutorResult) else result + result.model_dump(serialize_as_any=True) + if isinstance(result, BaseExecutorResult) + else result ) payload = { "task_id": task_id, diff --git a/tests/shared/test_executor_result.py b/tests/shared/test_executor_result.py new file mode 100644 index 00000000..0c71bdba --- /dev/null +++ b/tests/shared/test_executor_result.py @@ -0,0 +1,85 @@ +"""Round-trip tests for the shared executor-result schema.""" + +import json +from typing import Any + +from pydantic import Field + +from shared.schemas.artifact import ArtifactContext, ArtifactRef +from shared.schemas.executor_result import BaseExecutorResult + + +class _SampleResult(BaseExecutorResult): + ok: bool = True + items: list[dict[str, Any]] = Field(default_factory=list) + usage: dict[str, Any] | None = None + + +def test_subclass_round_trip_through_base_preserves_extra_fields() -> None: + """A subclass's executor-specific fields survive a JSON trip through the + base class via ``extra='allow'``.""" + original = _SampleResult( + ok=True, + items=[{"output": "hello"}], + usage={"latency_sec": 0.5}, + artifacts=ArtifactContext(base_dir="/tmp/t", base_url="http://h"), + ) + payload = original.model_dump_json(serialize_as_any=True) + + base = BaseExecutorResult.model_validate_json(payload) + redumped = json.loads(base.model_dump_json()) + + assert redumped["items"] == [{"output": "hello"}] + assert redumped["usage"] == {"latency_sec": 0.5} + assert redumped["_artifacts"] == {"base_dir": "/tmp/t", "base_url": "http://h"} + + +def test_artifacts_alias_round_trips_both_directions() -> None: + """The wire key is ``_artifacts`` (alias); field-name input is also accepted.""" + from_alias = BaseExecutorResult.model_validate({"_artifacts": {"base_dir": "/a"}}) + from_field = BaseExecutorResult.model_validate({"artifacts": {"base_dir": "/a"}}) + + assert from_alias.artifacts == from_field.artifacts == ArtifactContext(base_dir="/a") + + dumped = from_alias.model_dump() + assert "_artifacts" in dumped + assert "artifacts" not in dumped + + +def test_recursive_children_round_trip() -> None: + """Nested ``children`` deserialize as ``BaseExecutorResult`` and re-emit + their extra fields when serialized with ``serialize_as_any=True``.""" + parent = _SampleResult( + items=[{"output": "p"}], + children={ + "c1": _SampleResult(items=[{"output": "c"}], usage={"total_tokens": 3}), + }, + ) + payload = parent.model_dump_json(serialize_as_any=True) + base = BaseExecutorResult.model_validate_json(payload) + + assert "c1" in base.children + child = base.children["c1"] + redumped = json.loads(child.model_dump_json()) + assert redumped["items"] == [{"output": "c"}] + assert redumped["usage"] == {"total_tokens": 3} + + +def test_artifact_ref_is_a_typed_path_reference() -> None: + ref = ArtifactRef(path="lora/final") + assert ref.model_dump() == {"path": "lora/final"} + + +def test_envelope_round_trip_preserves_subclass_payload() -> None: + """The production write→read path (``write_result_in_envelope`` → + ``ResultEnvelope.model_validate``) round-trips subclass fields.""" + from shared.schemas.result import ResultEnvelope + + inner = _SampleResult(items=[{"output": "hello"}], usage={"total_tokens": 7}) + env = ResultEnvelope(task_id="tsk-x", result=inner) + raw = env.model_dump_json(serialize_as_any=True) + + parsed = ResultEnvelope.model_validate_json(raw) + dumped = parsed.result.model_dump() + assert dumped["items"] == [{"output": "hello"}] + assert dumped["usage"] == {"total_tokens": 7} From 4fa6a436517ac1f2db185054712a2445c02cc866 Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Mon, 18 May 2026 15:16:43 +0000 Subject: [PATCH 08/26] docs: describe typed executor result schema Architecture's task-merging bullet now points at the typed ``BaseExecutorResult``, and EXECUTORS.md gains a Result schema section covering the cross-cutting base fields, per-executor subclass placement, artifact resolution, and the ``serialize_as_any`` seam. Signed-off-by: Noppanat Wadlom --- docs/ARCHITECTURE.md | 7 ++++--- docs/EXECUTORS.md | 27 +++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 08a34223..dac1b885 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -109,9 +109,10 @@ scripts/dev/ compile_protos, sync_requirements, check_env_examples - **Task merging.** Compatible adjacent tasks in a DAG (same `taskType`, model, hardware shape, and merge key) coalesce into a single dispatch. Merged children ride on `WorkerTaskMessage.merged_children`; the worker - writes per-child results into `result.children`; the dispatcher fans - out synthetic `TASK_SUCCEEDED` / `TASK_FAILED` events. Disable with - `ENABLE_TASK_MERGE=false`. + writes per-child results into `result.children` on the typed + `BaseExecutorResult` (`src/shared/schemas/executor_result.py`); the + dispatcher fans out synthetic `TASK_SUCCEEDED` / `TASK_FAILED` events. + Disable with `ENABLE_TASK_MERGE=false`. - **Stage stickiness** (`ENABLE_STAGE_WEIGHT_STICKINESS=true`) — the dispatcher pins stages that reference an upstream stage's checkpoint to the worker that produced it, falling back to normal selection when diff --git a/docs/EXECUTORS.md b/docs/EXECUTORS.md index ed02e903..959bc9c2 100644 --- a/docs/EXECUTORS.md +++ b/docs/EXECUTORS.md @@ -22,6 +22,33 @@ Helper utilities live in `src/worker/executors/utils/` (`artifacts`, `src/worker/executors/mixins/` (`data`, `governance`, `inference`, `training`). +## Result schema + +Every executor's `run()` returns a typed Pydantic subclass of +`BaseExecutorResult` (`src/shared/schemas/executor_result.py`). The base +class carries the cross-cutting fields the rest of the runtime relies +on: + +- `children: dict[str, BaseExecutorResult]` — per-child results when + merged tasks share a dispatch. +- `artifacts: ArtifactContext | None` (wire key `_artifacts`) — + resolution context for relative artifact refs. + +Per-executor subclasses live next to the executor they describe — e.g. +`VLLMResult` in `src/worker/executors/vllm_executor.py`, `LoRAResult` in +`src/worker/executors/lora_sft_executor.py`. They add executor-specific +fields (`items`, `usage`, `final_lora`, `command`, …). + +The base class is `extra="allow"`, so the server can deserialize an +incoming envelope as `BaseExecutorResult` without knowing the concrete +subclass; the executor-specific fields pass through. Artifact-bearing +fields use `ArtifactRef` (`{"path": rel_path}`); relative paths resolve +against the producer's `_artifacts` context via +`artifact_to_source` / `_render_artifact_ref`. + +Serializers at the wire seam pass `serialize_as_any=True` so subclass +fields survive the round-trip. + ## Agent executor (utu / youtu-agent) `AgentExecutor` requires the following env vars to run; the executor From 9d03ecd2a6dd925c6b63b621b14780db7d333d06 Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Mon, 18 May 2026 15:43:23 +0000 Subject: [PATCH 09/26] chore: satisfy pre-commit hooks (black, isort, mypy) Apply isort / black formatting and resolve mypy errors surfaced by ``pre-commit run --all-files``: - ``BaseExecutorResult.artifacts`` switches to ``validation_alias=AliasChoices("artifacts", "_artifacts")`` + ``serialization_alias="_artifacts"`` so ``artifacts=...`` is the canonical constructor name while still accepting both keys on the wire. - ``write_result_in_envelope`` validates a dict input into ``BaseExecutorResult`` before constructing the envelope. - Dispatcher's skip-envelope path constructs ``BaseExecutorResult()`` instead of an empty dict. - ``SSHResult.command`` / ``entrypoint`` are typed ``list[str] | None`` to match ``SSHConfig``. - ``DiffusersExecutor`` types ``generated_images`` as ``list[ArtifactRef]``. - ``DPOExecutor`` narrows the ``model_name`` resolution to ``str | None`` via a conditional expression. - ``get_result`` returns ``result.model_dump(serialize_as_any=True)`` to match the router's ``dict[str, Any]`` signature. - Two MP-executor tests assert ``isinstance(result, dict)`` so mypy can subscript through the union. Signed-off-by: Noppanat Wadlom --- src/server/dispatcher/base.py | 3 ++- src/server/routers/v1/results.py | 4 +++- src/shared/schemas/executor_result.py | 5 +++-- src/shared/schemas/result.py | 6 +++--- src/worker/executors/diffusers_executor.py | 2 +- src/worker/executors/dpo_executor.py | 6 +++--- src/worker/executors/echo_executor.py | 4 +--- src/worker/executors/ssh_executor.py | 5 +++-- src/worker/executors/vllm_executor.py | 2 +- tests/shared/test_executor_result.py | 4 +++- tests/worker/test_connector_logging.py | 1 + tests/worker/test_mp_executor_lifecycle.py | 1 + 12 files changed, 25 insertions(+), 18 deletions(-) diff --git a/src/server/dispatcher/base.py b/src/server/dispatcher/base.py index d201c8dc..9ec8e5d8 100644 --- a/src/server/dispatcher/base.py +++ b/src/server/dispatcher/base.py @@ -9,6 +9,7 @@ from pydantic import BaseModel, ValidationError from shared.schemas.event import TaskEvent +from shared.schemas.executor_result import BaseExecutorResult from shared.schemas.result import ResultEnvelope, result_file_path, write_result from shared.tasks import ( MergedChildTaskStrict, @@ -994,7 +995,7 @@ def _evaluate_condition_skip( ) skip_envelope = ResultEnvelope( task_id=task_id, - result={}, + result=BaseExecutorResult(), metadata={ "skipped": True, "reason": "condition_not_met", diff --git a/src/server/routers/v1/results.py b/src/server/routers/v1/results.py index 00086507..85667f4b 100644 --- a/src/server/routers/v1/results.py +++ b/src/server/routers/v1/results.py @@ -143,7 +143,9 @@ async def get_result( detail=f"Result file is not valid JSON: {exc}", ) from exc try: - return ResultEnvelope.model_validate(content).result + return ResultEnvelope.model_validate(content).result.model_dump( + serialize_as_any=True + ) except ValidationError as exc: raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, diff --git a/src/shared/schemas/executor_result.py b/src/shared/schemas/executor_result.py index 5ee39721..f352f9e5 100644 --- a/src/shared/schemas/executor_result.py +++ b/src/shared/schemas/executor_result.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel, ConfigDict, Field +from pydantic import AliasChoices, BaseModel, ConfigDict, Field from .artifact import ArtifactContext @@ -22,7 +22,8 @@ class BaseExecutorResult(BaseModel): ) artifacts: ArtifactContext | None = Field( default=None, - alias="_artifacts", + validation_alias=AliasChoices("artifacts", "_artifacts"), + serialization_alias="_artifacts", description="Resolution context for relative artifact refs.", ) diff --git a/src/shared/schemas/result.py b/src/shared/schemas/result.py index eb44c67d..6b8831b0 100644 --- a/src/shared/schemas/result.py +++ b/src/shared/schemas/result.py @@ -47,10 +47,10 @@ def write_result_in_envelope( ) -> None: """Wrap ``result`` in a ``ResultEnvelope`` and persist it at ``path``.""" path.parent.mkdir(parents=True, exist_ok=True) + if isinstance(result, dict): + result = BaseExecutorResult.model_validate(result) envelope = ResultEnvelope(task_id=task_id, result=result) - atomic_write_text( - path, envelope.model_dump_json(indent=2, serialize_as_any=True) - ) + atomic_write_text(path, envelope.model_dump_json(indent=2, serialize_as_any=True)) def read_result(base_dir: Path, task_id: str) -> str: diff --git a/src/worker/executors/diffusers_executor.py b/src/worker/executors/diffusers_executor.py index 8855b1ac..b34ed6ff 100644 --- a/src/worker/executors/diffusers_executor.py +++ b/src/worker/executors/diffusers_executor.py @@ -344,7 +344,7 @@ def _run_inner( image_dir = out_dir / "artifacts" / "images" image_dir.mkdir(parents=True, exist_ok=True) - generated_images: list[dict[str, str]] = [] + generated_images: list[ArtifactRef] = [] for idx, img in enumerate(images): img.save(image_dir / f"image_{idx}.png", format="PNG") generated_images.append(artifact_ref(f"images/image_{idx}.png")) diff --git a/src/worker/executors/dpo_executor.py b/src/worker/executors/dpo_executor.py index 99e7920b..9dfbe449 100644 --- a/src/worker/executors/dpo_executor.py +++ b/src/worker/executors/dpo_executor.py @@ -100,9 +100,9 @@ def run(self, task: ExecutorTask, out_dir: Path) -> DPOResult: training_successful=True, spawned_torchrun=True, model_name=( - spec.model - and spec.model.source - and spec.model.source.identifier + spec.model.source.identifier + if spec.model and spec.model.source + else None ), output_dir=out_dir.as_posix(), ) diff --git a/src/worker/executors/echo_executor.py b/src/worker/executors/echo_executor.py index 884cdcf9..cec85193 100644 --- a/src/worker/executors/echo_executor.py +++ b/src/worker/executors/echo_executor.py @@ -88,9 +88,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> EchoResult: resolved = self._resolve_item(item, context) self._append_outputs(merged_items, resolved) - result = EchoResult( - ok=True, items=merged_items, count=len(merged_items) - ) + result = EchoResult(ok=True, items=merged_items, count=len(merged_items)) deps = self._extract_source_data_ids(spec) dependencies_by_task = {task_id: deps} diff --git a/src/worker/executors/ssh_executor.py b/src/worker/executors/ssh_executor.py index ac695e36..4677c532 100644 --- a/src/worker/executors/ssh_executor.py +++ b/src/worker/executors/ssh_executor.py @@ -60,8 +60,9 @@ class SSHResult(BaseExecutorResult): session_id: str exit_code: int - command: str | None = None - entrypoint: str | None = None + command: list[str] | None = None + entrypoint: list[str] | None = None + try: import docker diff --git a/src/worker/executors/vllm_executor.py b/src/worker/executors/vllm_executor.py index 7319ea43..4e6cea06 100644 --- a/src/worker/executors/vllm_executor.py +++ b/src/worker/executors/vllm_executor.py @@ -66,8 +66,8 @@ _HAS_VLLM = False StructuredOutputsParams = None # type: ignore -from shared.schemas.governance import SpanType from shared.schemas.executor_result import BaseExecutorResult +from shared.schemas.governance import SpanType from shared.tasks.specs import InferenceSpecStrict from .base_executor import ExecutionError, Executor, ExecutorTask diff --git a/tests/shared/test_executor_result.py b/tests/shared/test_executor_result.py index 0c71bdba..24bb93c8 100644 --- a/tests/shared/test_executor_result.py +++ b/tests/shared/test_executor_result.py @@ -39,7 +39,9 @@ def test_artifacts_alias_round_trips_both_directions() -> None: from_alias = BaseExecutorResult.model_validate({"_artifacts": {"base_dir": "/a"}}) from_field = BaseExecutorResult.model_validate({"artifacts": {"base_dir": "/a"}}) - assert from_alias.artifacts == from_field.artifacts == ArtifactContext(base_dir="/a") + assert ( + from_alias.artifacts == from_field.artifacts == ArtifactContext(base_dir="/a") + ) dumped = from_alias.model_dump() assert "_artifacts" in dumped diff --git a/tests/worker/test_connector_logging.py b/tests/worker/test_connector_logging.py index beda187e..0caa357c 100644 --- a/tests/worker/test_connector_logging.py +++ b/tests/worker/test_connector_logging.py @@ -90,6 +90,7 @@ def test_connector_logs_printed_to_stderr(tmp_path: Path) -> None: mp.cleanup_after_run() # Verify the executor ran successfully + assert isinstance(result, dict) assert result["ok"], f"Executor failed: {result}" assert ( result.get("result", {}).get("status") == "completed" diff --git a/tests/worker/test_mp_executor_lifecycle.py b/tests/worker/test_mp_executor_lifecycle.py index 3fe73be8..f6882cc5 100644 --- a/tests/worker/test_mp_executor_lifecycle.py +++ b/tests/worker/test_mp_executor_lifecycle.py @@ -54,6 +54,7 @@ def test_mp_executor_does_not_start_subprocess_until_first_run(tmp_path: Path) - with tempfile.TemporaryDirectory() as out_dir: result = mp.run(_simple_task_message(), Path(out_dir)) + assert isinstance(result, dict) assert result["ok"] is True assert mp._shutdown is False assert mp._proc is not None From cd5fd0c62fbf35277dab3d93cbb89093635615bb Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Tue, 19 May 2026 15:22:46 +0000 Subject: [PATCH 10/26] refactor: tighten executor result typing end-to-end ``Executor.run`` is now strictly typed as ``-> BaseExecutorResult`` (no dict union), and every executor constructs its typed subclass directly instead of validating a dict at the return boundary. Schema: ``BaseExecutorResult`` is consolidated into ``shared/schemas/result.py``; ``children`` and ``ResultEnvelope.result`` use ``SerializeAsAny[...]`` so subclass-specific fields survive the declared-type annotation without per-call ``serialize_as_any=True``. ``upstreamResults`` on ``TaskSpecStrictBase`` / ``TaskSpecTemplateBase`` is typed as ``dict[str, BaseExecutorResult]``. Plumbing: ``write_result_in_envelope`` is folded into ``write_executor_result`` since the dict-input path is no longer needed. ``_dump_to_governance``, ``_populate_table``, and ``_maybe_export_jsonl`` take the values they actually use (typed result, items list) instead of mutating an opaque payload. The dispatcher's ``_load_stage_result`` returns ``ResultEnvelope``; ``_dig_path`` walks ``BaseModel`` attributes; ``_render_artifact_ref`` reads ``ctx.base_url`` / ``ctx.base_dir`` directly. Omni: ``OmniResult`` becomes strict (``executor``, ``mode``, ``items``, ``model``) and each concrete omni executor declares its own ``OmniText2*Result`` subclass with default ``executor`` / ``mode`` literals. Test fixtures (``test_mp_executor_lifecycle``, ``test_connector_logging``) define typed result subclasses so assertions use natural attribute access; the multiprocessing boundary pickles the subclass through to the parent. Signed-off-by: Noppanat Wadlom --- src/server/dispatcher/base.py | 60 ++++++++------- src/server/routers/v1/results.py | 4 +- src/shared/schemas/executor_result.py | 31 -------- src/shared/schemas/result.py | 44 +++++++---- src/shared/tasks/specs/common.py | 5 +- src/worker/executors/agent_executor.py | 64 ++++++++-------- src/worker/executors/api_executor.py | 4 +- src/worker/executors/base_executor.py | 29 ++++---- .../executors/data_profiling_executor.py | 2 +- .../executors/data_retrieval_executor.py | 2 +- src/worker/executors/diffusers_executor.py | 20 ++--- src/worker/executors/dpo_executor.py | 54 +++++++------- src/worker/executors/echo_executor.py | 2 +- src/worker/executors/lora_sft_executor.py | 70 +++++++++--------- src/worker/executors/mixins/data.py | 10 +-- src/worker/executors/mixins/governance.py | 17 ++--- src/worker/executors/mixins/inference.py | 3 +- src/worker/executors/mp_executor.py | 6 +- src/worker/executors/omni_executor_base.py | 11 ++- .../executors/omni_text2audio_executor.py | 42 ++++++----- .../executors/omni_text2general_executor.py | 41 ++++++----- .../executors/omni_text2image_executor.py | 28 +++---- .../executors/omni_text2speech_executor.py | 35 +++++---- src/worker/executors/ppo_executor.py | 54 +++++++------- src/worker/executors/rag_executor.py | 2 +- src/worker/executors/sft_executor.py | 73 +++++++++---------- src/worker/executors/ssh_executor.py | 2 +- src/worker/executors/transformers_executor.py | 37 ++++------ src/worker/executors/utils/checkpoints.py | 15 ++-- src/worker/executors/vllm_executor.py | 44 +++++------ src/worker/runner.py | 18 ++--- tests/shared/test_executor_result.py | 28 ++----- tests/worker/test_connector_logging.py | 29 +++++--- tests/worker/test_data_mixin_lineage.py | 19 +++-- tests/worker/test_executor_bootstrap.py | 5 +- tests/worker/test_mp_executor_lifecycle.py | 14 +++- 36 files changed, 443 insertions(+), 481 deletions(-) delete mode 100644 src/shared/schemas/executor_result.py diff --git a/src/server/dispatcher/base.py b/src/server/dispatcher/base.py index 9ec8e5d8..2ae68c25 100644 --- a/src/server/dispatcher/base.py +++ b/src/server/dispatcher/base.py @@ -9,13 +9,18 @@ from pydantic import BaseModel, ValidationError from shared.schemas.event import TaskEvent -from shared.schemas.executor_result import BaseExecutorResult -from shared.schemas.result import ResultEnvelope, result_file_path, write_result +from shared.schemas.result import ( + BaseExecutorResult, + ResultEnvelope, + result_file_path, + write_result, +) from shared.tasks import ( MergedChildTaskStrict, TaskEnvelope, TaskEnvelopeStrict, TaskEnvelopeTemplate, + TaskSpecStrict, ) from shared.tasks.placeholders import PLACEHOLDER_PATTERN from shared.tasks.specs import ( @@ -708,7 +713,7 @@ def _resolve_stage_references( self, task_id: str, task: TaskEnvelopeTemplate, record: TaskRecord ) -> TaskEnvelopeStrict: context = self._build_stage_context(record) - resolved_task = task + resolved_task: TaskEnvelopeTemplate = task if context and task.has_placeholder(): resolved_task = self._resolve_placeholders(task, context) @@ -717,7 +722,7 @@ def _resolve_stage_references( ) if upstream_results: existing = resolved_task.spec.upstreamResults or {} - merged: dict[str, Any] = {} + merged: dict[str, BaseExecutorResult] = {} if isinstance(existing, dict): merged.update(copy.deepcopy(existing)) merged.update(upstream_results) @@ -731,7 +736,7 @@ def _resolve_stage_references( return TaskEnvelopeStrict.model_validate(resolved_task) - def _resolve_placeholders(self, value: Any, context: dict[str, Any]) -> Any: + def _resolve_placeholders(self, value: Any, context: dict[str, TaskRecord]) -> Any: if isinstance(value, str): exact = PLACEHOLDER_PATTERN.fullmatch(value) if exact: @@ -815,7 +820,7 @@ def _stage_context_keys(record: TaskRecord) -> tuple[str, ...]: keys.append(record.graph_node_name) return tuple(keys) - def _resolve_reference(self, expr: str, context: dict[str, Any]) -> Any: + def _resolve_reference(self, expr: str, context: dict[str, TaskRecord]) -> Any: expr = expr.strip() if not expr: raise ValueError("Empty stage reference") @@ -832,49 +837,47 @@ def _resolve_reference(self, expr: str, context: dict[str, Any]) -> Any: return stage_record.task_id if stage_record.status != "DONE": raise StageReferenceNotReady(f"Stage '{stage_name}' has not completed") - data = self._load_stage_result(stage_record.task_id) - value = self._dig_path(data, path.split(".")) + envelope = self._load_stage_result(stage_record.task_id) + value = self._dig_path(envelope.result, path.split(".")) if value is None: raise ValueError(f"Missing value for reference '{expr}'") # If the referenced value is an artifact ref ({path: "..."}), render # it as a full URL (when base_url is set) or an absolute filesystem # path using the producing stage's top-level _artifacts context. - if rendered := self._render_artifact_ref(value, data): + if rendered := self._render_artifact_ref(value, envelope): return rendered return value @staticmethod - def _render_artifact_ref(value: Any, stage_result: Any) -> str | None: + def _render_artifact_ref(value: Any, stage_result: ResultEnvelope) -> str | None: if not isinstance(value, dict): return None path_value = value.get("path") if not isinstance(path_value, str) or not path_value: return None - if not isinstance(stage_result, dict): - return None - ctx = stage_result.get("_artifacts") - if not isinstance(ctx, dict): + ctx = stage_result.result.artifacts + if ctx is None: return None - base_url = ctx.get("base_url") - base_dir = ctx.get("base_dir") - if isinstance(base_url, str) and base_url and isinstance(base_dir, str): + base_url = ctx.base_url + base_dir = ctx.base_dir + if base_url and base_dir: task_id = Path(base_dir).name return f"{base_url.rstrip('/')}/api/v1/results/{task_id}/files/{path_value}" - if isinstance(base_dir, str) and base_dir: + if base_dir: return (Path(base_dir) / "artifacts" / path_value).as_posix() return None def _collect_upstream_results( self, context: dict[str, TaskRecord], current_task_id: str - ) -> dict[str, Any]: - results: dict[str, Any] = {} + ) -> dict[str, BaseExecutorResult]: + results: dict[str, BaseExecutorResult] = {} for name, record in context.items(): if not record or record.task_id == current_task_id: continue if record.status != TaskStatus.DONE: continue try: - data = self._load_stage_result(record.task_id) + envelope = self._load_stage_result(record.task_id) except StageReferenceNotReady as exc: raise exc except Exception as exc: @@ -885,11 +888,11 @@ def _collect_upstream_results( exc, ) continue - results[name] = data + results[name] = envelope.result return results def _resolve_upstream_task_ids( - self, record: TaskRecord, spec: Any + self, record: TaskRecord, spec: TaskSpecStrict ) -> dict[str, str] | None: if not isinstance(spec, SSHSpecStrict) or not spec.inputs: return None @@ -916,16 +919,14 @@ def _resolve_upstream_task_ids( resolved[stage_name] = upstream.task_id return resolved or None - def _load_stage_result(self, stage_task_id: str) -> dict[str, Any]: + def _load_stage_result(self, stage_task_id: str) -> ResultEnvelope: path = result_file_path(self._results_dir, stage_task_id) if not path.exists(): raise StageReferenceNotReady( f"Result for task {stage_task_id} not found at {path}" ) content = json.loads(path.read_text(encoding="utf-8")) - return ResultEnvelope.model_validate(content).result.model_dump( - serialize_as_any=True - ) + return ResultEnvelope.model_validate(content) def _dig_path(self, data: Any, parts: list[str]) -> Any: current = data @@ -949,6 +950,11 @@ def _dig_path(self, data: Any, parts: list[str]) -> Any: return None current = current[idx] continue + if isinstance(current, BaseModel): + if not hasattr(current, part): + return None + current = getattr(current, part) + continue return None return current diff --git a/src/server/routers/v1/results.py b/src/server/routers/v1/results.py index 85667f4b..99ac8236 100644 --- a/src/server/routers/v1/results.py +++ b/src/server/routers/v1/results.py @@ -143,9 +143,7 @@ async def get_result( detail=f"Result file is not valid JSON: {exc}", ) from exc try: - return ResultEnvelope.model_validate(content).result.model_dump( - serialize_as_any=True - ) + return ResultEnvelope.model_validate(content).result.model_dump() except ValidationError as exc: raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, diff --git a/src/shared/schemas/executor_result.py b/src/shared/schemas/executor_result.py deleted file mode 100644 index f352f9e5..00000000 --- a/src/shared/schemas/executor_result.py +++ /dev/null @@ -1,31 +0,0 @@ -from pydantic import AliasChoices, BaseModel, ConfigDict, Field - -from .artifact import ArtifactContext - - -class BaseExecutorResult(BaseModel): - """Common shape for every executor's result payload. - - ``extra="allow"`` lets the server round-trip subclass payloads through - this base class without losing executor-specific fields. - """ - - model_config = ConfigDict( - extra="allow", - populate_by_name=True, - serialize_by_alias=True, - ) - - children: dict[str, "BaseExecutorResult"] = Field( - default_factory=dict, - description="Per-child result payloads for task merging.", - ) - artifacts: ArtifactContext | None = Field( - default=None, - validation_alias=AliasChoices("artifacts", "_artifacts"), - serialization_alias="_artifacts", - description="Resolution context for relative artifact refs.", - ) - - -BaseExecutorResult.model_rebuild() diff --git a/src/shared/schemas/result.py b/src/shared/schemas/result.py index 6b8831b0..832fa25c 100644 --- a/src/shared/schemas/result.py +++ b/src/shared/schemas/result.py @@ -1,20 +1,47 @@ """Result envelope schema shared by server and worker.""" +# This is necessary to allow for the recursive type hint of `children` in +# `BaseExecutorResult`. +from __future__ import annotations + from pathlib import Path from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field, SerializeAsAny from shared.utils.atomic import atomic_write_text from shared.utils.manifest import prepare_output_dir from shared.utils.time import now_iso -from .executor_result import BaseExecutorResult +from .artifact import ArtifactContext + + +class BaseExecutorResult(BaseModel): + """Common shape for every executor's result payload. + + ``extra="allow"`` lets the server round-trip subclass payloads through + this base class without losing executor-specific fields. + """ + + model_config = ConfigDict(extra="allow", serialize_by_alias=True) + + children: dict[str, SerializeAsAny[BaseExecutorResult]] = Field( + default_factory=dict, + exclude_if=lambda v: not v, + description="Per-child result payloads for task merging.", + ) + artifacts: ArtifactContext | None = Field( + default=None, + alias="_artifacts", + description="Resolution context for relative artifact refs.", + ) class ResultEnvelope(BaseModel): task_id: str = Field(description="Task identifier.") - result: BaseExecutorResult = Field(description="Result payload data.") + result: SerializeAsAny[BaseExecutorResult] = Field( + description="Result payload data." + ) worker_id: str | None = Field( default=None, description="Worker identifier submitting the result." ) @@ -42,17 +69,6 @@ def write_result(base_dir: Path, envelope: ResultEnvelope) -> Path: return path -def write_result_in_envelope( - path: Path, task_id: str, result: BaseExecutorResult | dict[str, Any] -) -> None: - """Wrap ``result`` in a ``ResultEnvelope`` and persist it at ``path``.""" - path.parent.mkdir(parents=True, exist_ok=True) - if isinstance(result, dict): - result = BaseExecutorResult.model_validate(result) - envelope = ResultEnvelope(task_id=task_id, result=result) - atomic_write_text(path, envelope.model_dump_json(indent=2, serialize_as_any=True)) - - def read_result(base_dir: Path, task_id: str) -> str: path = result_file_path(base_dir, task_id) return path.read_text(encoding="utf-8") diff --git a/src/shared/tasks/specs/common.py b/src/shared/tasks/specs/common.py index d6cb61db..e841e596 100644 --- a/src/shared/tasks/specs/common.py +++ b/src/shared/tasks/specs/common.py @@ -2,6 +2,7 @@ from pydantic import Field, model_validator +from ...schemas.result import BaseExecutorResult from .._base import StrictBaseModel, TemplateBaseModel from ..components import ( AdapterConfig, @@ -72,7 +73,7 @@ class TaskSpecStrictBase(StrictBaseModel): shard: ShardSpec | None = None # Server-injected stage context (reserve the user-facing key `_upstreamResults`) - upstreamResults: dict[str, Any] | None = Field( + upstreamResults: dict[str, BaseExecutorResult] | None = Field( default=None, alias="_upstreamResults" ) @@ -98,7 +99,7 @@ class TaskSpecTemplateBase(TemplateBaseModel): condition: ConditionSpec | None = None shard: ShardSpecTemplate | None = None - upstreamResults: dict[str, Any] | None = Field( + upstreamResults: dict[str, BaseExecutorResult] | None = Field( default=None, alias="_upstreamResults" ) diff --git a/src/worker/executors/agent_executor.py b/src/worker/executors/agent_executor.py index d0262dd7..7cff6d0a 100644 --- a/src/worker/executors/agent_executor.py +++ b/src/worker/executors/agent_executor.py @@ -17,7 +17,7 @@ from datasets import load_dataset from shared.schemas.artifact import ArtifactRef -from shared.schemas.executor_result import BaseExecutorResult +from shared.schemas.result import BaseExecutorResult from shared.tasks.specs import AgentSpecStrict from .base_executor import ExecutionError, Executor, ExecutorTask @@ -57,7 +57,7 @@ def _resolve_task_timeout(agent: dict[str, Any] | None) -> int: class AgentResult(BaseExecutorResult): - ok: bool + ok: bool = True model: str items: list[dict[str, Any]] = [] usage: dict[str, Any] | None = None @@ -260,32 +260,34 @@ def run(self, task: ExecutorTask, out_dir: Path) -> AgentResult: agent_config_name, self._tasks[0], out_dir, task_timeout ) ) + agent_output_ref = result.get("_agent_output_ref") - output: dict[str, Any] = { - "ok": True, - "model": agent_config_name, - "items": [ + output = AgentResult( + model=agent_config_name, + items=[ { "index": 0, "output": result.get("output", ""), "finish_reason": "completed", } ], - "usage": { + usage={ "execution_time_sec": result.get("usage", {}).get( "execution_time_sec", 0 ), "num_requests": 1, "agent_config": agent_config_name, }, - "metadata": { + metadata={ "task": self._tasks[0], "execution_log": result.get("log", []), }, - } - agent_output_ref = result.get("_agent_output_ref") - if isinstance(agent_output_ref, str): - output["agent_output"] = artifact_ref(agent_output_ref) + agent_output=( + artifact_ref(agent_output_ref) + if isinstance(agent_output_ref, str) + else None + ), + ) else: # Batch execution for multiple tasks results = asyncio.run( @@ -309,51 +311,53 @@ def run(self, task: ExecutorTask, out_dir: Path) -> AgentResult: } ) - output = { - "ok": True, - "model": agent_config_name, - "items": items, - "usage": { + batch_summary_ref = results.get("_batch_summary_ref") + output = AgentResult( + model=agent_config_name, + items=items, + usage={ "execution_time_sec": results.get("usage", {}).get( "execution_time_sec", 0 ), "num_requests": len(self._tasks), "agent_config": agent_config_name, }, - "metadata": { + metadata={ "tasks_count": len(self._tasks), "execution_log": results.get("log", []), "batch_summary": results.get("batch_summary", {}), }, - } - batch_summary_ref = results.get("_batch_summary_ref") - if isinstance(batch_summary_ref, str): - output["batch_summary_file"] = artifact_ref(batch_summary_ref) + batch_summary_file=( + artifact_ref(batch_summary_ref) + if isinstance(batch_summary_ref, str) + else None + ), + ) maybe_upload_artifacts(task, out_dir, logger=logger) logger.info(f"Agent execution completed: {len(self._tasks)} task(s)") - return AgentResult.model_validate(output) + return output except ExecutionError: raise except Exception as e: logger.exception(f"Agent execution failed: {e}") - error_output = { - "ok": False, - "model": agent_config_name, - "items": [], - "usage": { + error_output = AgentResult( + ok=False, + model=agent_config_name, + items=[], + usage={ "execution_time_sec": 0, "num_requests": len(self._tasks), "agent_config": agent_config_name, }, - "metadata": { + metadata={ "tasks_count": len(self._tasks), "error": str(e), "execution_log": [], }, - } + ) write_executor_result( out_dir / "results.json", task.task_id, task.spec, error_output ) diff --git a/src/worker/executors/api_executor.py b/src/worker/executors/api_executor.py index 45be3269..a845b31e 100644 --- a/src/worker/executors/api_executor.py +++ b/src/worker/executors/api_executor.py @@ -7,7 +7,7 @@ import httpx from pydantic import Field -from shared.schemas.executor_result import BaseExecutorResult +from shared.schemas.result import BaseExecutorResult from shared.tasks.specs import ApiSpecStrict from .base_executor import ExecutionError, Executor, ExecutorTask @@ -19,7 +19,7 @@ class APIResult(BaseExecutorResult): - ok: bool + ok: bool = True executor: str method: str url: str diff --git a/src/worker/executors/base_executor.py b/src/worker/executors/base_executor.py index ebb58ed4..f17a3d7f 100644 --- a/src/worker/executors/base_executor.py +++ b/src/worker/executors/base_executor.py @@ -27,7 +27,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> dict: from pathlib import Path from typing import Any, TypeVar -from shared.schemas.executor_result import BaseExecutorResult +from shared.schemas.result import BaseExecutorResult from shared.tasks import MergedChildTaskStrict from shared.tasks.specs import TaskSpecStrictBase from shared.tasks.worker_message import WorkerHardware, WorkerTaskMessage @@ -85,9 +85,7 @@ def prepare(self) -> None: return None @abstractmethod - def run( - self, task: ExecutorTask, out_dir: Path - ) -> BaseExecutorResult | dict[str, Any]: + def run(self, task: ExecutorTask, out_dir: Path) -> BaseExecutorResult: """Execute a single task. Args: @@ -96,8 +94,7 @@ def run( if needed. Returns: - A ``BaseExecutorResult`` subclass instance (preferred) or a - JSON-serializable dictionary summarizing the result. + A ``BaseExecutorResult`` subclass instance. Raises: ExecutionError: for expected, user-facing failures. @@ -143,13 +140,19 @@ def load_json(path: Path) -> dict[str, Any]: # -------- Minimal example implementation -------- +class EchoResult(BaseExecutorResult): + ok: bool = True + executor: str + task_id: str + task_type: str + + class EchoExecutor(Executor): name = "echo" - def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: - return { - "ok": True, - "executor": self.name, - "task_id": task.task_id, - "task_type": task.spec.taskType, - } + def run(self, task: ExecutorTask, out_dir: Path) -> EchoResult: + return EchoResult( + executor=self.name, + task_id=task.task_id, + task_type=task.spec.taskType, + ) diff --git a/src/worker/executors/data_profiling_executor.py b/src/worker/executors/data_profiling_executor.py index 20b94645..57f4249c 100644 --- a/src/worker/executors/data_profiling_executor.py +++ b/src/worker/executors/data_profiling_executor.py @@ -8,7 +8,7 @@ from pathlib import Path from typing import Any -from shared.schemas.executor_result import BaseExecutorResult +from shared.schemas.result import BaseExecutorResult from shared.tasks.specs import DataProfilingSpecStrict from shared.utils.json import to_json_serializable, validate_keys diff --git a/src/worker/executors/data_retrieval_executor.py b/src/worker/executors/data_retrieval_executor.py index 3775284b..22f964e5 100644 --- a/src/worker/executors/data_retrieval_executor.py +++ b/src/worker/executors/data_retrieval_executor.py @@ -10,7 +10,7 @@ import pandas as pd from PIL import Image -from shared.schemas.executor_result import BaseExecutorResult +from shared.schemas.result import BaseExecutorResult from shared.tasks.specs import DataRetrievalSpecStrict from shared.utils.json import validate_keys diff --git a/src/worker/executors/diffusers_executor.py b/src/worker/executors/diffusers_executor.py index b34ed6ff..5a82e1fd 100644 --- a/src/worker/executors/diffusers_executor.py +++ b/src/worker/executors/diffusers_executor.py @@ -15,7 +15,7 @@ from PIL import Image from shared.schemas.artifact import ArtifactRef -from shared.schemas.executor_result import BaseExecutorResult +from shared.schemas.result import BaseExecutorResult from shared.tasks.specs import DiffusionSpecStrict from ..utils.logging import configure_hf_library_logging @@ -255,17 +255,17 @@ def run(self, task: ExecutorTask, out_dir: Path) -> DiffusersResult: with self._task_span( task_id, task.workflow_id, out_dir, owner_id=task.owner_id ): - response = self._run_inner(spec, task_id, out_dir) + result = self._run_inner(spec, task_id, out_dir) maybe_upload_artifacts(task, out_dir, logger=logger) maybe_upload_traces(task, out_dir, logger=logger) - return DiffusersResult.model_validate(response) + return result def _run_inner( self, spec: DiffusionSpecStrict, task_id: str, out_dir: Path, - ) -> dict[str, Any]: + ) -> DiffusersResult: self._ensure_pipeline(spec) assert self._pipe is not None @@ -349,19 +349,13 @@ def _run_inner( img.save(image_dir / f"image_{idx}.png", format="PNG") generated_images.append(artifact_ref(f"images/image_{idx}.png")) - response: dict[str, Any] = { - "ok": True, - "model": self._model_name, - "images": generated_images, - } - + result = DiffusersResult(model=self._model_name, images=generated_images) self._dump_to_governance( task_id=task_id, - result=response, + result=result, dependencies_by_task=dependencies_by_task, ) - - return response + return result def cleanup_after_run(self) -> None: if self._pipe is not None: diff --git a/src/worker/executors/dpo_executor.py b/src/worker/executors/dpo_executor.py index 9dfbe449..043e0978 100644 --- a/src/worker/executors/dpo_executor.py +++ b/src/worker/executors/dpo_executor.py @@ -26,7 +26,7 @@ from trl.trainer.dpo_trainer import DPOTrainer from shared.schemas.artifact import ArtifactRef -from shared.schemas.executor_result import BaseExecutorResult +from shared.schemas.result import BaseExecutorResult from shared.tasks.specs import DPOSpecStrict from shared.utils.manifest import scratch_dir @@ -110,9 +110,9 @@ def run(self, task: ExecutorTask, out_dir: Path) -> DPOResult: result = self._execute_training(task, out_dir) logger.info( "DPO training task completed in %.2f seconds", - result.get("training_time_seconds", 0.0), + result.training_time_seconds or 0.0, ) - return DPOResult.model_validate(result) + return result finally: self._task_out_dir = None @@ -369,7 +369,7 @@ def _ensure_jsonl_local(jsonl_cfg: dict[str, Any]) -> Path: return dataset # type: ignore[return-value] - def _execute_training(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: + def _execute_training(self, task: ExecutorTask, out_dir: Path) -> DPOResult: spec = self.require_spec(task, DPOSpecStrict) training_config = spec.training or {} artifacts_dir = out_dir / "artifacts" @@ -477,21 +477,21 @@ def _execute_training(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any] logger.warning("Failed to save model: %s", exc) training_time = time.time() - start_time - results: dict[str, Any] = { - "training_successful": True, - "training_time_seconds": training_time, - "error_message": None, - "model_name": self._model_name, - "dataset_size": len(dataset), - "output_dir": out_dir.as_posix(), - "checkpoints_dir": artifact_ref("checkpoints"), - } + result = DPOResult( + training_successful=True, + training_time_seconds=training_time, + error_message=None, + model_name=self._model_name, + dataset_size=len(dataset) if dataset is not None else 0, + output_dir=out_dir.as_posix(), + checkpoints_dir=artifact_ref("checkpoints"), + ) if final_model_path is not None: - results["final_model"] = artifact_ref( + result.final_model = artifact_ref( final_model_path.relative_to(artifacts_dir).as_posix() ) if final_archive_path is not None: - results["final_model_archive"] = artifact_ref( + result.final_model_archive = artifact_ref( final_archive_path.relative_to(artifacts_dir).as_posix() ) @@ -503,24 +503,22 @@ def _execute_training(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any] final_model_path, final_archive_path, ) - return results + return result except Exception as exc: training_time = time.time() - start_time - results = { - "training_successful": False, - "training_time_seconds": training_time, - "error_message": str(exc), - "model_name": self._model_name, - "dataset_size": len(dataset) if dataset is not None else 0, - "output_dir": out_dir.as_posix(), - } + result = DPOResult( + training_successful=False, + training_time_seconds=training_time, + error_message=str(exc), + model_name=self._model_name, + dataset_size=len(dataset) if dataset is not None else 0, + output_dir=out_dir.as_posix(), + ) write_executor_result( - out_dir / "results.json", task.task_id, task.spec, results + out_dir / "results.json", task.task_id, task.spec, result ) logger.exception("DPO training failed: %s", exc) - raise ExecutionError( - results["error_message"] or "DPO training failed" - ) from exc + raise ExecutionError(result.error_message or "DPO training failed") from exc def _spawn_distributed( self, diff --git a/src/worker/executors/echo_executor.py b/src/worker/executors/echo_executor.py index cec85193..ad9795ee 100644 --- a/src/worker/executors/echo_executor.py +++ b/src/worker/executors/echo_executor.py @@ -2,7 +2,7 @@ from pathlib import Path from typing import Any -from shared.schemas.executor_result import BaseExecutorResult +from shared.schemas.result import BaseExecutorResult from shared.tasks.specs import EchoSpecStrict from .base_executor import ExecutionError, Executor, ExecutorTask diff --git a/src/worker/executors/lora_sft_executor.py b/src/worker/executors/lora_sft_executor.py index 20f9a5f3..55deca84 100644 --- a/src/worker/executors/lora_sft_executor.py +++ b/src/worker/executors/lora_sft_executor.py @@ -19,7 +19,7 @@ from trl.trainer.sft_trainer import SFTTrainer from shared.schemas.artifact import ArtifactRef -from shared.schemas.executor_result import BaseExecutorResult +from shared.schemas.result import BaseExecutorResult from shared.tasks.specs import LoRASFTSpecStrict from ..utils.logging import configure_hf_library_logging @@ -115,6 +115,8 @@ def run(self, task: ExecutorTask, out_dir: Path) -> LoRAResult: checkpoint_dir = artifacts_dir / "checkpoints" checkpoint_dir.mkdir(parents=True, exist_ok=True) + resume_path: Path | None = None + train_dataset: Dataset | None = None try: model_name = spec.model_name or "gpt2" self._model_name = model_name @@ -137,7 +139,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> LoRAResult: resume_path = determine_resume_path( spec, training_cfg, out_dir, logger=logger ) - resume_str = str(resume_path) if resume_path else None + resume_str = resume_path.as_posix() if resume_path else None if bool(training_cfg.get("gradient_checkpointing", False)): model.gradient_checkpointing_enable() @@ -155,9 +157,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> LoRAResult: "Resuming LoRA training from local checkpoint %s", resume_path ) peft_model = PeftModel.from_pretrained( - model, - str(resume_path), - is_trainable=True, + model, resume_path.as_posix(), is_trainable=True ) logger.info("Loaded existing LoRA adapters; continuing fine-tuning") else: @@ -198,7 +198,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> LoRAResult: logger.info("Initialized new LoRA adapters: %s", lora_target_modules) sft_config = SFTConfig( - output_dir=str(checkpoint_dir), + output_dir=checkpoint_dir.as_posix(), num_train_epochs=float(training_cfg.get("num_train_epochs", 1.0)), per_device_train_batch_size=int(training_cfg.get("batch_size", 2)), gradient_accumulation_steps=int( @@ -300,31 +300,29 @@ def _compute_loss_with_guard( archive_path: Path | None = None if bool(training_cfg.get("save_model", True)): model_path = artifacts_dir / "final_lora" - trainer.save_model(str(model_path)) + trainer.save_model(model_path.as_posix()) tokenizer.save_pretrained(model_path) final_adapter_path = model_path logger.info("Saved LoRA-adapted weights to %s", model_path) training_time = time.time() - start_time - result_payload: dict[str, Any] = { - "task_id": task.task_id, - "training_successful": training_successful, - "training_time_seconds": training_time, - "error_message": error_msg, - "model_name": self._model_name, - "dataset_size": ( - len(train_dataset) if "train_dataset" in locals() else 0 - ), - "output_dir": out_dir.as_posix(), - "checkpoints_dir": artifact_ref("checkpoints"), - "resume_from_path": resume_str, - } + result = LoRAResult( + task_id=task.task_id, + training_successful=training_successful, + training_time_seconds=training_time, + error_message=error_msg, + model_name=self._model_name, + dataset_size=len(train_dataset) if train_dataset is not None else 0, + output_dir=out_dir.as_posix(), + checkpoints_dir=artifact_ref("checkpoints"), + resume_from_path=resume_str, + ) if final_adapter_path is not None: - result_payload["final_lora"] = artifact_ref( + result.final_lora = artifact_ref( final_adapter_path.relative_to(artifacts_dir).as_posix() ) archive_path = archive_model_dir(final_adapter_path) - result_payload["final_lora_archive"] = artifact_ref( + result.final_lora_archive = artifact_ref( archive_path.relative_to(artifacts_dir).as_posix() ) logger.info("Prepared LoRA archive at %s", archive_path) @@ -337,7 +335,7 @@ def _compute_loss_with_guard( final_adapter_path, archive_path, ) - return LoRAResult.model_validate(result_payload) + return result except Exception as exc: # pylint: disable=broad-except error_msg = str(exc) @@ -346,22 +344,20 @@ def _compute_loss_with_guard( training_time = time.time() - start_time - result: dict[str, Any] = { - "task_id": task.task_id, - "training_successful": training_successful, - "training_time_seconds": training_time, - "error_message": error_msg, - "model_name": self._model_name, - "dataset_size": len(train_dataset) if "train_dataset" in locals() else 0, - "output_dir": out_dir.as_posix(), - "checkpoints_dir": artifact_ref("checkpoints"), - "resume_from_path": ( - str(resume_path) if "resume_path" in locals() and resume_path else None - ), - } + result = LoRAResult( + task_id=task.task_id, + training_successful=training_successful, + training_time_seconds=training_time, + error_message=error_msg, + model_name=self._model_name, + dataset_size=len(train_dataset) if train_dataset is not None else 0, + output_dir=out_dir.as_posix(), + checkpoints_dir=artifact_ref("checkpoints"), + resume_from_path=resume_path.as_posix() if resume_path else None, + ) if training_successful: - return LoRAResult.model_validate(result) + return result write_executor_result(out_dir / "results.json", task.task_id, task.spec, result) message = error_msg or "LoRA SFT training failed" diff --git a/src/worker/executors/mixins/data.py b/src/worker/executors/mixins/data.py index ee45aab7..77c2cff1 100644 --- a/src/worker/executors/mixins/data.py +++ b/src/worker/executors/mixins/data.py @@ -702,14 +702,11 @@ def _collect_prompts_for_spec( ) def _populate_table( - self, - payload: dict[str, Any], - table_stores_list: list[pd.DataFrame], - ): + self, items: list[dict[str, Any]], table_stores_list: list[pd.DataFrame] + ) -> list[dict[str, Any]]: """ Group row-level generation outputs back into per-table outputs. """ - items = payload["items"] cur = 0 grouped_items: list[dict[str, Any]] = [] for df in table_stores_list: @@ -724,8 +721,7 @@ def _populate_table( f"Output length {len(items)} does not match " f"the total number of rows {cur} in table stores." ) - payload["items"] = grouped_items - return payload + return grouped_items def _maybe_apply_dataset_shard(self, dataset, spec: TaskSpecStrictBase): shard_cfg = spec.shard diff --git a/src/worker/executors/mixins/governance.py b/src/worker/executors/mixins/governance.py index c623c934..a302f595 100644 --- a/src/worker/executors/mixins/governance.py +++ b/src/worker/executors/mixins/governance.py @@ -13,12 +13,12 @@ from opentelemetry.trace import Span as OTelSpan -from shared.schemas.executor_result import BaseExecutorResult from shared.schemas.governance import ( READY_SPAN_NAME, TASK_SPAN_NAME, SpanType, ) +from shared.schemas.result import BaseExecutorResult from shared.tasks.specs import TaskSpecStrictBase from shared.utils.time import now_iso @@ -237,32 +237,25 @@ def _extract_source_data_ids(self, spec: TaskSpecStrictBase) -> list[str]: def _dump_to_governance( self, task_id: str, - result: BaseExecutorResult | dict[str, Any], + result: BaseExecutorResult, dependencies_by_task: dict[str, list[str]], ) -> None: """Write parent + merged-child results and emit asset/lineage rows.""" parent_deps = dependencies_by_task.get(task_id, []) - payload = ( - result.model_dump(serialize_as_any=True) - if isinstance(result, BaseExecutorResult) - else result - ) - children_payload = payload.get("children", {}) or {} - collection_jobs: list[dict[str, Any]] = [ { "task_id": task_id, - "result": payload, + "result": result.model_dump(), "deps": parent_deps, "is_parent": True, } ] - for child_id, child_result in children_payload.items(): + for child_id, child_result in result.children.items(): child_deps = dependencies_by_task.get(child_id, []) collection_jobs.append( { "task_id": child_id, - "result": child_result, + "result": child_result.model_dump(), "deps": child_deps, "is_parent": False, } diff --git a/src/worker/executors/mixins/inference.py b/src/worker/executors/mixins/inference.py index b2c9d7e3..4076e901 100644 --- a/src/worker/executors/mixins/inference.py +++ b/src/worker/executors/mixins/inference.py @@ -228,7 +228,7 @@ def _maybe_export_jsonl( self, spec: InferenceSpecStrict, task_id: str, - result: dict[str, Any], + items: list[dict[str, Any]], out_dir: Path, ) -> None: post_cfg = (postprocess := spec.postprocess) and postprocess.jsonl_export @@ -260,7 +260,6 @@ def _maybe_export_jsonl( ) from exc target_path.parent.mkdir(parents=True, exist_ok=True) - items = result.get("items") or [] required_fields = post_cfg.required_fields or [] records: list[dict[str, Any]] = [] diff --git a/src/worker/executors/mp_executor.py b/src/worker/executors/mp_executor.py index db92eebc..f2b7c199 100644 --- a/src/worker/executors/mp_executor.py +++ b/src/worker/executors/mp_executor.py @@ -21,7 +21,7 @@ import psutil -from shared.schemas.executor_result import BaseExecutorResult +from shared.schemas.result import BaseExecutorResult from shared.tasks.worker_message import WorkerHardware from worker.config import WorkerConfig @@ -440,9 +440,7 @@ def _loop() -> None: self._log_thread = t t.start() - def run( - self, task: ExecutorTask, out_dir: Path - ) -> BaseExecutorResult | dict[str, Any]: + def run(self, task: ExecutorTask, out_dir: Path) -> BaseExecutorResult: with self._lock: if self._shutdown: logger.info("Starting worker subprocess for %s", self.name) diff --git a/src/worker/executors/omni_executor_base.py b/src/worker/executors/omni_executor_base.py index 86021bbd..073d71c8 100644 --- a/src/worker/executors/omni_executor_base.py +++ b/src/worker/executors/omni_executor_base.py @@ -19,7 +19,7 @@ import yaml -from shared.schemas.executor_result import BaseExecutorResult +from shared.schemas.result import BaseExecutorResult from shared.tasks.specs import TaskSpecStrictBase from shared.utils.parsing import to_bool, to_int @@ -43,7 +43,10 @@ class OmniResult(BaseExecutorResult): ok: bool = True - model: str | None = None + executor: str + mode: str + model: str | None + items: list[dict[str, Any]] class OmniExecutorBase(InferenceMixin, Executor): @@ -74,7 +77,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> OmniResult: result = self._run_inner(task, spec, spec_dict, out_dir) maybe_upload_artifacts(task, out_dir, logger=logger) maybe_upload_traces(task, out_dir, logger=logger) - return OmniResult.model_validate(result) + return result @abstractmethod def _run_inner( @@ -83,7 +86,7 @@ def _run_inner( spec: TaskSpecStrictBase, spec_dict: dict[str, Any], out_dir: Path, - ) -> dict[str, Any]: + ) -> OmniResult: """Run the executor-specific body. ``spec`` is the concrete strict spec; subclasses ``assert isinstance(spec, ...)`` to narrow.""" raise NotImplementedError diff --git a/src/worker/executors/omni_text2audio_executor.py b/src/worker/executors/omni_text2audio_executor.py index 952dfb28..77168cbb 100644 --- a/src/worker/executors/omni_text2audio_executor.py +++ b/src/worker/executors/omni_text2audio_executor.py @@ -56,16 +56,27 @@ from shared.utils.parsing import to_float, to_int from .base_executor import ExecutionError, ExecutorTask -from .omni_executor_base import OmniExecutorBase, extract_multimodal_output +from .omni_executor_base import OmniExecutorBase, OmniResult, extract_multimodal_output from .utils.checkpoints import artifact_ref logger = logging.getLogger(__name__) +EXECUTOR_NAME = "omni_text2audio" + + +class OmniText2AudioResult(OmniResult): + executor: str = EXECUTOR_NAME + mode: str = "bgm" + audio: dict[str, Any] + sample_rate: int + num_waveforms: int + audio_length: float + storyboard: dict[str, Any] | None = None class OmniText2AudioExecutor(OmniExecutorBase): """Generate background music with Omni diffusion sampling.""" - name = "omni_text2audio" + name = EXECUTOR_NAME _TASK_SPEC_TYPE = OmniText2AudioSpecStrict def prepare(self) -> None: @@ -84,7 +95,7 @@ def _run_inner( spec: TaskSpecStrictBase, spec_dict: dict[str, Any], out_dir: Path, - ) -> dict[str, Any]: + ) -> OmniText2AudioResult: assert isinstance(spec, OmniText2AudioSpecStrict) prompts = self._collect_text_inputs(spec, task.task_id) @@ -202,22 +213,15 @@ def _run_inner( if not items: raise ExecutionError("omni_text2audio produced no savable waveforms.") - first = items[0]["audio"] if items else {} - result: dict[str, Any] = { - "ok": True, - "executor": self.name, - "mode": "bgm", - "model": self._model_name, - "audio": first, - "items": items, - "sample_rate": sample_rate, - "num_waveforms": len(items), - "audio_length": audio_length, - } - storyboard = spec_dict.get("storyboard") - if isinstance(storyboard, dict): - result["storyboard"] = dict(storyboard) - return result + return OmniText2AudioResult( + model=self._model_name, + audio=items[0]["audio"] if items else {}, + items=items, + sample_rate=sample_rate, + num_waveforms=len(items), + audio_length=audio_length, + storyboard=spec_dict.get("storyboard"), + ) # ── model ──────────────────────────────────────────────────────────── diff --git a/src/worker/executors/omni_text2general_executor.py b/src/worker/executors/omni_text2general_executor.py index 5664af99..222a446f 100644 --- a/src/worker/executors/omni_text2general_executor.py +++ b/src/worker/executors/omni_text2general_executor.py @@ -32,10 +32,16 @@ from shared.utils.parsing import as_list, to_bool, to_float, to_int, to_int_list from .base_executor import ExecutionError, ExecutorTask -from .omni_executor_base import OmniExecutorBase, extract_audio_from_mm, save_audio +from .omni_executor_base import ( + OmniExecutorBase, + OmniResult, + extract_audio_from_mm, + save_audio, +) from .utils.checkpoints import artifact_ref logger = logging.getLogger(__name__) +EXECUTOR_NAME = "omni_text2general" _DEFAULT_SYSTEM_PROMPT = ( "You are Qwen, a virtual human developed by the Qwen Team, " @@ -44,10 +50,18 @@ ) +class OmniText2GeneralResult(OmniResult): + executor: str = EXECUTOR_NAME + mode: str = "narration" + audio: dict[str, Any] + sample_rate: int + storyboard: dict[str, Any] | None = None + + class OmniText2GeneralExecutor(OmniExecutorBase): """Generate narration/speech audio using Qwen3-Omni through vllm_omni.Omni.""" - name = "omni_text2general" + name = EXECUTOR_NAME _TASK_SPEC_TYPE = OmniText2GeneralSpecStrict def prepare(self) -> None: @@ -67,7 +81,7 @@ def _run_inner( spec: TaskSpecStrictBase, spec_dict: dict[str, Any], out_dir: Path, - ) -> dict[str, Any]: + ) -> OmniText2GeneralResult: assert isinstance(spec, OmniText2GeneralSpecStrict) texts = self._collect_text_inputs(spec, task.task_id) @@ -177,20 +191,13 @@ def _run_inner( item["text"] = text_out items.append(item) - first = items[0]["audio"] if items else {} - result: dict[str, Any] = { - "ok": True, - "executor": self.name, - "mode": "narration", - "model": self._model_name, - "audio": first, - "items": items, - "sample_rate": sample_rate, - } - storyboard = spec_dict.get("storyboard") - if isinstance(storyboard, dict): - result["storyboard"] = dict(storyboard) - return result + return OmniText2GeneralResult( + model=self._model_name, + items=items, + audio=items[0]["audio"] if items else {}, + sample_rate=sample_rate, + storyboard=spec_dict.get("storyboard"), + ) # ── model ──────────────────────────────────────────────────────────── diff --git a/src/worker/executors/omni_text2image_executor.py b/src/worker/executors/omni_text2image_executor.py index e3b1f234..8254f162 100644 --- a/src/worker/executors/omni_text2image_executor.py +++ b/src/worker/executors/omni_text2image_executor.py @@ -21,16 +21,23 @@ from shared.utils.parsing import as_list from .base_executor import ExecutionError, ExecutorTask -from .omni_executor_base import OmniExecutorBase +from .omni_executor_base import OmniExecutorBase, OmniResult from .utils.checkpoints import artifact_ref logger = logging.getLogger(__name__) +EXECUTOR_NAME = "omni_text2image" + + +class OmniText2ImageResult(OmniResult): + executor: str = EXECUTOR_NAME + mode: str = "image" + image: dict[str, Any] class OmniText2ImageExecutor(OmniExecutorBase): """Generate images using vllm_omni.Omni.""" - name = "omni_text2image" + name = EXECUTOR_NAME _TASK_SPEC_TYPE = OmniText2ImageSpecStrict def prepare(self) -> None: @@ -45,7 +52,7 @@ def _run_inner( spec: TaskSpecStrictBase, spec_dict: dict[str, Any], out_dir: Path, - ) -> dict[str, Any]: + ) -> OmniText2ImageResult: assert isinstance(spec, OmniText2ImageSpecStrict) prompts = self._collect_text_inputs(spec, task.task_id) @@ -92,16 +99,11 @@ def _run_inner( } ) - first = items[0]["image"] if items else {} - result: dict[str, Any] = { - "ok": True, - "executor": self.name, - "mode": "image", - "model": self._model_name, - "image": first, - "items": items, - } - return result + return OmniText2ImageResult( + model=self._model_name, + image=items[0]["image"] if items else {}, + items=items, + ) # ── model ──────────────────────────────────────────────────────────── diff --git a/src/worker/executors/omni_text2speech_executor.py b/src/worker/executors/omni_text2speech_executor.py index 5d7be1b7..ff13753c 100644 --- a/src/worker/executors/omni_text2speech_executor.py +++ b/src/worker/executors/omni_text2speech_executor.py @@ -24,6 +24,7 @@ from .base_executor import ExecutionError, ExecutorTask from .omni_executor_base import ( OmniExecutorBase, + OmniResult, extract_audio_from_mm, extract_multimodal_output, save_audio, @@ -31,12 +32,21 @@ from .utils.checkpoints import artifact_ref logger = logging.getLogger(__name__) +EXECUTOR_NAME = "omni_text2speech" + + +class OmniText2SpeechResult(OmniResult): + executor: str = EXECUTOR_NAME + mode: str = "tts" + audio: dict[str, Any] + sample_rate: int + storyboard: dict[str, Any] | None = None class OmniText2SpeechExecutor(OmniExecutorBase): """Generate speech audio using vllm_omni.Omni.""" - name = "omni_text2speech" + name = EXECUTOR_NAME _TASK_SPEC_TYPE = OmniText2SpeechSpecStrict def prepare(self) -> None: @@ -51,7 +61,7 @@ def _run_inner( spec: TaskSpecStrictBase, spec_dict: dict[str, Any], out_dir: Path, - ) -> dict[str, Any]: + ) -> OmniText2SpeechResult: assert isinstance(spec, OmniText2SpeechSpecStrict) texts = self._collect_text_inputs(spec, task.task_id) @@ -101,20 +111,13 @@ def _run_inner( } ) - first = items[0]["audio"] if items else {} - result: dict[str, Any] = { - "ok": True, - "executor": self.name, - "mode": "tts", - "model": self._model_name, - "audio": first, - "items": items, - "sample_rate": sample_rate, - } - storyboard = spec_dict.get("storyboard") - if isinstance(storyboard, dict): - result["storyboard"] = dict(storyboard) - return result + return OmniText2SpeechResult( + model=self._model_name, + items=items, + audio=items[0]["audio"] if items else {}, + sample_rate=sample_rate, + storyboard=spec_dict.get("storyboard"), + ) # ── model ──────────────────────────────────────────────────────────── diff --git a/src/worker/executors/ppo_executor.py b/src/worker/executors/ppo_executor.py index c9320f58..4bbae544 100644 --- a/src/worker/executors/ppo_executor.py +++ b/src/worker/executors/ppo_executor.py @@ -14,10 +14,7 @@ from contextlib import contextmanager, nullcontext from pathlib import Path from types import SimpleNamespace -from typing import TYPE_CHECKING, Any, cast - -if TYPE_CHECKING: - from deepspeed.runtime.engine import DeepSpeedEngine +from typing import Any, cast import torch from datasets import Dataset @@ -34,7 +31,7 @@ from trl.trainer.ppo_trainer import PPOTrainer from shared.schemas.artifact import ArtifactRef -from shared.schemas.executor_result import BaseExecutorResult +from shared.schemas.result import BaseExecutorResult from shared.tasks.specs import PPOSpecStrict from shared.utils.manifest import scratch_dir from shared.utils.parsing import safe_float, safe_int, to_bool @@ -433,9 +430,8 @@ def run(self, task: ExecutorTask, out_dir: Path) -> PPOResult: ) ipc_path = scratch_dir(out_dir) / "distributed_result.json" if ipc_path.exists(): - result = self.load_json(ipc_path) self._task_out_dir = None - return PPOResult.model_validate(result) + return PPOResult.model_validate(self.load_json(ipc_path)) self._task_out_dir = None return PPOResult( training_successful=True, @@ -901,37 +897,37 @@ def build_trainer() -> _EarlyStopPPOTrainer: logger.exception("PPO training failed: %s", exc) training_time = time.time() - start_time dataset_size = len(dataset) if "dataset" in locals() else 0 # type: ignore - results = { - "training_successful": training_successful, - "training_time_seconds": training_time, - "error_message": error_msg, - "model_name": self._model_name, - "dataset_size": dataset_size, - "output_dir": out_dir.as_posix(), - } + result = PPOResult( + training_successful=training_successful, + training_time_seconds=training_time, + error_message=error_msg, + model_name=self._model_name, + dataset_size=dataset_size, + output_dir=out_dir.as_posix(), + ) write_executor_result( - out_dir / "results.json", task.task_id, task.spec, results + out_dir / "results.json", task.task_id, task.spec, result ) self._task_out_dir = None raise ExecutionError(error_msg or "PPO training failed") from exc training_time = time.time() - start_time - results = { - "training_successful": training_successful, - "training_time_seconds": training_time, - "error_message": error_msg, - "model_name": self._model_name, - "dataset_size": len(dataset), - "output_dir": out_dir.as_posix(), - "checkpoints_dir": artifact_ref("checkpoints"), - } + result = PPOResult( + training_successful=training_successful, + training_time_seconds=training_time, + error_message=error_msg, + model_name=self._model_name, + dataset_size=len(dataset), + output_dir=out_dir.as_posix(), + checkpoints_dir=artifact_ref("checkpoints"), + ) if final_model_path is not None: - results["final_model"] = artifact_ref( + result.final_model = artifact_ref( final_model_path.relative_to(artifacts_dir).as_posix() ) if final_archive_path is not None: - results["final_model_archive"] = artifact_ref( + result.final_model_archive = artifact_ref( final_archive_path.relative_to(artifacts_dir).as_posix() ) @@ -946,7 +942,7 @@ def build_trainer() -> _EarlyStopPPOTrainer: logger.info("PPO training task completed in %.2f seconds", training_time) self._task_out_dir = None - return PPOResult.model_validate(results) + return result def _ensure_jsonl_local(self, jsonl_cfg: dict[str, Any]) -> Path: headers_cfg = ( @@ -1461,7 +1457,7 @@ def _wrapped_save_model( output_dir: str | None = None, _internal_call: bool = False ) -> None: backup_model = ppo_trainer.model - backup_deepspeed: DeepSpeedEngine | None = None + backup_deepspeed: Any = None ppo_trainer.model = self._resolve_model_for_save(backup_model) if ppo_trainer.is_deepspeed_enabled: backup_deepspeed = ppo_trainer.deepspeed diff --git a/src/worker/executors/rag_executor.py b/src/worker/executors/rag_executor.py index a1634258..24286bcd 100644 --- a/src/worker/executors/rag_executor.py +++ b/src/worker/executors/rag_executor.py @@ -16,7 +16,7 @@ from datasets import load_dataset from qdrant_client import QdrantClient, models -from shared.schemas.executor_result import BaseExecutorResult +from shared.schemas.result import BaseExecutorResult from shared.tasks.specs import RagSpecStrict from .base_executor import ExecutionError, Executor, ExecutorTask diff --git a/src/worker/executors/sft_executor.py b/src/worker/executors/sft_executor.py index 4c6cc82b..97085617 100644 --- a/src/worker/executors/sft_executor.py +++ b/src/worker/executors/sft_executor.py @@ -23,7 +23,7 @@ from trl.trainer.sft_trainer import SFTTrainer from shared.schemas.artifact import ArtifactRef -from shared.schemas.executor_result import BaseExecutorResult +from shared.schemas.result import BaseExecutorResult from shared.tasks.specs import SFTSpecStrict, TaskSpecStrictBase from shared.utils.manifest import scratch_dir @@ -206,9 +206,8 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: ) ipc_path = scratch_dir(out_dir) / "distributed_result.json" if ipc_path.exists(): - distributed_result = self.load_json(ipc_path) self._task_out_dir = None - return SFTResult.model_validate(distributed_result) + return SFTResult.model_validate(self.load_json(ipc_path)) self._task_out_dir = None return SFTResult( training_successful=True, @@ -222,6 +221,8 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: "Failed to launch distributed SFT subprocess" ) from spawn_exc + resume_path: Path | None = None + train_dataset: Any = None try: # Proceed with in-process training (single GPU or inside torchrun) model_name = spec.model_name or "gpt2" @@ -230,7 +231,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: resume_path = determine_resume_path( spec, training_cfg, out_dir, logger=logger ) - resume_str = str(resume_path) if resume_path else None + resume_str = resume_path.as_posix() if resume_path else None if resume_path: logger.info( @@ -308,7 +309,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: ) sft_config = SFTConfig( - output_dir=str(checkpoint_dir), + output_dir=checkpoint_dir.as_posix(), num_train_epochs=float(training_cfg.get("num_train_epochs", 1.0)), per_device_train_batch_size=int(training_cfg.get("batch_size", 2)), gradient_accumulation_steps=int( @@ -418,7 +419,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: final_archive_path: Path | None = None if bool(training_cfg.get("save_model", True)): model_path = artifacts_dir / "final_model" - trainer.save_model(str(model_path)) + trainer.save_model(model_path.as_posix()) tokenizer.save_pretrained(model_path) final_model_path = model_path logger.info("Saved fine-tuned model to %s", model_path) @@ -439,28 +440,28 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: ) training_time = time.time() - start_time - result_payload: dict[str, Any] = { - "task_id": task.task_id, - "training_successful": training_successful, - "training_time_seconds": training_time, - "error_message": error_msg, - "model_name": self._model_name, - "dataset_size": len(train_dataset), - "output_dir": out_dir.as_posix(), - "checkpoints_dir": artifact_ref("checkpoints"), - "resume_from_path": resume_str, - } + result = SFTResult( + task_id=task.task_id, + training_successful=training_successful, + training_time_seconds=training_time, + error_message=error_msg, + model_name=self._model_name, + dataset_size=len(train_dataset), + output_dir=out_dir.as_posix(), + checkpoints_dir=artifact_ref("checkpoints"), + resume_from_path=resume_str, + ) if final_model_path is not None: resolved_model_path = Path(final_model_path) self._final_model_dir = ( resolved_model_path if resolved_model_path.exists() else None ) - result_payload["final_model"] = artifact_ref( + result.final_model = artifact_ref( final_model_path.relative_to(artifacts_dir).as_posix() ) if final_archive_path is not None: - result_payload["final_model_archive"] = artifact_ref( + result.final_model_archive = artifact_ref( final_archive_path.relative_to(artifacts_dir).as_posix() ) @@ -484,7 +485,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: self._final_model_dir = None self._task_out_dir = None - return SFTResult.model_validate(result_payload) + return result except Exception as exc: error_msg = str(exc) @@ -501,23 +502,21 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: self._final_model_dir = None training_time = time.time() - start_time - result: dict[str, Any] = { - "task_id": task.task_id, - "training_successful": training_successful, - "training_time_seconds": training_time, - "error_message": error_msg, - "model_name": self._model_name, - "dataset_size": ( + result = SFTResult( + task_id=task.task_id, + training_successful=training_successful, + training_time_seconds=training_time, + error_message=error_msg, + model_name=self._model_name, + dataset_size=( len(train_dataset) if "train_dataset" in locals() and train_dataset is not None else 0 ), - "output_dir": out_dir.as_posix(), - "checkpoints_dir": artifact_ref("checkpoints"), - "resume_from_path": ( - str(resume_path) if "resume_path" in locals() and resume_path else None - ), - } + output_dir=out_dir.as_posix(), + checkpoints_dir=artifact_ref("checkpoints"), + resume_from_path=resume_path.as_posix() if resume_path else None, + ) write_executor_result(out_dir / "results.json", task.task_id, task.spec, result) if caught_exc is not None: self._task_out_dir = None @@ -619,7 +618,7 @@ def _ensure_jsonl_local(self, jsonl_cfg: dict[str, Any]) -> Path: timeout=timeout, logger=logger, ) - jsonl_cfg["path"] = str(resolved) + jsonl_cfg["path"] = resolved.as_posix() return resolved except ExecutionError as exc: last_error = exc @@ -840,13 +839,13 @@ def _resolve_deepspeed_config(training_cfg: dict[str, Any], log) -> Any | None: if isinstance(cfg, dict): return cfg if isinstance(cfg, (str, Path)): - candidate = Path(str(cfg)).expanduser() + candidate = Path(cfg).expanduser() if candidate.exists(): log.info("Using DeepSpeed config at %s", candidate) - return str(candidate) + return candidate.as_posix() # Allow literal identifiers (e.g., 'auto') without file presence. log.info("Using DeepSpeed literal configuration '%s'", cfg) - return str(cfg) + return cfg if isinstance(cfg, str) else cfg.as_posix() raise ValueError("training.deepspeed must be a dict, path string, or falsy") @staticmethod diff --git a/src/worker/executors/ssh_executor.py b/src/worker/executors/ssh_executor.py index 4677c532..107e93df 100644 --- a/src/worker/executors/ssh_executor.py +++ b/src/worker/executors/ssh_executor.py @@ -29,7 +29,7 @@ import requests -from shared.schemas.executor_result import BaseExecutorResult +from shared.schemas.result import BaseExecutorResult from shared.tasks.components.resources import GPURequirements from shared.tasks.specs.ssh import ( SSHInputSpec, diff --git a/src/worker/executors/transformers_executor.py b/src/worker/executors/transformers_executor.py index c8a32e95..f8aa58fa 100644 --- a/src/worker/executors/transformers_executor.py +++ b/src/worker/executors/transformers_executor.py @@ -57,8 +57,8 @@ from typing import TYPE_CHECKING, Any from shared.schemas.artifact import ArtifactRef -from shared.schemas.executor_result import BaseExecutorResult from shared.schemas.governance import SpanType +from shared.schemas.result import BaseExecutorResult from shared.tasks.specs import ( EmbeddingSpecStrict, InferenceSpecStrict, @@ -411,14 +411,14 @@ def run(self, task: ExecutorTask, out_dir: Path) -> TransformersResult: result = self._run_inner(spec, task_id, out_dir) maybe_upload_artifacts(task, out_dir, logger=logger) maybe_upload_traces(task, out_dir, logger=logger) - return TransformersResult.model_validate(result) + return result def _run_inner( self, spec: "InferenceSpecStrict | EmbeddingSpecStrict", task_id: str, out_dir: Path, - ) -> dict[str, Any]: + ) -> TransformersResult: with self._span("model load", span_type=SpanType.COMPUTE): self._ensure_model(spec) @@ -435,8 +435,6 @@ def _run_inner( assert self._model is not None - result: dict[str, Any] = {} - if self._mode == "visual-embedding": assert self._image_processor is not None device = self._device or ("cuda" if torch.cuda.is_available() else "cpu") @@ -491,15 +489,13 @@ def _run_inner( emb_path = artifacts_dir / "visual_embeddings.pt" torch.save(grouped_visual_embeddings, emb_path) - result = { - "ok": True, - "model": self._model_name, - "items": [], # Embeddings are in file - "count": len(grouped_visual_embeddings), - "embedding_file": artifact_ref("visual_embeddings.pt"), - } - if image_group_sizes is not None: - result["image_group_sizes"] = image_group_sizes + result = TransformersResult( + model=self._model_name, + items=[], + count=len(grouped_visual_embeddings), + embedding_file=artifact_ref("visual_embeddings.pt"), + image_group_sizes=image_group_sizes, + ) self._dump_to_governance( task_id=task_id, @@ -605,21 +601,20 @@ def _run_inner( prompt_tokens += int(input_len) completion_tokens += int(gen_part.shape[0]) - result = { - "ok": True, - "model": self._model_name, - "items": items, - "usage": { + result = TransformersResult( + model=self._model_name, + items=items, + usage={ "prompt_tokens": int(prompt_tokens), "completion_tokens": int(completion_tokens), "total_tokens": int(prompt_tokens + completion_tokens), "num_requests": len(self._prompts), "latency_sec": latency, }, - } + ) if isinstance(spec, InferenceSpecStrict): - self._maybe_export_jsonl(spec, task_id, result, out_dir) + self._maybe_export_jsonl(spec, task_id, items, out_dir) self._dump_to_governance( task_id=task_id, diff --git a/src/worker/executors/utils/checkpoints.py b/src/worker/executors/utils/checkpoints.py index c5c6e59f..9814c976 100644 --- a/src/worker/executors/utils/checkpoints.py +++ b/src/worker/executors/utils/checkpoints.py @@ -12,9 +12,9 @@ import requests from shared.schemas.artifact import ArtifactContext, ArtifactRef -from shared.schemas.executor_result import BaseExecutorResult -from shared.schemas.result import write_result_in_envelope +from shared.schemas.result import BaseExecutorResult, ResultEnvelope from shared.tasks.specs import TaskSpecStrictBase +from shared.utils.atomic import atomic_write_text from shared.utils.http import add_auth_headers from shared.utils.parsing import parse_bool_env @@ -416,16 +416,13 @@ def build_artifact_context(spec: TaskSpecStrictBase, out_dir: Path) -> ArtifactC def write_executor_result( - path: Path, - task_id: str, - spec: TaskSpecStrictBase, - result: BaseExecutorResult | dict[str, Any], + path: Path, task_id: str, spec: TaskSpecStrictBase, result: BaseExecutorResult ) -> None: """Stamp ``_artifacts`` onto ``result`` and persist the envelope.""" - if isinstance(result, dict): - result = BaseExecutorResult.model_validate(result) + path.parent.mkdir(parents=True, exist_ok=True) result.artifacts = build_artifact_context(spec, path.parent) - write_result_in_envelope(path, task_id, result) + envelope = ResultEnvelope(task_id=task_id, result=result) + atomic_write_text(path, envelope.model_dump_json(indent=2)) def maybe_upload_artifacts( diff --git a/src/worker/executors/vllm_executor.py b/src/worker/executors/vllm_executor.py index 4e6cea06..bb94b21d 100644 --- a/src/worker/executors/vllm_executor.py +++ b/src/worker/executors/vllm_executor.py @@ -66,8 +66,8 @@ _HAS_VLLM = False StructuredOutputsParams = None # type: ignore -from shared.schemas.executor_result import BaseExecutorResult from shared.schemas.governance import SpanType +from shared.schemas.result import BaseExecutorResult from shared.tasks.specs import InferenceSpecStrict from .base_executor import ExecutionError, Executor, ExecutorTask @@ -1157,54 +1157,48 @@ def _collect( "num_requests": len(self._batched_inputs), } - result: dict[str, Any] = { - "ok": True, - "model": self._model_name, - "items": per_task_items.get(task_id, []), - "usage": parent_usage, - } - - child_results: dict[str, Any] = {} + items = per_task_items.get(task_id, []) + child_results: dict[str, VLLMResult] = {} for child in merge_children: child_id = child.task_id.strip() if not child_id: continue - child_payload: dict[str, Any] = { - "items": per_task_items.get(child_id, []), - } + child_items = per_task_items.get(child_id, []) maybe_usage = usage_by_task.get(child_id) - if maybe_usage: - child_payload["usage"] = maybe_usage - child_results[child_id] = child_payload + child_results[child_id] = VLLMResult( + model=self._model_name, items=child_items, usage=maybe_usage + ) if parent_tables := parent_entry.tables: - result = self._populate_table(result, parent_tables) + items = self._populate_table(items, parent_tables) if child_results: for child_id, child_payload in child_results.items(): if (child_entry := entry_by_task_id.get(child_id)) and ( child_tables := child_entry.tables ): - child_results[child_id] = self._populate_table( - child_payload, child_tables + child_results[child_id].items = self._populate_table( + child_payload.items, child_tables ) - if child_results: - result["children"] = child_results + result = VLLMResult( + children=cast(dict[str, BaseExecutorResult], child_results), + model=self._model_name, + items=items, + usage=parent_usage, + ) with self._span( "JSONL export", span_type=SpanType.COMPUTE, attributes={"task_ids": task_ids}, ): - self._maybe_export_jsonl(spec, task_id, result, out_dir) + self._maybe_export_jsonl(spec, task_id, result.items, out_dir) self._dump_to_governance( - task_id=task_id, - result=result, - dependencies_by_task=dependencies_by_task, + task_id=task_id, result=result, dependencies_by_task=dependencies_by_task ) - return VLLMResult.model_validate(result) + return result def cleanup_after_run(self) -> None: if self._llm: diff --git a/src/worker/runner.py b/src/worker/runner.py index 6d7b17b0..96c2b2cc 100644 --- a/src/worker/runner.py +++ b/src/worker/runner.py @@ -10,7 +10,7 @@ import requests -from shared.schemas.executor_result import BaseExecutorResult +from shared.schemas.result import BaseExecutorResult from shared.tasks import MergedChildTaskStrict from shared.tasks.specs import InferenceSpecStrict, TaskSpecStrictBase from shared.tasks.worker_message import ( @@ -128,7 +128,7 @@ def _write_results( spec: TaskSpecStrictBase, merged_children: list[MergedChildTaskStrict], out_dir: Path, - result: BaseExecutorResult | dict[str, Any] | None, + result: BaseExecutorResult | None, ): if result is None: return @@ -153,7 +153,7 @@ def _write_single_result( task_id: str, spec: TaskSpecStrictBase, out_dir: Path, - payload: BaseExecutorResult | dict[str, Any] | None, + payload: BaseExecutorResult | None, ): if payload is None: return @@ -181,10 +181,7 @@ def _simulate_bandwidth_delay(self, payload_bytes: int, destination: str) -> Non time.sleep(delay) def _maybe_emit_http( - self, - task_id: str, - spec: TaskSpecStrictBase, - result: BaseExecutorResult | dict[str, Any], + self, task_id: str, spec: TaskSpecStrictBase, result: BaseExecutorResult ) -> None: """Send task results to an HTTP endpoint when requested by the spec.""" destination = get_http_destination(spec) @@ -193,14 +190,9 @@ def _maybe_emit_http( url = destination.url ignore_error = destination.ignore_error - result_dict = ( - result.model_dump(serialize_as_any=True) - if isinstance(result, BaseExecutorResult) - else result - ) payload = { "task_id": task_id, - "result": result_dict, + "result": result.model_dump(), "worker_id": self.lifecycle.worker_id, } payload_size = len(json.dumps(payload, ensure_ascii=False).encode("utf-8")) diff --git a/tests/shared/test_executor_result.py b/tests/shared/test_executor_result.py index 24bb93c8..4b5d79d3 100644 --- a/tests/shared/test_executor_result.py +++ b/tests/shared/test_executor_result.py @@ -6,7 +6,7 @@ from pydantic import Field from shared.schemas.artifact import ArtifactContext, ArtifactRef -from shared.schemas.executor_result import BaseExecutorResult +from shared.schemas.result import BaseExecutorResult, ResultEnvelope class _SampleResult(BaseExecutorResult): @@ -22,9 +22,9 @@ def test_subclass_round_trip_through_base_preserves_extra_fields() -> None: ok=True, items=[{"output": "hello"}], usage={"latency_sec": 0.5}, - artifacts=ArtifactContext(base_dir="/tmp/t", base_url="http://h"), + _artifacts=ArtifactContext(base_dir="/tmp/t", base_url="http://h"), ) - payload = original.model_dump_json(serialize_as_any=True) + payload = original.model_dump_json() base = BaseExecutorResult.model_validate_json(payload) redumped = json.loads(base.model_dump_json()) @@ -34,30 +34,16 @@ def test_subclass_round_trip_through_base_preserves_extra_fields() -> None: assert redumped["_artifacts"] == {"base_dir": "/tmp/t", "base_url": "http://h"} -def test_artifacts_alias_round_trips_both_directions() -> None: - """The wire key is ``_artifacts`` (alias); field-name input is also accepted.""" - from_alias = BaseExecutorResult.model_validate({"_artifacts": {"base_dir": "/a"}}) - from_field = BaseExecutorResult.model_validate({"artifacts": {"base_dir": "/a"}}) - - assert ( - from_alias.artifacts == from_field.artifacts == ArtifactContext(base_dir="/a") - ) - - dumped = from_alias.model_dump() - assert "_artifacts" in dumped - assert "artifacts" not in dumped - - def test_recursive_children_round_trip() -> None: """Nested ``children`` deserialize as ``BaseExecutorResult`` and re-emit - their extra fields when serialized with ``serialize_as_any=True``.""" + their extra fields when serialized.""" parent = _SampleResult( items=[{"output": "p"}], children={ "c1": _SampleResult(items=[{"output": "c"}], usage={"total_tokens": 3}), }, ) - payload = parent.model_dump_json(serialize_as_any=True) + payload = parent.model_dump_json() base = BaseExecutorResult.model_validate_json(payload) assert "c1" in base.children @@ -75,11 +61,9 @@ def test_artifact_ref_is_a_typed_path_reference() -> None: def test_envelope_round_trip_preserves_subclass_payload() -> None: """The production write→read path (``write_result_in_envelope`` → ``ResultEnvelope.model_validate``) round-trips subclass fields.""" - from shared.schemas.result import ResultEnvelope - inner = _SampleResult(items=[{"output": "hello"}], usage={"total_tokens": 7}) env = ResultEnvelope(task_id="tsk-x", result=inner) - raw = env.model_dump_json(serialize_as_any=True) + raw = env.model_dump_json() parsed = ResultEnvelope.model_validate_json(raw) dumped = parsed.result.model_dump() diff --git a/tests/worker/test_connector_logging.py b/tests/worker/test_connector_logging.py index 0caa357c..417e8a98 100644 --- a/tests/worker/test_connector_logging.py +++ b/tests/worker/test_connector_logging.py @@ -1,16 +1,24 @@ """Test that connector logs are properly redirected from worker subprocess to parent.""" +import logging import tempfile import time import uuid from pathlib import Path +from typing import Any +from shared.schemas.result import BaseExecutorResult from shared.tasks.worker_message import WorkerTaskMessage from tests.worker.factories import make_live_worker_config, make_worker_hardware from worker.executors.base_executor import Executor from worker.executors.mp_executor import MPExecutor +class ConnectorLoggingResult(BaseExecutorResult): + ok: bool = True + result: dict[str, Any] + + class ConnectorLoggingExecutor(Executor): """Simple test executor that uses a connector and logs messages.""" @@ -19,9 +27,8 @@ class ConnectorLoggingExecutor(Executor): def prepare(self) -> None: pass - def run(self, task, out_dir: Path) -> dict: + def run(self, task, out_dir: Path) -> ConnectorLoggingResult: """Run a simple test that logs from different modules.""" - import logging # Get loggers from different modules that would be used in real execution executor_logger = logging.getLogger("executors.test_executor") @@ -36,13 +43,12 @@ def run(self, task, out_dir: Path) -> dict: root_logger.info("Message from root logger") - return { - "ok": True, - "result": { + return ConnectorLoggingResult( + result={ "status": "completed", "log_test": "Messages logged from different modules", }, - } + ) def cleanup_after_run(self) -> None: pass @@ -89,9 +95,8 @@ def test_connector_logs_printed_to_stderr(tmp_path: Path) -> None: mp.cleanup_after_run() - # Verify the executor ran successfully - assert isinstance(result, dict) - assert result["ok"], f"Executor failed: {result}" - assert ( - result.get("result", {}).get("status") == "completed" - ), f"Unexpected result: {result}" + # Verify the executor ran successfully. The MP boundary pickles the + # subclass instance, so the parent receives a ``ConnectorLoggingResult``. + assert isinstance(result, ConnectorLoggingResult), f"Unexpected result: {result}" + assert result.ok is True + assert result.result.get("status") == "completed", f"Unexpected result: {result}" diff --git a/tests/worker/test_data_mixin_lineage.py b/tests/worker/test_data_mixin_lineage.py index 48892ee9..491d65a7 100644 --- a/tests/worker/test_data_mixin_lineage.py +++ b/tests/worker/test_data_mixin_lineage.py @@ -7,6 +7,7 @@ from PIL import Image +from shared.schemas.result import BaseExecutorResult from worker.executors.mixins.data import DataMixin @@ -96,14 +97,16 @@ def test_dump_to_governance_with_merged_children(tmp_path: Path) -> None: mixin = _Mixin() out_dir = tmp_path / "task" with mixin._task_span("tsk-parent", "wfl-1", out_dir, owner_id="alice"): - result = { - "ok": True, - "items": [{"output": "p"}], - "children": { - "tsk-c1": {"items": [{"output": "c1"}]}, - "tsk-c2": {"items": [{"output": "c2"}]}, - }, - } + result = BaseExecutorResult.model_validate( + { + "ok": True, + "items": [{"output": "p"}], + "children": { + "tsk-c1": {"items": [{"output": "c1"}]}, + "tsk-c2": {"items": [{"output": "c2"}]}, + }, + } + ) deps = { "tsk-parent": ["tsk-up-a"], "tsk-c1": ["tsk-up-b"], diff --git a/tests/worker/test_executor_bootstrap.py b/tests/worker/test_executor_bootstrap.py index 34f6596c..45277de9 100644 --- a/tests/worker/test_executor_bootstrap.py +++ b/tests/worker/test_executor_bootstrap.py @@ -10,6 +10,7 @@ from pathlib import Path from typing import Any +from shared.schemas.result import BaseExecutorResult from tests.worker.factories import make_live_worker_config, make_worker_hardware from worker.executors.base_executor import Executor, ExecutorTask from worker.main import initialize_executors @@ -23,8 +24,8 @@ class _PassthroughExecutor(Executor): def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) - def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: # noqa: D401 - return {"ok": True} + def run(self, task: ExecutorTask, out_dir: Path) -> BaseExecutorResult: + return BaseExecutorResult.model_validate({"ok": True}) class TestInitializeExecutorsHardware: diff --git a/tests/worker/test_mp_executor_lifecycle.py b/tests/worker/test_mp_executor_lifecycle.py index f6882cc5..6cea3af4 100644 --- a/tests/worker/test_mp_executor_lifecycle.py +++ b/tests/worker/test_mp_executor_lifecycle.py @@ -4,6 +4,7 @@ import uuid from pathlib import Path +from shared.schemas.result import BaseExecutorResult from shared.tasks import TaskType from shared.tasks.specs import EchoSpecStrict from shared.tasks.worker_message import WorkerTaskMessage @@ -16,11 +17,16 @@ from worker.executors.mp_executor import MPExecutor +class _SimpleMPResult(BaseExecutorResult): + ok: bool = True + task_id: str + + class _SimpleMPExecutor(Executor): name = "simple_mp" - def run(self, task, out_dir: Path) -> dict: - return {"ok": True, "task_id": task.task_id} + def run(self, task, out_dir: Path) -> _SimpleMPResult: + return _SimpleMPResult(task_id=task.task_id) def cleanup_after_run(self) -> None: return None @@ -54,8 +60,8 @@ def test_mp_executor_does_not_start_subprocess_until_first_run(tmp_path: Path) - with tempfile.TemporaryDirectory() as out_dir: result = mp.run(_simple_task_message(), Path(out_dir)) - assert isinstance(result, dict) - assert result["ok"] is True + assert isinstance(result, _SimpleMPResult) + assert result.ok is True assert mp._shutdown is False assert mp._proc is not None assert mp._proc.is_alive() From 75083c975d044d029e2097a6a23bddc0e0af481f Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Tue, 19 May 2026 16:25:37 +0000 Subject: [PATCH 11/26] refactor(worker): type upstream result and artifact resolver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tightens the worker-side mirror of the server's ``_render_artifact_ref`` work: ``artifact_to_source`` and ``maybe_resolve_artifact_ref`` take ``context: dict[str, BaseExecutorResult] | None`` and read ``node_result.artifacts`` directly instead of walking dict shapes defensively. The legacy "result key" fallback (where some executors nested their payload under a ``"result"`` key) is removed — no executor produces that shape after the result-schema refactor. ``GovernanceMixin._spec_upstream_results`` and the data-mixin ``context`` local are retyped accordingly, and ``_evaluate_expr`` grows a ``BaseModel`` branch (with sentinel-default ``getattr``) so expressions can walk into typed upstream results. Signed-off-by: Noppanat Wadlom --- src/worker/executors/mixins/data.py | 3 +- src/worker/executors/mixins/governance.py | 4 +- src/worker/executors/utils/artifacts.py | 35 ++++------ src/worker/executors/utils/graph_templates.py | 20 ++++-- tests/worker/test_artifact_utils.py | 66 ++++++++----------- tests/worker/test_data_mixin_lineage.py | 19 +++--- 6 files changed, 72 insertions(+), 75 deletions(-) diff --git a/src/worker/executors/mixins/data.py b/src/worker/executors/mixins/data.py index 77c2cff1..d0ee4324 100644 --- a/src/worker/executors/mixins/data.py +++ b/src/worker/executors/mixins/data.py @@ -15,6 +15,7 @@ from datasets import Dataset, load_dataset from PIL import Image +from shared.schemas.result import BaseExecutorResult from shared.tasks.specs import TaskSpecStrictBase from shared.utils.json import safe_get @@ -403,7 +404,7 @@ def _collect_prompts_for_spec( metadata_raw.append(entry_meta) elif dtype == "list": items = data.get("items") - context: dict[str, Any] | None = None + context: dict[str, BaseExecutorResult] | None = None root_node: str | None = None if items is None: expr = data.get("expr") diff --git a/src/worker/executors/mixins/governance.py b/src/worker/executors/mixins/governance.py index a302f595..44617d25 100644 --- a/src/worker/executors/mixins/governance.py +++ b/src/worker/executors/mixins/governance.py @@ -210,7 +210,9 @@ def _record_output( ) @staticmethod - def _spec_upstream_results(spec: TaskSpecStrictBase) -> dict[str, Any]: + def _spec_upstream_results( + spec: TaskSpecStrictBase, + ) -> dict[str, BaseExecutorResult]: """Validated ``spec._upstreamResults`` (server-injected stage context).""" context = spec.upstreamResults or {} if not isinstance(context, dict): diff --git a/src/worker/executors/utils/artifacts.py b/src/worker/executors/utils/artifacts.py index 7f4df0be..6935bc22 100644 --- a/src/worker/executors/utils/artifacts.py +++ b/src/worker/executors/utils/artifacts.py @@ -6,48 +6,39 @@ import requests +from shared.schemas.result import BaseExecutorResult from shared.utils.http import auth_headers from ..base_executor import ExecutionError def artifact_to_source( - ref: dict[str, Any], context: dict[str, Any] | None, node: str | None + ref: dict[str, Any], context: dict[str, BaseExecutorResult] | None, node: str | None ) -> str: """Translate a `{path: ...}` artifact ref into a URL or local path.""" rel_path = ref.get("path") if not isinstance(rel_path, str) or not rel_path: raise ExecutionError("Artifact ref must include a non-empty 'path' field") - ctx: dict[str, Any] = {} - if context and isinstance(node, str) and node: - node_payload = context.get(node) - if isinstance(node_payload, dict): - raw_ctx = node_payload.get("_artifacts") - if isinstance(raw_ctx, dict): - ctx = raw_ctx - if not ctx: - # Some executors stuff their result under a "result" key. - result = node_payload.get("result") - if isinstance(result, dict): - inner = result.get("_artifacts") - if isinstance(inner, dict): - ctx = inner - - if ctx: - base_url = ctx.get("base_url") - base_dir = ctx.get("base_dir") + if ( + context + and node + and (node_result := context.get(node)) + and (ctx := node_result.artifacts) + ): + base_url = ctx.base_url + base_dir = ctx.base_dir else: base_url = base_dir = None # Check local filesystem first - base_dir_path = Path(base_dir) if isinstance(base_dir, str) and base_dir else None + base_dir_path = Path(base_dir) if base_dir else None local_path = base_dir_path / "artifacts" / rel_path if base_dir_path else None if local_path is not None and local_path.is_file(): return local_path.as_posix() # Fallback to a URL if base_url is provided - if isinstance(base_url, str) and base_url: + if base_url: if not base_dir_path: raise ExecutionError( "Artifact ref with base_url requires upstream base_dir to " @@ -66,7 +57,7 @@ def artifact_to_source( def maybe_resolve_artifact_ref( - value: Any, context: dict[str, Any] | None, node: str | None + value: Any, context: dict[str, BaseExecutorResult] | None, node: str | None ) -> Any: """Convert `{path: ...}` ref dicts to URL/path strings; pass others through.""" if isinstance(value, dict) and "path" in value: diff --git a/src/worker/executors/utils/graph_templates.py b/src/worker/executors/utils/graph_templates.py index eb01ea1a..17958dfa 100644 --- a/src/worker/executors/utils/graph_templates.py +++ b/src/worker/executors/utils/graph_templates.py @@ -5,7 +5,9 @@ from typing import Any import pandas as pd +from pydantic import BaseModel +from shared.schemas.result import BaseExecutorResult from shared.tasks.specs import TaskSpecStrictBase from shared.utils.json import validate_keys @@ -13,6 +15,8 @@ from ..base_executor import ExecutionError from .safe_eval import safe_execute_function, safe_materialize_function +_MISSING: Any = object() + type MessageItem = dict[str, str] type Message = Sequence[MessageItem] type MaterializedMessage = Sequence[MessageItem] @@ -580,17 +584,17 @@ def _format_column_line(label: str, value: str) -> str: return f"• {label}: {indented}" -def _evaluate_expr(expr: str, context: dict[str, Any]) -> Any: +def _evaluate_expr(expr: str, context: dict[str, BaseExecutorResult]) -> Any: if not expr: return None parts = expr.split(".") root = parts[0] - data = context.get(root) - if data is None: + result = context.get(root) + if result is None: return None - value: Any = data + value: Any = result for token in parts[1:]: if not token: continue @@ -617,6 +621,14 @@ def _evaluate_expr(expr: str, context: dict[str, Any]) -> Any: f"{attr} not a valid column in DataFrame for {token}." ) value = value[attr].tolist() + elif isinstance(value, BaseModel): + resolved = getattr(value, attr, _MISSING) + if resolved is _MISSING: + raise ExecutionError( + f"{attr} not a valid attribute of {type(value).__name__} " + f"for {token}." + ) + value = resolved else: raise ExecutionError( f"{attr} in {parts} is not a valid key - " diff --git a/tests/worker/test_artifact_utils.py b/tests/worker/test_artifact_utils.py index a30528a5..189733b2 100644 --- a/tests/worker/test_artifact_utils.py +++ b/tests/worker/test_artifact_utils.py @@ -8,6 +8,8 @@ import pytest +from shared.schemas.artifact import ArtifactContext +from shared.schemas.result import BaseExecutorResult from worker.executors.base_executor import ExecutionError from worker.executors.utils.artifacts import ( artifact_to_source, @@ -16,42 +18,26 @@ class TestArtifactToSource: - def test_unwrapped_upstream_yields_url(self, tmp_path: Path) -> None: - upstream = { - "_artifacts": { - "base_url": "http://host:8010", - "base_dir": (tmp_path / "producer-tid").as_posix(), - }, - "result": {"images": [{"path": "a.png"}]}, - } + def test_url_resolution(self, tmp_path: Path) -> None: + upstream = BaseExecutorResult( + _artifacts=ArtifactContext( + base_dir=(tmp_path / "producer-tid").as_posix(), + base_url="http://host:8010", + ), + ) url = artifact_to_source({"path": "a.png"}, {"producer": upstream}, "producer") assert url == "http://host:8010/api/v1/results/producer-tid/files/a.png" - def test_envelope_wrapped_upstream_yields_url(self, tmp_path: Path) -> None: - """Server stores results.json as `{task_id, ..., result: {...}}`; the - helper must unwrap one level to find `_artifacts`.""" - upstream = { - "task_id": "producer-tid", - "result": { - "_artifacts": { - "base_url": "http://host:8010", - "base_dir": (tmp_path / "producer-tid").as_posix(), - }, - }, - } - url = artifact_to_source({"path": "x.png"}, {"producer": upstream}, "producer") - assert url == "http://host:8010/api/v1/results/producer-tid/files/x.png" - def test_local_file_takes_fast_path(self, tmp_path: Path) -> None: task_root = tmp_path / "producer-tid" (task_root / "artifacts").mkdir(parents=True) (task_root / "artifacts" / "a.png").write_bytes(b"\x89PNG") - upstream = { - "_artifacts": { - "base_url": "http://host:8010", - "base_dir": task_root.as_posix(), - } - } + upstream = BaseExecutorResult( + _artifacts=ArtifactContext( + base_dir=task_root.as_posix(), + base_url="http://host:8010", + ) + ) resolved = artifact_to_source( {"path": "a.png"}, {"producer": upstream}, "producer" ) @@ -59,7 +45,9 @@ def test_local_file_takes_fast_path(self, tmp_path: Path) -> None: def test_local_only_upstream_returns_local_path(self, tmp_path: Path) -> None: task_root = tmp_path / "producer-tid" - upstream = {"_artifacts": {"base_url": None, "base_dir": task_root.as_posix()}} + upstream = BaseExecutorResult( + _artifacts=ArtifactContext(base_dir=task_root.as_posix()) + ) resolved = artifact_to_source( {"path": "a.png"}, {"producer": upstream}, "producer" ) @@ -67,11 +55,13 @@ def test_local_only_upstream_returns_local_path(self, tmp_path: Path) -> None: def test_missing_context_raises(self) -> None: with pytest.raises(ExecutionError, match="_artifacts context is missing"): - artifact_to_source({"path": "a.png"}, {"producer": {}}, "producer") + artifact_to_source( + {"path": "a.png"}, {"producer": BaseExecutorResult()}, "producer" + ) def test_missing_path_raises(self) -> None: with pytest.raises(ExecutionError, match="non-empty 'path' field"): - artifact_to_source({}, {"producer": {}}, "producer") + artifact_to_source({}, {"producer": BaseExecutorResult()}, "producer") class TestMaybeResolveArtifactRef: @@ -85,12 +75,12 @@ def test_passes_through_dict_without_path(self) -> None: assert maybe_resolve_artifact_ref(value, None, None) is value def test_resolves_path_dict(self, tmp_path: Path) -> None: - upstream = { - "_artifacts": { - "base_url": "http://host:8010", - "base_dir": (tmp_path / "producer-tid").as_posix(), - } - } + upstream = BaseExecutorResult( + _artifacts=ArtifactContext( + base_url="http://host:8010", + base_dir=(tmp_path / "producer-tid").as_posix(), + ) + ) out = maybe_resolve_artifact_ref( {"path": "a.png"}, {"producer": upstream}, "producer" ) diff --git a/tests/worker/test_data_mixin_lineage.py b/tests/worker/test_data_mixin_lineage.py index 491d65a7..69e8856f 100644 --- a/tests/worker/test_data_mixin_lineage.py +++ b/tests/worker/test_data_mixin_lineage.py @@ -146,20 +146,21 @@ def test_collect_prompts_resolves_grouped_image_artifact_refs_after_flatten( for name, color in (("a.png", "red"), ("b.png", "green"), ("c.png", "blue")): Image.new("RGB", (2, 2), color=color).save(artifacts_dir / name) + result = BaseExecutorResult.model_validate( + { + "images": [ + [{"path": "images/a.png"}, {"path": "images/b.png"}], + [{"path": "images/c.png"}], + ], + "_artifacts": {"base_dir": upstream_dir.as_posix()}, + } + ) spec = cast( Any, SimpleNamespace( data={"type": "list", "expr": "vision.images"}, inference={}, - upstreamResults={ - "vision": { - "images": [ - [{"path": "images/a.png"}, {"path": "images/b.png"}], - [{"path": "images/c.png"}], - ], - "_artifacts": {"base_dir": upstream_dir.as_posix()}, - } - }, + upstreamResults={"vision": result}, ), ) From 3b2fe17b0685cbf79ebabc22af27eec1f54deedb Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Tue, 19 May 2026 19:27:36 +0000 Subject: [PATCH 12/26] refactor(worker): inline ArtifactRef constructor at call sites The ``artifact_ref(rel_path)`` helper was a one-liner around ``ArtifactRef(path=rel_path)``; now that ``ArtifactRef`` is the typed schema callers reach for directly, the wrapper is dead weight. Drop it and the corresponding test, and inline the constructor at every call site across the nine executors that used it. Signed-off-by: Noppanat Wadlom --- src/worker/executors/agent_executor.py | 10 +++------- src/worker/executors/data_retrieval_executor.py | 9 +++------ src/worker/executors/diffusers_executor.py | 8 ++------ src/worker/executors/dpo_executor.py | 11 +++++------ src/worker/executors/lora_sft_executor.py | 13 ++++++------- src/worker/executors/omni_text2audio_executor.py | 6 +++--- src/worker/executors/omni_text2general_executor.py | 6 ++++-- src/worker/executors/omni_text2image_executor.py | 6 +++--- src/worker/executors/omni_text2speech_executor.py | 6 +++--- src/worker/executors/ppo_executor.py | 11 +++++------ src/worker/executors/sft_executor.py | 13 ++++++------- src/worker/executors/transformers_executor.py | 8 ++------ src/worker/executors/utils/checkpoints.py | 8 +------- tests/worker/test_checkpoint_utils.py | 9 +-------- 14 files changed, 47 insertions(+), 77 deletions(-) diff --git a/src/worker/executors/agent_executor.py b/src/worker/executors/agent_executor.py index 7cff6d0a..05b37aa4 100644 --- a/src/worker/executors/agent_executor.py +++ b/src/worker/executors/agent_executor.py @@ -21,11 +21,7 @@ from shared.tasks.specs import AgentSpecStrict from .base_executor import ExecutionError, Executor, ExecutorTask -from .utils.checkpoints import ( - artifact_ref, - maybe_upload_artifacts, - write_executor_result, -) +from .utils.checkpoints import maybe_upload_artifacts, write_executor_result from .utils.graph_templates import build_prompts_from_graph_template # Add agent directory to sys.path for utu imports @@ -283,7 +279,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> AgentResult: "execution_log": result.get("log", []), }, agent_output=( - artifact_ref(agent_output_ref) + ArtifactRef(path=agent_output_ref) if isinstance(agent_output_ref, str) else None ), @@ -328,7 +324,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> AgentResult: "batch_summary": results.get("batch_summary", {}), }, batch_summary_file=( - artifact_ref(batch_summary_ref) + ArtifactRef(path=batch_summary_ref) if isinstance(batch_summary_ref, str) else None ), diff --git a/src/worker/executors/data_retrieval_executor.py b/src/worker/executors/data_retrieval_executor.py index 22f964e5..1cde3031 100644 --- a/src/worker/executors/data_retrieval_executor.py +++ b/src/worker/executors/data_retrieval_executor.py @@ -10,6 +10,7 @@ import pandas as pd from PIL import Image +from shared.schemas.artifact import ArtifactRef from shared.schemas.result import BaseExecutorResult from shared.tasks.specs import DataRetrievalSpecStrict from shared.utils.json import validate_keys @@ -18,11 +19,7 @@ from ..utils.serialization import serialize_dataframe from .base_executor import ExecutionError, Executor, ExecutorTask from .mixins.data import DataMixin -from .utils.checkpoints import ( - artifact_ref, - maybe_upload_artifacts, - maybe_upload_traces, -) +from .utils.checkpoints import maybe_upload_artifacts, maybe_upload_traces from .utils.graph_templates import _render_template, _resolve_columns logger = logging.getLogger(__name__) @@ -346,5 +343,5 @@ def _serialize_s3_content(self, content: Any, out_dir: Path) -> Any: filename = f"{uuid.uuid4().hex}.png" file_path = images_dir / filename content.save(file_path, format="PNG") - return artifact_ref(f"s3_images/{filename}") + return ArtifactRef(path=f"s3_images/{filename}") return content diff --git a/src/worker/executors/diffusers_executor.py b/src/worker/executors/diffusers_executor.py index 5a82e1fd..0c782fce 100644 --- a/src/worker/executors/diffusers_executor.py +++ b/src/worker/executors/diffusers_executor.py @@ -21,11 +21,7 @@ from ..utils.logging import configure_hf_library_logging from .base_executor import ExecutionError, Executor, ExecutorTask from .mixins.data import DataMixin -from .utils.checkpoints import ( - artifact_ref, - maybe_upload_artifacts, - maybe_upload_traces, -) +from .utils.checkpoints import maybe_upload_artifacts, maybe_upload_traces try: import torch @@ -347,7 +343,7 @@ def _run_inner( generated_images: list[ArtifactRef] = [] for idx, img in enumerate(images): img.save(image_dir / f"image_{idx}.png", format="PNG") - generated_images.append(artifact_ref(f"images/image_{idx}.png")) + generated_images.append(ArtifactRef(path=f"images/image_{idx}.png")) result = DiffusersResult(model=self._model_name, images=generated_images) self._dump_to_governance( diff --git a/src/worker/executors/dpo_executor.py b/src/worker/executors/dpo_executor.py index 043e0978..a3a2da09 100644 --- a/src/worker/executors/dpo_executor.py +++ b/src/worker/executors/dpo_executor.py @@ -35,7 +35,6 @@ from .mixins.training import TrainingMixin from .utils.checkpoints import ( archive_model_dir, - artifact_ref, get_http_destination, maybe_upload_artifacts, write_executor_result, @@ -484,15 +483,15 @@ def _execute_training(self, task: ExecutorTask, out_dir: Path) -> DPOResult: model_name=self._model_name, dataset_size=len(dataset) if dataset is not None else 0, output_dir=out_dir.as_posix(), - checkpoints_dir=artifact_ref("checkpoints"), + checkpoints_dir=ArtifactRef(path="checkpoints"), ) if final_model_path is not None: - result.final_model = artifact_ref( - final_model_path.relative_to(artifacts_dir).as_posix() + result.final_model = ArtifactRef( + path=final_model_path.relative_to(artifacts_dir).as_posix() ) if final_archive_path is not None: - result.final_model_archive = artifact_ref( - final_archive_path.relative_to(artifacts_dir).as_posix() + result.final_model_archive = ArtifactRef( + path=final_archive_path.relative_to(artifacts_dir).as_posix() ) maybe_upload_artifacts(task, out_dir, logger=logger) diff --git a/src/worker/executors/lora_sft_executor.py b/src/worker/executors/lora_sft_executor.py index 55deca84..c17697ae 100644 --- a/src/worker/executors/lora_sft_executor.py +++ b/src/worker/executors/lora_sft_executor.py @@ -28,7 +28,6 @@ from .sft_executor import SFTExecutor from .utils.checkpoints import ( archive_model_dir, - artifact_ref, determine_resume_path, maybe_upload_artifacts, write_executor_result, @@ -314,16 +313,16 @@ def _compute_loss_with_guard( model_name=self._model_name, dataset_size=len(train_dataset) if train_dataset is not None else 0, output_dir=out_dir.as_posix(), - checkpoints_dir=artifact_ref("checkpoints"), + checkpoints_dir=ArtifactRef(path="checkpoints"), resume_from_path=resume_str, ) if final_adapter_path is not None: - result.final_lora = artifact_ref( - final_adapter_path.relative_to(artifacts_dir).as_posix() + result.final_lora = ArtifactRef( + path=final_adapter_path.relative_to(artifacts_dir).as_posix() ) archive_path = archive_model_dir(final_adapter_path) - result.final_lora_archive = artifact_ref( - archive_path.relative_to(artifacts_dir).as_posix() + result.final_lora_archive = ArtifactRef( + path=archive_path.relative_to(artifacts_dir).as_posix() ) logger.info("Prepared LoRA archive at %s", archive_path) @@ -352,7 +351,7 @@ def _compute_loss_with_guard( model_name=self._model_name, dataset_size=len(train_dataset) if train_dataset is not None else 0, output_dir=out_dir.as_posix(), - checkpoints_dir=artifact_ref("checkpoints"), + checkpoints_dir=ArtifactRef(path="checkpoints"), resume_from_path=resume_path.as_posix() if resume_path else None, ) diff --git a/src/worker/executors/omni_text2audio_executor.py b/src/worker/executors/omni_text2audio_executor.py index 77168cbb..d0913317 100644 --- a/src/worker/executors/omni_text2audio_executor.py +++ b/src/worker/executors/omni_text2audio_executor.py @@ -50,6 +50,7 @@ current_omni_platform = None _HAS_OMNI_PLATFORM = False +from shared.schemas.artifact import ArtifactRef from shared.schemas.governance import SpanType from shared.tasks.specs import TaskSpecStrictBase from shared.tasks.specs.omni import OmniText2AudioSpecStrict @@ -57,7 +58,6 @@ from .base_executor import ExecutionError, ExecutorTask from .omni_executor_base import OmniExecutorBase, OmniResult, extract_multimodal_output -from .utils.checkpoints import artifact_ref logger = logging.getLogger(__name__) EXECUTOR_NAME = "omni_text2audio" @@ -203,8 +203,8 @@ def _run_inner( "prompt_index": prompt_idx, "waveform_index": local_idx, "prompt": prompt, - "audio": artifact_ref( - self.relative_to(save_path, artifacts_dir) + "audio": ArtifactRef( + path=self.relative_to(save_path, artifacts_dir) ), } ) diff --git a/src/worker/executors/omni_text2general_executor.py b/src/worker/executors/omni_text2general_executor.py index 222a446f..a80314f5 100644 --- a/src/worker/executors/omni_text2general_executor.py +++ b/src/worker/executors/omni_text2general_executor.py @@ -26,6 +26,7 @@ Omni = None _HAS_OMNI = False +from shared.schemas.artifact import ArtifactRef from shared.schemas.governance import SpanType from shared.tasks.specs import TaskSpecStrictBase from shared.tasks.specs.omni import OmniText2GeneralSpecStrict @@ -38,7 +39,6 @@ extract_audio_from_mm, save_audio, ) -from .utils.checkpoints import artifact_ref logger = logging.getLogger(__name__) EXECUTOR_NAME = "omni_text2general" @@ -184,7 +184,9 @@ def _run_inner( "index": idx, "request_id": rid, "prompt": texts[idx] if idx < len(texts) else None, - "audio": artifact_ref(self.relative_to(save_path, artifacts_dir)), + "audio": ArtifactRef( + path=self.relative_to(save_path, artifacts_dir) + ), } text_out = text_results.get(rid) if text_out: diff --git a/src/worker/executors/omni_text2image_executor.py b/src/worker/executors/omni_text2image_executor.py index 8254f162..068202e5 100644 --- a/src/worker/executors/omni_text2image_executor.py +++ b/src/worker/executors/omni_text2image_executor.py @@ -15,6 +15,7 @@ Omni = None _HAS_OMNI = False +from shared.schemas.artifact import ArtifactRef from shared.schemas.governance import SpanType from shared.tasks.specs import TaskSpecStrictBase from shared.tasks.specs.omni import OmniText2ImageSpecStrict @@ -22,7 +23,6 @@ from .base_executor import ExecutionError, ExecutorTask from .omni_executor_base import OmniExecutorBase, OmniResult -from .utils.checkpoints import artifact_ref logger = logging.getLogger(__name__) EXECUTOR_NAME = "omni_text2image" @@ -93,8 +93,8 @@ def _run_inner( { "index": idx, "prompt": prompt, - "image": artifact_ref( - self.relative_to(save_path, artifacts_dir) + "image": ArtifactRef( + path=self.relative_to(save_path, artifacts_dir) ), } ) diff --git a/src/worker/executors/omni_text2speech_executor.py b/src/worker/executors/omni_text2speech_executor.py index ff13753c..1cf5e26c 100644 --- a/src/worker/executors/omni_text2speech_executor.py +++ b/src/worker/executors/omni_text2speech_executor.py @@ -16,6 +16,7 @@ Omni = None _HAS_OMNI = False +from shared.schemas.artifact import ArtifactRef from shared.schemas.governance import SpanType from shared.tasks.specs import TaskSpecStrictBase from shared.tasks.specs.omni import OmniText2SpeechSpecStrict @@ -29,7 +30,6 @@ extract_multimodal_output, save_audio, ) -from .utils.checkpoints import artifact_ref logger = logging.getLogger(__name__) EXECUTOR_NAME = "omni_text2speech" @@ -105,8 +105,8 @@ def _run_inner( { "index": idx, "text": text, - "audio": artifact_ref( - self.relative_to(save_path, artifacts_dir) + "audio": ArtifactRef( + path=self.relative_to(save_path, artifacts_dir) ), } ) diff --git a/src/worker/executors/ppo_executor.py b/src/worker/executors/ppo_executor.py index 4bbae544..be710dbb 100644 --- a/src/worker/executors/ppo_executor.py +++ b/src/worker/executors/ppo_executor.py @@ -41,7 +41,6 @@ from .mixins.training import TrainingMixin from .utils.checkpoints import ( archive_model_dir, - artifact_ref, get_http_destination, maybe_upload_artifacts, write_executor_result, @@ -920,15 +919,15 @@ def build_trainer() -> _EarlyStopPPOTrainer: model_name=self._model_name, dataset_size=len(dataset), output_dir=out_dir.as_posix(), - checkpoints_dir=artifact_ref("checkpoints"), + checkpoints_dir=ArtifactRef(path="checkpoints"), ) if final_model_path is not None: - result.final_model = artifact_ref( - final_model_path.relative_to(artifacts_dir).as_posix() + result.final_model = ArtifactRef( + path=final_model_path.relative_to(artifacts_dir).as_posix() ) if final_archive_path is not None: - result.final_model_archive = artifact_ref( - final_archive_path.relative_to(artifacts_dir).as_posix() + result.final_model_archive = ArtifactRef( + path=final_archive_path.relative_to(artifacts_dir).as_posix() ) maybe_upload_artifacts(task, out_dir, logger=logger) diff --git a/src/worker/executors/sft_executor.py b/src/worker/executors/sft_executor.py index 97085617..453b25eb 100644 --- a/src/worker/executors/sft_executor.py +++ b/src/worker/executors/sft_executor.py @@ -32,7 +32,6 @@ from .mixins.training import TrainingMixin from .utils.checkpoints import ( archive_model_dir, - artifact_ref, determine_resume_path, get_http_destination, maybe_upload_artifacts, @@ -448,7 +447,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: model_name=self._model_name, dataset_size=len(train_dataset), output_dir=out_dir.as_posix(), - checkpoints_dir=artifact_ref("checkpoints"), + checkpoints_dir=ArtifactRef(path="checkpoints"), resume_from_path=resume_str, ) @@ -457,12 +456,12 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: self._final_model_dir = ( resolved_model_path if resolved_model_path.exists() else None ) - result.final_model = artifact_ref( - final_model_path.relative_to(artifacts_dir).as_posix() + result.final_model = ArtifactRef( + path=final_model_path.relative_to(artifacts_dir).as_posix() ) if final_archive_path is not None: - result.final_model_archive = artifact_ref( - final_archive_path.relative_to(artifacts_dir).as_posix() + result.final_model_archive = ArtifactRef( + path=final_archive_path.relative_to(artifacts_dir).as_posix() ) maybe_upload_artifacts(task, out_dir, logger=logger) @@ -514,7 +513,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: else 0 ), output_dir=out_dir.as_posix(), - checkpoints_dir=artifact_ref("checkpoints"), + checkpoints_dir=ArtifactRef(path="checkpoints"), resume_from_path=resume_path.as_posix() if resume_path else None, ) write_executor_result(out_dir / "results.json", task.task_id, task.spec, result) diff --git a/src/worker/executors/transformers_executor.py b/src/worker/executors/transformers_executor.py index f8aa58fa..beaed479 100644 --- a/src/worker/executors/transformers_executor.py +++ b/src/worker/executors/transformers_executor.py @@ -68,11 +68,7 @@ from .base_executor import ExecutionError, Executor, ExecutorTask from .mixins.data import InferenceEntry from .mixins.inference import InferenceMixin -from .utils.checkpoints import ( - artifact_ref, - maybe_upload_artifacts, - maybe_upload_traces, -) +from .utils.checkpoints import maybe_upload_artifacts, maybe_upload_traces try: import torch @@ -493,7 +489,7 @@ def _run_inner( model=self._model_name, items=[], count=len(grouped_visual_embeddings), - embedding_file=artifact_ref("visual_embeddings.pt"), + embedding_file=ArtifactRef(path="visual_embeddings.pt"), image_group_sizes=image_group_sizes, ) diff --git a/src/worker/executors/utils/checkpoints.py b/src/worker/executors/utils/checkpoints.py index 9814c976..acf7d345 100644 --- a/src/worker/executors/utils/checkpoints.py +++ b/src/worker/executors/utils/checkpoints.py @@ -11,7 +11,7 @@ import requests -from shared.schemas.artifact import ArtifactContext, ArtifactRef +from shared.schemas.artifact import ArtifactContext from shared.schemas.result import BaseExecutorResult, ResultEnvelope from shared.tasks.specs import TaskSpecStrictBase from shared.utils.atomic import atomic_write_text @@ -396,12 +396,6 @@ def is_cleanup_enabled() -> bool: return normalized not in {"0", "false", "no", "off"} -def artifact_ref(rel_path: str) -> ArtifactRef: - """Build an artifact reference. ``rel_path`` is relative to - ``out_dir/artifacts/``.""" - return ArtifactRef(path=rel_path) - - def build_artifact_context(spec: TaskSpecStrictBase, out_dir: Path) -> ArtifactContext: """Top-level ``_artifacts`` context. ``base_url`` is the destination origin (scheme://host[:port]) for HTTP, else ``None``.""" diff --git a/tests/worker/test_checkpoint_utils.py b/tests/worker/test_checkpoint_utils.py index 171a0c1a..7c671641 100644 --- a/tests/worker/test_checkpoint_utils.py +++ b/tests/worker/test_checkpoint_utils.py @@ -6,7 +6,7 @@ import pytest -from shared.schemas.artifact import ArtifactContext, ArtifactRef +from shared.schemas.artifact import ArtifactContext from worker.executors.base_executor import TaskReference from worker.executors.utils import checkpoints @@ -30,13 +30,6 @@ def _task( return cast(TaskReference, SimpleNamespace(task_id="task-1", spec=spec)) -class TestArtifactRef: - def test_returns_path_only(self) -> None: - assert checkpoints.artifact_ref("images/foo.png") == ArtifactRef( - path="images/foo.png" - ) - - class TestBuildArtifactContext: def test_http_destination_strips_api_suffix(self, tmp_path: Path) -> None: out_dir = tmp_path / "task-1" From e541a6201d53e8dcb3813403c472cb257151325d Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Tue, 19 May 2026 19:27:49 +0000 Subject: [PATCH 13/26] refactor(server): return typed BaseExecutorResult from get_result ``GET /results/{task_id}`` now returns the validated ``BaseExecutorResult`` directly instead of immediately flattening it back to ``dict[str, Any]`` via ``model_dump``. FastAPI serializes the model via Pydantic (alias and ``extra="allow"`` preserved), and the endpoint's signature documents the actual shape the client sees. Signed-off-by: Noppanat Wadlom --- src/server/routers/v1/results.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/server/routers/v1/results.py b/src/server/routers/v1/results.py index 99ac8236..e38e8d76 100644 --- a/src/server/routers/v1/results.py +++ b/src/server/routers/v1/results.py @@ -4,7 +4,6 @@ import tarfile import tempfile from pathlib import Path -from typing import Any from fastapi import ( APIRouter, @@ -20,6 +19,7 @@ from pydantic import ValidationError from shared.schemas.result import ( + BaseExecutorResult, ResultEnvelope, read_result, result_file_path, @@ -115,7 +115,7 @@ async def get_result( principal: PrincipalContext = Depends(authenticate_connection), results_dir: Path = Depends(get_results_dir), logger: logging.Logger = Depends(get_logger), -) -> dict[str, Any]: +) -> BaseExecutorResult: task_id = (task_id or "").strip() if not task_id: raise HTTPException( @@ -143,7 +143,7 @@ async def get_result( detail=f"Result file is not valid JSON: {exc}", ) from exc try: - return ResultEnvelope.model_validate(content).result.model_dump() + return ResultEnvelope.model_validate(content).result except ValidationError as exc: raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, From 62cfd96f596960cd25813c33003ee205090e3459 Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Tue, 19 May 2026 19:37:08 +0000 Subject: [PATCH 14/26] refactor: unify sentinel attribute lookup in dispatcher and template walker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces ``_dig_path``'s ``hasattr`` + ``getattr`` double-lookup with the same sentinel-default pattern ``_evaluate_expr`` already uses, and unifies the constant name (``_MISSING`` → ``_SENTINEL``) across both files so the two dynamic-attribute walkers stay aligned. Signed-off-by: Noppanat Wadlom --- src/server/dispatcher/base.py | 6 ++++-- src/worker/executors/utils/graph_templates.py | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/server/dispatcher/base.py b/src/server/dispatcher/base.py index 2ae68c25..a370fb05 100644 --- a/src/server/dispatcher/base.py +++ b/src/server/dispatcher/base.py @@ -38,6 +38,8 @@ from ..utils.time import now_iso from .worker_selector import DEFAULT_WORKER_SELECTION, select_worker +_SENTINEL: Any = object() + class StageReferenceNotReady(Exception): """Raised when a task references a stage whose artifacts are not yet available.""" @@ -951,9 +953,9 @@ def _dig_path(self, data: Any, parts: list[str]) -> Any: current = current[idx] continue if isinstance(current, BaseModel): - if not hasattr(current, part): + current = getattr(current, part, _SENTINEL) + if current is _SENTINEL: return None - current = getattr(current, part) continue return None return current diff --git a/src/worker/executors/utils/graph_templates.py b/src/worker/executors/utils/graph_templates.py index 17958dfa..c21cd339 100644 --- a/src/worker/executors/utils/graph_templates.py +++ b/src/worker/executors/utils/graph_templates.py @@ -15,7 +15,7 @@ from ..base_executor import ExecutionError from .safe_eval import safe_execute_function, safe_materialize_function -_MISSING: Any = object() +_SENTINEL: Any = object() type MessageItem = dict[str, str] type Message = Sequence[MessageItem] @@ -622,8 +622,8 @@ def _evaluate_expr(expr: str, context: dict[str, BaseExecutorResult]) -> Any: ) value = value[attr].tolist() elif isinstance(value, BaseModel): - resolved = getattr(value, attr, _MISSING) - if resolved is _MISSING: + resolved = getattr(value, attr, _SENTINEL) + if resolved is _SENTINEL: raise ExecutionError( f"{attr} not a valid attribute of {type(value).__name__} " f"for {token}." From 3c9835808b01c2df0e151b1bca4fd73752f3ab5a Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Tue, 19 May 2026 20:33:19 +0000 Subject: [PATCH 15/26] refactor(sdk): port typed result and artifact schemas Mirrors the server's ``BaseExecutorResult``, ``ArtifactContext``, and ``ArtifactRef`` Pydantic models on the SDK side so ``materialize()`` walks typed attributes instead of poking raw dicts under ``envelope.result["_artifacts"]``. ``ResultEnvelope.result`` becomes ``SerializeAsAny[BaseExecutorResult]`` to preserve subclass payloads through the declared base type, matching the server's wire shape. Shared ``ArtifactContext.base_url`` gains ``exclude_if=lambda v: v is None`` so setting the field to ``None`` drops the key on serialization, replacing the previous ``ctx.pop("base_url", None)`` dict mutation. ``tests/sdk/test_schema_compat.py`` parametrizes the three new model pairs so server / SDK drift is caught at CI time. Signed-off-by: Noppanat Wadlom --- sdk/src/flowmesh/models/__init__.py | 6 +++++- sdk/src/flowmesh/models/artifacts.py | 10 ++++++++++ sdk/src/flowmesh/models/results.py | 19 +++++++++++++++++-- sdk/src/flowmesh/resources/results.py | 7 +++---- src/shared/schemas/artifact.py | 4 +++- tests/sdk/test_schema_compat.py | 10 ++++++++++ 6 files changed, 48 insertions(+), 8 deletions(-) create mode 100644 sdk/src/flowmesh/models/artifacts.py diff --git a/sdk/src/flowmesh/models/__init__.py b/sdk/src/flowmesh/models/__init__.py index e2043805..ea08aa5a 100644 --- a/sdk/src/flowmesh/models/__init__.py +++ b/sdk/src/flowmesh/models/__init__.py @@ -1,5 +1,6 @@ """FlowMesh SDK data models.""" +from .artifacts import ArtifactContext, ArtifactRef from .common import ( LogEntry, LogEvent, @@ -19,7 +20,7 @@ NodeWorkerInfo, WorkerRegisterResponse, ) -from .results import PathResponse, ResultEnvelope +from .results import BaseExecutorResult, PathResponse, ResultEnvelope from .tasks import HardwareUsage, TaskInfo, TaskUsage from .traces import ( ActiveWaitBreakdown, @@ -54,7 +55,10 @@ __all__ = [ "ActiveWaitBreakdown", + "ArtifactContext", + "ArtifactRef", "AssetSummary", + "BaseExecutorResult", "CPUInfo", "CriticalPathSummary", "E2EBreakdown", diff --git a/sdk/src/flowmesh/models/artifacts.py b/sdk/src/flowmesh/models/artifacts.py new file mode 100644 index 00000000..fdbb3edb --- /dev/null +++ b/sdk/src/flowmesh/models/artifacts.py @@ -0,0 +1,10 @@ +from pydantic import BaseModel, Field + + +class ArtifactRef(BaseModel): + path: str + + +class ArtifactContext(BaseModel): + base_dir: str + base_url: str | None = Field(default=None, exclude_if=lambda v: v is None) diff --git a/sdk/src/flowmesh/models/results.py b/sdk/src/flowmesh/models/results.py index 3368f1bc..b872d31b 100644 --- a/sdk/src/flowmesh/models/results.py +++ b/sdk/src/flowmesh/models/results.py @@ -1,8 +1,14 @@ """Result-related models.""" +# This is necessary to allow for the recursive type hint of `children` in +# `BaseExecutorResult`. +from __future__ import annotations + from typing import Any -from pydantic import BaseModel, Field +from pydantic import BaseModel, ConfigDict, Field, SerializeAsAny + +from .artifacts import ArtifactContext class PathResponse(BaseModel): @@ -10,11 +16,20 @@ class PathResponse(BaseModel): path: str +class BaseExecutorResult(BaseModel): + model_config = ConfigDict(extra="allow", serialize_by_alias=True) + + children: dict[str, SerializeAsAny[BaseExecutorResult]] = Field( + default_factory=dict, exclude_if=lambda v: not v + ) + artifacts: ArtifactContext | None = Field(default=None, alias="_artifacts") + + class ResultEnvelope(BaseModel): """Canonical on-disk shape of ``results.json`` (mirrors the server).""" task_id: str - result: dict[str, Any] + result: SerializeAsAny[BaseExecutorResult] worker_id: str | None = None metadata: dict[str, Any] | None = None received_at: str | None = Field(default=None) diff --git a/sdk/src/flowmesh/resources/results.py b/sdk/src/flowmesh/resources/results.py index 1f5e99cf..671aaa4b 100644 --- a/sdk/src/flowmesh/resources/results.py +++ b/sdk/src/flowmesh/resources/results.py @@ -198,10 +198,9 @@ def _finalize_materialize( return {}, json_path, extracted envelope = ResultEnvelope.model_validate_json(json_path.read_text()) - if _wants_artifacts(sections): - ctx = envelope.result["_artifacts"] - ctx["base_dir"] = (output_dir / task_id).resolve().as_posix() - ctx.pop("base_url", None) + if _wants_artifacts(sections) and (ctx := envelope.result.artifacts): + ctx.base_dir = (output_dir / task_id).resolve().as_posix() + ctx.base_url = None payload = envelope.model_dump(mode="json") json_path.write_text(json.dumps(payload, indent=2)) return payload, json_path, extracted diff --git a/src/shared/schemas/artifact.py b/src/shared/schemas/artifact.py index d9a232cc..724d8819 100644 --- a/src/shared/schemas/artifact.py +++ b/src/shared/schemas/artifact.py @@ -8,5 +8,7 @@ class ArtifactRef(BaseModel): class ArtifactContext(BaseModel): base_dir: str = Field(description="Producing task's output directory.") base_url: str | None = Field( - default=None, description="HTTP origin (scheme://host[:port]) for upload." + default=None, + exclude_if=lambda v: v is None, + description="HTTP origin (scheme://host[:port]) for upload.", ) diff --git a/tests/sdk/test_schema_compat.py b/tests/sdk/test_schema_compat.py index b757060f..d4092a7e 100644 --- a/tests/sdk/test_schema_compat.py +++ b/tests/sdk/test_schema_compat.py @@ -8,6 +8,9 @@ # SDK-side imports from flowmesh.models import ( + ArtifactContext, + ArtifactRef, + BaseExecutorResult, CPUInfo, GpuInfo, GpuPlatformInfo, @@ -79,6 +82,9 @@ ) from server.task.models import TaskInfo as SrvTaskInfo from server.task.models import TaskUsage as SrvTaskUsage +from shared.schemas.artifact import ArtifactContext as SrvArtifactContext +from shared.schemas.artifact import ArtifactRef as SrvArtifactRef +from shared.schemas.result import BaseExecutorResult as SrvBaseExecutorResult from shared.schemas.result import ResultEnvelope as SrvResultEnvelope from shared.schemas.worker import SSHLimits as SrvSSHLimits from shared.tasks.task_type import TaskType as SrvTaskType @@ -123,7 +129,11 @@ # Common (SrvOkResponse, OkResponse), # Results + (SrvBaseExecutorResult, BaseExecutorResult), (SrvResultEnvelope, ResultEnvelope), + # Artifacts + (SrvArtifactContext, ArtifactContext), + (SrvArtifactRef, ArtifactRef), ] From 576231282bf649b1dd0a4ca56cb1eb84997dfbeb Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Tue, 19 May 2026 20:33:40 +0000 Subject: [PATCH 16/26] refactor(worker): construct executor results without post-hoc mutation Each typed executor result (``AgentResult``, ``DataRetrievalResult``, ``DPOResult``, ``EchoResult``, ``LoRAResult``, ``PPOResult``, ``RAGResult``, ``SFTResult``) is now built with every field passed at construction time instead of being mutated afterwards (``result.final_model = ArtifactRef(...)``). Conditional ``ArtifactRef`` fields are folded into the constructor call as inline ternaries; the SFT/LoRA pair stages locals first because the truthy branch also runs ``archive_model_dir`` / sets ``self._final_model_dir``. Executors whose result schema already pins a default (``ok=True``, ``executor=EXECUTOR_NAME``) drop the redundant kwarg at the call site. ``Runner._write_results`` now reads ``result.children`` directly: the caller already guarantees a ``BaseExecutorResult``, so the ``isinstance`` + ``result.get("children")`` dict fallback is dead. Signed-off-by: Noppanat Wadlom --- src/worker/executors/agent_executor.py | 12 +++---- .../executors/data_retrieval_executor.py | 3 -- src/worker/executors/dpo_executor.py | 22 ++++++++----- src/worker/executors/echo_executor.py | 2 +- src/worker/executors/lora_sft_executor.py | 22 +++++++------ src/worker/executors/ppo_executor.py | 20 +++++++----- src/worker/executors/rag_executor.py | 7 ++-- src/worker/executors/sft_executor.py | 32 +++++++++++-------- src/worker/runner.py | 6 +--- 9 files changed, 69 insertions(+), 57 deletions(-) diff --git a/src/worker/executors/agent_executor.py b/src/worker/executors/agent_executor.py index 05b37aa4..db59687b 100644 --- a/src/worker/executors/agent_executor.py +++ b/src/worker/executors/agent_executor.py @@ -279,9 +279,9 @@ def run(self, task: ExecutorTask, out_dir: Path) -> AgentResult: "execution_log": result.get("log", []), }, agent_output=( - ArtifactRef(path=agent_output_ref) - if isinstance(agent_output_ref, str) - else None + None + if agent_output_ref is None + else ArtifactRef(path=agent_output_ref) ), ) else: @@ -324,9 +324,9 @@ def run(self, task: ExecutorTask, out_dir: Path) -> AgentResult: "batch_summary": results.get("batch_summary", {}), }, batch_summary_file=( - ArtifactRef(path=batch_summary_ref) - if isinstance(batch_summary_ref, str) - else None + None + if batch_summary_ref is None + else ArtifactRef(path=batch_summary_ref) ), ) diff --git a/src/worker/executors/data_retrieval_executor.py b/src/worker/executors/data_retrieval_executor.py index 1cde3031..86e75145 100644 --- a/src/worker/executors/data_retrieval_executor.py +++ b/src/worker/executors/data_retrieval_executor.py @@ -151,7 +151,6 @@ def _run_sql( ) return DataRetrievalResult( - ok=True, items=items, count=len(items), ) @@ -214,7 +213,6 @@ def _run_s3( items.append(item) return DataRetrievalResult( - ok=True, type="s3", items=items, metadata=s3_result["metadata"], # type: ignore @@ -320,7 +318,6 @@ def _run_agent( ) return DataRetrievalResult( - ok=True, type="agent", items=items, count=len(items), diff --git a/src/worker/executors/dpo_executor.py b/src/worker/executors/dpo_executor.py index a3a2da09..824cda27 100644 --- a/src/worker/executors/dpo_executor.py +++ b/src/worker/executors/dpo_executor.py @@ -484,15 +484,21 @@ def _execute_training(self, task: ExecutorTask, out_dir: Path) -> DPOResult: dataset_size=len(dataset) if dataset is not None else 0, output_dir=out_dir.as_posix(), checkpoints_dir=ArtifactRef(path="checkpoints"), + final_model=( + ArtifactRef( + path=final_model_path.relative_to(artifacts_dir).as_posix() + ) + if final_model_path + else None + ), + final_model_archive=( + ArtifactRef( + path=final_archive_path.relative_to(artifacts_dir).as_posix() + ) + if final_archive_path + else None + ), ) - if final_model_path is not None: - result.final_model = ArtifactRef( - path=final_model_path.relative_to(artifacts_dir).as_posix() - ) - if final_archive_path is not None: - result.final_model_archive = ArtifactRef( - path=final_archive_path.relative_to(artifacts_dir).as_posix() - ) maybe_upload_artifacts(task, out_dir, logger=logger) diff --git a/src/worker/executors/echo_executor.py b/src/worker/executors/echo_executor.py index ad9795ee..52c0e25d 100644 --- a/src/worker/executors/echo_executor.py +++ b/src/worker/executors/echo_executor.py @@ -88,7 +88,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> EchoResult: resolved = self._resolve_item(item, context) self._append_outputs(merged_items, resolved) - result = EchoResult(ok=True, items=merged_items, count=len(merged_items)) + result = EchoResult(items=merged_items, count=len(merged_items)) deps = self._extract_source_data_ids(spec) dependencies_by_task = {task_id: deps} diff --git a/src/worker/executors/lora_sft_executor.py b/src/worker/executors/lora_sft_executor.py index c17697ae..157bbb25 100644 --- a/src/worker/executors/lora_sft_executor.py +++ b/src/worker/executors/lora_sft_executor.py @@ -305,6 +305,17 @@ def _compute_loss_with_guard( logger.info("Saved LoRA-adapted weights to %s", model_path) training_time = time.time() - start_time + final_lora: ArtifactRef | None = None + final_lora_archive: ArtifactRef | None = None + if final_adapter_path: + final_lora = ArtifactRef( + path=final_adapter_path.relative_to(artifacts_dir).as_posix() + ) + archive_path = archive_model_dir(final_adapter_path) + final_lora_archive = ArtifactRef( + path=archive_path.relative_to(artifacts_dir).as_posix() + ) + logger.info("Prepared LoRA archive at %s", archive_path) result = LoRAResult( task_id=task.task_id, training_successful=training_successful, @@ -315,16 +326,9 @@ def _compute_loss_with_guard( output_dir=out_dir.as_posix(), checkpoints_dir=ArtifactRef(path="checkpoints"), resume_from_path=resume_str, + final_lora=final_lora, + final_lora_archive=final_lora_archive, ) - if final_adapter_path is not None: - result.final_lora = ArtifactRef( - path=final_adapter_path.relative_to(artifacts_dir).as_posix() - ) - archive_path = archive_model_dir(final_adapter_path) - result.final_lora_archive = ArtifactRef( - path=archive_path.relative_to(artifacts_dir).as_posix() - ) - logger.info("Prepared LoRA archive at %s", archive_path) maybe_upload_artifacts(task, out_dir, logger=logger) diff --git a/src/worker/executors/ppo_executor.py b/src/worker/executors/ppo_executor.py index be710dbb..cd7acedb 100644 --- a/src/worker/executors/ppo_executor.py +++ b/src/worker/executors/ppo_executor.py @@ -920,15 +920,19 @@ def build_trainer() -> _EarlyStopPPOTrainer: dataset_size=len(dataset), output_dir=out_dir.as_posix(), checkpoints_dir=ArtifactRef(path="checkpoints"), + final_model=( + ArtifactRef(path=final_model_path.relative_to(artifacts_dir).as_posix()) + if final_model_path + else None + ), + final_model_archive=( + ArtifactRef( + path=final_archive_path.relative_to(artifacts_dir).as_posix() + ) + if final_archive_path + else None + ), ) - if final_model_path is not None: - result.final_model = ArtifactRef( - path=final_model_path.relative_to(artifacts_dir).as_posix() - ) - if final_archive_path is not None: - result.final_model_archive = ArtifactRef( - path=final_archive_path.relative_to(artifacts_dir).as_posix() - ) maybe_upload_artifacts(task, out_dir, logger=logger) diff --git a/src/worker/executors/rag_executor.py b/src/worker/executors/rag_executor.py index 24286bcd..0b9c41ae 100644 --- a/src/worker/executors/rag_executor.py +++ b/src/worker/executors/rag_executor.py @@ -23,11 +23,12 @@ from .utils.graph_templates import Message, build_prompts_from_graph_template logger = logging.getLogger("worker.rag") +EXECUTOR_NAME = "rag" class RAGResult(BaseExecutorResult): ok: bool = True - executor: str + executor: str = EXECUTOR_NAME qdrant: dict[str, Any] embedding: dict[str, Any] search: dict[str, Any] @@ -36,7 +37,7 @@ class RAGResult(BaseExecutorResult): class RAGExecutor(Executor): - name = "rag" + name = EXECUTOR_NAME def run(self, task: ExecutorTask, out_dir: Path) -> RAGResult: start_ts = time.time() @@ -195,8 +196,6 @@ def run(self, task: ExecutorTask, out_dir: Path) -> RAGResult: "RAG query completed queries=%d total_results=%d", len(queries), total_items ) return RAGResult( - ok=True, - executor=self.name, qdrant={"collection": collection, "url": url}, embedding={"model": model_name}, search={"top_k": top_k}, diff --git a/src/worker/executors/sft_executor.py b/src/worker/executors/sft_executor.py index 453b25eb..ec194087 100644 --- a/src/worker/executors/sft_executor.py +++ b/src/worker/executors/sft_executor.py @@ -439,6 +439,23 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: ) training_time = time.time() - start_time + final_model: ArtifactRef | None = None + final_model_archive: ArtifactRef | None = None + if final_model_path: + resolved_model_path = Path(final_model_path) + self._final_model_dir = ( + resolved_model_path if resolved_model_path.exists() else None + ) + final_model = ArtifactRef( + path=final_model_path.relative_to(artifacts_dir).as_posix() + ) + final_model_archive = ( + ArtifactRef( + path=final_archive_path.relative_to(artifacts_dir).as_posix() + ) + if final_archive_path + else None + ) result = SFTResult( task_id=task.task_id, training_successful=training_successful, @@ -449,21 +466,10 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: output_dir=out_dir.as_posix(), checkpoints_dir=ArtifactRef(path="checkpoints"), resume_from_path=resume_str, + final_model=final_model, + final_model_archive=final_model_archive, ) - if final_model_path is not None: - resolved_model_path = Path(final_model_path) - self._final_model_dir = ( - resolved_model_path if resolved_model_path.exists() else None - ) - result.final_model = ArtifactRef( - path=final_model_path.relative_to(artifacts_dir).as_posix() - ) - if final_archive_path is not None: - result.final_model_archive = ArtifactRef( - path=final_archive_path.relative_to(artifacts_dir).as_posix() - ) - maybe_upload_artifacts(task, out_dir, logger=logger) self._cleanup_local_artifacts( diff --git a/src/worker/runner.py b/src/worker/runner.py index 96c2b2cc..7b7086de 100644 --- a/src/worker/runner.py +++ b/src/worker/runner.py @@ -135,11 +135,7 @@ def _write_results( self._write_single_result(task_id, spec, out_dir, result) child_lookup = {entry.task_id: entry for entry in merged_children} - if isinstance(result, BaseExecutorResult): - children_payload: dict[str, Any] = dict(result.children) - else: - children_payload = result.get("children") or {} - for child_id, child_result in children_payload.items(): + for child_id, child_result in result.children.items(): child_info = child_lookup.get(child_id) if child_info is None: continue From f75aba81a4e76d7f4fb36b6aa2e61d9f89c1d10f Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Tue, 19 May 2026 20:45:19 +0000 Subject: [PATCH 17/26] docs: correct result schema references MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both ARCHITECTURE.md and EXECUTORS.md pointed at ``src/shared/schemas/executor_result.py``, which was consolidated into ``src/shared/schemas/result.py`` later in the refactor. Update the paths. While here, drop the ``extra="allow"`` round-trip explanation and the ``SerializeAsAny`` paragraph from the EXECUTORS.md result schema section — both are implementation justifications that already live in ``BaseExecutorResult``'s docstring / field annotations, and the executor-author guide only needs the contract: return type, base fields, and how artifact refs resolve. Signed-off-by: Noppanat Wadlom --- docs/ARCHITECTURE.md | 7 +++---- docs/EXECUTORS.md | 17 +++++------------ 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index dac1b885..08a34223 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -109,10 +109,9 @@ scripts/dev/ compile_protos, sync_requirements, check_env_examples - **Task merging.** Compatible adjacent tasks in a DAG (same `taskType`, model, hardware shape, and merge key) coalesce into a single dispatch. Merged children ride on `WorkerTaskMessage.merged_children`; the worker - writes per-child results into `result.children` on the typed - `BaseExecutorResult` (`src/shared/schemas/executor_result.py`); the - dispatcher fans out synthetic `TASK_SUCCEEDED` / `TASK_FAILED` events. - Disable with `ENABLE_TASK_MERGE=false`. + writes per-child results into `result.children`; the dispatcher fans + out synthetic `TASK_SUCCEEDED` / `TASK_FAILED` events. Disable with + `ENABLE_TASK_MERGE=false`. - **Stage stickiness** (`ENABLE_STAGE_WEIGHT_STICKINESS=true`) — the dispatcher pins stages that reference an upstream stage's checkpoint to the worker that produced it, falling back to normal selection when diff --git a/docs/EXECUTORS.md b/docs/EXECUTORS.md index 959bc9c2..155efb08 100644 --- a/docs/EXECUTORS.md +++ b/docs/EXECUTORS.md @@ -24,10 +24,9 @@ Helper utilities live in `src/worker/executors/utils/` (`artifacts`, ## Result schema -Every executor's `run()` returns a typed Pydantic subclass of -`BaseExecutorResult` (`src/shared/schemas/executor_result.py`). The base -class carries the cross-cutting fields the rest of the runtime relies -on: +Every executor's `run()` returns a subclass of `BaseExecutorResult` +(`src/shared/schemas/result.py`). The base class carries two +cross-cutting fields: - `children: dict[str, BaseExecutorResult]` — per-child results when merged tasks share a dispatch. @@ -39,16 +38,10 @@ Per-executor subclasses live next to the executor they describe — e.g. `src/worker/executors/lora_sft_executor.py`. They add executor-specific fields (`items`, `usage`, `final_lora`, `command`, …). -The base class is `extra="allow"`, so the server can deserialize an -incoming envelope as `BaseExecutorResult` without knowing the concrete -subclass; the executor-specific fields pass through. Artifact-bearing -fields use `ArtifactRef` (`{"path": rel_path}`); relative paths resolve -against the producer's `_artifacts` context via +Artifact-bearing fields use `ArtifactRef` (`{"path": rel_path}`); +relative paths resolve against the producer's `_artifacts` context via `artifact_to_source` / `_render_artifact_ref`. -Serializers at the wire seam pass `serialize_as_any=True` so subclass -fields survive the round-trip. - ## Agent executor (utu / youtu-agent) `AgentExecutor` requires the following env vars to run; the executor From 034a21a26c71a0377c15726335b3ba698bc6604f Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Tue, 19 May 2026 21:01:04 +0000 Subject: [PATCH 18/26] fix(worker): retype omni summary fields as ArtifactRef MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The four Omni result subclasses declared their singular summary field (``image`` / ``audio``) as ``dict[str, Any]``, but each constructor passed ``items[0]["image" | "audio"]`` — an ``ArtifactRef`` instance — which Pydantic v2 rejects with ``Input should be a valid dictionary``. The validation fired on the success path (whenever ``items`` was non-empty), so every successful Omni run would have raised ``ValidationError`` at result construction. Retype each field as ``ArtifactRef | None`` and replace the empty-dict fallback with ``None`` so both the populated and empty-items paths validate cleanly. Signed-off-by: Noppanat Wadlom --- src/worker/executors/omni_text2audio_executor.py | 4 ++-- src/worker/executors/omni_text2general_executor.py | 4 ++-- src/worker/executors/omni_text2image_executor.py | 4 ++-- src/worker/executors/omni_text2speech_executor.py | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/worker/executors/omni_text2audio_executor.py b/src/worker/executors/omni_text2audio_executor.py index d0913317..ef4ff3c1 100644 --- a/src/worker/executors/omni_text2audio_executor.py +++ b/src/worker/executors/omni_text2audio_executor.py @@ -66,7 +66,7 @@ class OmniText2AudioResult(OmniResult): executor: str = EXECUTOR_NAME mode: str = "bgm" - audio: dict[str, Any] + audio: ArtifactRef | None sample_rate: int num_waveforms: int audio_length: float @@ -215,7 +215,7 @@ def _run_inner( return OmniText2AudioResult( model=self._model_name, - audio=items[0]["audio"] if items else {}, + audio=items[0]["audio"] if items else None, items=items, sample_rate=sample_rate, num_waveforms=len(items), diff --git a/src/worker/executors/omni_text2general_executor.py b/src/worker/executors/omni_text2general_executor.py index a80314f5..9c48763c 100644 --- a/src/worker/executors/omni_text2general_executor.py +++ b/src/worker/executors/omni_text2general_executor.py @@ -53,7 +53,7 @@ class OmniText2GeneralResult(OmniResult): executor: str = EXECUTOR_NAME mode: str = "narration" - audio: dict[str, Any] + audio: ArtifactRef | None sample_rate: int storyboard: dict[str, Any] | None = None @@ -196,7 +196,7 @@ def _run_inner( return OmniText2GeneralResult( model=self._model_name, items=items, - audio=items[0]["audio"] if items else {}, + audio=items[0]["audio"] if items else None, sample_rate=sample_rate, storyboard=spec_dict.get("storyboard"), ) diff --git a/src/worker/executors/omni_text2image_executor.py b/src/worker/executors/omni_text2image_executor.py index 068202e5..f11db25b 100644 --- a/src/worker/executors/omni_text2image_executor.py +++ b/src/worker/executors/omni_text2image_executor.py @@ -31,7 +31,7 @@ class OmniText2ImageResult(OmniResult): executor: str = EXECUTOR_NAME mode: str = "image" - image: dict[str, Any] + image: ArtifactRef | None class OmniText2ImageExecutor(OmniExecutorBase): @@ -101,7 +101,7 @@ def _run_inner( return OmniText2ImageResult( model=self._model_name, - image=items[0]["image"] if items else {}, + image=items[0]["image"] if items else None, items=items, ) diff --git a/src/worker/executors/omni_text2speech_executor.py b/src/worker/executors/omni_text2speech_executor.py index 1cf5e26c..b37aa74a 100644 --- a/src/worker/executors/omni_text2speech_executor.py +++ b/src/worker/executors/omni_text2speech_executor.py @@ -38,7 +38,7 @@ class OmniText2SpeechResult(OmniResult): executor: str = EXECUTOR_NAME mode: str = "tts" - audio: dict[str, Any] + audio: ArtifactRef | None sample_rate: int storyboard: dict[str, Any] | None = None @@ -114,7 +114,7 @@ def _run_inner( return OmniText2SpeechResult( model=self._model_name, items=items, - audio=items[0]["audio"] if items else {}, + audio=items[0]["audio"] if items else None, sample_rate=sample_rate, storyboard=spec_dict.get("storyboard"), ) From 210c6d2523a89c1c268ae954327af75b6702c504 Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Tue, 19 May 2026 21:01:18 +0000 Subject: [PATCH 19/26] refactor(worker): align lora/sft result schemas with peer training executors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drops the inner ``task_id`` field from ``SFTResult`` and ``LoRAResult`` — ``ResultEnvelope`` already owns ``task_id`` at the envelope level, so duplicating it inside the result is redundant — and loosens ``LoRAResult``'s formerly-required fields (``training_successful``, ``training_time_seconds``, ``output_dir``, ``checkpoints_dir``) to the default-bearing shape ``DPOResult`` / ``PPOResult`` / ``SFTResult`` already use. The lax defaults are alive code: ``SFTExecutor``'s torchrun-spawn early-return path constructs a partial ``SFTResult`` that omits ``checkpoints_dir`` and the error / timing fields. Signed-off-by: Noppanat Wadlom --- src/worker/executors/lora_sft_executor.py | 11 ++++------- src/worker/executors/sft_executor.py | 3 --- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/worker/executors/lora_sft_executor.py b/src/worker/executors/lora_sft_executor.py index 157bbb25..4453e6cb 100644 --- a/src/worker/executors/lora_sft_executor.py +++ b/src/worker/executors/lora_sft_executor.py @@ -50,14 +50,13 @@ class LoRAResult(BaseExecutorResult): - task_id: str - training_successful: bool - training_time_seconds: float + training_successful: bool = True + training_time_seconds: float | None = None error_message: str | None = None model_name: str | None = None dataset_size: int = 0 - output_dir: str - checkpoints_dir: ArtifactRef + output_dir: str | None = None + checkpoints_dir: ArtifactRef | None = None resume_from_path: str | None = None final_lora: ArtifactRef | None = None final_lora_archive: ArtifactRef | None = None @@ -317,7 +316,6 @@ def _compute_loss_with_guard( ) logger.info("Prepared LoRA archive at %s", archive_path) result = LoRAResult( - task_id=task.task_id, training_successful=training_successful, training_time_seconds=training_time, error_message=error_msg, @@ -348,7 +346,6 @@ def _compute_loss_with_guard( training_time = time.time() - start_time result = LoRAResult( - task_id=task.task_id, training_successful=training_successful, training_time_seconds=training_time, error_message=error_msg, diff --git a/src/worker/executors/sft_executor.py b/src/worker/executors/sft_executor.py index ec194087..065bb122 100644 --- a/src/worker/executors/sft_executor.py +++ b/src/worker/executors/sft_executor.py @@ -45,7 +45,6 @@ class SFTResult(BaseExecutorResult): - task_id: str | None = None training_successful: bool = True training_time_seconds: float | None = None error_message: str | None = None @@ -457,7 +456,6 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: else None ) result = SFTResult( - task_id=task.task_id, training_successful=training_successful, training_time_seconds=training_time, error_message=error_msg, @@ -508,7 +506,6 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: training_time = time.time() - start_time result = SFTResult( - task_id=task.task_id, training_successful=training_successful, training_time_seconds=training_time, error_message=error_msg, From 909a45a123743ba2b1b3dc13486b72ca878e03d4 Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Wed, 20 May 2026 12:56:55 +0000 Subject: [PATCH 20/26] fix(worker): type upstream context in graph-template and data-retrieval helpers Tightens ``context: dict[str, Any]`` to ``dict[str, BaseExecutorResult]`` on ``_build_graph_template_messages``, ``_resolve_image_embedding_count``, ``_resolve_columns`` (``graph_templates.py``); ``_run_sql``, ``_run_s3``, ``_run_agent`` (``data_retrieval_executor.py``); and ``_resolve_expr_item``, ``_resolve_item`` (``echo_executor.py``). Fixes a silent regression in ``_resolve_image_embedding_count``: after the typed-result refactor, ``isinstance(upstream, dict)`` was always ``False`` (upstream is a ``BaseExecutorResult``), so the image-embedding broadcast multiplier always returned ``None``. Replace the dict-isinstance + dict ``.get("count")`` with a sentinel-default ``getattr(upstream, "count", _SENTINEL)``, which resolves declared fields on subclasses (``EchoResult``, ``DataRetrievalResult``) and ``model_extra``-stored extras on the base. Signed-off-by: Noppanat Wadlom --- src/worker/executors/data_retrieval_executor.py | 8 ++++---- src/worker/executors/echo_executor.py | 8 ++++++-- src/worker/executors/utils/graph_templates.py | 12 +++++++----- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/src/worker/executors/data_retrieval_executor.py b/src/worker/executors/data_retrieval_executor.py index 86e75145..bb9dfe9b 100644 --- a/src/worker/executors/data_retrieval_executor.py +++ b/src/worker/executors/data_retrieval_executor.py @@ -77,7 +77,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> DataRetrievalResult: return result def _run_sql( - self, data_cfg: dict[str, Any], context: dict[str, Any] + self, data_cfg: dict[str, Any], context: dict[str, BaseExecutorResult] ) -> DataRetrievalResult: """ Execute SQL queries based on the provided data configuration and context. @@ -85,7 +85,7 @@ def _run_sql( :param data_cfg: Description :type data_cfg: dict[str, Any] :param context: Description - :type context: dict[str, Any] + :type context: dict[str, BaseExecutorResult] :return: Description :rtype: dict[str, Any] """ @@ -158,7 +158,7 @@ def _run_sql( def _run_s3( self, data_cfg: dict[str, Any], - context: dict[str, Any], + context: dict[str, BaseExecutorResult], out_dir: Path, ) -> DataRetrievalResult: validate_keys( @@ -221,7 +221,7 @@ def _run_s3( def _run_agent( self, data_cfg: dict[str, Any], - context: dict[str, Any], + context: dict[str, BaseExecutorResult], out_dir: Path, ) -> DataRetrievalResult: """Drive lumid.data's data agent for NL-driven retrieval.""" diff --git a/src/worker/executors/echo_executor.py b/src/worker/executors/echo_executor.py index 52c0e25d..e70f1412 100644 --- a/src/worker/executors/echo_executor.py +++ b/src/worker/executors/echo_executor.py @@ -32,7 +32,9 @@ def _append_outputs(self, out_items: list[dict[str, Any]], value: Any) -> None: out_items.append({"output": value}) @staticmethod - def _resolve_expr_item(item: dict[str, Any], context: dict[str, Any]) -> Any: + def _resolve_expr_item( + item: dict[str, Any], context: dict[str, BaseExecutorResult] + ) -> Any: expr = item.get("expr") if not expr: node = item.get("node") @@ -51,7 +53,9 @@ def _resolve_expr_item(item: dict[str, Any], context: dict[str, Any]) -> Any: ) return resolved - def _resolve_item(self, item: EchoItem, context: dict[str, Any]) -> Any: + def _resolve_item( + self, item: EchoItem, context: dict[str, BaseExecutorResult] + ) -> Any: if isinstance(item, str): return item elif isinstance(item, dict): diff --git a/src/worker/executors/utils/graph_templates.py b/src/worker/executors/utils/graph_templates.py index c21cd339..e51aad54 100644 --- a/src/worker/executors/utils/graph_templates.py +++ b/src/worker/executors/utils/graph_templates.py @@ -85,7 +85,7 @@ def build_prompts_from_graph_template( def _maybe_broadcast_image_prompts( prompts: Sequence[str | Message], data_cfg: dict[str, Any], - context: dict[str, Any], + context: dict[str, BaseExecutorResult], ) -> list[str | Message]: image_embedding = data_cfg.get("image_embedding") if not isinstance(image_embedding, dict): @@ -99,7 +99,7 @@ def _maybe_broadcast_image_prompts( def _resolve_image_embedding_count( - image_embedding: dict[str, Any], context: dict[str, Any] + image_embedding: dict[str, Any], context: dict[str, BaseExecutorResult] ) -> int | None: node = image_embedding.get("node") if not isinstance(node, str): @@ -109,15 +109,17 @@ def _resolve_image_embedding_count( if not isinstance(node, str) or not node: return None upstream = context.get(node) - if not isinstance(upstream, dict): + if upstream is None: return None - count = upstream.get("count") + count = getattr(upstream, "count", _SENTINEL) if isinstance(count, int) and count > 0: return count return None -def _resolve_columns(columns_cfg: Any, context: dict[str, Any]) -> list[dict[str, Any]]: +def _resolve_columns( + columns_cfg: Any, context: dict[str, BaseExecutorResult] +) -> list[dict[str, Any]]: if not isinstance(columns_cfg, list): raise ExecutionError("graph_template.template.columns must be a list.") From 0a42ae22d9d465b7fd35eb6930e07c59de686b25 Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Wed, 20 May 2026 12:57:55 +0000 Subject: [PATCH 21/26] refactor: standardize ok across executor results and remove training_successful MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lifts ``ok: bool = True`` to ``BaseExecutorResult`` (shared and SDK). Drops the now-redundant declaration from ``APIResult``, ``EchoResult``, ``DataRetrievalResult``, ``OmniResult``, ``RAGResult``, and replaces ``training_successful: bool`` with the inherited ``ok`` across ``SFTResult``, ``LoRAResult``, ``DPOResult``, ``PPOResult`` — including the corresponding constructor kwargs, local variables, and the LoRA ``if training_successful: return result`` guard. Wire-format consequence: on-disk training result JSONs no longer carry a ``training_successful`` key; ``ok`` is emitted by every result. Signed-off-by: Noppanat Wadlom --- sdk/src/flowmesh/models/results.py | 1 + src/shared/schemas/result.py | 1 + src/worker/executors/api_executor.py | 1 - src/worker/executors/data_retrieval_executor.py | 1 - src/worker/executors/dpo_executor.py | 5 +---- src/worker/executors/echo_executor.py | 1 - src/worker/executors/lora_sft_executor.py | 13 ++++++------- src/worker/executors/omni_executor_base.py | 1 - src/worker/executors/ppo_executor.py | 10 ++++------ src/worker/executors/rag_executor.py | 1 - src/worker/executors/sft_executor.py | 12 +++++------- 11 files changed, 18 insertions(+), 29 deletions(-) diff --git a/sdk/src/flowmesh/models/results.py b/sdk/src/flowmesh/models/results.py index b872d31b..8a1ae05d 100644 --- a/sdk/src/flowmesh/models/results.py +++ b/sdk/src/flowmesh/models/results.py @@ -19,6 +19,7 @@ class PathResponse(BaseModel): class BaseExecutorResult(BaseModel): model_config = ConfigDict(extra="allow", serialize_by_alias=True) + ok: bool = True children: dict[str, SerializeAsAny[BaseExecutorResult]] = Field( default_factory=dict, exclude_if=lambda v: not v ) diff --git a/src/shared/schemas/result.py b/src/shared/schemas/result.py index 832fa25c..5a876f81 100644 --- a/src/shared/schemas/result.py +++ b/src/shared/schemas/result.py @@ -25,6 +25,7 @@ class BaseExecutorResult(BaseModel): model_config = ConfigDict(extra="allow", serialize_by_alias=True) + ok: bool = Field(default=True, description="Whether task execution succeeded.") children: dict[str, SerializeAsAny[BaseExecutorResult]] = Field( default_factory=dict, exclude_if=lambda v: not v, diff --git a/src/worker/executors/api_executor.py b/src/worker/executors/api_executor.py index a845b31e..bc0e9bde 100644 --- a/src/worker/executors/api_executor.py +++ b/src/worker/executors/api_executor.py @@ -19,7 +19,6 @@ class APIResult(BaseExecutorResult): - ok: bool = True executor: str method: str url: str diff --git a/src/worker/executors/data_retrieval_executor.py b/src/worker/executors/data_retrieval_executor.py index bb9dfe9b..4347a262 100644 --- a/src/worker/executors/data_retrieval_executor.py +++ b/src/worker/executors/data_retrieval_executor.py @@ -26,7 +26,6 @@ class DataRetrievalResult(BaseExecutorResult): - ok: bool = True type: str | None = None items: list[dict[str, Any]] = [] count: int | None = None diff --git a/src/worker/executors/dpo_executor.py b/src/worker/executors/dpo_executor.py index 824cda27..b62256f2 100644 --- a/src/worker/executors/dpo_executor.py +++ b/src/worker/executors/dpo_executor.py @@ -47,7 +47,6 @@ class DPOResult(BaseExecutorResult): - training_successful: bool = True training_time_seconds: float | None = None error_message: str | None = None model_name: str | None = None @@ -96,7 +95,6 @@ def run(self, task: ExecutorTask, out_dir: Path) -> DPOResult: if ipc_path.exists(): return DPOResult.model_validate(self.load_json(ipc_path)) return DPOResult( - training_successful=True, spawned_torchrun=True, model_name=( spec.model.source.identifier @@ -477,7 +475,6 @@ def _execute_training(self, task: ExecutorTask, out_dir: Path) -> DPOResult: training_time = time.time() - start_time result = DPOResult( - training_successful=True, training_time_seconds=training_time, error_message=None, model_name=self._model_name, @@ -512,7 +509,7 @@ def _execute_training(self, task: ExecutorTask, out_dir: Path) -> DPOResult: except Exception as exc: training_time = time.time() - start_time result = DPOResult( - training_successful=False, + ok=False, training_time_seconds=training_time, error_message=str(exc), model_name=self._model_name, diff --git a/src/worker/executors/echo_executor.py b/src/worker/executors/echo_executor.py index e70f1412..1a03662f 100644 --- a/src/worker/executors/echo_executor.py +++ b/src/worker/executors/echo_executor.py @@ -16,7 +16,6 @@ class EchoResult(BaseExecutorResult): - ok: bool = True items: list[dict[str, Any]] = [] count: int = 0 diff --git a/src/worker/executors/lora_sft_executor.py b/src/worker/executors/lora_sft_executor.py index 4453e6cb..9e36e220 100644 --- a/src/worker/executors/lora_sft_executor.py +++ b/src/worker/executors/lora_sft_executor.py @@ -50,7 +50,6 @@ class LoRAResult(BaseExecutorResult): - training_successful: bool = True training_time_seconds: float | None = None error_message: str | None = None model_name: str | None = None @@ -77,7 +76,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> LoRAResult: configure_hf_library_logging() spec = self.require_spec(task, LoRASFTSpecStrict) start_time = time.time() - training_successful = False + ok = False error_msg: str | None = None if ( @@ -291,7 +290,7 @@ def _compute_loss_with_guard( logger.info("Starting LoRA supervised fine-tuning run") trainer.train() - training_successful = True + ok = True logger.info("LoRA SFT training completed") final_adapter_path: Path | None = None @@ -316,7 +315,7 @@ def _compute_loss_with_guard( ) logger.info("Prepared LoRA archive at %s", archive_path) result = LoRAResult( - training_successful=training_successful, + ok=ok, training_time_seconds=training_time, error_message=error_msg, model_name=self._model_name, @@ -340,13 +339,13 @@ def _compute_loss_with_guard( except Exception as exc: # pylint: disable=broad-except error_msg = str(exc) - training_successful = False + ok = False logger.exception("LoRA SFT training failed: %s", exc) training_time = time.time() - start_time result = LoRAResult( - training_successful=training_successful, + ok=ok, training_time_seconds=training_time, error_message=error_msg, model_name=self._model_name, @@ -356,7 +355,7 @@ def _compute_loss_with_guard( resume_from_path=resume_path.as_posix() if resume_path else None, ) - if training_successful: + if ok: return result write_executor_result(out_dir / "results.json", task.task_id, task.spec, result) diff --git a/src/worker/executors/omni_executor_base.py b/src/worker/executors/omni_executor_base.py index 073d71c8..8fd9ec21 100644 --- a/src/worker/executors/omni_executor_base.py +++ b/src/worker/executors/omni_executor_base.py @@ -42,7 +42,6 @@ class OmniResult(BaseExecutorResult): - ok: bool = True executor: str mode: str model: str | None diff --git a/src/worker/executors/ppo_executor.py b/src/worker/executors/ppo_executor.py index cd7acedb..4c859719 100644 --- a/src/worker/executors/ppo_executor.py +++ b/src/worker/executors/ppo_executor.py @@ -53,7 +53,6 @@ class PPOResult(BaseExecutorResult): - training_successful: bool = True training_time_seconds: float | None = None error_message: str | None = None model_name: str | None = None @@ -433,7 +432,6 @@ def run(self, task: ExecutorTask, out_dir: Path) -> PPOResult: return PPOResult.model_validate(self.load_json(ipc_path)) self._task_out_dir = None return PPOResult( - training_successful=True, spawned_torchrun=True, model_name=spec.model_name, output_dir=out_dir.as_posix(), @@ -863,7 +861,7 @@ def build_trainer() -> _EarlyStopPPOTrainer: pass logger.info("PPO training completed") - training_successful = True + ok = True error_msg = None # Save model if requested @@ -891,13 +889,13 @@ def build_trainer() -> _EarlyStopPPOTrainer: logger.warning("Failed to save model: %s", exc) except Exception as exc: - training_successful = False + ok = False error_msg = str(exc) logger.exception("PPO training failed: %s", exc) training_time = time.time() - start_time dataset_size = len(dataset) if "dataset" in locals() else 0 # type: ignore result = PPOResult( - training_successful=training_successful, + ok=ok, training_time_seconds=training_time, error_message=error_msg, model_name=self._model_name, @@ -913,7 +911,7 @@ def build_trainer() -> _EarlyStopPPOTrainer: training_time = time.time() - start_time result = PPOResult( - training_successful=training_successful, + ok=ok, training_time_seconds=training_time, error_message=error_msg, model_name=self._model_name, diff --git a/src/worker/executors/rag_executor.py b/src/worker/executors/rag_executor.py index 0b9c41ae..06b3c705 100644 --- a/src/worker/executors/rag_executor.py +++ b/src/worker/executors/rag_executor.py @@ -27,7 +27,6 @@ class RAGResult(BaseExecutorResult): - ok: bool = True executor: str = EXECUTOR_NAME qdrant: dict[str, Any] embedding: dict[str, Any] diff --git a/src/worker/executors/sft_executor.py b/src/worker/executors/sft_executor.py index 065bb122..030dcd81 100644 --- a/src/worker/executors/sft_executor.py +++ b/src/worker/executors/sft_executor.py @@ -45,7 +45,6 @@ class SFTResult(BaseExecutorResult): - training_successful: bool = True training_time_seconds: float | None = None error_message: str | None = None model_name: str | None = None @@ -75,7 +74,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: spec = self.require_spec(task, SFTSpecStrict) requested_gpu_count = self._requested_gpu_count(spec) start_time = time.time() - training_successful = False + ok = False error_msg: str | None = None caught_exc: Exception | None = None self._task_out_dir = out_dir @@ -208,7 +207,6 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: return SFTResult.model_validate(self.load_json(ipc_path)) self._task_out_dir = None return SFTResult( - training_successful=True, spawned_torchrun=True, model_name=spec.model_name, output_dir=out_dir.as_posix(), @@ -410,7 +408,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: logger.info("Starting supervised fine-tuning") trainer.train() - training_successful = True + ok = True logger.info("Training finished") final_model_path: Path | None = None @@ -456,7 +454,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: else None ) result = SFTResult( - training_successful=training_successful, + ok=ok, training_time_seconds=training_time, error_message=error_msg, model_name=self._model_name, @@ -492,7 +490,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: except Exception as exc: error_msg = str(exc) - training_successful = False + ok = False caught_exc = exc logger.exception("SFT training failed: %s", exc) trainer = None @@ -506,7 +504,7 @@ def run(self, task: ExecutorTask, out_dir: Path) -> SFTResult: training_time = time.time() - start_time result = SFTResult( - training_successful=training_successful, + ok=ok, training_time_seconds=training_time, error_message=error_msg, model_name=self._model_name, From 9a24320b0a17f9a83ebbe6d62fa50dd6eeb87236 Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Wed, 20 May 2026 12:58:26 +0000 Subject: [PATCH 22/26] fix(shared): rename internal artifacts field to artifacts_ and lock against subclass redef MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Renames ``BaseExecutorResult.artifacts`` to ``artifacts_`` (wire alias ``_artifacts`` unchanged) and adds a ``__pydantic_init_subclass__`` hook that rejects any subclass declaring its own ``artifacts_`` annotation. The trailing-underscore Python name + locked field signals that this context is internal — set by the runner via ``result.artifacts_ = build_artifact_context(...)``, never by executor code — while freeing executor result subclasses to declare their own ``artifacts`` field without colliding with the base. Updates the four attribute-access sites: ``_render_artifact_ref`` (``dispatcher/base.py``), ``artifact_to_source`` (``worker/utils/ artifacts.py``), ``write_executor_result`` (``worker/utils/ checkpoints.py``), and ``_finalize_materialize`` (``sdk/resources/ results.py``). Signed-off-by: Noppanat Wadlom --- sdk/src/flowmesh/models/results.py | 11 ++++++++++- sdk/src/flowmesh/resources/results.py | 2 +- src/server/dispatcher/base.py | 2 +- src/shared/schemas/result.py | 11 ++++++++++- src/worker/executors/utils/artifacts.py | 2 +- src/worker/executors/utils/checkpoints.py | 2 +- 6 files changed, 24 insertions(+), 6 deletions(-) diff --git a/sdk/src/flowmesh/models/results.py b/sdk/src/flowmesh/models/results.py index 8a1ae05d..399ea7b8 100644 --- a/sdk/src/flowmesh/models/results.py +++ b/sdk/src/flowmesh/models/results.py @@ -23,7 +23,16 @@ class BaseExecutorResult(BaseModel): children: dict[str, SerializeAsAny[BaseExecutorResult]] = Field( default_factory=dict, exclude_if=lambda v: not v ) - artifacts: ArtifactContext | None = Field(default=None, alias="_artifacts") + artifacts_: ArtifactContext | None = Field(default=None, alias="_artifacts") + + @classmethod + def __pydantic_init_subclass__(cls, **kwargs: Any) -> None: + super().__pydantic_init_subclass__(**kwargs) + if "artifacts_" in cls.__annotations__: + raise TypeError( + f"{cls.__name__} may not redefine the internal " + "BaseExecutorResult.artifacts_ field" + ) class ResultEnvelope(BaseModel): diff --git a/sdk/src/flowmesh/resources/results.py b/sdk/src/flowmesh/resources/results.py index 671aaa4b..3742106b 100644 --- a/sdk/src/flowmesh/resources/results.py +++ b/sdk/src/flowmesh/resources/results.py @@ -198,7 +198,7 @@ def _finalize_materialize( return {}, json_path, extracted envelope = ResultEnvelope.model_validate_json(json_path.read_text()) - if _wants_artifacts(sections) and (ctx := envelope.result.artifacts): + if _wants_artifacts(sections) and (ctx := envelope.result.artifacts_): ctx.base_dir = (output_dir / task_id).resolve().as_posix() ctx.base_url = None payload = envelope.model_dump(mode="json") diff --git a/src/server/dispatcher/base.py b/src/server/dispatcher/base.py index a370fb05..501a9b2d 100644 --- a/src/server/dispatcher/base.py +++ b/src/server/dispatcher/base.py @@ -857,7 +857,7 @@ def _render_artifact_ref(value: Any, stage_result: ResultEnvelope) -> str | None path_value = value.get("path") if not isinstance(path_value, str) or not path_value: return None - ctx = stage_result.result.artifacts + ctx = stage_result.result.artifacts_ if ctx is None: return None base_url = ctx.base_url diff --git a/src/shared/schemas/result.py b/src/shared/schemas/result.py index 5a876f81..e4378f90 100644 --- a/src/shared/schemas/result.py +++ b/src/shared/schemas/result.py @@ -31,12 +31,21 @@ class BaseExecutorResult(BaseModel): exclude_if=lambda v: not v, description="Per-child result payloads for task merging.", ) - artifacts: ArtifactContext | None = Field( + artifacts_: ArtifactContext | None = Field( default=None, alias="_artifacts", description="Resolution context for relative artifact refs.", ) + @classmethod + def __pydantic_init_subclass__(cls, **kwargs: Any) -> None: + super().__pydantic_init_subclass__(**kwargs) + if "artifacts_" in cls.__annotations__: + raise TypeError( + f"{cls.__name__} may not redefine the internal " + "BaseExecutorResult.artifacts_ field" + ) + class ResultEnvelope(BaseModel): task_id: str = Field(description="Task identifier.") diff --git a/src/worker/executors/utils/artifacts.py b/src/worker/executors/utils/artifacts.py index 6935bc22..8605433b 100644 --- a/src/worker/executors/utils/artifacts.py +++ b/src/worker/executors/utils/artifacts.py @@ -24,7 +24,7 @@ def artifact_to_source( context and node and (node_result := context.get(node)) - and (ctx := node_result.artifacts) + and (ctx := node_result.artifacts_) ): base_url = ctx.base_url base_dir = ctx.base_dir diff --git a/src/worker/executors/utils/checkpoints.py b/src/worker/executors/utils/checkpoints.py index acf7d345..ed57f369 100644 --- a/src/worker/executors/utils/checkpoints.py +++ b/src/worker/executors/utils/checkpoints.py @@ -414,7 +414,7 @@ def write_executor_result( ) -> None: """Stamp ``_artifacts`` onto ``result`` and persist the envelope.""" path.parent.mkdir(parents=True, exist_ok=True) - result.artifacts = build_artifact_context(spec, path.parent) + result.artifacts_ = build_artifact_context(spec, path.parent) envelope = ResultEnvelope(task_id=task_id, result=result) atomic_write_text(path, envelope.model_dump_json(indent=2)) From bdbf74b650e9d7df88873a18cec1aef79e1012a6 Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Wed, 20 May 2026 12:58:47 +0000 Subject: [PATCH 23/26] chore: tighten SDK pydantic pin to >=2.12.3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SDK's ``pydantic>=2.0.0`` floor was too loose — ``exclude_if`` (used on ``BaseExecutorResult.children`` and ``ArtifactContext. base_url``) was added in pydantic v2.12.0, so older 2.x versions would silently ignore the kwarg and emit unwanted null/empty keys. Pin the SDK floor to ``>=2.12.3``, matching the runtime workspace. (The reviewer's request to bump to ``>=2.12.4`` is blocked upstream by ``vllm-omni 0.18 → gradio 5.50 → pydantic<=2.12.3``; the 2.12.4 change only refines JSON Schema's ``required`` list interaction with ``exclude_if`` and doesn't affect the ``model_dump`` path FlowMesh uses.) Signed-off-by: Noppanat Wadlom --- sdk/pyproject.toml | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/pyproject.toml b/sdk/pyproject.toml index 881e7689..97ca5abc 100644 --- a/sdk/pyproject.toml +++ b/sdk/pyproject.toml @@ -13,7 +13,7 @@ license-files = ["LICENSE"] dependencies = [ "httpx>=0.27.0", "pandas>=2.3.3", - "pydantic>=2.0.0", + "pydantic>=2.12.3", "pyyaml>=6.0.0", ] diff --git a/uv.lock b/uv.lock index ea21fb32..650e27a2 100644 --- a/uv.lock +++ b/uv.lock @@ -2355,7 +2355,7 @@ dependencies = [ requires-dist = [ { name = "httpx", specifier = ">=0.27.0" }, { name = "pandas", specifier = ">=2.3.3" }, - { name = "pydantic", specifier = ">=2.0.0" }, + { name = "pydantic", specifier = ">=2.12.3" }, { name = "pyyaml", specifier = ">=6.0.0" }, ] From 48406cf2ffe44a735e59c0f0bf69e7985c0257f9 Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Wed, 20 May 2026 12:58:56 +0000 Subject: [PATCH 24/26] docs(worker): describe typed result contract in base_executor docstring Updates the module docstring's usage example and contract block to use ``BaseExecutorResult`` instead of the legacy ``-> dict`` shape. The example now declares a ``MyResult(BaseExecutorResult)`` subclass and returns it, matching the actual typed contract every executor follows post-result-schema refactor. Import-line in the snippet now points at the canonical locations: ``BaseExecutorResult`` from ``shared.schemas.result``, ``Executor`` / ``ExecutionError`` from ``worker.executors.base_executor``. Signed-off-by: Noppanat Wadlom --- src/worker/executors/base_executor.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/worker/executors/base_executor.py b/src/worker/executors/base_executor.py index f17a3d7f..f5a20245 100644 --- a/src/worker/executors/base_executor.py +++ b/src/worker/executors/base_executor.py @@ -2,19 +2,23 @@ Executor base class and a minimal example implementation. Usage: - from executor_base import Executor, ExecutionError, EchoExecutor + from shared.schemas.result import BaseExecutorResult + from worker.executors.base_executor import Executor, ExecutionError + + class MyResult(BaseExecutorResult): + echo: str class MyExecutor(Executor): name = "my-executor" - def run(self, task: ExecutorTask, out_dir: Path) -> dict: + def run(self, task: ExecutorTask, out_dir: Path) -> MyResult: # ... your logic ... - return {"ok": True, "echo": task.task_id} + return MyResult(echo=task.task_id) Contract: -- Implement `run(task: ExecutorTask, out_dir: Path) -> dict`. The runner - writes the returned dict to `out_dir/results.json` and injects the - top-level `_artifacts` context — executors should not write that file - themselves on the success path. +- Implement `run(task: ExecutorTask, out_dir: Path) -> BaseExecutorResult`. + The runner writes the returned model to `out_dir/results.json` and + injects the top-level `_artifacts` context — executors should not write + that file themselves on the success path. - Drop generated files under `out_dir/artifacts/` (uploaded to the server when the task has an HTTP destination) or `scratch_dir(out_dir)` for local-only scratch data. From 2ef78e03b7df6d2e1a11ebfc1ae8a6cb4db2ea56 Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Wed, 20 May 2026 13:15:54 +0000 Subject: [PATCH 25/26] chore(security): ignore CVEs with no fix versions in pip-audit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pip-audit flagged 22 new advisories (torch, transformers, nltk, pyjwt, joblib) — all with no fix version published. Add ``--ignore-vuln`` flags for each PYSEC ID in the worker CPU and worker GPU steps, and extend the documented advisory table in ``docs/CODE_STYLE.md`` (renaming the column from GHSA to Advisory since PYSEC IDs now appear alongside GHSA IDs). Signed-off-by: Noppanat Wadlom --- .github/workflows/security.yml | 44 ++++++++++++++++++++++++++++++++++ docs/CODE_STYLE.md | 34 +++++++++++++++++++++----- 2 files changed, 72 insertions(+), 6 deletions(-) diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index 0859dfd0..41a753ea 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -105,6 +105,28 @@ jobs: --ignore-vuln GHSA-vfmq-68hx-4jfw \ --ignore-vuln GHSA-j7w6-vpvq-j3gm \ --ignore-vuln GHSA-98h9-4798-4q5v \ + --ignore-vuln PYSEC-2025-189 \ + --ignore-vuln PYSEC-2025-190 \ + --ignore-vuln PYSEC-2025-191 \ + --ignore-vuln PYSEC-2025-192 \ + --ignore-vuln PYSEC-2025-193 \ + --ignore-vuln PYSEC-2025-194 \ + --ignore-vuln PYSEC-2025-195 \ + --ignore-vuln PYSEC-2025-196 \ + --ignore-vuln PYSEC-2025-197 \ + --ignore-vuln PYSEC-2025-210 \ + --ignore-vuln PYSEC-2026-139 \ + --ignore-vuln PYSEC-2025-211 \ + --ignore-vuln PYSEC-2025-212 \ + --ignore-vuln PYSEC-2025-213 \ + --ignore-vuln PYSEC-2025-214 \ + --ignore-vuln PYSEC-2025-215 \ + --ignore-vuln PYSEC-2025-216 \ + --ignore-vuln PYSEC-2025-217 \ + --ignore-vuln PYSEC-2025-218 \ + --ignore-vuln PYSEC-2026-97 \ + --ignore-vuln PYSEC-2025-183 \ + --ignore-vuln PYSEC-2024-277 \ -r /tmp/requirements-worker-cpu-audit.txt - name: Run pip-audit (worker GPU delta) # no --strict: flashinfer-jit-cache is unauditable on PyPI run: | @@ -130,4 +152,26 @@ jobs: --ignore-vuln GHSA-83vm-p52w-f9pw \ --ignore-vuln GHSA-j7w6-vpvq-j3gm \ --ignore-vuln GHSA-98h9-4798-4q5v \ + --ignore-vuln PYSEC-2025-189 \ + --ignore-vuln PYSEC-2025-190 \ + --ignore-vuln PYSEC-2025-191 \ + --ignore-vuln PYSEC-2025-192 \ + --ignore-vuln PYSEC-2025-193 \ + --ignore-vuln PYSEC-2025-194 \ + --ignore-vuln PYSEC-2025-195 \ + --ignore-vuln PYSEC-2025-196 \ + --ignore-vuln PYSEC-2025-197 \ + --ignore-vuln PYSEC-2025-210 \ + --ignore-vuln PYSEC-2026-139 \ + --ignore-vuln PYSEC-2025-211 \ + --ignore-vuln PYSEC-2025-212 \ + --ignore-vuln PYSEC-2025-213 \ + --ignore-vuln PYSEC-2025-214 \ + --ignore-vuln PYSEC-2025-215 \ + --ignore-vuln PYSEC-2025-216 \ + --ignore-vuln PYSEC-2025-217 \ + --ignore-vuln PYSEC-2025-218 \ + --ignore-vuln PYSEC-2026-97 \ + --ignore-vuln PYSEC-2025-183 \ + --ignore-vuln PYSEC-2024-277 \ -r src/worker/requirements/requirements.gpu.txt diff --git a/docs/CODE_STYLE.md b/docs/CODE_STYLE.md index 33ed7383..c63edf27 100644 --- a/docs/CODE_STYLE.md +++ b/docs/CODE_STYLE.md @@ -89,13 +89,13 @@ CI runs `pip-audit` against each generated requirements file When pip-audit reports a new CVE, the only real fix is to bump the offending dep in `pyproject.toml`, then `uv lock` and `uv run scripts/dev/sync_requirements.py --write`. Silencing via `--ignore-vuln` -is a last resort; every silenced GHSA needs a written upgrade-blocker. -The currently-ignored advisories and the upgrade blocker that justifies -each are listed below; the same list is encoded as `--ignore-vuln` -flags in `.github/workflows/security.yml`. +is a last resort; every silenced advisory needs a written +upgrade-blocker. The currently-ignored advisories and the upgrade +blocker that justifies each are listed below; the same list is encoded +as `--ignore-vuln` flags in `.github/workflows/security.yml`. -| GHSA | Package | Fix version | Why ignored | -|------|---------|-------------|-------------| +| Advisory | Package | Fix version | Why ignored | +|----------|---------|-------------|-------------| | GHSA-69w3-r845-3855 | transformers | 5.0.0rc3 | held by vllm/vllm-omni 0.18 compatibility | | GHSA-pf3h-qjgv-vcpr | vllm | 0.19.0 | held by transformers 4.57 + adjacent inference deps | | GHSA-pq5c-rjhq-qp7p | vllm | 0.19.0 | same | @@ -117,6 +117,28 @@ flags in `.github/workflows/security.yml`. | GHSA-w8v5-vhqr-4h9v | diskcache | (none) | upstream unmaintained, no fixed version published | | GHSA-j7w6-vpvq-j3gm | diffusers | 0.38.0 | fix requires safetensors>=0.8.0rc0 pre-release; uv lock won't pick up pre-releases without explicit opt-in | | GHSA-98h9-4798-4q5v | diffusers | 0.38.0 | same blocker as GHSA-j7w6-vpvq-j3gm — both fixed in 0.38.0 | +| PYSEC-2025-189 | torch | (none) | no fix version published | +| PYSEC-2025-190 | torch | (none) | same | +| PYSEC-2025-191 | torch | (none) | same | +| PYSEC-2025-192 | torch | (none) | same | +| PYSEC-2025-193 | torch | (none) | same | +| PYSEC-2025-194 | torch | (none) | same | +| PYSEC-2025-195 | torch | (none) | same | +| PYSEC-2025-196 | torch | (none) | same | +| PYSEC-2025-197 | torch | (none) | same | +| PYSEC-2025-210 | torch | (none) | same | +| PYSEC-2026-139 | torch | (none) | same | +| PYSEC-2025-211 | transformers | (none) | no fix version published; transformers also held by vllm-omni 0.18 | +| PYSEC-2025-212 | transformers | (none) | same | +| PYSEC-2025-213 | transformers | (none) | same | +| PYSEC-2025-214 | transformers | (none) | same | +| PYSEC-2025-215 | transformers | (none) | same | +| PYSEC-2025-216 | transformers | (none) | same | +| PYSEC-2025-217 | transformers | (none) | same | +| PYSEC-2025-218 | transformers | (none) | same | +| PYSEC-2026-97 | nltk | (none) | no fix version published | +| PYSEC-2025-183 | pyjwt | (none) | no fix version published | +| PYSEC-2024-277 | joblib | (none) | no fix version published | When a blocker lifts (e.g. transformers 5 ↔ vllm 0.19 line stabilizes), drop the corresponding `--ignore-vuln` flag from the workflow and the From c99082e32b704971c0b24858c01195c6f845dea6 Mon Sep 17 00:00:00 2001 From: Noppanat Wadlom Date: Wed, 20 May 2026 13:31:08 +0000 Subject: [PATCH 26/26] chore(security): ignore vllm/gradio CVEs surfaced by GPU pip-audit step MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PYSEC-2025-222 (vllm 0.18.0) and PYSEC-2024-274 (gradio 5.50.0) — both with no fix version published and both blocked by the vllm-omni 0.18 pin. Add ``--ignore-vuln`` flags to the worker-GPU step and document each in ``docs/CODE_STYLE.md``. Signed-off-by: Noppanat Wadlom --- .github/workflows/security.yml | 2 ++ docs/CODE_STYLE.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml index 41a753ea..d2540862 100644 --- a/.github/workflows/security.yml +++ b/.github/workflows/security.yml @@ -174,4 +174,6 @@ jobs: --ignore-vuln PYSEC-2026-97 \ --ignore-vuln PYSEC-2025-183 \ --ignore-vuln PYSEC-2024-277 \ + --ignore-vuln PYSEC-2025-222 \ + --ignore-vuln PYSEC-2024-274 \ -r src/worker/requirements/requirements.gpu.txt diff --git a/docs/CODE_STYLE.md b/docs/CODE_STYLE.md index c63edf27..394628a9 100644 --- a/docs/CODE_STYLE.md +++ b/docs/CODE_STYLE.md @@ -139,6 +139,8 @@ as `--ignore-vuln` flags in `.github/workflows/security.yml`. | PYSEC-2026-97 | nltk | (none) | no fix version published | | PYSEC-2025-183 | pyjwt | (none) | no fix version published | | PYSEC-2024-277 | joblib | (none) | no fix version published | +| PYSEC-2025-222 | vllm | (none) | no fix version published; held by vllm-omni 0.18 pin | +| PYSEC-2024-274 | gradio | (none) | no fix version published; vllm-omni 0.18 pins gradio==5.50 | When a blocker lifts (e.g. transformers 5 ↔ vllm 0.19 line stabilizes), drop the corresponding `--ignore-vuln` flag from the workflow and the