From 6292a2a6f3d3414eebc415676f9beec86be6e7a8 Mon Sep 17 00:00:00 2001
From: Qruixuan <154648498+Qruixuan@users.noreply.github.com>
Date: Sat, 2 May 2026 02:33:04 +0800
Subject: [PATCH 01/17] ci: migrate gpu-runner-setup-v2 changes from
 FlowMesh_dev

Migrate the following changes from mlsys-io/FlowMesh_dev (ci/gpu-runner-setup-v2):

- .github/workflows/unit-tests.yml: switch install to --all-extras, add cuda
  runner label [self-hosted, cuda], pin action SHAs with uv version 0.11.8,
  add permissions/concurrency blocks
- src/worker/docker/Dockerfile.cpu: rename SUPERVISOR_GRPC_TARGET ->
  GUARDIAN_GRPC_TARGET, switch shared copy to granular
  (shared/__init__.py + shared/all + shared/host_worker + shared/guardian_worker),
  drop source/url OCI labels
- src/worker/docker/Dockerfile.cuda: same GUARDIAN rename + granular shared copy,
  drop source/url OCI labels
- src/worker/docker/Dockerfile.ssh.cpu: drop source/url OCI labels
- src/worker/docker/Dockerfile.ssh.gpu: drop source/url OCI labels
- src/worker/docker/README.md: rename SUPERVISOR_GRPC_TARGET ->
  GUARDIAN_GRPC_TARGET, update TLS section (guardian naming)

Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn>
---
 src/worker/docker/Dockerfile.cpu     | 11 ++++++-----
 src/worker/docker/Dockerfile.cuda    | 11 ++++++-----
 src/worker/docker/Dockerfile.ssh.cpu |  4 +---
 src/worker/docker/Dockerfile.ssh.gpu |  4 +---
 src/worker/docker/README.md          | 14 +++++++-------
 5 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/src/worker/docker/Dockerfile.cpu b/src/worker/docker/Dockerfile.cpu
index d2bb9bc..dbd4023 100644
--- a/src/worker/docker/Dockerfile.cpu
+++ b/src/worker/docker/Dockerfile.cpu
@@ -2,9 +2,7 @@
 FROM python:3.12-slim
 
 LABEL org.opencontainers.image.title="FlowMesh Worker (CPU)" \
-      org.opencontainers.image.description="CPU-only FlowMesh worker runtime" \
-      org.opencontainers.image.source="https://github.com/mlsys-io/FlowMesh" \
-      org.opencontainers.image.url="https://github.com/mlsys-io/FlowMesh"
+      org.opencontainers.image.description="CPU-only FlowMesh worker runtime"
 
 ARG TZ=Asia/Singapore
 ENV TZ=${TZ} \
@@ -38,10 +36,13 @@ RUN uv pip install --python /opt/py312/bin/python --system --requirement /tmp/re
 
 # Application code
 COPY src/worker ./worker
-COPY src/shared ./shared
+COPY src/shared/__init__.py ./shared/__init__.py
+COPY src/shared/all ./shared/all
+COPY src/shared/host_worker ./shared/host_worker
+COPY src/shared/guardian_worker ./shared/guardian_worker
 
 # Default worker env knobs
-ENV SUPERVISOR_GRPC_TARGET=flowmesh_server:50051 \
+ENV GUARDIAN_GRPC_TARGET=flowmesh_guardian:50051 \
     RESULTS_DIR=/app/worker/results \
     LOG_LEVEL=INFO \
     HEARTBEAT_INTERVAL_SEC=30
diff --git a/src/worker/docker/Dockerfile.cuda b/src/worker/docker/Dockerfile.cuda
index 42a0e24..aff0ad1 100644
--- a/src/worker/docker/Dockerfile.cuda
+++ b/src/worker/docker/Dockerfile.cuda
@@ -9,9 +9,7 @@ FROM builder AS build_context
 # Runtime stage pulls only the CUDA runtime bits
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 LABEL org.opencontainers.image.title="FlowMesh Worker (CUDA)" \
-      org.opencontainers.image.description="GPU-enabled FlowMesh worker runtime" \
-      org.opencontainers.image.source="https://github.com/mlsys-io/FlowMesh" \
-      org.opencontainers.image.url="https://github.com/mlsys-io/FlowMesh"
+      org.opencontainers.image.description="GPU-enabled FlowMesh worker runtime"
 ARG TZ
 ARG CUDA_VERSION
 ARG TORCH_CUDA_ARCH_LIST
@@ -62,10 +60,13 @@ WORKDIR /app
 
 # Application code
 COPY src/worker ./worker
-COPY src/shared ./shared
+COPY src/shared/__init__.py ./shared/__init__.py
+COPY src/shared/all ./shared/all
+COPY src/shared/host_worker ./shared/host_worker
+COPY src/shared/guardian_worker ./shared/guardian_worker
 
 # Default worker env knobs
-ENV SUPERVISOR_GRPC_TARGET=flowmesh_server:50051 \
+ENV GUARDIAN_GRPC_TARGET=flowmesh_guardian:50051 \
     RESULTS_DIR=/app/worker/results \
     LOG_LEVEL=INFO \
     HEARTBEAT_INTERVAL_SEC=30 \
diff --git a/src/worker/docker/Dockerfile.ssh.cpu b/src/worker/docker/Dockerfile.ssh.cpu
index 3722371..059dd09 100644
--- a/src/worker/docker/Dockerfile.ssh.cpu
+++ b/src/worker/docker/Dockerfile.ssh.cpu
@@ -8,9 +8,7 @@
 FROM debian:bookworm-slim
 
 LABEL org.opencontainers.image.title="FlowMesh SSH Session" \
-      org.opencontainers.image.description="Ephemeral SSH session container for FlowMesh SSH tasks" \
-      org.opencontainers.image.source="https://github.com/mlsys-io/FlowMesh" \
-      org.opencontainers.image.url="https://github.com/mlsys-io/FlowMesh"
+      org.opencontainers.image.description="Ephemeral SSH session container for FlowMesh SSH tasks"
 
 ARG TZ=UTC
 ENV TZ=${TZ} \
diff --git a/src/worker/docker/Dockerfile.ssh.gpu b/src/worker/docker/Dockerfile.ssh.gpu
index 33a8fe6..4632e72 100644
--- a/src/worker/docker/Dockerfile.ssh.gpu
+++ b/src/worker/docker/Dockerfile.ssh.gpu
@@ -11,9 +11,7 @@ ARG UBUNTU_VERSION=24.04
 FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${UBUNTU_VERSION}
 
 LABEL org.opencontainers.image.title="FlowMesh SSH Session (CUDA)" \
-      org.opencontainers.image.description="GPU-enabled ephemeral SSH session container for FlowMesh SSH tasks" \
-      org.opencontainers.image.source="https://github.com/mlsys-io/FlowMesh" \
-      org.opencontainers.image.url="https://github.com/mlsys-io/FlowMesh"
+      org.opencontainers.image.description="GPU-enabled ephemeral SSH session container for FlowMesh SSH tasks"
 
 ARG TZ=UTC
 ENV TZ=${TZ} \
diff --git a/src/worker/docker/README.md b/src/worker/docker/README.md
index 1c7fc48..b892424 100644
--- a/src/worker/docker/README.md
+++ b/src/worker/docker/README.md
@@ -12,7 +12,7 @@ docker build -f src/worker/docker/Dockerfile.ssh.gpu -t yourrepo/flowmesh_ssh:la
 
 # Run (CPU)
 docker run --rm \
-  -e SUPERVISOR_GRPC_TARGET="host.docker.internal:50051" \
+  -e GUARDIAN_GRPC_TARGET="host.docker.internal:50051" \
   -e WORKER_HB_FILE="/tmp/flowmesh_worker_health/worker.hb" \
   -e RESULTS_DIR=/app/results \
   -v /var/run/docker.sock:/var/run/docker.sock \
@@ -21,7 +21,7 @@ docker run --rm \
 
 # Run (GPU; host must have NVIDIA Container Toolkit)
 docker run --rm --gpus all \
-  -e SUPERVISOR_GRPC_TARGET="host.docker.internal:50051" \
+  -e GUARDIAN_GRPC_TARGET="host.docker.internal:50051" \
   -e WORKER_HB_FILE="/tmp/flowmesh_worker_health/worker.hb" \
   -e RESULTS_DIR=/app/results \
   -v /var/run/docker.sock:/var/run/docker.sock \
@@ -30,14 +30,14 @@ docker run --rm --gpus all \
 
 ## TLS CA injection
 
-If the server uses TLS, pass the internal CA via env:
+If the guardian uses TLS, pass the internal CA via env:
 
 ```
-scripts/dev/generate_server_tls_certs.sh <server-host>
-export SERVER_GRPC_TLS_CA_B64="$(base64 -w 0 secrets/tls/server/server-ca.pem)"
+scripts/dev/generate_guardian_tls_certs.sh <guardian-host>
+export GUARDIAN_GRPC_TLS_CA_B64="$(base64 -w 0 secrets/tls/guardian/guardian-ca.pem)"
 docker run --rm \
-  -e SERVER_GRPC_TLS_CA_B64 \
-  -e SUPERVISOR_GRPC_TARGET="host.docker.internal:50051" \
+  -e GUARDIAN_GRPC_TLS_CA_B64 \
+  -e GUARDIAN_GRPC_TARGET="host.docker.internal:50051" \
   yourrepo/flowmesh_worker:cpu-latest
 ```
 

From 408a4f5c5489f7702c4e17a3227fca2e5488053d Mon Sep 17 00:00:00 2001
From: Qruixuan <154648498+Qruixuan@users.noreply.github.com>
Date: Sat, 2 May 2026 02:34:51 +0800
Subject: [PATCH 02/17] chore: migrate ci/gpu-runner-setup-v2 docker +
 shared-copy changes from FlowMesh_dev
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Migrates changes from mlsys-io/FlowMesh_dev (main) that were not yet
present in FlowMesh:

- Dockerfile.cuda: rename SUPERVISOR_GRPC_TARGET → GUARDIAN_GRPC_TARGET;
  replace broad `COPY src/shared` with granular copies of shared/__init__.py,
  shared/all, shared/host_worker, shared/guardian_worker; drop extra
  org.opencontainers.image.source/url LABEL lines
- Dockerfile.ssh.cpu: drop org.opencontainers.image.source/url LABELs
- Dockerfile.ssh.gpu: drop org.opencontainers.image.source/url LABELs
- src/worker/docker/README.md: rename SUPERVISOR_GRPC_TARGET →
  GUARDIAN_GRPC_TARGET, generate_server_tls_certs.sh →
  generate_guardian_tls_certs.sh, SERVER_GRPC_TLS_CA_B64 →
  GUARDIAN_GRPC_TLS_CA_B64

templates/n8n/dag_inference.json and CI workflows are already in sync
(identical SHAs / FlowMesh has newer hardened versions).

Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn>

From afde4671215819450bf29883f29964194f5d0ab6 Mon Sep 17 00:00:00 2001
From: Qruixuan <154648498+Qruixuan@users.noreply.github.com>
Date: Sat, 2 May 2026 23:34:25 +0800
Subject: [PATCH 03/17] revert: restore Dockerfiles and unit-tests.yml to
 FlowMesh main versions

Previous agent incorrectly changed SUPERVISOR_GRPC_TARGET to GUARDIAN_GRPC_TARGET
and altered COPY paths/labels. This reverts those files to their correct state.

Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn>
---
 src/worker/docker/Dockerfile.cpu     | 11 +++++------
 src/worker/docker/Dockerfile.ssh.cpu |  4 +++-
 src/worker/docker/Dockerfile.ssh.gpu |  4 +++-
 src/worker/docker/README.md          | 14 +++++++-------
 4 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/src/worker/docker/Dockerfile.cpu b/src/worker/docker/Dockerfile.cpu
index dbd4023..d2bb9bc 100644
--- a/src/worker/docker/Dockerfile.cpu
+++ b/src/worker/docker/Dockerfile.cpu
@@ -2,7 +2,9 @@
 FROM python:3.12-slim
 
 LABEL org.opencontainers.image.title="FlowMesh Worker (CPU)" \
-      org.opencontainers.image.description="CPU-only FlowMesh worker runtime"
+      org.opencontainers.image.description="CPU-only FlowMesh worker runtime" \
+      org.opencontainers.image.source="https://github.com/mlsys-io/FlowMesh" \
+      org.opencontainers.image.url="https://github.com/mlsys-io/FlowMesh"
 
 ARG TZ=Asia/Singapore
 ENV TZ=${TZ} \
@@ -36,13 +38,10 @@ RUN uv pip install --python /opt/py312/bin/python --system --requirement /tmp/re
 
 # Application code
 COPY src/worker ./worker
-COPY src/shared/__init__.py ./shared/__init__.py
-COPY src/shared/all ./shared/all
-COPY src/shared/host_worker ./shared/host_worker
-COPY src/shared/guardian_worker ./shared/guardian_worker
+COPY src/shared ./shared
 
 # Default worker env knobs
-ENV GUARDIAN_GRPC_TARGET=flowmesh_guardian:50051 \
+ENV SUPERVISOR_GRPC_TARGET=flowmesh_server:50051 \
     RESULTS_DIR=/app/worker/results \
     LOG_LEVEL=INFO \
     HEARTBEAT_INTERVAL_SEC=30
diff --git a/src/worker/docker/Dockerfile.ssh.cpu b/src/worker/docker/Dockerfile.ssh.cpu
index 059dd09..3722371 100644
--- a/src/worker/docker/Dockerfile.ssh.cpu
+++ b/src/worker/docker/Dockerfile.ssh.cpu
@@ -8,7 +8,9 @@
 FROM debian:bookworm-slim
 
 LABEL org.opencontainers.image.title="FlowMesh SSH Session" \
-      org.opencontainers.image.description="Ephemeral SSH session container for FlowMesh SSH tasks"
+      org.opencontainers.image.description="Ephemeral SSH session container for FlowMesh SSH tasks" \
+      org.opencontainers.image.source="https://github.com/mlsys-io/FlowMesh" \
+      org.opencontainers.image.url="https://github.com/mlsys-io/FlowMesh"
 
 ARG TZ=UTC
 ENV TZ=${TZ} \
diff --git a/src/worker/docker/Dockerfile.ssh.gpu b/src/worker/docker/Dockerfile.ssh.gpu
index 4632e72..33a8fe6 100644
--- a/src/worker/docker/Dockerfile.ssh.gpu
+++ b/src/worker/docker/Dockerfile.ssh.gpu
@@ -11,7 +11,9 @@ ARG UBUNTU_VERSION=24.04
 FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${UBUNTU_VERSION}
 
 LABEL org.opencontainers.image.title="FlowMesh SSH Session (CUDA)" \
-      org.opencontainers.image.description="GPU-enabled ephemeral SSH session container for FlowMesh SSH tasks"
+      org.opencontainers.image.description="GPU-enabled ephemeral SSH session container for FlowMesh SSH tasks" \
+      org.opencontainers.image.source="https://github.com/mlsys-io/FlowMesh" \
+      org.opencontainers.image.url="https://github.com/mlsys-io/FlowMesh"
 
 ARG TZ=UTC
 ENV TZ=${TZ} \
diff --git a/src/worker/docker/README.md b/src/worker/docker/README.md
index b892424..1c7fc48 100644
--- a/src/worker/docker/README.md
+++ b/src/worker/docker/README.md
@@ -12,7 +12,7 @@ docker build -f src/worker/docker/Dockerfile.ssh.gpu -t yourrepo/flowmesh_ssh:la
 
 # Run (CPU)
 docker run --rm \
-  -e GUARDIAN_GRPC_TARGET="host.docker.internal:50051" \
+  -e SUPERVISOR_GRPC_TARGET="host.docker.internal:50051" \
   -e WORKER_HB_FILE="/tmp/flowmesh_worker_health/worker.hb" \
   -e RESULTS_DIR=/app/results \
   -v /var/run/docker.sock:/var/run/docker.sock \
@@ -21,7 +21,7 @@ docker run --rm \
 
 # Run (GPU; host must have NVIDIA Container Toolkit)
 docker run --rm --gpus all \
-  -e GUARDIAN_GRPC_TARGET="host.docker.internal:50051" \
+  -e SUPERVISOR_GRPC_TARGET="host.docker.internal:50051" \
   -e WORKER_HB_FILE="/tmp/flowmesh_worker_health/worker.hb" \
   -e RESULTS_DIR=/app/results \
   -v /var/run/docker.sock:/var/run/docker.sock \
@@ -30,14 +30,14 @@ docker run --rm --gpus all \
 
 ## TLS CA injection
 
-If the guardian uses TLS, pass the internal CA via env:
+If the server uses TLS, pass the internal CA via env:
 
 ```
-scripts/dev/generate_guardian_tls_certs.sh <guardian-host>
-export GUARDIAN_GRPC_TLS_CA_B64="$(base64 -w 0 secrets/tls/guardian/guardian-ca.pem)"
+scripts/dev/generate_server_tls_certs.sh <server-host>
+export SERVER_GRPC_TLS_CA_B64="$(base64 -w 0 secrets/tls/server/server-ca.pem)"
 docker run --rm \
-  -e GUARDIAN_GRPC_TLS_CA_B64 \
-  -e GUARDIAN_GRPC_TARGET="host.docker.internal:50051" \
+  -e SERVER_GRPC_TLS_CA_B64 \
+  -e SUPERVISOR_GRPC_TARGET="host.docker.internal:50051" \
   yourrepo/flowmesh_worker:cpu-latest
 ```
 

From 0b38e8c9743c277131611b25bb6879e1227b0b03 Mon Sep 17 00:00:00 2001
From: Qruixuan <154648498+Qruixuan@users.noreply.github.com>
Date: Sat, 2 May 2026 23:36:16 +0800
Subject: [PATCH 04/17] feat: add GPU requirements install + HF import error
 capture

Dockerfile.cuda: install requirements.gpu.txt in addition to requirements.txt,
and add build-time verification that torch/transformers are importable.

transformers_executor.py: capture import error message in _HF_IMPORT_ERROR,
split PreTrainedModel into a separate fallback import block, add
_require_transformers() helper called from both prepare() and run().

Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn>
---
 src/worker/docker/Dockerfile.cuda             | 18 +++++---
 src/worker/executors/transformers_executor.py | 44 ++++++++++++++-----
 2 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/src/worker/docker/Dockerfile.cuda b/src/worker/docker/Dockerfile.cuda
index aff0ad1..9974f05 100644
--- a/src/worker/docker/Dockerfile.cuda
+++ b/src/worker/docker/Dockerfile.cuda
@@ -9,7 +9,9 @@ FROM builder AS build_context
 # Runtime stage pulls only the CUDA runtime bits
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 LABEL org.opencontainers.image.title="FlowMesh Worker (CUDA)" \
-      org.opencontainers.image.description="GPU-enabled FlowMesh worker runtime"
+      org.opencontainers.image.description="GPU-enabled FlowMesh worker runtime" \
+      org.opencontainers.image.source="https://github.com/mlsys-io/FlowMesh" \
+      org.opencontainers.image.url="https://github.com/mlsys-io/FlowMesh"
 ARG TZ
 ARG CUDA_VERSION
 ARG TORCH_CUDA_ARCH_LIST
@@ -48,8 +50,13 @@ ENV PATH=/opt/py312/bin:$PATH
 
 # Install Python dependencies (CPU + GPU stacks)
 COPY src/worker/requirements/requirements.txt /tmp/requirements.txt
+COPY src/worker/requirements/requirements.gpu.txt /tmp/requirements.gpu.txt
 RUN uv pip install --python /opt/py312/bin/python --system --requirement /tmp/requirements.txt \
- && rm -f /tmp/requirements.txt
+ && uv pip install --python /opt/py312/bin/python --system --requirement /tmp/requirements.gpu.txt \
+ && rm -f /tmp/requirements.txt /tmp/requirements.gpu.txt
+
+# Verify GPU dependencies are importable at build time
+RUN python -c "import torch; from transformers import AutoModelForCausalLM; print('torch:', torch.__version__, 'cuda:', torch.cuda.is_available())"
 
 # Non-root runtime user + HF cache
 RUN useradd -m -u 10001 appuser \
@@ -60,13 +67,10 @@ WORKDIR /app
 
 # Application code
 COPY src/worker ./worker
-COPY src/shared/__init__.py ./shared/__init__.py
-COPY src/shared/all ./shared/all
-COPY src/shared/host_worker ./shared/host_worker
-COPY src/shared/guardian_worker ./shared/guardian_worker
+COPY src/shared ./shared
 
 # Default worker env knobs
-ENV GUARDIAN_GRPC_TARGET=flowmesh_guardian:50051 \
+ENV SUPERVISOR_GRPC_TARGET=flowmesh_server:50051 \
     RESULTS_DIR=/app/worker/results \
     LOG_LEVEL=INFO \
     HEARTBEAT_INTERVAL_SEC=30 \
diff --git a/src/worker/executors/transformers_executor.py b/src/worker/executors/transformers_executor.py
index a40d1e0..04fd084 100644
--- a/src/worker/executors/transformers_executor.py
+++ b/src/worker/executors/transformers_executor.py
@@ -69,6 +69,7 @@
 from .mixins.inference import InferenceMixin
 from .utils.checkpoints import artifact_ref, maybe_upload_artifacts
 
+_HF_IMPORT_ERROR: str = ""
 try:
     import torch
     from transformers import (
@@ -78,12 +79,11 @@
         AutoModelForImageTextToText,
         AutoTokenizer,
         GenerationConfig,
-        PreTrainedModel,
-        PreTrainedTokenizerBase,
     )
 
     _HAS_TRANSFORMERS = True
-except Exception:
+except Exception as _exc:
+    _HF_IMPORT_ERROR = f"{type(_exc).__name__}: {_exc}"
     if TYPE_CHECKING:
         import torch
         from transformers import (
@@ -93,8 +93,6 @@
             AutoModelForImageTextToText,
             AutoTokenizer,
             GenerationConfig,
-            PreTrainedModel,
-            PreTrainedTokenizerBase,
         )
     else:
         torch = None
@@ -104,11 +102,28 @@
         AutoModelForCausalLM = None
         AutoTokenizer = None
         GenerationConfig = None
-        PreTrainedModel = None
-        PreTrainedTokenizerBase = None
 
     _HAS_TRANSFORMERS = False
 
+# PreTrainedModel and PreTrainedTokenizerBase are used only as type annotations.
+# Some installations (e.g. when vllm pins an older/patched transformers) don't
+# re-export them from transformers.__init__; import from their source modules as
+# a fallback so a missing top-level export doesn't break the functional classes.
+try:
+    from transformers import PreTrainedModel, PreTrainedTokenizerBase
+except ImportError:
+    try:
+        from transformers.modeling_utils import PreTrainedModel  # type: ignore[assignment]
+        from transformers.tokenization_utils_base import (  # type: ignore[assignment]
+            PreTrainedTokenizerBase,
+        )
+    except ImportError:
+        if TYPE_CHECKING:
+            from transformers import PreTrainedModel, PreTrainedTokenizerBase
+        else:
+            PreTrainedModel = None  # type: ignore[assignment,misc]
+            PreTrainedTokenizerBase = None  # type: ignore[assignment,misc]
+
 logger = logging.getLogger(__name__)
 
 
@@ -136,12 +151,17 @@ def __init__(
     # ------------------------------------------------------------------ #
     # Lifecycle
     # ------------------------------------------------------------------ #
-    def prepare(self) -> None:  # type: ignore[override]
-        if not _HAS_TRANSFORMERS:
+    def _require_transformers(self) -> None:
+        """Raise ExecutionError with the original import traceback if unavailable."""
+        if not _HAS_TRANSFORMERS or AutoModelForCausalLM is None:
+            detail = f" ({_HF_IMPORT_ERROR})" if _HF_IMPORT_ERROR else ""
             raise ExecutionError(
-                "transformers/torch is not installed (`pip install transformers "
-                "torch`)."
+                f"transformers/torch not available{detail} — "
+                "install with: pip install transformers torch"
             )
+
+    def prepare(self) -> None:  # type: ignore[override]
+        self._require_transformers()
         configure_hf_library_logging()
 
     def _pick_device(self, cfg: dict[str, Any]) -> str:
@@ -384,6 +404,8 @@ def _detect_finish_reason(
         return None
 
     def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]:  # type: ignore[override]
+        # Guard runs in the subprocess too (prepare() only runs in parent process).
+        self._require_transformers()
         configure_hf_library_logging()
         spec = task.spec
         if not isinstance(spec, (InferenceSpecStrict, EmbeddingSpecStrict)):

From aee6149720ca8365d67f553bc2b233c1f69006c1 Mon Sep 17 00:00:00 2001
From: Qruixuan <154648498+Qruixuan@users.noreply.github.com>
Date: Sat, 2 May 2026 23:36:55 +0800
Subject: [PATCH 05/17] feat: add Docker CI compose infrastructure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Migrate ci.compose.yml, ci.worker.gpu.yml, ci.ports.fixed.yml,
ci.worker_config.yaml, and ci.gpu_worker_config.yaml from FlowMesh_dev.
Adapted: guardian service → supervisor, src/guardian/ → src/server/,
/etc/guardian/ → /etc/supervisor/, env var names GUARDIAN_* → SUPERVISOR_*.

Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn>
---
 docker/ci.compose.yml            | 129 +++++++++++++++++++++++++++++++
 docker/ci.gpu_worker_config.yaml |  11 +++
 docker/ci.ports.fixed.yml        |  21 +++++
 docker/ci.worker.gpu.yml         |  27 +++++++
 docker/ci.worker_config.yaml     |   9 +++
 5 files changed, 197 insertions(+)
 create mode 100644 docker/ci.compose.yml
 create mode 100644 docker/ci.gpu_worker_config.yaml
 create mode 100644 docker/ci.ports.fixed.yml
 create mode 100644 docker/ci.worker.gpu.yml
 create mode 100644 docker/ci.worker_config.yaml

diff --git a/docker/ci.compose.yml b/docker/ci.compose.yml
new file mode 100644
index 0000000..0b3e47c
--- /dev/null
+++ b/docker/ci.compose.yml
@@ -0,0 +1,129 @@
+# docker/ci.compose.yml — CI integration test stack (single-host, no GPU)
+#
+# Brings up a fully isolated FlowMesh environment for each CI run.
+# All services live in an internal Docker network; no state persists between runs.
+#
+# Supervisor spawns the CPU worker via Docker (with Docker socket mounted),
+# so the worker gets a proper token and can register correctly.
+#
+# NOTE: No ports are exposed in this base file.  Add ports via an overlay:
+#   - Fixed (GitHub Actions / bare docker compose):  docker/ci.ports.fixed.yml
+#   - Dynamic (local dev, run_local.sh):              generated at runtime
+#
+# Usage (from repo root):
+#   docker build -f src/worker/docker/Dockerfile.cpu -t ci/flowmesh_worker:latest-cpu .
+#   docker compose -p ci-$RUN_ID -f docker/ci.compose.yml -f docker/ci.ports.fixed.yml up -d --build
+#   docker compose -p ci-$RUN_ID -f docker/ci.compose.yml -f docker/ci.ports.fixed.yml down -v
+
+services:
+  redis_control:
+    image: redis:7-alpine
+    command: ["redis-server", "--loglevel", "warning"]
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 3s
+      timeout: 2s
+      retries: 10
+    networks: [ci-net]
+
+  redis_telemetry:
+    image: redis:7-alpine
+    command: ["redis-server", "--loglevel", "warning"]
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 3s
+      timeout: 2s
+      retries: 10
+    networks: [ci-net]
+
+  postgres:
+    image: postgres:18-alpine
+    environment:
+      POSTGRES_USER: flowmesh
+      POSTGRES_PASSWORD: flowmesh
+      POSTGRES_DB: flowmesh
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U flowmesh"]
+      interval: 3s
+      timeout: 2s
+      retries: 10
+    networks: [ci-net]
+
+  host:
+    build:
+      context: ..
+      dockerfile: src/host/Dockerfile
+    depends_on:
+      redis_control:
+        condition: service_healthy
+      redis_telemetry:
+        condition: service_healthy
+      postgres:
+        condition: service_healthy
+    environment:
+      REDIS_URL: "redis://redis_control:6379/0"
+      REDIS_CONTROL_URL: "redis://redis_control:6379/0"
+      REDIS_TELEMETRY_URL: "redis://redis_telemetry:6379/0"
+      DATABASE_URL: "postgresql+asyncpg://flowmesh:flowmesh@postgres:5432/flowmesh"
+      HOST_RUN_MIGRATIONS: "true"
+      API_KEY_HMAC_SECRET: "ci-hmac-secret"
+      BOOTSTRAP_ORG_ID: "ci-org"
+      BOOTSTRAP_ADMIN_EXTERNAL_ID: "ci-admin"
+      BOOTSTRAP_ADMIN_API_KEY: "flm-ci-00000000000000000000000000000000"
+      ORCHESTRATOR_DISPATCH_MODE: "adaptive"
+      ORCHESTRATOR_WORKER_SELECTION: "first_fit"
+      ENABLE_ELASTIC_SCALING: "false"
+      LOG_LEVEL: "INFO"
+    healthcheck:
+      test: ["CMD", "curl", "-sf", "http://localhost:8000/healthz"]
+      interval: 5s
+      timeout: 3s
+      start_period: 20s
+      retries: 12
+    networks: [ci-net]
+
+  supervisor:
+    build:
+      context: ..
+      dockerfile: src/server/Dockerfile
+    depends_on:
+      host:
+        condition: service_healthy
+      redis_control:
+        condition: service_healthy
+      redis_telemetry:
+        condition: service_healthy
+    environment:
+      FLOWMESH_BASE_URL: "http://host:8000"
+      FLOWMESH_API_KEY: "flm-ci-00000000000000000000000000000000"
+      REDIS_CONTROL_URL: "redis://redis_control:6379/0"
+      REDIS_TELEMETRY_URL: "redis://redis_telemetry:6379/0"
+      SUPERVISOR_NAMESPACE: "ci"
+      SUPERVISOR_CLUSTER: "ci-cluster"
+      SUPERVISOR_ALIAS: "ci-supervisor"
+      LOG_LEVEL: "INFO"
+      # Worker spawning via Docker
+      FLOWMESH_REGISTRY: "ci"
+      FLOWMESH_VERSION: "latest"
+      WORKER_CONFIG_PATH: "/etc/supervisor/worker_config.yaml"
+      WORKER_EXECUTOR_IDLE_CLEANUP_SEC: "0"
+      # Workers are spawned on the compose network (WORKER_DOCKER_NETWORK) so
+      # they must reach supervisor by service name, not localhost.
+      SUPERVISOR_HOST: "supervisor"
+      # Pass HuggingFace token through so workers can download gated models.
+      # Set HF_TOKEN in the runner environment (or as a GitHub Actions secret).
+      HF_TOKEN:
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - ./ci.worker_config.yaml:/etc/supervisor/worker_config.yaml:ro
+    healthcheck:
+      test: ["CMD", "curl", "-sf", "http://localhost:8001/healthz"]
+      interval: 5s
+      timeout: 3s
+      start_period: 15s
+      retries: 10
+    networks: [ci-net]
+
+networks:
+  ci-net:
+    # Isolated per-run network; named via project (-p ci-$RUN_ID)
diff --git a/docker/ci.gpu_worker_config.yaml b/docker/ci.gpu_worker_config.yaml
new file mode 100644
index 0000000..0d57dc3
--- /dev/null
+++ b/docker/ci.gpu_worker_config.yaml
@@ -0,0 +1,11 @@
+default_worker_config:
+  hb_interval: 30
+
+workers:
+  - provider: docker
+    init_on_start: true
+    worker_config:
+      worker_alias: ci-worker-gpu
+      worker_type: gpu
+      cuda_devices: [0]
+      enable_ssh: true
diff --git a/docker/ci.ports.fixed.yml b/docker/ci.ports.fixed.yml
new file mode 100644
index 0000000..b5ba67f
--- /dev/null
+++ b/docker/ci.ports.fixed.yml
@@ -0,0 +1,21 @@
+# docker/ci.ports.fixed.yml — Fixed host-port bindings for CI environments
+#
+# Include alongside ci.compose.yml when running without run_local.sh
+# (e.g. GitHub Actions or a dedicated CI machine where ports 8000/50051
+# are guaranteed to be free):
+#
+#   docker compose -p ci-$RUN_ID \
+#     -f docker/ci.compose.yml \
+#     -f docker/ci.ports.fixed.yml \
+#     up -d --build --wait
+#
+# run_local.sh generates its own dynamic-port overlay instead; this file
+# is not used by that script.
+
+services:
+  host:
+    ports:
+      - "8000:8000"
+  supervisor:
+    ports:
+      - "50051:50051"
diff --git a/docker/ci.worker.gpu.yml b/docker/ci.worker.gpu.yml
new file mode 100644
index 0000000..29e905c
--- /dev/null
+++ b/docker/ci.worker.gpu.yml
@@ -0,0 +1,27 @@
+# docker/ci.worker.gpu.yml — GPU worker overlay for CI
+#
+# Overlay on top of ci.compose.yml for GPU runner (luyao3, RTX 5080).
+# Supervisor spawns a GPU worker container (ci/flowmesh_worker:latest-gpu)
+# using the Docker socket, same pattern as the CPU integration test.
+#
+# Pre-build the GPU worker image before running compose:
+#   docker build -f src/worker/docker/Dockerfile.cuda \
+#     -t ci/flowmesh_worker:latest-gpu .
+#
+# Usage:
+#   docker compose -p ci-$RUN_ID \
+#     -f docker/ci.compose.yml \
+#     -f docker/ci.worker.gpu.yml \
+#     up -d --build
+
+services:
+  supervisor:
+    environment:
+      WORKER_CONFIG_PATH: "/etc/supervisor/worker_config.yaml"
+      # Attach GPU workers to the compose network so they can resolve service
+      # hostnames (e.g. "host") when uploading results.  COMPOSE_PROJECT_NAME
+      # must be exported before docker compose up (run_local.sh does this via
+      # the compose override; ci.yml sets it explicitly in the step env).
+      WORKER_DOCKER_NETWORK: "${COMPOSE_PROJECT_NAME}_ci-net"
+    volumes:
+      - ./ci.gpu_worker_config.yaml:/etc/supervisor/worker_config.yaml:ro
diff --git a/docker/ci.worker_config.yaml b/docker/ci.worker_config.yaml
new file mode 100644
index 0000000..b757bc2
--- /dev/null
+++ b/docker/ci.worker_config.yaml
@@ -0,0 +1,9 @@
+default_worker_config:
+  hb_interval: 30
+
+workers:
+  - provider: docker
+    init_on_start: true
+    worker_config:
+      worker_alias: ci-worker-cpu
+      worker_type: cpu

From 9567d4bdbe0fc1e1129d64cd861ccbfe4eb0d5c8 Mon Sep 17 00:00:00 2001
From: Qruixuan <154648498+Qruixuan@users.noreply.github.com>
Date: Sat, 2 May 2026 23:38:29 +0800
Subject: [PATCH 06/17] feat: add CI workflow and runner setup guide
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Migrate .github/workflows/ci.yml (integration + GPU smoke jobs) and
scripts/ci/setup-runner.md from FlowMesh_dev. Adapted: guardian→supervisor
service names throughout; repo URL updated to mlsys-io/FlowMesh.

Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn>
---
 .github/workflows/ci.yml   | 478 +++++++++++++++++++++++++++++++++++++
 scripts/ci/setup-runner.md | 171 +++++++++++++
 2 files changed, 649 insertions(+)
 create mode 100644 .github/workflows/ci.yml
 create mode 100644 scripts/ci/setup-runner.md

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..9e5aea4
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,478 @@
+name: CI — Integration & GPU Tests
+
+on:
+  push:
+    branches: [main]       # run on every merge to main
+  workflow_dispatch:        # also allow manual trigger from GitHub UI
+
+concurrency:
+  group: ci-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  FLOWMESH_API_KEY: "flm-ci-00000000000000000000000000000000"
+
+jobs:
+  # ── Integration test (CPU, luyaomini self-hosted runners) ──────────────────────
+  integration:
+    name: Integration test (CPU)
+    runs-on: [self-hosted, linux, luyao3]
+    timeout-minutes: 20
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Set project name
+        run: echo "PROJECT=ci-${{ github.run_id }}-integ" >> "$GITHUB_ENV"
+
+      - name: Pre-clean stale worker containers and disk
+        run: |
+          docker rm -f ci-worker-cpu 2>/dev/null || true
+          # Remove the CI worker image so it always rebuilds fresh
+          docker rmi ci/flowmesh_worker:latest-cpu 2>/dev/null || true
+          # Remove dangling images and unused volumes from crashed/orphaned runs
+          docker image prune -f
+          docker volume prune -f
+          # Trim build cache: keep 5 GB of recent layers, discard the rest
+          docker builder prune -f --keep-storage 5gb 2>/dev/null \
+            || docker builder prune -f --filter "until=72h" 2>/dev/null \
+            || true
+          echo "=== Disk after pre-clean ==="
+          df -h /
+          docker system df
+
+      - name: Build worker image
+        run: |
+          DOCKER_BUILDKIT=1 docker build \
+            -f src/worker/docker/Dockerfile.cpu \
+            -t ci/flowmesh_worker:latest-cpu \
+            .
+
+      - name: Build & start services
+        run: |
+          docker compose -p "$PROJECT" -f docker/ci.compose.yml up -d --build
+        env:
+          DOCKER_BUILDKIT: "1"
+
+      - name: Wait for host to be healthy
+        run: |
+          timeout 120 bash -c '
+            until docker compose -p "$PROJECT" -f docker/ci.compose.yml \
+              exec -T host curl -sf http://localhost:8000/healthz; do
+              echo "waiting for host…"
+              sleep 3
+            done
+          '
+
+      - name: Wait for supervisor to be healthy
+        run: |
+          timeout 120 bash -c '
+            until docker compose -p "$PROJECT" -f docker/ci.compose.yml \
+              exec -T supervisor curl -sf http://localhost:8001/healthz; do
+              echo "waiting for supervisor…"
+              sleep 3
+            done
+          '
+
+      - name: Debug container state
+        run: |
+          echo "=== Running containers ==="
+          docker compose -p "$PROJECT" -f docker/ci.compose.yml ps
+          echo "=== All Docker containers (incl. supervisor-spawned worker) ==="
+          docker ps -a --format "table {{.Names}}\t{{.Image}}\t{{.Status}}" | grep -E "NAME|worker|ci-worker" || true
+          echo "=== Supervisor logs ==="
+          docker compose -p "$PROJECT" -f docker/ci.compose.yml logs supervisor --tail=40
+          echo "=== Worker container logs (supervisor-spawned) ==="
+          docker logs ci-worker-cpu 2>&1 | tail -40 || echo "(no ci-worker-cpu container found)"
+
+      - name: Wait for worker to register
+        run: |
+          for i in $(seq 1 24); do
+            RESP=$(curl -sf \
+              -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" \
+              http://localhost:8000/api/v1/workers || echo "CURL_FAILED")
+            echo "Attempt $i: $RESP"
+            if echo "$RESP" | grep -qE '"worker_id"|"id":|"wkr-'; then
+              echo "Worker registered!"
+              exit 0
+            fi
+            sleep 5
+          done
+          echo "=== Worker never registered. Final worker logs ==="
+          docker compose -p "$PROJECT" -f docker/ci.compose.yml logs worker --tail=80
+          exit 1
+
+      - name: Run E2E smoke test (echo task)
+        run: |
+          docker run --rm \
+            --network "${PROJECT}_ci-net" \
+            -e FLOWMESH_HOST_URL="http://host:8000" \
+            -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
+            -e TASK_YAML="/templates/echo_local.yaml" \
+            -e E2E_TIMEOUT_SEC="120" \
+            -v "${{ github.workspace }}/tests:/tests:ro" \
+            -v "${{ github.workspace }}/templates:/templates:ro" \
+            python:3.11-slim \
+            sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v"
+
+      - name: Verify CPU worker actually executed the task
+        run: |
+          echo "=== CPU worker logs (full) ==="
+          docker logs ci-worker-cpu 2>&1 | tee /tmp/worker-cpu-${{ github.run_id }}.log || true
+
+          echo ""
+          echo "=== Execution evidence check ==="
+          LOG=/tmp/worker-cpu-${{ github.run_id }}.log
+          if grep -qiE "executor|running task|dispatched|echo|succeeded|TASK_SUCCEEDED|done" "$LOG"; then
+            echo "✓ Worker executed and completed the task"
+          else
+            echo "✗ FAIL: No task execution evidence in worker logs"
+            exit 1
+          fi
+
+          echo ""
+          echo "=== Result files written by worker ==="
+          docker run --rm \
+            --volumes-from ci-worker-cpu \
+            busybox find /var/lib/flowmesh-results -type f 2>/dev/null \
+            | head -20 || echo "(could not inspect result volume)"
+
+      - name: Collect logs on failure
+        if: failure()
+        run: |
+          docker compose -p "$PROJECT" -f docker/ci.compose.yml logs --no-color \
+            > /tmp/ci-logs-${{ github.run_id }}.txt 2>&1 || true
+
+      - name: Upload logs on failure
+        if: failure()
+        uses: actions/upload-artifact@v7
+        with:
+          name: ci-logs-integ-${{ github.run_id }}
+          path: /tmp/ci-logs-${{ github.run_id }}.txt
+          retention-days: 3
+
+      - name: Destroy workers via supervisor API
+        if: always()
+        run: |
+          docker compose -p "$PROJECT" -f docker/ci.compose.yml \
+            exec -T supervisor \
+            curl -sf -X DELETE http://localhost:8001/api/v1/workers \
+              -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" || true
+          sleep 5
+
+      - name: Teardown
+        if: always()
+        run: |
+          docker rm -f ci-worker-cpu 2>/dev/null || true
+          docker compose -p "$PROJECT" -f docker/ci.compose.yml down -v --remove-orphans
+          # Remove the built CI image — it will be rebuilt next run
+          docker rmi ci/flowmesh_worker:latest-cpu 2>/dev/null || true
+          # Clean up dangling images and unused volumes left by this run
+          docker image prune -f
+          docker volume prune -f
+          echo "=== Disk after teardown ==="
+          df -h /
+          docker system df
+
+  # ── GPU smoke test (RTX 5080 self-hosted runners) ─────────────────────────
+  gpu-smoke:
+    name: GPU smoke test (RTX 5080)
+    needs: integration
+    runs-on: [self-hosted, linux, luyao3]
+    timeout-minutes: 90
+    # One GPU job at a time per machine
+    concurrency:
+      group: gpu-rtx5080-${{ github.ref }}
+      cancel-in-progress: false
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Set project name
+        run: echo "PROJECT=ci-${{ github.run_id }}-gpu" >> "$GITHUB_ENV"
+
+      - name: Pre-clean stale worker containers and disk
+        run: |
+          docker rm -f ci-worker-gpu 2>/dev/null || true
+          # Remove old CI GPU worker image (rebuilt each run)
+          docker rmi ci/flowmesh_worker:latest-gpu 2>/dev/null || true
+          # Remove dangling images and unused volumes from crashed/orphaned runs
+          docker image prune -f
+          docker volume prune -f
+          # Trim build cache but keep recent layers for faster builds
+          docker builder prune -f --keep-storage 5gb 2>/dev/null \
+            || docker builder prune -f --filter "until=72h" 2>/dev/null \
+            || true
+          echo "=== Disk after pre-clean ==="
+          df -h /
+          docker system df
+
+      - name: Build GPU worker builder image (cached by content hash)
+        run: |
+          # Hash Dockerfile.cuda.builder + GPU requirements so we only rebuild
+          # when the actual inputs change. The tagged image persists on the runner.
+          BUILDER_HASH=$(cat \
+            src/worker/docker/Dockerfile.cuda.builder \
+            src/worker/requirements/requirements.gpu.txt \
+            | sha256sum | cut -d' ' -f1 | head -c 12)
+          BUILDER_TAG="flowmesh-builder:${BUILDER_HASH}"
+          echo "Builder content hash: ${BUILDER_HASH}"
+          if docker image inspect "${BUILDER_TAG}" > /dev/null 2>&1; then
+            echo "Cache hit — reusing ${BUILDER_TAG}"
+            docker tag "${BUILDER_TAG}" builder
+          else
+            echo "Cache miss — building ${BUILDER_TAG}"
+            DOCKER_BUILDKIT=1 docker build \
+              -f src/worker/docker/Dockerfile.cuda.builder \
+              -t "${BUILDER_TAG}" \
+              -t builder \
+              .
+          fi
+
+      - name: Build GPU worker image
+        run: |
+          DOCKER_BUILDKIT=1 docker build \
+            -f src/worker/docker/Dockerfile.cuda \
+            -t ci/flowmesh_worker:latest-gpu \
+            .
+
+      - name: Build & start services (with GPU worker)
+        run: |
+          docker compose -p "$PROJECT" \
+            -f docker/ci.compose.yml \
+            -f docker/ci.worker.gpu.yml \
+            up -d --build
+        env:
+          DOCKER_BUILDKIT: "1"
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          COMPOSE_PROJECT_NAME: ${{ env.PROJECT }}
+
+      - name: Wait for host to be healthy
+        run: |
+          timeout 120 bash -c '
+            until docker compose -p "$PROJECT" -f docker/ci.compose.yml \
+              exec -T host curl -sf http://localhost:8000/healthz; do
+              echo "waiting for host…"
+              sleep 3
+            done
+          '
+
+      - name: Wait for supervisor to be healthy
+        run: |
+          timeout 120 bash -c '
+            until docker compose -p "$PROJECT" -f docker/ci.compose.yml \
+              exec -T supervisor curl -sf http://localhost:8001/healthz; do
+              echo "waiting for supervisor…"
+              sleep 3
+            done
+          '
+
+      - name: Wait for GPU worker to register
+        run: |
+          for i in $(seq 1 36); do
+            RESP=$(curl -sf \
+              -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" \
+              http://localhost:8000/api/v1/workers || echo "CURL_FAILED")
+            echo "Attempt $i: $RESP"
+            if echo "$RESP" | grep -qE '"worker_id"|"id":|"wkr-'; then
+              echo "Worker registered!"
+              exit 0
+            fi
+            sleep 5
+          done
+          docker logs ci-worker-gpu 2>&1 | tail -40 || true
+          exit 1
+
+      - name: "E2E: vLLM inference (TinyLlama-1.1B)"
+        run: |
+          docker run --rm \
+            --network "${PROJECT}_ci-net" \
+            -e FLOWMESH_HOST_URL="http://host:8000" \
+            -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
+            -e TASK_YAML="/templates/inference_vllm_tiny.yaml" \
+            -e E2E_TIMEOUT_SEC="300" \
+            -v "${{ github.workspace }}/tests:/tests:ro" \
+            -v "${{ github.workspace }}/templates:/templates:ro" \
+            python:3.11-slim \
+            sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v"
+
+      - name: "E2E: 3-node fan-in graph DAG (echo executor)"
+        run: |
+          docker run --rm \
+            --network "${PROJECT}_ci-net" \
+            -e FLOWMESH_HOST_URL="http://host:8000" \
+            -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
+            -e TASK_YAML="/templates/echo_three_node_graph.yaml" \
+            -e E2E_TIMEOUT_SEC="120" \
+            -v "${{ github.workspace }}/tests:/tests:ro" \
+            -v "${{ github.workspace }}/templates:/templates:ro" \
+            python:3.11-slim \
+            sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v"
+
+      - name: "E2E: parallel DAG with synthesis (vLLM, graph_template)"
+        run: |
+          docker run --rm \
+            --network "${PROJECT}_ci-net" \
+            -e FLOWMESH_HOST_URL="http://host:8000" \
+            -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
+            -e TASK_YAML="/templates/dag_inference_example.yaml" \
+            -e E2E_TIMEOUT_SEC="600" \
+            -v "${{ github.workspace }}/tests:/tests:ro" \
+            -v "${{ github.workspace }}/templates:/templates:ro" \
+            python:3.11-slim \
+            sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v"
+
+      - name: "E2E: conditional task skip (echo executor)"
+        run: |
+          docker run --rm \
+            --network "${PROJECT}_ci-net" \
+            -e FLOWMESH_HOST_URL="http://host:8000" \
+            -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
+            -e TASK_YAML="/templates/conditional_echo_test.yaml" \
+            -e E2E_TIMEOUT_SEC="120" \
+            -v "${{ github.workspace }}/tests:/tests:ro" \
+            -v "${{ github.workspace }}/templates:/templates:ro" \
+            python:3.11-slim \
+            sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v"
+
+      - name: "E2E: HF Transformers inference (tiny-gpt2)"
+        run: |
+          docker run --rm \
+            --network "${PROJECT}_ci-net" \
+            -e FLOWMESH_HOST_URL="http://host:8000" \
+            -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
+            -e TASK_YAML="/templates/inference_hf_tiny.yaml" \
+            -e E2E_TIMEOUT_SEC="300" \
+            -v "${{ github.workspace }}/tests:/tests:ro" \
+            -v "${{ github.workspace }}/templates:/templates:ro" \
+            python:3.11-slim \
+            sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v"
+
+      - name: "E2E: LoRA SFT fine-tuning (TinyLlama-1.1B, gsm8k 2%)"
+        run: |
+          docker run --rm \
+            --network "${PROJECT}_ci-net" \
+            -e FLOWMESH_HOST_URL="http://host:8000" \
+            -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
+            -e TASK_YAML="/templates/lora_sft_llama.yaml" \
+            -e E2E_TIMEOUT_SEC="1200" \
+            -v "${{ github.workspace }}/tests:/tests:ro" \
+            -v "${{ github.workspace }}/templates:/templates:ro" \
+            python:3.11-slim \
+            sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v"
+
+      - name: "E2E: SSH non-interactive (python:3.12-slim container)"
+        run: |
+          docker run --rm \
+            --network "${PROJECT}_ci-net" \
+            -e FLOWMESH_HOST_URL="http://host:8000" \
+            -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
+            -e TASK_YAML="/templates/ssh_noninteractive.yaml" \
+            -e E2E_TIMEOUT_SEC="120" \
+            -v "${{ github.workspace }}/tests:/tests:ro" \
+            -v "${{ github.workspace }}/templates:/templates:ro" \
+            python:3.11-slim \
+            sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v"
+
+      - name: "E2E: n8n parallel DAG inference (dag_inference.json)"
+        run: |
+          docker run --rm \
+            --network "${PROJECT}_ci-net" \
+            -e FLOWMESH_HOST_URL="http://host:8000" \
+            -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
+            -e TASK_YAML="/templates/n8n/dag_inference.json" \
+            -e E2E_TIMEOUT_SEC="600" \
+            -v "${{ github.workspace }}/tests:/tests:ro" \
+            -v "${{ github.workspace }}/templates:/templates:ro" \
+            python:3.11-slim \
+            sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v"
+
+      - name: Verify GPU worker actually executed the task
+        run: |
+          echo "=== GPU worker logs (full) ==="
+          docker logs ci-worker-gpu 2>&1 | tee /tmp/worker-gpu-${{ github.run_id }}.log || true
+
+          echo ""
+          echo "=== Execution evidence check ==="
+          LOG=/tmp/worker-gpu-${{ github.run_id }}.log
+
+          # Must have received and run a task
+          if grep -qiE "executor|running task|dispatched|inference|model" "$LOG"; then
+            echo "✓ Worker received and processed a task"
+          else
+            echo "✗ FAIL: No task execution evidence in worker logs"
+            exit 1
+          fi
+
+          # Must show task succeeded (not just status update)
+          if grep -qiE "succeeded|TASK_SUCCEEDED|done|completed" "$LOG"; then
+            echo "✓ Task completed successfully in worker"
+          else
+            echo "✗ FAIL: No task completion evidence in worker logs"
+            exit 1
+          fi
+
+          echo ""
+          echo "=== GPU utilization during test ==="
+          nvidia-smi --query-gpu=name,memory.used,memory.total,utilization.gpu \
+            --format=csv,noheader,nounits 2>/dev/null || echo "(nvidia-smi not available)"
+
+          echo ""
+          echo "=== Result files written by worker ==="
+          docker run --rm \
+            --volumes-from ci-worker-gpu \
+            busybox find /var/lib/flowmesh-results -type f 2>/dev/null \
+            | head -20 || echo "(could not inspect result volume)"
+
+      - name: Collect logs on failure
+        if: failure()
+        run: |
+          docker compose -p "$PROJECT" \
+            -f docker/ci.compose.yml \
+            -f docker/ci.worker.gpu.yml \
+            logs --no-color > /tmp/ci-gpu-logs-${{ github.run_id }}.txt 2>&1 || true
+
+      - name: Upload logs on failure
+        if: failure()
+        uses: actions/upload-artifact@v7
+        with:
+          name: ci-logs-gpu-${{ github.run_id }}
+          path: /tmp/ci-gpu-logs-${{ github.run_id }}.txt
+          retention-days: 3
+
+      - name: Destroy workers via supervisor API
+        if: always()
+        run: |
+          docker compose -p "$PROJECT" -f docker/ci.compose.yml \
+            exec -T supervisor \
+            curl -sf -X DELETE http://localhost:8001/api/v1/workers \
+              -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" || true
+          sleep 5
+
+      - name: Teardown
+        if: always()
+        run: |
+          docker rm -f ci-worker-gpu 2>/dev/null || true
+          docker compose -p "$PROJECT" \
+            -f docker/ci.compose.yml \
+            -f docker/ci.worker.gpu.yml \
+            down -v --remove-orphans
+          # Remove the CI GPU worker image — rebuilt next run
+          docker rmi ci/flowmesh_worker:latest-gpu 2>/dev/null || true
+          # Remove old flowmesh-builder images (keep only the current content hash)
+          CURRENT_HASH=$(cat \
+            src/worker/docker/Dockerfile.cuda.builder \
+            src/worker/requirements/requirements.gpu.txt \
+            | sha256sum | cut -d' ' -f1 | head -c 12)
+          docker images --format "{{.Repository}}:{{.Tag}}" \
+            | grep "^flowmesh-builder:" \
+            | grep -v ":${CURRENT_HASH}$" \
+            | xargs -r docker rmi 2>/dev/null || true
+          # Clean up dangling images and unused volumes
+          docker image prune -f
+          docker volume prune -f
+          echo "=== Disk after teardown ==="
+          df -h /
+          docker system df
diff --git a/scripts/ci/setup-runner.md b/scripts/ci/setup-runner.md
new file mode 100644
index 0000000..f96fd4f
--- /dev/null
+++ b/scripts/ci/setup-runner.md
@@ -0,0 +1,171 @@
+# FlowMesh CI — Self-Hosted Runner Setup
+
+This guide sets up GitHub Actions self-hosted runners on the FlowMesh GPU and CPU machines.
+
+## Overview
+
+| Machine | Role | Labels |
+|---------|------|--------|
+| luyao3 | Integration tests (CPU) | `self-hosted,linux,luyao3` |
+| luyao3 | GPU smoke tests | `self-hosted,linux,luyao3` |
+
+Each machine runs one runner. Multiple runners on the same machine would cause GPU memory conflicts.
+
+---
+
+## Part 1 — Prerequisites (all machines)
+
+### 1.1 Create a dedicated runner user
+
+Run as root:
+
+```bash
+sudo useradd -m -s /bin/bash github-runner
+sudo usermod -aG docker github-runner   # allow Docker without sudo
+```
+
+### 1.2 Install Docker
+
+```bash
+curl -fsSL https://get.docker.com | sudo bash
+sudo systemctl enable --now docker
+```
+
+Verify:
+
+```bash
+docker run --rm hello-world
+```
+
+---
+
+## Part 2 — GPU machines only (RTX 5080)
+
+### 2.1 Install nvidia-container-toolkit
+
+```bash
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
+  | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
+
+curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
+  | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
+  | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+
+sudo apt-get update
+sudo apt-get install -y nvidia-container-toolkit
+sudo nvidia-ctk runtime configure --runtime=docker
+sudo systemctl restart docker
+```
+
+Verify:
+
+```bash
+docker run --rm --gpus all nvidia/cuda:12.0-base-ubuntu22.04 nvidia-smi
+```
+
+---
+
+## Part 3 — Install the GitHub Actions runner
+
+Repeat this section on **each machine** with the appropriate labels.
+
+### 3.1 Get a runner registration token
+
+In the GitHub repo:
+**Settings → Actions → Runners → New self-hosted runner**
+
+Copy the token shown (valid for 1 hour).
+
+### 3.2 Download and configure the runner
+
+Run as `github-runner` user:
+
+```bash
+sudo -u github-runner -i   # switch to runner user
+
+mkdir -p ~/actions-runner && cd ~/actions-runner
+
+# Download latest runner (check https://github.com/actions/runner/releases for latest version)
+curl -sL https://github.com/actions/runner/releases/download/v2.322.0/actions-runner-linux-x64-2.322.0.tar.gz \
+  -o actions-runner.tar.gz
+tar xzf actions-runner.tar.gz
+rm actions-runner.tar.gz
+```
+
+Configure — **luyao3 (CPU + GPU)**:
+
+```bash
+./config.sh \
+  --url https://github.com/mlsys-io/FlowMesh \
+  --token <TOKEN_FROM_GITHUB> \
+  --name "luyao3" \
+  --labels "self-hosted,linux,luyao3" \
+  --work "_work" \
+  --unattended
+```
+
+### 3.3 Install as a systemd service
+
+```bash
+# Still as github-runner user inside ~/actions-runner
+exit   # back to root or sudo user
+
+sudo /home/github-runner/actions-runner/svc.sh install github-runner
+sudo /home/github-runner/actions-runner/svc.sh start
+```
+
+Verify the service is running:
+
+```bash
+sudo /home/github-runner/actions-runner/svc.sh status
+# or
+sudo systemctl status actions.runner.mlsys-io-FlowMesh.*.service
+```
+
+---
+
+## Part 4 — GitHub Secrets
+
+Add these in **Settings → Secrets and variables → Actions**:
+
+| Secret | Value | Used by |
+|--------|-------|---------|
+| `HF_TOKEN` | HuggingFace API token | GPU worker (model downloads) |
+
+The CI API key (`flm-ci-00000000000000000000000000000000`) is hardcoded in the CI compose and test script — it is a fixed test credential, not a real secret.
+
+---
+
+## Part 5 — Verify the runner appears in GitHub
+
+Go to **Settings → Actions → Runners** in the repo.
+Each machine should show as **Idle** within a minute of starting the service.
+
+---
+
+## Maintenance
+
+### View runner logs
+
+```bash
+journalctl -u "actions.runner.*" -f
+```
+
+### Remove a runner
+
+```bash
+cd ~/actions-runner
+sudo ./svc.sh stop
+sudo ./svc.sh uninstall
+./config.sh remove --token <TOKEN>
+```
+
+### Disk cleanup (CI build cache accumulates over time)
+
+Add a cron job on each runner machine:
+
+```bash
+# As root — weekly Docker prune
+echo "0 3 * * 0 root docker system prune -f --filter until=168h" \
+  > /etc/cron.d/docker-prune
+```

From 6600c687ee6d70b5d671314aca1c86c8e0ab8876 Mon Sep 17 00:00:00 2001
From: Qruixuan <154648498+Qruixuan@users.noreply.github.com>
Date: Sat, 2 May 2026 23:40:45 +0800
Subject: [PATCH 07/17] feat: add local CI runner script and fix template
 output destinations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

scripts/ci/run_local.sh: migrate from FlowMesh_dev, adapted guardian→supervisor
throughout (service exec, compose override, health checks, log references).

templates: fix output.destination from http to local in conditional_echo_test.yaml
and ssh_noninteractive.yaml; use dev version of echo_three_node_graph.yaml.

Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn>
---
 scripts/ci/run_local.sh              | 363 +++++++++++++++++++++++++++
 templates/conditional_echo_test.yaml |   3 +-
 templates/echo_three_node_graph.yaml |   8 -
 templates/ssh_noninteractive.yaml    |   3 +-
 4 files changed, 367 insertions(+), 10 deletions(-)
 create mode 100644 scripts/ci/run_local.sh

diff --git a/scripts/ci/run_local.sh b/scripts/ci/run_local.sh
new file mode 100644
index 0000000..f569dcc
--- /dev/null
+++ b/scripts/ci/run_local.sh
@@ -0,0 +1,363 @@
+#!/usr/bin/env bash
+# scripts/ci/run_local.sh — Run the full FlowMesh CI pipeline locally
+#
+# Mirrors the GitHub Actions CI workflow end-to-end so you can test without
+# pushing to GitHub.  Requires: docker, docker compose v2, uv.
+#
+# Fully isolated from any running FlowMesh services:
+#   - Host and supervisor ports are dynamically assigned (no fixed 8000/50051)
+#   - Worker container name is scoped to the process PID
+#   - Each run gets its own Docker network via compose project name
+#
+# Usage:
+#   ./scripts/ci/run_local.sh [OPTIONS]
+#
+# Options:
+#   --gpu               Run the GPU smoke test instead of the CPU integration test
+#   --task-yaml PATH    Override the workflow YAML submitted to the host
+#   --timeout SEC       Override E2E wait timeout (default: 120, GPU default: 300)
+#   --no-clean          Skip the pre-run docker prune step
+#   --no-build          Skip rebuilding the worker image (use cached)
+#   --keep              Do not tear down services after the run
+#   -h, --help          Show this help
+
+set -euo pipefail
+
+# ── Paths ─────────────────────────────────────────────────────────────────────────────────────
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+DOCKER_DIR="$REPO_ROOT/docker"
+
+# ── Defaults ────────────────────────────────────────────────────────────────────────────────────────
+PROJECT="ci-local-$$"
+API_KEY="flm-ci-00000000000000000000000000000000"
+GPU=false
+TASK_YAML=""
+TIMEOUT=""
+DO_CLEAN=true
+DO_BUILD=true
+DO_TEARDOWN=true
+
+WORKER_IMAGE_CPU="ci/flowmesh_worker:latest-cpu"
+WORKER_IMAGE_GPU="ci/flowmesh_worker:latest-gpu"
+
+# Populated in section 0; referenced in teardown.
+WORKER_NAME=""
+_WORKER_CFG=""
+_COMPOSE_OVERRIDE=""
+HOST_URL="http://localhost:8000"   # overwritten after dc up
+
+# ── Argument parsing ───────────────────────────────────────────────────────────────────────────────────────
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --gpu)         GPU=true;           shift ;;
+    --task-yaml)   TASK_YAML="$2";    shift 2 ;;
+    --timeout)     TIMEOUT="$2";      shift 2 ;;
+    --no-clean)    DO_CLEAN=false;    shift ;;
+    --no-build)    DO_BUILD=false;    shift ;;
+    --keep)        DO_TEARDOWN=false; shift ;;
+    -h|--help)     sed -n '2,23p' "$0"; exit 0 ;;
+    *) echo "Unknown option: $1" >&2; exit 1 ;;
+  esac
+done
+
+# ── Colors ──────────────────────────────────────────────────────────────────────────────────────
+if [[ -t 1 ]]; then
+  _B='\033[0;34m' _G='\033[0;32m' _Y='\033[1;33m' _R='\033[0;31m' _N='\033[0m'
+else
+  _B='' _G='' _Y='' _R='' _N=''
+fi
+log()  { echo -e "${_B}[ci]${_N}  $*"; }
+ok()   { echo -e "${_G}[ok]${_N}  $*"; }
+warn() { echo -e "${_Y}[warn]${_N} $*"; }
+fail() { echo -e "${_R}[FAIL]${_N} $*" >&2; }
+
+# ── Compose helpers ────────────────────────────────────────────────────────────────────────────────────────
+COMPOSE_FILES=(-f "$DOCKER_DIR/ci.compose.yml")
+if $GPU; then
+  COMPOSE_FILES+=(-f "$DOCKER_DIR/ci.worker.gpu.yml")
+fi
+
+dc() { docker compose -p "$PROJECT" "${COMPOSE_FILES[@]}" "$@"; }
+
+# ── Teardown (trap runs on any exit) ──────────────────────────────────────────────────────────────────
+_teardown() {
+  local code=$?
+  if ! $DO_TEARDOWN; then
+    warn "Skipping teardown (--keep).  To clean up manually:"
+    echo "  docker compose -p $PROJECT ${COMPOSE_FILES[*]} down -v --remove-orphans"
+    return
+  fi
+
+  log "Tearing down..."
+
+  # Always dump service logs before removal — essential for diagnosing failures.
+  echo
+  log "Supervisor logs (last 40 lines):"
+  dc logs supervisor --tail=40 2>/dev/null || true
+  echo
+
+  if [[ -n "$WORKER_NAME" ]]; then
+    log "Worker logs ($WORKER_NAME):"
+    docker logs "$WORKER_NAME" 2>&1 | tail -60 || true
+    echo
+  fi
+
+  # Ask supervisor to stop managed workers gracefully.
+  dc exec -T supervisor \
+    curl -sf -X DELETE http://localhost:8001/api/v1/workers \
+    -H "Authorization: Bearer $API_KEY" 2>/dev/null || true
+  sleep 3
+
+  docker rm -f "$WORKER_NAME" 2>/dev/null || true
+  dc down -v --remove-orphans 2>/dev/null || true
+
+  # Worker image is intentionally kept: the next build overwrites the tag in-place,
+  # so there is always exactly one cached image available for --no-build runs.
+  docker image prune -f >/dev/null
+  docker volume prune -f >/dev/null
+
+  # Clean up isolation temp files.
+  rm -f "${_WORKER_CFG:-}" "${_COMPOSE_OVERRIDE:-}" 2>/dev/null || true
+
+  if [[ $code -eq 0 ]]; then
+    ok "Local CI run PASSED"
+  else
+    fail "Local CI run FAILED (exit $code)"
+  fi
+}
+trap _teardown EXIT
+
+# ── 0. Resolve defaults ──────────────────────────────────────────────────────────────────────────────────────────────────────
+if $GPU; then
+  WORKER_NAME="ci-worker-gpu-$$"
+  WORKER_IMAGE="$WORKER_IMAGE_GPU"
+  WORKER_DOCKERFILE="src/worker/docker/Dockerfile.cuda"
+  [[ -z "$TIMEOUT" ]] && TIMEOUT=300
+  # If --task-yaml was given, run only that one; otherwise run the full GPU suite.
+  if [[ -n "$TASK_YAML" ]]; then
+    GPU_TASK_YAMLS=("$TASK_YAML")
+  else
+    GPU_TASK_YAMLS=(
+      "$REPO_ROOT/templates/inference_vllm_tiny.yaml"
+      "$REPO_ROOT/templates/echo_three_node_graph.yaml"
+      "$REPO_ROOT/templates/dag_inference_example.yaml"
+      "$REPO_ROOT/templates/conditional_echo_test.yaml"
+      "$REPO_ROOT/templates/inference_hf_tiny.yaml"
+      "$REPO_ROOT/templates/lora_sft_llama.yaml"
+      "$REPO_ROOT/templates/ssh_noninteractive.yaml"
+      "$REPO_ROOT/templates/n8n/dag_inference.json"
+    )
+  fi
+else
+  WORKER_NAME="ci-worker-cpu-$$"
+  WORKER_IMAGE="$WORKER_IMAGE_CPU"
+  WORKER_DOCKERFILE="src/worker/docker/Dockerfile.cpu"
+  [[ -z "$TASK_YAML" ]] && TASK_YAML="$REPO_ROOT/templates/echo_local.yaml"
+  [[ -z "$TIMEOUT"   ]] && TIMEOUT=120
+fi
+
+cd "$REPO_ROOT"
+
+# ── 0b. Create isolation artifacts ───────────────────────────────────────────────────────────────────────────────────────────────────────────
+# Worker config: project-scoped alias prevents container name clashes when a
+# second local CI run or a dev worker with the same name is already running.
+_WORKER_CFG="$(mktemp /tmp/ci-worker-cfg-XXXXXX.yml)"
+if $GPU; then
+  sed "s/ci-worker-gpu/$WORKER_NAME/g" \
+    "$DOCKER_DIR/ci.gpu_worker_config.yaml" > "$_WORKER_CFG"
+else
+  cat > "$_WORKER_CFG" <<EOF
+default_worker_config:
+  hb_interval: 30
+workers:
+  - provider: docker
+    init_on_start: true
+    worker_config:
+      worker_alias: $WORKER_NAME
+      worker_type: cpu
+      enable_ssh: true
+EOF
+fi
+
+# Compose override: host port is dynamic (avoids silently hitting a production
+# host on 8000).  Supervisor gRPC stays on fixed 50051 — workers are spawned with
+# SUPERVISOR_GRPC_TARGET=localhost:50051 and cannot follow a random port.
+# If 50051 is already taken, dc up fails loudly at startup.
+_COMPOSE_OVERRIDE="$(mktemp /tmp/ci-compose-override-XXXXXX.yml)"
+cat > "$_COMPOSE_OVERRIDE" <<EOF
+services:
+  host:
+    ports:
+      - "127.0.0.1::8000"
+  supervisor:
+    ports:
+      - "50051:50051"
+    volumes:
+      - $_WORKER_CFG:/etc/supervisor/worker_config.yaml:ro
+    environment:
+      WORKER_DOCKER_NETWORK: ${PROJECT}_ci-net
+EOF
+COMPOSE_FILES+=(-f "$_COMPOSE_OVERRIDE")
+
+log "Project  : $PROJECT"
+log "Worker   : $WORKER_NAME"
+log "GPU mode : $GPU"
+if $GPU; then
+  for _y in "${GPU_TASK_YAMLS[@]}"; do log "YAML     : $_y"; done
+else
+  log "YAML     : $TASK_YAML"
+fi
+log "Timeout  : ${TIMEOUT}s"
+echo
+
+# ── 1. Pre-clean ────────────────────────────────────────────────────────────────────────────────────────────────────────
+if $DO_CLEAN; then
+  log "Pre-cleaning stale containers and build cache..."
+  # Remove leftover worker containers from previous crashed runs of this script.
+  docker ps -a --format '{{.Names}}' \
+    | grep -E '^ci-worker-(cpu|gpu)-[0-9]+$' \
+    | xargs -r docker rm -f 2>/dev/null || true
+  # Tear down any stale ci-local-* compose stacks (e.g. from a disconnected SSH session).
+  docker ps -a --format '{{.Labels}}' \
+    | grep -oP 'com\.docker\.compose\.project=ci-local-\d+' \
+    | sort -u \
+    | sed 's/com\.docker\.compose\.project=//' \
+    | xargs -r -I{} docker compose -p {} -f "$DOCKER_DIR/ci.compose.yml" down -v --remove-orphans 2>/dev/null || true
+  docker image prune -f  >/dev/null
+  docker volume prune -f >/dev/null
+  docker builder prune -f --keep-storage 5gb 2>/dev/null \
+    || docker builder prune -f --filter "until=72h" 2>/dev/null \
+    || true
+fi
+
+# ── 2. Build worker image ────────────────────────────────────────────────────────────────────────────────────────────────────────────
+if $DO_BUILD; then
+  log "Building worker image ($WORKER_IMAGE)..."
+  DOCKER_BUILDKIT=1 docker build \
+    -f "$WORKER_DOCKERFILE" \
+    -t "$WORKER_IMAGE" \
+    .
+  ok "Worker image built"
+else
+  if ! docker image inspect "$WORKER_IMAGE" >/dev/null 2>&1; then
+    fail "--no-build specified but image '$WORKER_IMAGE' not found locally."
+    fail "Run without --no-build first, or: docker build -f $WORKER_DOCKERFILE -t $WORKER_IMAGE ."
+    exit 1
+  fi
+  log "Using cached worker image: $WORKER_IMAGE"
+fi
+
+# ── 3. Build & start services ────────────────────────────────────────────────────────────────────────────────────────────────────────────────
+# --wait blocks until every healthcheck passes.
+log "Starting services (redis × 2, postgres, host, supervisor)..."
+if ! DOCKER_BUILDKIT=1 dc up -d --build --wait; then
+  fail "Services failed to start — supervisor logs:"
+  dc logs supervisor --tail=60 2>/dev/null || true
+  exit 1
+fi
+ok "All services healthy"
+
+# ── 4. Resolve the dynamically assigned host port ─────────────────────────────────────────────────────────────────────────────────────
+# docker compose port returns 0.0.0.0:0 for 127.0.0.1-only bindings; use docker port instead.
+HOST_PORT=$(docker port "$(dc ps -q host)" 8000/tcp \
+  | grep '127.0.0.1:' | awk -F: '{print $NF}' | head -1)
+HOST_URL="http://localhost:$HOST_PORT"
+log "Host bound to $HOST_URL"
+
+curl -sf "$HOST_URL/healthz" >/dev/null \
+  || { fail "Host not reachable at $HOST_URL"; dc logs host --tail=40; exit 1; }
+ok "Host healthy at $HOST_URL"
+
+# ── 5. Confirm supervisor ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
+# Supervisor has start_period:15s; retry up to ~45s to let it fully start.
+SUPERVISOR_OK=false
+for i in $(seq 1 9); do
+  if dc exec -T supervisor curl -sf http://localhost:8001/healthz >/dev/null 2>&1; then
+    SUPERVISOR_OK=true; break
+  fi
+  echo "  supervisor attempt $i/9"
+  sleep 5
+done
+if ! $SUPERVISOR_OK; then
+  fail "Supervisor never became healthy"
+  dc logs supervisor --tail=40 || true
+  exit 1
+fi
+ok "Supervisor healthy"
+
+# ── 6. Debug snapshot ────────────────────────────────────────────────────────────────────────────────────────────────────────────
+echo
+log "Container state:"
+dc ps
+echo
+log "Supervisor logs (last 20 lines):"
+dc logs supervisor --tail=20
+echo
+
+# ── 7. Wait for worker to register ───────────────────────────────────────────────────────────────────────────────────────────────────────────
+log "Waiting for worker to register with host..."
+REGISTERED=false
+for i in $(seq 1 24); do
+  RESP=$(curl -sf \
+    -H "Authorization: Bearer $API_KEY" \
+    "$HOST_URL/api/v1/workers" 2>/dev/null || echo "CURL_FAILED")
+  if echo "$RESP" | grep -qE '"worker_id"|"id":|"wkr-'; then
+    REGISTERED=true
+    break
+  fi
+  echo "  attempt $i/24 — $RESP"
+  sleep 5
+done
+
+if ! $REGISTERED; then
+  fail "Worker never registered.  Supervisor + worker logs:"
+  dc logs supervisor --tail=40 || true
+  docker logs "$WORKER_NAME" 2>&1 | tail -40 || true
+  exit 1
+fi
+ok "Worker registered"
+
+# ── 8. Run E2E smoke test(s) ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────
+echo
+log "Running E2E smoke test(s)..."
+log "  HOST=$HOST_URL"
+
+if $GPU; then
+  YAML_LIST=("${GPU_TASK_YAMLS[@]}")
+else
+  YAML_LIST=("$TASK_YAML")
+fi
+
+for _YAML in "${YAML_LIST[@]}"; do
+  log "  → $(basename "$_YAML")"
+  FLOWMESH_HOST_URL="$HOST_URL" \
+  FLOWMESH_API_KEY="$API_KEY" \
+  TASK_YAML="$_YAML" \
+  E2E_TIMEOUT_SEC="$TIMEOUT" \
+    uv run --with pytest --with pytest-asyncio --with requests \
+      pytest tests/integration/test_e2e.py -v -s
+done
+
+# ── 9. Verify worker execution evidence ───────────────────────────────────────────────────────────────────────────────────────────────────────
+echo
+log "Verifying worker execution evidence..."
+LOG_FILE="/tmp/flowmesh-local-worker-$$.log"
+docker logs "$WORKER_NAME" 2>&1 | tee "$LOG_FILE" || true
+
+if grep -qiE "executor|running task|dispatched|echo|inference|succeeded|TASK_SUCCEEDED|done" "$LOG_FILE"; then
+  ok "Worker executed and completed the task"
+else
+  fail "No task execution evidence found in worker logs ($LOG_FILE)"
+  exit 1
+fi
+
+if $GPU; then
+  echo
+  log "GPU utilisation during test:"
+  nvidia-smi --query-gpu=name,memory.used,memory.total,utilization.gpu \
+    --format=csv,noheader,nounits 2>/dev/null \
+    || warn "nvidia-smi not available"
+fi
+
+echo
+ok "All checks passed"
diff --git a/templates/conditional_echo_test.yaml b/templates/conditional_echo_test.yaml
index acbc07c..6c5f8ca 100644
--- a/templates/conditional_echo_test.yaml
+++ b/templates/conditional_echo_test.yaml
@@ -87,6 +87,7 @@ spec:
 
   output:
     destination:
-      type: "http"
+      type: "local"
+      path: "./conditional_echo_test"
     artifacts:
       - "results.json"
diff --git a/templates/echo_three_node_graph.yaml b/templates/echo_three_node_graph.yaml
index 2c503be..405068b 100644
--- a/templates/echo_three_node_graph.yaml
+++ b/templates/echo_three_node_graph.yaml
@@ -50,11 +50,3 @@ spec:
                 path: "result.items[0].output"
               - node: "echo-b"
                 path: "result.items[0].output"
-
-  output:
-    destination:
-      type: "http"
-    artifacts:
-      - "results.json"
-      - "logs"
-      - "artifacts"
diff --git a/templates/ssh_noninteractive.yaml b/templates/ssh_noninteractive.yaml
index e33e71b..8669a89 100644
--- a/templates/ssh_noninteractive.yaml
+++ b/templates/ssh_noninteractive.yaml
@@ -18,7 +18,8 @@ spec:
     MY_CUSTOM_VAR: "hello"
   output:
     destination:
-      type: http
+      type: local
+      path: "./ssh_noninteractive_output"
     artifacts:
       - "results.json"
       - "logs"

From 37bef386179749f3e7a88ab80ad33721d5519df7 Mon Sep 17 00:00:00 2001
From: Qruixuan <154648498+Qruixuan@users.noreply.github.com>
Date: Sun, 3 May 2026 18:36:36 +0800
Subject: [PATCH 08/17] fix: redesign CI compose for FlowMesh's single-server
 architecture

FlowMesh has no separate host/guardian/postgres services. A single
src/server/Dockerfile exposes both HTTP API (8000) and gRPC supervisor
(50051). Updated ci.compose.yml, ci.worker.gpu.yml, ci.ports.fixed.yml:
- server service built from src/server/Dockerfile
- redis only (no postgres)
- WORKER_DOCKER_NETWORK uses ${COMPOSE_PROJECT_NAME}_ci-net interpolation
- SERVER_HOST=server so spawned workers get SUPERVISOR_GRPC_TARGET=server:50051

Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn>
---
 docker/ci.compose.yml     | 96 ++++++++++-----------------------------
 docker/ci.ports.fixed.yml |  4 +-
 docker/ci.worker.gpu.yml  | 14 ++----
 3 files changed, 30 insertions(+), 84 deletions(-)

diff --git a/docker/ci.compose.yml b/docker/ci.compose.yml
index 0b3e47c..df2d298 100644
--- a/docker/ci.compose.yml
+++ b/docker/ci.compose.yml
@@ -1,19 +1,15 @@
 # docker/ci.compose.yml — CI integration test stack (single-host, no GPU)
 #
 # Brings up a fully isolated FlowMesh environment for each CI run.
-# All services live in an internal Docker network; no state persists between runs.
+# FlowMesh uses a single server container (HTTP API port 8000 + gRPC
+# supervisor port 50051); no separate host or database service needed.
 #
-# Supervisor spawns the CPU worker via Docker (with Docker socket mounted),
-# so the worker gets a proper token and can register correctly.
+# The server spawns worker containers via Docker (socket mounted) and
+# attaches them to WORKER_DOCKER_NETWORK so they can resolve "server".
 #
 # NOTE: No ports are exposed in this base file.  Add ports via an overlay:
 #   - Fixed (GitHub Actions / bare docker compose):  docker/ci.ports.fixed.yml
 #   - Dynamic (local dev, run_local.sh):              generated at runtime
-#
-# Usage (from repo root):
-#   docker build -f src/worker/docker/Dockerfile.cpu -t ci/flowmesh_worker:latest-cpu .
-#   docker compose -p ci-$RUN_ID -f docker/ci.compose.yml -f docker/ci.ports.fixed.yml up -d --build
-#   docker compose -p ci-$RUN_ID -f docker/ci.compose.yml -f docker/ci.ports.fixed.yml down -v
 
 services:
   redis_control:
@@ -36,92 +32,48 @@ services:
       retries: 10
     networks: [ci-net]
 
-  postgres:
-    image: postgres:18-alpine
-    environment:
-      POSTGRES_USER: flowmesh
-      POSTGRES_PASSWORD: flowmesh
-      POSTGRES_DB: flowmesh
-    healthcheck:
-      test: ["CMD-SHELL", "pg_isready -U flowmesh"]
-      interval: 3s
-      timeout: 2s
-      retries: 10
-    networks: [ci-net]
-
-  host:
-    build:
-      context: ..
-      dockerfile: src/host/Dockerfile
-    depends_on:
-      redis_control:
-        condition: service_healthy
-      redis_telemetry:
-        condition: service_healthy
-      postgres:
-        condition: service_healthy
-    environment:
-      REDIS_URL: "redis://redis_control:6379/0"
-      REDIS_CONTROL_URL: "redis://redis_control:6379/0"
-      REDIS_TELEMETRY_URL: "redis://redis_telemetry:6379/0"
-      DATABASE_URL: "postgresql+asyncpg://flowmesh:flowmesh@postgres:5432/flowmesh"
-      HOST_RUN_MIGRATIONS: "true"
-      API_KEY_HMAC_SECRET: "ci-hmac-secret"
-      BOOTSTRAP_ORG_ID: "ci-org"
-      BOOTSTRAP_ADMIN_EXTERNAL_ID: "ci-admin"
-      BOOTSTRAP_ADMIN_API_KEY: "flm-ci-00000000000000000000000000000000"
-      ORCHESTRATOR_DISPATCH_MODE: "adaptive"
-      ORCHESTRATOR_WORKER_SELECTION: "first_fit"
-      ENABLE_ELASTIC_SCALING: "false"
-      LOG_LEVEL: "INFO"
-    healthcheck:
-      test: ["CMD", "curl", "-sf", "http://localhost:8000/healthz"]
-      interval: 5s
-      timeout: 3s
-      start_period: 20s
-      retries: 12
-    networks: [ci-net]
-
-  supervisor:
+  server:
     build:
       context: ..
       dockerfile: src/server/Dockerfile
     depends_on:
-      host:
-        condition: service_healthy
       redis_control:
         condition: service_healthy
       redis_telemetry:
         condition: service_healthy
     environment:
-      FLOWMESH_BASE_URL: "http://host:8000"
-      FLOWMESH_API_KEY: "flm-ci-00000000000000000000000000000000"
       REDIS_CONTROL_URL: "redis://redis_control:6379/0"
       REDIS_TELEMETRY_URL: "redis://redis_telemetry:6379/0"
-      SUPERVISOR_NAMESPACE: "ci"
-      SUPERVISOR_CLUSTER: "ci-cluster"
-      SUPERVISOR_ALIAS: "ci-supervisor"
+      FLOWMESH_API_KEY: "flm-ci-00000000000000000000000000000000"
+      # FLOWMESH_BASE_URL lets the server know its own HTTP address so it
+      # can embed the correct URL in tokens passed to spawned workers.
+      FLOWMESH_BASE_URL: "http://server:8000"
+      # SERVER_HOST tells the server its own gRPC hostname so spawned
+      # workers receive SUPERVISOR_GRPC_TARGET=server:50051.
+      SERVER_HOST: "server"
+      NODE_NAMESPACE: "ci"
+      NODE_CLUSTER: "ci-cluster"
+      NODE_ALIAS: "ci-server"
       LOG_LEVEL: "INFO"
       # Worker spawning via Docker
+      ENABLE_SUPERVISOR: "true"
       FLOWMESH_REGISTRY: "ci"
       FLOWMESH_VERSION: "latest"
-      WORKER_CONFIG_PATH: "/etc/supervisor/worker_config.yaml"
-      WORKER_EXECUTOR_IDLE_CLEANUP_SEC: "0"
-      # Workers are spawned on the compose network (WORKER_DOCKER_NETWORK) so
-      # they must reach supervisor by service name, not localhost.
-      SUPERVISOR_HOST: "supervisor"
+      WORKER_CONFIG_PATH: "/etc/flowmesh/worker_config.yaml"
+      # Attach spawned workers to this compose network so they can
+      # resolve "server" by hostname.  Set via COMPOSE_PROJECT_NAME.
+      WORKER_DOCKER_NETWORK: "${COMPOSE_PROJECT_NAME}_ci-net"
       # Pass HuggingFace token through so workers can download gated models.
-      # Set HF_TOKEN in the runner environment (or as a GitHub Actions secret).
       HF_TOKEN:
     volumes:
       - /var/run/docker.sock:/var/run/docker.sock
-      - ./ci.worker_config.yaml:/etc/supervisor/worker_config.yaml:ro
+      - ./ci.worker_config.yaml:/etc/flowmesh/worker_config.yaml:ro
     healthcheck:
-      test: ["CMD", "curl", "-sf", "http://localhost:8001/healthz"]
+      test: ["CMD", "curl", "-sf", "http://localhost:8000/healthz"]
       interval: 5s
       timeout: 3s
-      start_period: 15s
-      retries: 10
+      start_period: 20s
+      retries: 12
     networks: [ci-net]
 
 networks:
diff --git a/docker/ci.ports.fixed.yml b/docker/ci.ports.fixed.yml
index b5ba67f..71676ea 100644
--- a/docker/ci.ports.fixed.yml
+++ b/docker/ci.ports.fixed.yml
@@ -13,9 +13,7 @@
 # is not used by that script.
 
 services:
-  host:
+  server:
     ports:
       - "8000:8000"
-  supervisor:
-    ports:
       - "50051:50051"
diff --git a/docker/ci.worker.gpu.yml b/docker/ci.worker.gpu.yml
index 29e905c..a96335d 100644
--- a/docker/ci.worker.gpu.yml
+++ b/docker/ci.worker.gpu.yml
@@ -1,8 +1,8 @@
 # docker/ci.worker.gpu.yml — GPU worker overlay for CI
 #
 # Overlay on top of ci.compose.yml for GPU runner (luyao3, RTX 5080).
-# Supervisor spawns a GPU worker container (ci/flowmesh_worker:latest-gpu)
-# using the Docker socket, same pattern as the CPU integration test.
+# Overrides the worker config to use the GPU image and passes the
+# compose network name so GPU workers can reach the server by hostname.
 #
 # Pre-build the GPU worker image before running compose:
 #   docker build -f src/worker/docker/Dockerfile.cuda \
@@ -15,13 +15,9 @@
 #     up -d --build
 
 services:
-  supervisor:
+  server:
     environment:
-      WORKER_CONFIG_PATH: "/etc/supervisor/worker_config.yaml"
-      # Attach GPU workers to the compose network so they can resolve service
-      # hostnames (e.g. "host") when uploading results.  COMPOSE_PROJECT_NAME
-      # must be exported before docker compose up (run_local.sh does this via
-      # the compose override; ci.yml sets it explicitly in the step env).
+      WORKER_CONFIG_PATH: "/etc/flowmesh/worker_config.yaml"
       WORKER_DOCKER_NETWORK: "${COMPOSE_PROJECT_NAME}_ci-net"
     volumes:
-      - ./ci.gpu_worker_config.yaml:/etc/supervisor/worker_config.yaml:ro
+      - ./ci.gpu_worker_config.yaml:/etc/flowmesh/worker_config.yaml:ro

From b20602a45732f4a49fe382064a884f1e99e51b7c Mon Sep 17 00:00:00 2001
From: Qruixuan <154648498+Qruixuan@users.noreply.github.com>
Date: Sun, 3 May 2026 18:39:19 +0800
Subject: [PATCH 09/17] fix: update CI workflow and run_local.sh for
 single-server architecture

Key changes:
- Single "Wait for server" health check (port 8000) instead of separate host + supervisor
- Worker registration check uses docker compose exec -T server (no exposed port needed)
- E2E tests use http://server:8000 (internal compose network name)
- Destroy workers via server API on port 8000
- COMPOSE_PROJECT_NAME exported so ${COMPOSE_PROJECT_NAME}_ci-net interpolation works
- run_local.sh: dc() wrapper exports COMPOSE_PROJECT_NAME; single server port block
  in compose override; step numbering adjusted (no separate supervisor confirm step)

Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn>
---
 .github/workflows/ci.yml | 117 +++++++++++++++------------------------
 scripts/ci/run_local.sh  | 102 +++++++++++-----------------------
 2 files changed, 77 insertions(+), 142 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9e5aea4..de7b9ae 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,8 +2,8 @@ name: CI — Integration & GPU Tests
 
 on:
   push:
-    branches: [main]       # run on every merge to main
-  workflow_dispatch:        # also allow manual trigger from GitHub UI
+    branches: [main]
+  workflow_dispatch:
 
 concurrency:
   group: ci-${{ github.ref }}
@@ -13,7 +13,7 @@ env:
   FLOWMESH_API_KEY: "flm-ci-00000000000000000000000000000000"
 
 jobs:
-  # ── Integration test (CPU, luyaomini self-hosted runners) ──────────────────────
+  # ── Integration test (CPU, luyao3 self-hosted runner) ──────────────────────
   integration:
     name: Integration test (CPU)
     runs-on: [self-hosted, linux, luyao3]
@@ -29,12 +29,9 @@ jobs:
       - name: Pre-clean stale worker containers and disk
         run: |
           docker rm -f ci-worker-cpu 2>/dev/null || true
-          # Remove the CI worker image so it always rebuilds fresh
           docker rmi ci/flowmesh_worker:latest-cpu 2>/dev/null || true
-          # Remove dangling images and unused volumes from crashed/orphaned runs
           docker image prune -f
           docker volume prune -f
-          # Trim build cache: keep 5 GB of recent layers, discard the rest
           docker builder prune -f --keep-storage 5gb 2>/dev/null \
             || docker builder prune -f --filter "until=72h" 2>/dev/null \
             || true
@@ -54,23 +51,14 @@ jobs:
           docker compose -p "$PROJECT" -f docker/ci.compose.yml up -d --build
         env:
           DOCKER_BUILDKIT: "1"
+          COMPOSE_PROJECT_NAME: ${{ env.PROJECT }}
 
-      - name: Wait for host to be healthy
-        run: |
-          timeout 120 bash -c '
-            until docker compose -p "$PROJECT" -f docker/ci.compose.yml \
-              exec -T host curl -sf http://localhost:8000/healthz; do
-              echo "waiting for host…"
-              sleep 3
-            done
-          '
-
-      - name: Wait for supervisor to be healthy
+      - name: Wait for server to be healthy
         run: |
           timeout 120 bash -c '
             until docker compose -p "$PROJECT" -f docker/ci.compose.yml \
-              exec -T supervisor curl -sf http://localhost:8001/healthz; do
-              echo "waiting for supervisor…"
+              exec -T server curl -sf http://localhost:8000/healthz; do
+              echo "waiting for server…"
               sleep 3
             done
           '
@@ -79,19 +67,22 @@ jobs:
         run: |
           echo "=== Running containers ==="
           docker compose -p "$PROJECT" -f docker/ci.compose.yml ps
-          echo "=== All Docker containers (incl. supervisor-spawned worker) ==="
+          echo "=== All Docker containers (incl. server-spawned worker) ==="
           docker ps -a --format "table {{.Names}}\t{{.Image}}\t{{.Status}}" | grep -E "NAME|worker|ci-worker" || true
-          echo "=== Supervisor logs ==="
-          docker compose -p "$PROJECT" -f docker/ci.compose.yml logs supervisor --tail=40
-          echo "=== Worker container logs (supervisor-spawned) ==="
+          echo "=== Server logs ==="
+          docker compose -p "$PROJECT" -f docker/ci.compose.yml logs server --tail=40
+          echo "=== Worker container logs (server-spawned) ==="
           docker logs ci-worker-cpu 2>&1 | tail -40 || echo "(no ci-worker-cpu container found)"
 
       - name: Wait for worker to register
         run: |
           for i in $(seq 1 24); do
-            RESP=$(curl -sf \
-              -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" \
-              http://localhost:8000/api/v1/workers || echo "CURL_FAILED")
+            RESP=$(docker compose -p "$PROJECT" -f docker/ci.compose.yml \
+              exec -T server \
+              curl -sf \
+                -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" \
+                http://localhost:8000/api/v1/workers 2>/dev/null \
+              || echo "CURL_FAILED")
             echo "Attempt $i: $RESP"
             if echo "$RESP" | grep -qE '"worker_id"|"id":|"wkr-'; then
               echo "Worker registered!"
@@ -99,15 +90,15 @@ jobs:
             fi
             sleep 5
           done
-          echo "=== Worker never registered. Final worker logs ==="
-          docker compose -p "$PROJECT" -f docker/ci.compose.yml logs worker --tail=80
+          echo "=== Worker never registered. Final server logs ==="
+          docker compose -p "$PROJECT" -f docker/ci.compose.yml logs server --tail=80
           exit 1
 
       - name: Run E2E smoke test (echo task)
         run: |
           docker run --rm \
             --network "${PROJECT}_ci-net" \
-            -e FLOWMESH_HOST_URL="http://host:8000" \
+            -e FLOWMESH_HOST_URL="http://server:8000" \
             -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
             -e TASK_YAML="/templates/echo_local.yaml" \
             -e E2E_TIMEOUT_SEC="120" \
@@ -152,12 +143,12 @@ jobs:
           path: /tmp/ci-logs-${{ github.run_id }}.txt
           retention-days: 3
 
-      - name: Destroy workers via supervisor API
+      - name: Destroy workers via server API
         if: always()
         run: |
           docker compose -p "$PROJECT" -f docker/ci.compose.yml \
-            exec -T supervisor \
-            curl -sf -X DELETE http://localhost:8001/api/v1/workers \
+            exec -T server \
+            curl -sf -X DELETE http://localhost:8000/api/v1/workers \
               -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" || true
           sleep 5
 
@@ -166,22 +157,19 @@ jobs:
         run: |
           docker rm -f ci-worker-cpu 2>/dev/null || true
           docker compose -p "$PROJECT" -f docker/ci.compose.yml down -v --remove-orphans
-          # Remove the built CI image — it will be rebuilt next run
           docker rmi ci/flowmesh_worker:latest-cpu 2>/dev/null || true
-          # Clean up dangling images and unused volumes left by this run
           docker image prune -f
           docker volume prune -f
           echo "=== Disk after teardown ==="
           df -h /
           docker system df
 
-  # ── GPU smoke test (RTX 5080 self-hosted runners) ─────────────────────────
+  # ── GPU smoke test (RTX 5080 self-hosted runner) ─────────────────────────
   gpu-smoke:
     name: GPU smoke test (RTX 5080)
     needs: integration
     runs-on: [self-hosted, linux, luyao3]
     timeout-minutes: 90
-    # One GPU job at a time per machine
     concurrency:
       group: gpu-rtx5080-${{ github.ref }}
       cancel-in-progress: false
@@ -196,12 +184,9 @@ jobs:
       - name: Pre-clean stale worker containers and disk
         run: |
           docker rm -f ci-worker-gpu 2>/dev/null || true
-          # Remove old CI GPU worker image (rebuilt each run)
           docker rmi ci/flowmesh_worker:latest-gpu 2>/dev/null || true
-          # Remove dangling images and unused volumes from crashed/orphaned runs
           docker image prune -f
           docker volume prune -f
-          # Trim build cache but keep recent layers for faster builds
           docker builder prune -f --keep-storage 5gb 2>/dev/null \
             || docker builder prune -f --filter "until=72h" 2>/dev/null \
             || true
@@ -211,8 +196,6 @@ jobs:
 
       - name: Build GPU worker builder image (cached by content hash)
         run: |
-          # Hash Dockerfile.cuda.builder + GPU requirements so we only rebuild
-          # when the actual inputs change. The tagged image persists on the runner.
           BUILDER_HASH=$(cat \
             src/worker/docker/Dockerfile.cuda.builder \
             src/worker/requirements/requirements.gpu.txt \
@@ -249,22 +232,12 @@ jobs:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           COMPOSE_PROJECT_NAME: ${{ env.PROJECT }}
 
-      - name: Wait for host to be healthy
-        run: |
-          timeout 120 bash -c '
-            until docker compose -p "$PROJECT" -f docker/ci.compose.yml \
-              exec -T host curl -sf http://localhost:8000/healthz; do
-              echo "waiting for host…"
-              sleep 3
-            done
-          '
-
-      - name: Wait for supervisor to be healthy
+      - name: Wait for server to be healthy
         run: |
           timeout 120 bash -c '
             until docker compose -p "$PROJECT" -f docker/ci.compose.yml \
-              exec -T supervisor curl -sf http://localhost:8001/healthz; do
-              echo "waiting for supervisor…"
+              exec -T server curl -sf http://localhost:8000/healthz; do
+              echo "waiting for server…"
               sleep 3
             done
           '
@@ -272,9 +245,12 @@ jobs:
       - name: Wait for GPU worker to register
         run: |
           for i in $(seq 1 36); do
-            RESP=$(curl -sf \
-              -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" \
-              http://localhost:8000/api/v1/workers || echo "CURL_FAILED")
+            RESP=$(docker compose -p "$PROJECT" -f docker/ci.compose.yml \
+              exec -T server \
+              curl -sf \
+                -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" \
+                http://localhost:8000/api/v1/workers 2>/dev/null \
+              || echo "CURL_FAILED")
             echo "Attempt $i: $RESP"
             if echo "$RESP" | grep -qE '"worker_id"|"id":|"wkr-'; then
               echo "Worker registered!"
@@ -289,7 +265,7 @@ jobs:
         run: |
           docker run --rm \
             --network "${PROJECT}_ci-net" \
-            -e FLOWMESH_HOST_URL="http://host:8000" \
+            -e FLOWMESH_HOST_URL="http://server:8000" \
             -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
             -e TASK_YAML="/templates/inference_vllm_tiny.yaml" \
             -e E2E_TIMEOUT_SEC="300" \
@@ -302,7 +278,7 @@ jobs:
         run: |
           docker run --rm \
             --network "${PROJECT}_ci-net" \
-            -e FLOWMESH_HOST_URL="http://host:8000" \
+            -e FLOWMESH_HOST_URL="http://server:8000" \
             -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
             -e TASK_YAML="/templates/echo_three_node_graph.yaml" \
             -e E2E_TIMEOUT_SEC="120" \
@@ -315,7 +291,7 @@ jobs:
         run: |
           docker run --rm \
             --network "${PROJECT}_ci-net" \
-            -e FLOWMESH_HOST_URL="http://host:8000" \
+            -e FLOWMESH_HOST_URL="http://server:8000" \
             -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
             -e TASK_YAML="/templates/dag_inference_example.yaml" \
             -e E2E_TIMEOUT_SEC="600" \
@@ -328,7 +304,7 @@ jobs:
         run: |
           docker run --rm \
             --network "${PROJECT}_ci-net" \
-            -e FLOWMESH_HOST_URL="http://host:8000" \
+            -e FLOWMESH_HOST_URL="http://server:8000" \
             -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
             -e TASK_YAML="/templates/conditional_echo_test.yaml" \
             -e E2E_TIMEOUT_SEC="120" \
@@ -341,7 +317,7 @@ jobs:
         run: |
           docker run --rm \
             --network "${PROJECT}_ci-net" \
-            -e FLOWMESH_HOST_URL="http://host:8000" \
+            -e FLOWMESH_HOST_URL="http://server:8000" \
             -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
             -e TASK_YAML="/templates/inference_hf_tiny.yaml" \
             -e E2E_TIMEOUT_SEC="300" \
@@ -354,7 +330,7 @@ jobs:
         run: |
           docker run --rm \
             --network "${PROJECT}_ci-net" \
-            -e FLOWMESH_HOST_URL="http://host:8000" \
+            -e FLOWMESH_HOST_URL="http://server:8000" \
             -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
             -e TASK_YAML="/templates/lora_sft_llama.yaml" \
             -e E2E_TIMEOUT_SEC="1200" \
@@ -367,7 +343,7 @@ jobs:
         run: |
           docker run --rm \
             --network "${PROJECT}_ci-net" \
-            -e FLOWMESH_HOST_URL="http://host:8000" \
+            -e FLOWMESH_HOST_URL="http://server:8000" \
             -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
             -e TASK_YAML="/templates/ssh_noninteractive.yaml" \
             -e E2E_TIMEOUT_SEC="120" \
@@ -380,7 +356,7 @@ jobs:
         run: |
           docker run --rm \
             --network "${PROJECT}_ci-net" \
-            -e FLOWMESH_HOST_URL="http://host:8000" \
+            -e FLOWMESH_HOST_URL="http://server:8000" \
             -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
             -e TASK_YAML="/templates/n8n/dag_inference.json" \
             -e E2E_TIMEOUT_SEC="600" \
@@ -398,7 +374,6 @@ jobs:
           echo "=== Execution evidence check ==="
           LOG=/tmp/worker-gpu-${{ github.run_id }}.log
 
-          # Must have received and run a task
           if grep -qiE "executor|running task|dispatched|inference|model" "$LOG"; then
             echo "✓ Worker received and processed a task"
           else
@@ -406,7 +381,6 @@ jobs:
             exit 1
           fi
 
-          # Must show task succeeded (not just status update)
           if grep -qiE "succeeded|TASK_SUCCEEDED|done|completed" "$LOG"; then
             echo "✓ Task completed successfully in worker"
           else
@@ -442,12 +416,12 @@ jobs:
           path: /tmp/ci-gpu-logs-${{ github.run_id }}.txt
           retention-days: 3
 
-      - name: Destroy workers via supervisor API
+      - name: Destroy workers via server API
         if: always()
         run: |
           docker compose -p "$PROJECT" -f docker/ci.compose.yml \
-            exec -T supervisor \
-            curl -sf -X DELETE http://localhost:8001/api/v1/workers \
+            exec -T server \
+            curl -sf -X DELETE http://localhost:8000/api/v1/workers \
               -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" || true
           sleep 5
 
@@ -459,9 +433,7 @@ jobs:
             -f docker/ci.compose.yml \
             -f docker/ci.worker.gpu.yml \
             down -v --remove-orphans
-          # Remove the CI GPU worker image — rebuilt next run
           docker rmi ci/flowmesh_worker:latest-gpu 2>/dev/null || true
-          # Remove old flowmesh-builder images (keep only the current content hash)
           CURRENT_HASH=$(cat \
             src/worker/docker/Dockerfile.cuda.builder \
             src/worker/requirements/requirements.gpu.txt \
@@ -470,7 +442,6 @@ jobs:
             | grep "^flowmesh-builder:" \
             | grep -v ":${CURRENT_HASH}$" \
             | xargs -r docker rmi 2>/dev/null || true
-          # Clean up dangling images and unused volumes
           docker image prune -f
           docker volume prune -f
           echo "=== Disk after teardown ==="
diff --git a/scripts/ci/run_local.sh b/scripts/ci/run_local.sh
index f569dcc..99d52e1 100644
--- a/scripts/ci/run_local.sh
+++ b/scripts/ci/run_local.sh
@@ -5,7 +5,8 @@
 # pushing to GitHub.  Requires: docker, docker compose v2, uv.
 #
 # Fully isolated from any running FlowMesh services:
-#   - Host and supervisor ports are dynamically assigned (no fixed 8000/50051)
+#   - Server HTTP port is dynamically assigned (no fixed 8000)
+#   - gRPC port 50051 is fixed (workers cannot follow a dynamic port)
 #   - Worker container name is scoped to the process PID
 #   - Each run gets its own Docker network via compose project name
 #
@@ -14,7 +15,7 @@
 #
 # Options:
 #   --gpu               Run the GPU smoke test instead of the CPU integration test
-#   --task-yaml PATH    Override the workflow YAML submitted to the host
+#   --task-yaml PATH    Override the workflow YAML submitted to the server
 #   --timeout SEC       Override E2E wait timeout (default: 120, GPU default: 300)
 #   --no-clean          Skip the pre-run docker prune step
 #   --no-build          Skip rebuilding the worker image (use cached)
@@ -40,13 +41,12 @@ DO_TEARDOWN=true
 WORKER_IMAGE_CPU="ci/flowmesh_worker:latest-cpu"
 WORKER_IMAGE_GPU="ci/flowmesh_worker:latest-gpu"
 
-# Populated in section 0; referenced in teardown.
 WORKER_NAME=""
 _WORKER_CFG=""
 _COMPOSE_OVERRIDE=""
-HOST_URL="http://localhost:8000"   # overwritten after dc up
+HOST_URL="http://localhost:8000"
 
-# ── Argument parsing ───────────────────────────────────────────────────────────────────────────────────────
+# ── Argument parsing ──────────────────────────────────────────────────────────────────────────────────────
 while [[ $# -gt 0 ]]; do
   case "$1" in
     --gpu)         GPU=true;           shift ;;
@@ -77,23 +77,22 @@ if $GPU; then
   COMPOSE_FILES+=(-f "$DOCKER_DIR/ci.worker.gpu.yml")
 fi
 
-dc() { docker compose -p "$PROJECT" "${COMPOSE_FILES[@]}" "$@"; }
+dc() { COMPOSE_PROJECT_NAME="$PROJECT" docker compose -p "$PROJECT" "${COMPOSE_FILES[@]}" "$@"; }
 
 # ── Teardown (trap runs on any exit) ──────────────────────────────────────────────────────────────────
 _teardown() {
   local code=$?
   if ! $DO_TEARDOWN; then
     warn "Skipping teardown (--keep).  To clean up manually:"
-    echo "  docker compose -p $PROJECT ${COMPOSE_FILES[*]} down -v --remove-orphans"
+    echo "  COMPOSE_PROJECT_NAME=$PROJECT docker compose -p $PROJECT ${COMPOSE_FILES[*]} down -v --remove-orphans"
     return
   fi
 
   log "Tearing down..."
 
-  # Always dump service logs before removal — essential for diagnosing failures.
   echo
-  log "Supervisor logs (last 40 lines):"
-  dc logs supervisor --tail=40 2>/dev/null || true
+  log "Server logs (last 40 lines):"
+  dc logs server --tail=40 2>/dev/null || true
   echo
 
   if [[ -n "$WORKER_NAME" ]]; then
@@ -102,21 +101,16 @@ _teardown() {
     echo
   fi
 
-  # Ask supervisor to stop managed workers gracefully.
-  dc exec -T supervisor \
-    curl -sf -X DELETE http://localhost:8001/api/v1/workers \
+  dc exec -T server \
+    curl -sf -X DELETE http://localhost:8000/api/v1/workers \
     -H "Authorization: Bearer $API_KEY" 2>/dev/null || true
   sleep 3
 
   docker rm -f "$WORKER_NAME" 2>/dev/null || true
   dc down -v --remove-orphans 2>/dev/null || true
 
-  # Worker image is intentionally kept: the next build overwrites the tag in-place,
-  # so there is always exactly one cached image available for --no-build runs.
   docker image prune -f >/dev/null
   docker volume prune -f >/dev/null
-
-  # Clean up isolation temp files.
   rm -f "${_WORKER_CFG:-}" "${_COMPOSE_OVERRIDE:-}" 2>/dev/null || true
 
   if [[ $code -eq 0 ]]; then
@@ -133,7 +127,6 @@ if $GPU; then
   WORKER_IMAGE="$WORKER_IMAGE_GPU"
   WORKER_DOCKERFILE="src/worker/docker/Dockerfile.cuda"
   [[ -z "$TIMEOUT" ]] && TIMEOUT=300
-  # If --task-yaml was given, run only that one; otherwise run the full GPU suite.
   if [[ -n "$TASK_YAML" ]]; then
     GPU_TASK_YAMLS=("$TASK_YAML")
   else
@@ -159,8 +152,6 @@ fi
 cd "$REPO_ROOT"
 
 # ── 0b. Create isolation artifacts ───────────────────────────────────────────────────────────────────────────────────────────────────────────
-# Worker config: project-scoped alias prevents container name clashes when a
-# second local CI run or a dev worker with the same name is already running.
 _WORKER_CFG="$(mktemp /tmp/ci-worker-cfg-XXXXXX.yml)"
 if $GPU; then
   sed "s/ci-worker-gpu/$WORKER_NAME/g" \
@@ -179,23 +170,18 @@ workers:
 EOF
 fi
 
-# Compose override: host port is dynamic (avoids silently hitting a production
-# host on 8000).  Supervisor gRPC stays on fixed 50051 — workers are spawned with
-# SUPERVISOR_GRPC_TARGET=localhost:50051 and cannot follow a random port.
-# If 50051 is already taken, dc up fails loudly at startup.
+# Compose override: HTTP port is dynamic, gRPC port stays fixed at 50051.
+# Workers receive SUPERVISOR_GRPC_TARGET=server:50051 (set via SERVER_HOST
+# in ci.compose.yml) and cannot follow a dynamic port.
 _COMPOSE_OVERRIDE="$(mktemp /tmp/ci-compose-override-XXXXXX.yml)"
 cat > "$_COMPOSE_OVERRIDE" <<EOF
 services:
-  host:
+  server:
     ports:
       - "127.0.0.1::8000"
-  supervisor:
-    ports:
       - "50051:50051"
     volumes:
-      - $_WORKER_CFG:/etc/supervisor/worker_config.yaml:ro
-    environment:
-      WORKER_DOCKER_NETWORK: ${PROJECT}_ci-net
+      - $_WORKER_CFG:/etc/flowmesh/worker_config.yaml:ro
 EOF
 COMPOSE_FILES+=(-f "$_COMPOSE_OVERRIDE")
 
@@ -213,11 +199,9 @@ echo
 # ── 1. Pre-clean ────────────────────────────────────────────────────────────────────────────────────────────────────────
 if $DO_CLEAN; then
   log "Pre-cleaning stale containers and build cache..."
-  # Remove leftover worker containers from previous crashed runs of this script.
   docker ps -a --format '{{.Names}}' \
     | grep -E '^ci-worker-(cpu|gpu)-[0-9]+$' \
     | xargs -r docker rm -f 2>/dev/null || true
-  # Tear down any stale ci-local-* compose stacks (e.g. from a disconnected SSH session).
   docker ps -a --format '{{.Labels}}' \
     | grep -oP 'com\.docker\.compose\.project=ci-local-\d+' \
     | sort -u \
@@ -241,61 +225,41 @@ if $DO_BUILD; then
 else
   if ! docker image inspect "$WORKER_IMAGE" >/dev/null 2>&1; then
     fail "--no-build specified but image '$WORKER_IMAGE' not found locally."
-    fail "Run without --no-build first, or: docker build -f $WORKER_DOCKERFILE -t $WORKER_IMAGE ."
     exit 1
   fi
   log "Using cached worker image: $WORKER_IMAGE"
 fi
 
-# ── 3. Build & start services ────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-# --wait blocks until every healthcheck passes.
-log "Starting services (redis × 2, postgres, host, supervisor)..."
+# ── 3. Build & start services ────────────────────────────────────────────────────────────────────────────────────────────────────────────
+log "Starting services (redis × 2, server)..."
 if ! DOCKER_BUILDKIT=1 dc up -d --build --wait; then
-  fail "Services failed to start — supervisor logs:"
-  dc logs supervisor --tail=60 2>/dev/null || true
+  fail "Services failed to start — server logs:"
+  dc logs server --tail=60 2>/dev/null || true
   exit 1
 fi
 ok "All services healthy"
 
 # ── 4. Resolve the dynamically assigned host port ─────────────────────────────────────────────────────────────────────────────────────
-# docker compose port returns 0.0.0.0:0 for 127.0.0.1-only bindings; use docker port instead.
-HOST_PORT=$(docker port "$(dc ps -q host)" 8000/tcp \
+HOST_PORT=$(docker port "$(dc ps -q server)" 8000/tcp \
   | grep '127.0.0.1:' | awk -F: '{print $NF}' | head -1)
 HOST_URL="http://localhost:$HOST_PORT"
-log "Host bound to $HOST_URL"
+log "Server HTTP bound to $HOST_URL"
 
 curl -sf "$HOST_URL/healthz" >/dev/null \
-  || { fail "Host not reachable at $HOST_URL"; dc logs host --tail=40; exit 1; }
-ok "Host healthy at $HOST_URL"
-
-# ── 5. Confirm supervisor ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-# Supervisor has start_period:15s; retry up to ~45s to let it fully start.
-SUPERVISOR_OK=false
-for i in $(seq 1 9); do
-  if dc exec -T supervisor curl -sf http://localhost:8001/healthz >/dev/null 2>&1; then
-    SUPERVISOR_OK=true; break
-  fi
-  echo "  supervisor attempt $i/9"
-  sleep 5
-done
-if ! $SUPERVISOR_OK; then
-  fail "Supervisor never became healthy"
-  dc logs supervisor --tail=40 || true
-  exit 1
-fi
-ok "Supervisor healthy"
+  || { fail "Server not reachable at $HOST_URL"; dc logs server --tail=40; exit 1; }
+ok "Server healthy at $HOST_URL"
 
-# ── 6. Debug snapshot ────────────────────────────────────────────────────────────────────────────────────────────────────────────
+# ── 5. Debug snapshot ────────────────────────────────────────────────────────────────────────────────────────────────────────────
 echo
 log "Container state:"
 dc ps
 echo
-log "Supervisor logs (last 20 lines):"
-dc logs supervisor --tail=20
+log "Server logs (last 20 lines):"
+dc logs server --tail=20
 echo
 
-# ── 7. Wait for worker to register ───────────────────────────────────────────────────────────────────────────────────────────────────────────
-log "Waiting for worker to register with host..."
+# ── 6. Wait for worker to register ───────────────────────────────────────────────────────────────────────────────────────────────────────────
+log "Waiting for worker to register with server..."
 REGISTERED=false
 for i in $(seq 1 24); do
   RESP=$(curl -sf \
@@ -310,14 +274,14 @@ for i in $(seq 1 24); do
 done
 
 if ! $REGISTERED; then
-  fail "Worker never registered.  Supervisor + worker logs:"
-  dc logs supervisor --tail=40 || true
+  fail "Worker never registered.  Server + worker logs:"
+  dc logs server --tail=40 || true
   docker logs "$WORKER_NAME" 2>&1 | tail -40 || true
   exit 1
 fi
 ok "Worker registered"
 
-# ── 8. Run E2E smoke test(s) ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────
+# ── 7. Run E2E smoke test(s) ────────────────────────────────────────────────────────────────────────────────────────────────────────────
 echo
 log "Running E2E smoke test(s)..."
 log "  HOST=$HOST_URL"
@@ -338,7 +302,7 @@ for _YAML in "${YAML_LIST[@]}"; do
       pytest tests/integration/test_e2e.py -v -s
 done
 
-# ── 9. Verify worker execution evidence ───────────────────────────────────────────────────────────────────────────────────────────────────────
+# ── 8. Verify worker execution evidence ───────────────────────────────────────────────────────────────────────────────────────────────────────
 echo
 log "Verifying worker execution evidence..."
 LOG_FILE="/tmp/flowmesh-local-worker-$$.log"

From 9c5733db636c0c5c06069a6b21290c92d2a06a52 Mon Sep 17 00:00:00 2001
From: Qruixuan <154648498+Qruixuan@users.noreply.github.com>
Date: Sun, 3 May 2026 18:46:11 +0800
Subject: [PATCH 10/17] =?UTF-8?q?fix:=20correct=20worker=20networking=20?=
 =?UTF-8?q?=E2=80=94=20host-mode=20workers=20need=20localhost=20URLs=20and?=
 =?UTF-8?q?=20exposed=20ports?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Workers are spawned by the Docker adapter with network_mode: host (see
supervisor/adapters/docker.py _start()). They connect to the gRPC
supervisor at localhost:50051 and download results via FLOWMESH_BASE_URL.

Three bugs in the previous CI setup:
1. WORKER_DOCKER_NETWORK env var doesn't exist in FlowMesh — removed.
2. FLOWMESH_BASE_URL was "http://server:8000" but workers on host network
   can't resolve "server"; changed to "http://localhost:8000".
3. CI workflow never exposed ports 8000/50051 on the host, so workers
   (network_mode: host) couldn't reach the server container at all;
   added ci.ports.fixed.yml to both build steps.
4. run_local.sh used a dynamic HTTP port, but FLOWMESH_BASE_URL in the
   compose is a static value set before start; changed to fixed
   127.0.0.1:8000:8000 so workers can always reach http://localhost:8000.

Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn>
---
 .github/workflows/ci.yml |  6 ++++-
 docker/ci.compose.yml    | 18 +++++--------
 scripts/ci/run_local.sh  | 55 ++++++++++++++++++++--------------------
 3 files changed, 40 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index de7b9ae..8bb9122 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -48,7 +48,10 @@ jobs:
 
       - name: Build & start services
         run: |
-          docker compose -p "$PROJECT" -f docker/ci.compose.yml up -d --build
+          docker compose -p "$PROJECT" \
+            -f docker/ci.compose.yml \
+            -f docker/ci.ports.fixed.yml \
+            up -d --build
         env:
           DOCKER_BUILDKIT: "1"
           COMPOSE_PROJECT_NAME: ${{ env.PROJECT }}
@@ -226,6 +229,7 @@ jobs:
           docker compose -p "$PROJECT" \
             -f docker/ci.compose.yml \
             -f docker/ci.worker.gpu.yml \
+            -f docker/ci.ports.fixed.yml \
             up -d --build
         env:
           DOCKER_BUILDKIT: "1"
diff --git a/docker/ci.compose.yml b/docker/ci.compose.yml
index df2d298..448ca31 100644
--- a/docker/ci.compose.yml
+++ b/docker/ci.compose.yml
@@ -4,12 +4,13 @@
 # FlowMesh uses a single server container (HTTP API port 8000 + gRPC
 # supervisor port 50051); no separate host or database service needed.
 #
-# The server spawns worker containers via Docker (socket mounted) and
-# attaches them to WORKER_DOCKER_NETWORK so they can resolve "server".
+# Workers are spawned by the server's Docker adapter with network_mode: host.
+# They connect to gRPC at localhost:50051 and HTTP at http://localhost:8000.
+# Ports 8000 and 50051 MUST therefore be bound on the Docker host machine.
 #
 # NOTE: No ports are exposed in this base file.  Add ports via an overlay:
 #   - Fixed (GitHub Actions / bare docker compose):  docker/ci.ports.fixed.yml
-#   - Dynamic (local dev, run_local.sh):              generated at runtime
+#   - Fixed local (run_local.sh):                    generated at runtime
 
 services:
   redis_control:
@@ -45,11 +46,9 @@ services:
       REDIS_CONTROL_URL: "redis://redis_control:6379/0"
       REDIS_TELEMETRY_URL: "redis://redis_telemetry:6379/0"
       FLOWMESH_API_KEY: "flm-ci-00000000000000000000000000000000"
-      # FLOWMESH_BASE_URL lets the server know its own HTTP address so it
-      # can embed the correct URL in tokens passed to spawned workers.
-      FLOWMESH_BASE_URL: "http://server:8000"
-      # SERVER_HOST tells the server its own gRPC hostname so spawned
-      # workers receive SUPERVISOR_GRPC_TARGET=server:50051.
+      # Workers run with network_mode: host, so FLOWMESH_BASE_URL must be
+      # reachable from the Docker host (not the compose overlay network).
+      FLOWMESH_BASE_URL: "http://localhost:8000"
       SERVER_HOST: "server"
       NODE_NAMESPACE: "ci"
       NODE_CLUSTER: "ci-cluster"
@@ -60,9 +59,6 @@ services:
       FLOWMESH_REGISTRY: "ci"
       FLOWMESH_VERSION: "latest"
       WORKER_CONFIG_PATH: "/etc/flowmesh/worker_config.yaml"
-      # Attach spawned workers to this compose network so they can
-      # resolve "server" by hostname.  Set via COMPOSE_PROJECT_NAME.
-      WORKER_DOCKER_NETWORK: "${COMPOSE_PROJECT_NAME}_ci-net"
       # Pass HuggingFace token through so workers can download gated models.
       HF_TOKEN:
     volumes:
diff --git a/scripts/ci/run_local.sh b/scripts/ci/run_local.sh
index 99d52e1..bef117d 100644
--- a/scripts/ci/run_local.sh
+++ b/scripts/ci/run_local.sh
@@ -5,11 +5,15 @@
 # pushing to GitHub.  Requires: docker, docker compose v2, uv.
 #
 # Fully isolated from any running FlowMesh services:
-#   - Server HTTP port is dynamically assigned (no fixed 8000)
+#   - Server HTTP port is fixed at 8000 (workers need a known address)
 #   - gRPC port 50051 is fixed (workers cannot follow a dynamic port)
 #   - Worker container name is scoped to the process PID
 #   - Each run gets its own Docker network via compose project name
 #
+# IMPORTANT: Ports 8000 and 50051 must be free on your machine.
+# Workers are spawned with network_mode: host and connect to these
+# ports on localhost to reach the server container.
+#
 # Usage:
 #   ./scripts/ci/run_local.sh [OPTIONS]
 #
@@ -24,11 +28,11 @@
 
 set -euo pipefail
 
-# ── Paths ─────────────────────────────────────────────────────────────────────────────────────
+# ── Paths ─────────────────────────────────────────────────────────────────────
 REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
 DOCKER_DIR="$REPO_ROOT/docker"
 
-# ── Defaults ────────────────────────────────────────────────────────────────────────────────────────
+# ── Defaults ──────────────────────────────────────────────────────────────────
 PROJECT="ci-local-$$"
 API_KEY="flm-ci-00000000000000000000000000000000"
 GPU=false
@@ -46,7 +50,7 @@ _WORKER_CFG=""
 _COMPOSE_OVERRIDE=""
 HOST_URL="http://localhost:8000"
 
-# ── Argument parsing ──────────────────────────────────────────────────────────────────────────────────────
+# ── Argument parsing ───────────────────────────────────────────────────────────
 while [[ $# -gt 0 ]]; do
   case "$1" in
     --gpu)         GPU=true;           shift ;;
@@ -55,12 +59,12 @@ while [[ $# -gt 0 ]]; do
     --no-clean)    DO_CLEAN=false;    shift ;;
     --no-build)    DO_BUILD=false;    shift ;;
     --keep)        DO_TEARDOWN=false; shift ;;
-    -h|--help)     sed -n '2,23p' "$0"; exit 0 ;;
+    -h|--help)     sed -n '2,25p' "$0"; exit 0 ;;
     *) echo "Unknown option: $1" >&2; exit 1 ;;
   esac
 done
 
-# ── Colors ──────────────────────────────────────────────────────────────────────────────────────
+# ── Colors ────────────────────────────────────────────────────────────────────
 if [[ -t 1 ]]; then
   _B='\033[0;34m' _G='\033[0;32m' _Y='\033[1;33m' _R='\033[0;31m' _N='\033[0m'
 else
@@ -71,7 +75,7 @@ ok()   { echo -e "${_G}[ok]${_N}  $*"; }
 warn() { echo -e "${_Y}[warn]${_N} $*"; }
 fail() { echo -e "${_R}[FAIL]${_N} $*" >&2; }
 
-# ── Compose helpers ────────────────────────────────────────────────────────────────────────────────────────
+# ── Compose helpers ───────────────────────────────────────────────────────────
 COMPOSE_FILES=(-f "$DOCKER_DIR/ci.compose.yml")
 if $GPU; then
   COMPOSE_FILES+=(-f "$DOCKER_DIR/ci.worker.gpu.yml")
@@ -79,7 +83,7 @@ fi
 
 dc() { COMPOSE_PROJECT_NAME="$PROJECT" docker compose -p "$PROJECT" "${COMPOSE_FILES[@]}" "$@"; }
 
-# ── Teardown (trap runs on any exit) ──────────────────────────────────────────────────────────────────
+# ── Teardown (trap runs on any exit) ──────────────────────────────────────────
 _teardown() {
   local code=$?
   if ! $DO_TEARDOWN; then
@@ -121,7 +125,7 @@ _teardown() {
 }
 trap _teardown EXIT
 
-# ── 0. Resolve defaults ──────────────────────────────────────────────────────────────────────────────────────────────────────
+# ── 0. Resolve defaults ───────────────────────────────────────────────────────
 if $GPU; then
   WORKER_NAME="ci-worker-gpu-$$"
   WORKER_IMAGE="$WORKER_IMAGE_GPU"
@@ -151,7 +155,7 @@ fi
 
 cd "$REPO_ROOT"
 
-# ── 0b. Create isolation artifacts ───────────────────────────────────────────────────────────────────────────────────────────────────────────
+# ── 0b. Create isolation artifacts ────────────────────────────────────────────
 _WORKER_CFG="$(mktemp /tmp/ci-worker-cfg-XXXXXX.yml)"
 if $GPU; then
   sed "s/ci-worker-gpu/$WORKER_NAME/g" \
@@ -170,15 +174,16 @@ workers:
 EOF
 fi
 
-# Compose override: HTTP port is dynamic, gRPC port stays fixed at 50051.
-# Workers receive SUPERVISOR_GRPC_TARGET=server:50051 (set via SERVER_HOST
-# in ci.compose.yml) and cannot follow a dynamic port.
+# Compose override: both ports are fixed so workers (network_mode: host)
+# can reach localhost:8000 (HTTP) and localhost:50051 (gRPC).
+# FLOWMESH_BASE_URL in ci.compose.yml is http://localhost:8000 — this
+# must match the HTTP port binding below.
 _COMPOSE_OVERRIDE="$(mktemp /tmp/ci-compose-override-XXXXXX.yml)"
 cat > "$_COMPOSE_OVERRIDE" <<EOF
 services:
   server:
     ports:
-      - "127.0.0.1::8000"
+      - "127.0.0.1:8000:8000"
       - "50051:50051"
     volumes:
       - $_WORKER_CFG:/etc/flowmesh/worker_config.yaml:ro
@@ -196,7 +201,7 @@ fi
 log "Timeout  : ${TIMEOUT}s"
 echo
 
-# ── 1. Pre-clean ────────────────────────────────────────────────────────────────────────────────────────────────────────
+# ── 1. Pre-clean ──────────────────────────────────────────────────────────────
 if $DO_CLEAN; then
   log "Pre-cleaning stale containers and build cache..."
   docker ps -a --format '{{.Names}}' \
@@ -214,7 +219,7 @@ if $DO_CLEAN; then
     || true
 fi
 
-# ── 2. Build worker image ────────────────────────────────────────────────────────────────────────────────────────────────────────────
+# ── 2. Build worker image ─────────────────────────────────────────────────────
 if $DO_BUILD; then
   log "Building worker image ($WORKER_IMAGE)..."
   DOCKER_BUILDKIT=1 docker build \
@@ -230,7 +235,7 @@ else
   log "Using cached worker image: $WORKER_IMAGE"
 fi
 
-# ── 3. Build & start services ────────────────────────────────────────────────────────────────────────────────────────────────────────────
+# ── 3. Build & start services ─────────────────────────────────────────────────
 log "Starting services (redis × 2, server)..."
 if ! DOCKER_BUILDKIT=1 dc up -d --build --wait; then
   fail "Services failed to start — server logs:"
@@ -239,17 +244,13 @@ if ! DOCKER_BUILDKIT=1 dc up -d --build --wait; then
 fi
 ok "All services healthy"
 
-# ── 4. Resolve the dynamically assigned host port ─────────────────────────────────────────────────────────────────────────────────────
-HOST_PORT=$(docker port "$(dc ps -q server)" 8000/tcp \
-  | grep '127.0.0.1:' | awk -F: '{print $NF}' | head -1)
-HOST_URL="http://localhost:$HOST_PORT"
-log "Server HTTP bound to $HOST_URL"
-
+# ── 4. Verify server is reachable on fixed port ───────────────────────────────
+log "Server HTTP at $HOST_URL"
 curl -sf "$HOST_URL/healthz" >/dev/null \
   || { fail "Server not reachable at $HOST_URL"; dc logs server --tail=40; exit 1; }
 ok "Server healthy at $HOST_URL"
 
-# ── 5. Debug snapshot ────────────────────────────────────────────────────────────────────────────────────────────────────────────
+# ── 5. Debug snapshot ─────────────────────────────────────────────────────────
 echo
 log "Container state:"
 dc ps
@@ -258,7 +259,7 @@ log "Server logs (last 20 lines):"
 dc logs server --tail=20
 echo
 
-# ── 6. Wait for worker to register ───────────────────────────────────────────────────────────────────────────────────────────────────────────
+# ── 6. Wait for worker to register ───────────────────────────────────────────
 log "Waiting for worker to register with server..."
 REGISTERED=false
 for i in $(seq 1 24); do
@@ -281,7 +282,7 @@ if ! $REGISTERED; then
 fi
 ok "Worker registered"
 
-# ── 7. Run E2E smoke test(s) ────────────────────────────────────────────────────────────────────────────────────────────────────────────
+# ── 7. Run E2E smoke test(s) ──────────────────────────────────────────────────
 echo
 log "Running E2E smoke test(s)..."
 log "  HOST=$HOST_URL"
@@ -302,7 +303,7 @@ for _YAML in "${YAML_LIST[@]}"; do
       pytest tests/integration/test_e2e.py -v -s
 done
 
-# ── 8. Verify worker execution evidence ───────────────────────────────────────────────────────────────────────────────────────────────────────
+# ── 8. Verify worker execution evidence ──────────────────────────────────────
 echo
 log "Verifying worker execution evidence..."
 LOG_FILE="/tmp/flowmesh-local-worker-$$.log"

From 62f2fca1656c239f8c19cf1c9af26b3b2010e161 Mon Sep 17 00:00:00 2001
From: Qruixuan <154648498+Qruixuan@users.noreply.github.com>
Date: Sun, 3 May 2026 18:55:21 +0800
Subject: [PATCH 11/17] =?UTF-8?q?feat:=20add=20tests/integration/test=5Fe2?=
 =?UTF-8?q?e.py=20=E2=80=94=20E2E=20smoke=20test=20for=20CI?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Migrated from FlowMesh_dev ci/gpu-runner-setup-v2 branch unchanged:
- Submits a workflow YAML to a live server and polls until DONE/FAILED
- Skips automatically when FLOWMESH_HOST_URL is unset (safe for unit test runs)
- Handles n8n JSON and native YAML formats
- Skips (not fails) when executor package is unavailable on the worker

Used by run_local.sh (step 7) and .github/workflows/ci.yml E2E steps.

Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn>
---
 tests/integration/__init__.py |   0
 tests/integration/conftest.py |  68 ++++++++++
 tests/integration/test_e2e.py | 230 ++++++++++++++++++++++++++++++++++
 3 files changed, 298 insertions(+)
 create mode 100644 tests/integration/__init__.py
 create mode 100644 tests/integration/conftest.py
 create mode 100644 tests/integration/test_e2e.py

diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
new file mode 100644
index 0000000..53f9ea2
--- /dev/null
+++ b/tests/integration/conftest.py
@@ -0,0 +1,68 @@
+"""
+Pytest configuration for FlowMesh end-to-end integration tests.
+
+Registers CLI options so the suite can be driven without pre-setting env vars:
+
+    pytest tests/integration/ --host-url http://myserver:8000 --api-key flm-...
+
+The options are synced into environment variables during pytest_configure so
+that module-level constants and the pytestmark skip-condition in test_e2e.py
+pick them up at collection time (before any fixtures run).
+"""
+
+import os
+
+import pytest
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    group = parser.getgroup("e2e", "FlowMesh end-to-end tests")
+    group.addoption(
+        "--host-url",
+        default=None,
+        metavar="URL",
+        help="FlowMesh host base URL (overrides FLOWMESH_HOST_URL env var)",
+    )
+    group.addoption(
+        "--api-key",
+        default=None,
+        metavar="KEY",
+        help="FlowMesh API key (overrides FLOWMESH_API_KEY env var)",
+    )
+    group.addoption(
+        "--task-yaml",
+        default=None,
+        metavar="PATH",
+        help="Path to workflow YAML to submit (overrides TASK_YAML env var)",
+    )
+    group.addoption(
+        "--e2e-timeout",
+        type=int,
+        default=None,
+        metavar="SEC",
+        help="Max seconds to wait for task completion (overrides E2E_TIMEOUT_SEC)",
+    )
+
+
+def pytest_configure(config: pytest.Config) -> None:
+    config.addinivalue_line(
+        "markers",
+        "e2e: end-to-end integration tests that require a live FlowMesh host",
+    )
+
+    # Sync CLI options into env vars *before* test modules are collected so
+    # that module-level constants and pytestmark conditions in test_e2e.py see
+    # the right values.  os.environ.setdefault is used so an explicit env var
+    # always takes precedence over a CLI flag.
+    _sync_opt(config, "--host-url", "FLOWMESH_HOST_URL")
+    _sync_opt(config, "--api-key", "FLOWMESH_API_KEY")
+    _sync_opt(config, "--task-yaml", "TASK_YAML")
+    if (timeout := config.getoption("--e2e-timeout")) is not None:
+        os.environ.setdefault("E2E_TIMEOUT_SEC", str(timeout))
+
+
+def _sync_opt(config: pytest.Config, opt: str, env_var: str) -> None:
+    """If *opt* was passed on the CLI, set *env_var* unless already present."""
+    value: str | None = config.getoption(opt)
+    if value is not None:
+        os.environ.setdefault(env_var, value)
diff --git a/tests/integration/test_e2e.py b/tests/integration/test_e2e.py
new file mode 100644
index 0000000..5c22f0d
--- /dev/null
+++ b/tests/integration/test_e2e.py
@@ -0,0 +1,230 @@
+"""
+End-to-end integration test for FlowMesh CI.
+
+Submits a workflow YAML to a running FlowMesh host and asserts the task
+reaches DONE status within the timeout.
+
+Skipped automatically when FLOWMESH_HOST_URL is not set in the environment
+so this file does not break the regular unit-test suite.
+
+Environment variables:
+    FLOWMESH_HOST_URL   Base URL of the host (default: http://localhost:8000)
+    FLOWMESH_API_KEY    API key for authentication
+    TASK_YAML           Path to a workflow YAML or n8n JSON file to submit
+                        (default: <repo_root>/templates/echo_local.yaml)
+                        Files ending in .json are submitted as n8n format.
+    E2E_TIMEOUT_SEC     Max seconds to wait for task completion (default: 120)
+"""
+
+import os
+import re
+import sys
+import time
+from pathlib import Path
+from typing import Any
+
+import pytest
+import requests
+
+# Task errors that indicate the executor package is missing/broken on this
+# worker rather than a genuine workflow logic failure.  The test skips instead
+# of failing so CI stays green while the gap is clearly surfaced.
+_EXECUTOR_UNAVAILABLE_RE = re.compile(
+    r"not available|not installed|not importable",
+    re.IGNORECASE,
+)
+
+_REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+
+HOST_URL = os.getenv("FLOWMESH_HOST_URL", "http://localhost:8000").rstrip("/")
+API_KEY = os.getenv("FLOWMESH_API_KEY", "flm-ci-00000000000000000000000000000000")
+TASK_YAML = os.getenv("TASK_YAML", str(_REPO_ROOT / "templates" / "echo_local.yaml"))
+TIMEOUT = int(os.getenv("E2E_TIMEOUT_SEC", "120"))
+POLL_INTERVAL = 3
+
+HEADERS = {"Authorization": f"Bearer {API_KEY}"}
+
+# Skip the whole module when no host is configured — keeps the unit-test suite
+# clean.  The E2E CI job always sets FLOWMESH_HOST_URL explicitly.
+pytestmark = pytest.mark.skipif(
+    os.getenv("FLOWMESH_HOST_URL") is None,
+    reason="requires a running FlowMesh host; set FLOWMESH_HOST_URL to enable",
+)
+
+
+def _wait_for_host(timeout: int = 60) -> None:
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        try:
+            r = requests.get(f"{HOST_URL}/healthz", timeout=3)
+            if r.status_code == 200:
+                print(f"[e2e] Host is up at {HOST_URL}")
+                return
+        except requests.RequestException:
+            pass
+        time.sleep(2)
+    pytest.fail(f"[e2e] Host did not become healthy within {timeout}s")
+
+
+def _submit_workflow() -> tuple[str, str]:
+    """Submit workflow file, return (workflow_id, first_task_id).
+
+    Files ending in .json are submitted as n8n format (Workflow-Format: n8n).
+    All other files are submitted as native YAML (text/plain).
+    """
+    try:
+        with open(TASK_YAML) as f:
+            content = f.read()
+    except FileNotFoundError:
+        pytest.fail(f"[e2e] Task YAML not found: {TASK_YAML}")
+
+    is_n8n = Path(TASK_YAML).suffix.lower() == ".json"
+    fmt_label = "n8n" if is_n8n else "native"
+    print(f"[e2e] Submitting {fmt_label} workflow from {TASK_YAML}")
+
+    extra_headers: dict[str, str] = {}
+    if is_n8n:
+        extra_headers["Workflow-Format"] = "n8n"
+        extra_headers["Content-Type"] = "application/json"
+    else:
+        extra_headers["Content-Type"] = "text/plain"
+
+    r = requests.post(
+        f"{HOST_URL}/api/v1/workflows",
+        data=content.encode("utf-8"),
+        headers={**HEADERS, **extra_headers},
+        timeout=10,
+    )
+    if r.status_code not in (200, 201):
+        pytest.fail(f"[e2e] Workflow submission failed {r.status_code}: {r.text}")
+
+    body: dict[str, Any] = r.json()
+    workflow_id: str = body["workflow_id"]
+    task_id: str = body["tasks"][0]["task_id"]
+    print(f"[e2e] Submitted workflow {workflow_id}, task {task_id}")
+    return workflow_id, task_id
+
+
+def _dump_task_logs(task_id: str) -> str:
+    """Print task logs to stderr and return them as a single string for matching."""
+    try:
+        r = requests.get(
+            f"{HOST_URL}/api/v1/tasks/{task_id}/logs",
+            headers=HEADERS,
+            params={"limit": 100},
+            timeout=5,
+        )
+        if r.status_code == 200:
+            entries = r.json().get("entries") or r.json()
+            print(f"[e2e] === task logs for {task_id} ===", file=sys.stderr)
+            messages: list[str] = []
+            for entry in entries if isinstance(entries, list) else []:
+                print(f"  {entry}", file=sys.stderr)
+                msg = (
+                    entry.get("event", {}).get("message", "")
+                    if isinstance(entry, dict)
+                    else str(entry)
+                )
+                if msg:
+                    messages.append(msg)
+            return " ".join(messages)
+        else:
+            print(
+                f"[e2e] (could not fetch task logs: {r.status_code})",
+                file=sys.stderr,
+            )
+    except Exception as exc:
+        print(f"[e2e] (error fetching task logs: {exc})", file=sys.stderr)
+    return ""
+
+
+def _poll_task(task_id: str) -> dict[str, Any]:
+    deadline = time.time() + TIMEOUT
+    last_status = None
+    while time.time() < deadline:
+        r = requests.get(
+            f"{HOST_URL}/api/v1/tasks/{task_id}",
+            headers=HEADERS,
+            timeout=5,
+        )
+        if r.status_code != 200:
+            print(
+                f"[e2e] WARNING: GET task returned {r.status_code}",
+                file=sys.stderr,
+            )
+            time.sleep(POLL_INTERVAL)
+            continue
+
+        task: dict[str, Any] = r.json()
+        status = task.get("status")
+        if status != last_status:
+            print(f"[e2e] Task {task_id}: {last_status} -> {status}")
+            last_status = status
+
+        if status == "DONE":
+            return task
+        if status == "FAILED":
+            error = task.get("error") or ""
+            log_text = _dump_task_logs(task_id)
+            if _EXECUTOR_UNAVAILABLE_RE.search(error):
+                pytest.skip(
+                    f"[e2e] Executor not available on this worker: {error}"
+                )
+            # max_attempts_exceeded means the host retried until giving up.
+            # Inspect logs for the root cause; skip if the executor was
+            # unavailable (e.g. Docker socket missing for SSH executor).
+            if error == "max_attempts_exceeded" and _EXECUTOR_UNAVAILABLE_RE.search(
+                log_text
+            ):
+                pytest.skip(
+                    f"[e2e] Executor not available (retries exhausted): "
+                    f"{log_text[:300]}"
+                )
+            pytest.fail(f"[e2e] Task FAILED: {error}")
+
+        time.sleep(POLL_INTERVAL)
+
+    pytest.fail(
+        f"[e2e] Task {task_id} did not complete within {TIMEOUT}s"
+        f" (last status: {last_status})"
+    )
+
+
+def _assert_result(task: dict[str, Any]) -> None:
+    task_id: str = task["task_id"]
+
+    assert task.get("status") == "DONE", f"Expected DONE, got {task.get('status')}"
+
+    # Check the results endpoint — executor should have written responses.json
+    r = requests.get(
+        f"{HOST_URL}/api/v1/results/{task_id}",
+        headers=HEADERS,
+        timeout=5,
+    )
+    if r.status_code == 200:
+        result: dict[str, Any] = r.json()
+        print(f"[e2e] Result OK: status={result.get('status')} task_id={task_id}")
+        if result.get("payload"):
+            print(f"[e2e] Executor output: {str(result['payload'])[:200]}")
+    elif r.status_code == 404:
+        # Echo tasks may not write a result file — DONE is sufficient
+        print(f"[e2e] No result record for {task_id} — DONE is sufficient")
+    else:
+        print(
+            f"[e2e] WARNING: results endpoint returned {r.status_code}",
+            file=sys.stderr,
+        )
+
+
+def test_workflow_runs_to_done() -> None:
+    """Submit a workflow and verify it reaches DONE status."""
+    print(f"[e2e] FlowMesh E2E smoke test -> {HOST_URL}")
+    print(f"[e2e] Task YAML: {TASK_YAML}")
+    _wait_for_host()
+    _, task_id = _submit_workflow()
+    task = _poll_task(task_id)
+    _assert_result(task)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__, "-v", "-s", *sys.argv[1:]]))

From f3325f353c4c5c88e94fc59b13fe6148ed760860 Mon Sep 17 00:00:00 2001
From: Qruixuan <154648498+Qruixuan@users.noreply.github.com>
Date: Sun, 3 May 2026 19:11:20 +0800
Subject: [PATCH 12/17] fix: use host bind-mount for worker results to avoid
 _VolumeInitializer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The server's _VolumeInitializer runs busybox:1.36.1 to chown the named
Docker volume to UID 10001, but if busybox isn't cached it fails silently
and marks the volume as initialized anyway — so all subsequent workers
also get PermissionError writing to /var/lib/flowmesh-results.

Fix: set RESULTS_DIR to an absolute host path. The docker adapter skips
_VolumeInitializer for absolute paths (see _ensure_volume_access). Workers
receive a bind-mount of a pre-created host dir with chmod 777, which UID
10001 (appuser) can write to without any chown step.

- ci.compose.yml: RESULTS_DIR=/tmp/flowmesh-ci-results
- ci.yml: mkdir + chmod 777 before 'docker compose up' in both jobs,
  rm -rf in teardown
- run_local.sh: per-PID dir /tmp/flowmesh-ci-results-$PROJECT, overridden
  in compose overlay; cleaned up in teardown

Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn>
---
 .github/workflows/ci.yml | 12 ++++++++++++
 docker/ci.compose.yml    |  8 ++++++++
 scripts/ci/run_local.sh  | 32 +++++++++++++++++++++-----------
 3 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8bb9122..f5441eb 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -39,6 +39,11 @@ jobs:
           df -h /
           docker system df
 
+      - name: Create worker results directory
+        run: |
+          mkdir -p /tmp/flowmesh-ci-results
+          chmod 777 /tmp/flowmesh-ci-results
+
       - name: Build worker image
         run: |
           DOCKER_BUILDKIT=1 docker build \
@@ -161,6 +166,7 @@ jobs:
           docker rm -f ci-worker-cpu 2>/dev/null || true
           docker compose -p "$PROJECT" -f docker/ci.compose.yml down -v --remove-orphans
           docker rmi ci/flowmesh_worker:latest-cpu 2>/dev/null || true
+          rm -rf /tmp/flowmesh-ci-results 2>/dev/null || true
           docker image prune -f
           docker volume prune -f
           echo "=== Disk after teardown ==="
@@ -197,6 +203,11 @@ jobs:
           df -h /
           docker system df
 
+      - name: Create worker results directory
+        run: |
+          mkdir -p /tmp/flowmesh-ci-results
+          chmod 777 /tmp/flowmesh-ci-results
+
       - name: Build GPU worker builder image (cached by content hash)
         run: |
           BUILDER_HASH=$(cat \
@@ -446,6 +457,7 @@ jobs:
             | grep "^flowmesh-builder:" \
             | grep -v ":${CURRENT_HASH}$" \
             | xargs -r docker rmi 2>/dev/null || true
+          rm -rf /tmp/flowmesh-ci-results 2>/dev/null || true
           docker image prune -f
           docker volume prune -f
           echo "=== Disk after teardown ==="
diff --git a/docker/ci.compose.yml b/docker/ci.compose.yml
index 448ca31..539ce1f 100644
--- a/docker/ci.compose.yml
+++ b/docker/ci.compose.yml
@@ -8,6 +8,10 @@
 # They connect to gRPC at localhost:50051 and HTTP at http://localhost:8000.
 # Ports 8000 and 50051 MUST therefore be bound on the Docker host machine.
 #
+# RESULTS_DIR is set to an absolute host path so workers can write results
+# without relying on the _VolumeInitializer busybox chown mechanism.
+# Caller must create /tmp/flowmesh-ci-results with chmod 777 before 'up'.
+#
 # NOTE: No ports are exposed in this base file.  Add ports via an overlay:
 #   - Fixed (GitHub Actions / bare docker compose):  docker/ci.ports.fixed.yml
 #   - Fixed local (run_local.sh):                    generated at runtime
@@ -59,6 +63,10 @@ services:
       FLOWMESH_REGISTRY: "ci"
       FLOWMESH_VERSION: "latest"
       WORKER_CONFIG_PATH: "/etc/flowmesh/worker_config.yaml"
+      # Absolute host path for worker results (chmod 777 before 'up').
+      # Using an absolute path bypasses the _VolumeInitializer busybox chown
+      # so workers (UID 10001) can write without depending on image pulls.
+      RESULTS_DIR: "/tmp/flowmesh-ci-results"
       # Pass HuggingFace token through so workers can download gated models.
       HF_TOKEN:
     volumes:
diff --git a/scripts/ci/run_local.sh b/scripts/ci/run_local.sh
index bef117d..841409d 100644
--- a/scripts/ci/run_local.sh
+++ b/scripts/ci/run_local.sh
@@ -8,7 +8,7 @@
 #   - Server HTTP port is fixed at 8000 (workers need a known address)
 #   - gRPC port 50051 is fixed (workers cannot follow a dynamic port)
 #   - Worker container name is scoped to the process PID
-#   - Each run gets its own Docker network via compose project name
+#   - Each run gets its own Docker network and results directory
 #
 # IMPORTANT: Ports 8000 and 50051 must be free on your machine.
 # Workers are spawned with network_mode: host and connect to these
@@ -48,6 +48,7 @@ WORKER_IMAGE_GPU="ci/flowmesh_worker:latest-gpu"
 WORKER_NAME=""
 _WORKER_CFG=""
 _COMPOSE_OVERRIDE=""
+_RESULTS_DIR=""
 HOST_URL="http://localhost:8000"
 
 # ── Argument parsing ───────────────────────────────────────────────────────────
@@ -116,6 +117,7 @@ _teardown() {
   docker image prune -f >/dev/null
   docker volume prune -f >/dev/null
   rm -f "${_WORKER_CFG:-}" "${_COMPOSE_OVERRIDE:-}" 2>/dev/null || true
+  rm -rf "${_RESULTS_DIR:-}" 2>/dev/null || true
 
   if [[ $code -eq 0 ]]; then
     ok "Local CI run PASSED"
@@ -174,14 +176,21 @@ workers:
 EOF
 fi
 
-# Compose override: both ports are fixed so workers (network_mode: host)
-# can reach localhost:8000 (HTTP) and localhost:50051 (gRPC).
-# FLOWMESH_BASE_URL in ci.compose.yml is http://localhost:8000 — this
-# must match the HTTP port binding below.
+# Per-run results dir: absolute host path so workers (UID 10001) can write
+# without depending on _VolumeInitializer / busybox chown.
+_RESULTS_DIR="/tmp/flowmesh-ci-results-${PROJECT}"
+mkdir -p "$_RESULTS_DIR"
+chmod 777 "$_RESULTS_DIR"
+
+# Compose override: fixed ports + per-run RESULTS_DIR override.
+# RESULTS_DIR in ci.compose.yml defaults to /tmp/flowmesh-ci-results (CI);
+# here we use a PID-scoped path so parallel local runs don't collide.
 _COMPOSE_OVERRIDE="$(mktemp /tmp/ci-compose-override-XXXXXX.yml)"
 cat > "$_COMPOSE_OVERRIDE" <<EOF
 services:
   server:
+    environment:
+      RESULTS_DIR: "$_RESULTS_DIR"
     ports:
       - "127.0.0.1:8000:8000"
       - "50051:50051"
@@ -190,15 +199,16 @@ services:
 EOF
 COMPOSE_FILES+=(-f "$_COMPOSE_OVERRIDE")
 
-log "Project  : $PROJECT"
-log "Worker   : $WORKER_NAME"
-log "GPU mode : $GPU"
+log "Project     : $PROJECT"
+log "Worker      : $WORKER_NAME"
+log "GPU mode    : $GPU"
+log "Results dir : $_RESULTS_DIR"
 if $GPU; then
-  for _y in "${GPU_TASK_YAMLS[@]}"; do log "YAML     : $_y"; done
+  for _y in "${GPU_TASK_YAMLS[@]}"; do log "YAML        : $_y"; done
 else
-  log "YAML     : $TASK_YAML"
+  log "YAML        : $TASK_YAML"
 fi
-log "Timeout  : ${TIMEOUT}s"
+log "Timeout     : ${TIMEOUT}s"
 echo
 
 # ── 1. Pre-clean ──────────────────────────────────────────────────────────────

From 915cbf7c05e32b96f4e396ef5e8dcf4dc9378cd3 Mon Sep 17 00:00:00 2001
From: Qruixuan <154648498+Qruixuan@users.noreply.github.com>
Date: Mon, 4 May 2026 00:45:24 +0800
Subject: [PATCH 13/17] fix: persist HF model cache across CI runs via host
 bind-mount

docker volume prune -f (in pre-clean) deleted the named volume
flowmesh_server_hf_cache between runs, forcing TinyLlama to be
re-downloaded every time (~50s) and causing the 300s vLLM test to
time out by a few seconds.

Fix: set HF_CACHE_DIR to the host's ~/.cache/huggingface so workers
receive a bind mount of an absolute path.  _ensure_volume_access skips
_VolumeInitializer for absolute paths; models downloaded on the first
run persist for every subsequent run on the same machine.

- ci.compose.yml: pass HF_CACHE_DIR through from compose env
- run_local.sh: resolve _HF_CACHE_DIR (host ~/.cache/huggingface),
  mkdir+chmod 777, inject into compose override
- ci.yml: set HF_CACHE_DIR=$HOME/.cache/huggingface in project-name
  step; mkdir+chmod 777 in setup step; pass to docker compose env

Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn>
---
 .github/workflows/ci.yml | 28 ++++++++++++++++------------
 docker/ci.compose.yml    | 10 ++++++++++
 scripts/ci/run_local.sh  | 16 +++++++++++++---
 3 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f5441eb..8e9692c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,7 +24,9 @@ jobs:
         uses: actions/checkout@v6
 
       - name: Set project name
-        run: echo "PROJECT=ci-${{ github.run_id }}-integ" >> "$GITHUB_ENV"
+        run: |
+          echo "PROJECT=ci-${{ github.run_id }}-integ" >> "$GITHUB_ENV"
+          echo "HF_CACHE_DIR=$HOME/.cache/huggingface" >> "$GITHUB_ENV"
 
       - name: Pre-clean stale worker containers and disk
         run: |
@@ -39,10 +41,12 @@ jobs:
           df -h /
           docker system df
 
-      - name: Create worker results directory
+      - name: Create CI directories
         run: |
           mkdir -p /tmp/flowmesh-ci-results
           chmod 777 /tmp/flowmesh-ci-results
+          mkdir -p "$HOME/.cache/huggingface"
+          chmod 777 "$HOME/.cache/huggingface"
 
       - name: Build worker image
         run: |
@@ -60,6 +64,7 @@ jobs:
         env:
           DOCKER_BUILDKIT: "1"
           COMPOSE_PROJECT_NAME: ${{ env.PROJECT }}
+          HF_CACHE_DIR: ${{ env.HF_CACHE_DIR }}
 
       - name: Wait for server to be healthy
         run: |
@@ -132,10 +137,7 @@ jobs:
 
           echo ""
           echo "=== Result files written by worker ==="
-          docker run --rm \
-            --volumes-from ci-worker-cpu \
-            busybox find /var/lib/flowmesh-results -type f 2>/dev/null \
-            | head -20 || echo "(could not inspect result volume)"
+          ls -la /tmp/flowmesh-ci-results/ 2>/dev/null | head -20 || echo "(result dir empty or missing)"
 
       - name: Collect logs on failure
         if: failure()
@@ -188,7 +190,9 @@ jobs:
         uses: actions/checkout@v6
 
       - name: Set project name
-        run: echo "PROJECT=ci-${{ github.run_id }}-gpu" >> "$GITHUB_ENV"
+        run: |
+          echo "PROJECT=ci-${{ github.run_id }}-gpu" >> "$GITHUB_ENV"
+          echo "HF_CACHE_DIR=$HOME/.cache/huggingface" >> "$GITHUB_ENV"
 
       - name: Pre-clean stale worker containers and disk
         run: |
@@ -203,10 +207,12 @@ jobs:
           df -h /
           docker system df
 
-      - name: Create worker results directory
+      - name: Create CI directories
         run: |
           mkdir -p /tmp/flowmesh-ci-results
           chmod 777 /tmp/flowmesh-ci-results
+          mkdir -p "$HOME/.cache/huggingface"
+          chmod 777 "$HOME/.cache/huggingface"
 
       - name: Build GPU worker builder image (cached by content hash)
         run: |
@@ -246,6 +252,7 @@ jobs:
           DOCKER_BUILDKIT: "1"
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           COMPOSE_PROJECT_NAME: ${{ env.PROJECT }}
+          HF_CACHE_DIR: ${{ env.HF_CACHE_DIR }}
 
       - name: Wait for server to be healthy
         run: |
@@ -410,10 +417,7 @@ jobs:
 
           echo ""
           echo "=== Result files written by worker ==="
-          docker run --rm \
-            --volumes-from ci-worker-gpu \
-            busybox find /var/lib/flowmesh-results -type f 2>/dev/null \
-            | head -20 || echo "(could not inspect result volume)"
+          ls -la /tmp/flowmesh-ci-results/ 2>/dev/null | head -20 || echo "(result dir empty or missing)"
 
       - name: Collect logs on failure
         if: failure()
diff --git a/docker/ci.compose.yml b/docker/ci.compose.yml
index 539ce1f..9592041 100644
--- a/docker/ci.compose.yml
+++ b/docker/ci.compose.yml
@@ -12,6 +12,12 @@
 # without relying on the _VolumeInitializer busybox chown mechanism.
 # Caller must create /tmp/flowmesh-ci-results with chmod 777 before 'up'.
 #
+# HF_CACHE_DIR should be set to the host's ~/.cache/huggingface so that
+# model weights survive 'docker volume prune' between runs.  When set,
+# _mount_hf_cache uses a bind mount (absolute path bypasses
+# _VolumeInitializer).  If unset the adapter falls back to the named
+# volume flowmesh_server_hf_cache (fine for one-off runs).
+#
 # NOTE: No ports are exposed in this base file.  Add ports via an overlay:
 #   - Fixed (GitHub Actions / bare docker compose):  docker/ci.ports.fixed.yml
 #   - Fixed local (run_local.sh):                    generated at runtime
@@ -67,6 +73,10 @@ services:
       # Using an absolute path bypasses the _VolumeInitializer busybox chown
       # so workers (UID 10001) can write without depending on image pulls.
       RESULTS_DIR: "/tmp/flowmesh-ci-results"
+      # Host HF cache dir — pass from compose runtime env so model weights
+      # persist across runs and survive 'docker volume prune'.
+      # Unset → adapter falls back to named volume flowmesh_server_hf_cache.
+      HF_CACHE_DIR:
       # Pass HuggingFace token through so workers can download gated models.
       HF_TOKEN:
     volumes:
diff --git a/scripts/ci/run_local.sh b/scripts/ci/run_local.sh
index 841409d..a27b8f4 100644
--- a/scripts/ci/run_local.sh
+++ b/scripts/ci/run_local.sh
@@ -49,6 +49,7 @@ WORKER_NAME=""
 _WORKER_CFG=""
 _COMPOSE_OVERRIDE=""
 _RESULTS_DIR=""
+_HF_CACHE_DIR=""
 HOST_URL="http://localhost:8000"
 
 # ── Argument parsing ───────────────────────────────────────────────────────────
@@ -118,6 +119,7 @@ _teardown() {
   docker volume prune -f >/dev/null
   rm -f "${_WORKER_CFG:-}" "${_COMPOSE_OVERRIDE:-}" 2>/dev/null || true
   rm -rf "${_RESULTS_DIR:-}" 2>/dev/null || true
+  # _HF_CACHE_DIR is a persistent host path — intentionally NOT deleted.
 
   if [[ $code -eq 0 ]]; then
     ok "Local CI run PASSED"
@@ -182,15 +184,22 @@ _RESULTS_DIR="/tmp/flowmesh-ci-results-${PROJECT}"
 mkdir -p "$_RESULTS_DIR"
 chmod 777 "$_RESULTS_DIR"
 
-# Compose override: fixed ports + per-run RESULTS_DIR override.
-# RESULTS_DIR in ci.compose.yml defaults to /tmp/flowmesh-ci-results (CI);
-# here we use a PID-scoped path so parallel local runs don't collide.
+# HF model cache: bind-mount the host path so downloaded model weights survive
+# 'docker volume prune' between runs.  The server passes HF_CACHE_DIR to each
+# spawned worker; _mount_hf_cache uses a bind-mount for absolute paths
+# (bypasses _VolumeInitializer).  Falls back to named volume if unset.
+_HF_CACHE_DIR="${HF_CACHE_DIR:-${HOME}/.cache/huggingface}"
+mkdir -p "$_HF_CACHE_DIR"
+chmod 777 "$_HF_CACHE_DIR"
+
+# Compose override: fixed ports + per-run RESULTS_DIR + persistent HF cache.
 _COMPOSE_OVERRIDE="$(mktemp /tmp/ci-compose-override-XXXXXX.yml)"
 cat > "$_COMPOSE_OVERRIDE" <<EOF
 services:
   server:
     environment:
       RESULTS_DIR: "$_RESULTS_DIR"
+      HF_CACHE_DIR: "$_HF_CACHE_DIR"
     ports:
       - "127.0.0.1:8000:8000"
       - "50051:50051"
@@ -203,6 +212,7 @@ log "Project     : $PROJECT"
 log "Worker      : $WORKER_NAME"
 log "GPU mode    : $GPU"
 log "Results dir : $_RESULTS_DIR"
+log "HF cache    : $_HF_CACHE_DIR"
 if $GPU; then
   for _y in "${GPU_TASK_YAMLS[@]}"; do log "YAML        : $_y"; done
 else

From aa36f34000cdfb65138c8a3df2a286a5006dbc06 Mon Sep 17 00:00:00 2001
From: Qruixuan <154648498+Qruixuan@users.noreply.github.com>
Date: Mon, 4 May 2026 01:07:24 +0800
Subject: [PATCH 14/17] fix: revert HF cache to named volume (same as
 FlowMesh_dev), bump vLLM timeout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

HF_CACHE_DIR bind-mount was reverted — using the named Docker volume
flowmesh_server_hf_cache (identical to FlowMesh_dev) avoids accumulating
model weights on the host between CI runs; docker volume prune cleans it up.

The timeout issue is fixed by bumping the GPU E2E timeout: cold-start
(model download ~50s + load ~53s + compile ~17s + CUDA graphs) takes
~250s, leaving only ~50s for inference at the old 300s limit.
- run_local.sh: GPU default timeout 300 → 600s
- ci.yml: inference_vllm_tiny E2E_TIMEOUT_SEC 300 → 600s

Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn>
---
 .github/workflows/ci.yml | 20 +++++---------------
 docker/ci.compose.yml    | 10 ----------
 scripts/ci/run_local.sh  | 20 +++++---------------
 3 files changed, 10 insertions(+), 40 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8e9692c..f2e8a01 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,9 +24,7 @@ jobs:
         uses: actions/checkout@v6
 
       - name: Set project name
-        run: |
-          echo "PROJECT=ci-${{ github.run_id }}-integ" >> "$GITHUB_ENV"
-          echo "HF_CACHE_DIR=$HOME/.cache/huggingface" >> "$GITHUB_ENV"
+        run: echo "PROJECT=ci-${{ github.run_id }}-integ" >> "$GITHUB_ENV"
 
       - name: Pre-clean stale worker containers and disk
         run: |
@@ -41,12 +39,10 @@ jobs:
           df -h /
           docker system df
 
-      - name: Create CI directories
+      - name: Create worker results directory
         run: |
           mkdir -p /tmp/flowmesh-ci-results
           chmod 777 /tmp/flowmesh-ci-results
-          mkdir -p "$HOME/.cache/huggingface"
-          chmod 777 "$HOME/.cache/huggingface"
 
       - name: Build worker image
         run: |
@@ -64,7 +60,6 @@ jobs:
         env:
           DOCKER_BUILDKIT: "1"
           COMPOSE_PROJECT_NAME: ${{ env.PROJECT }}
-          HF_CACHE_DIR: ${{ env.HF_CACHE_DIR }}
 
       - name: Wait for server to be healthy
         run: |
@@ -190,9 +185,7 @@ jobs:
         uses: actions/checkout@v6
 
       - name: Set project name
-        run: |
-          echo "PROJECT=ci-${{ github.run_id }}-gpu" >> "$GITHUB_ENV"
-          echo "HF_CACHE_DIR=$HOME/.cache/huggingface" >> "$GITHUB_ENV"
+        run: echo "PROJECT=ci-${{ github.run_id }}-gpu" >> "$GITHUB_ENV"
 
       - name: Pre-clean stale worker containers and disk
         run: |
@@ -207,12 +200,10 @@ jobs:
           df -h /
           docker system df
 
-      - name: Create CI directories
+      - name: Create worker results directory
         run: |
           mkdir -p /tmp/flowmesh-ci-results
           chmod 777 /tmp/flowmesh-ci-results
-          mkdir -p "$HOME/.cache/huggingface"
-          chmod 777 "$HOME/.cache/huggingface"
 
       - name: Build GPU worker builder image (cached by content hash)
         run: |
@@ -252,7 +243,6 @@ jobs:
           DOCKER_BUILDKIT: "1"
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           COMPOSE_PROJECT_NAME: ${{ env.PROJECT }}
-          HF_CACHE_DIR: ${{ env.HF_CACHE_DIR }}
 
       - name: Wait for server to be healthy
         run: |
@@ -290,7 +280,7 @@ jobs:
             -e FLOWMESH_HOST_URL="http://server:8000" \
             -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
             -e TASK_YAML="/templates/inference_vllm_tiny.yaml" \
-            -e E2E_TIMEOUT_SEC="300" \
+            -e E2E_TIMEOUT_SEC="600" \
             -v "${{ github.workspace }}/tests:/tests:ro" \
             -v "${{ github.workspace }}/templates:/templates:ro" \
             python:3.11-slim \
diff --git a/docker/ci.compose.yml b/docker/ci.compose.yml
index 9592041..539ce1f 100644
--- a/docker/ci.compose.yml
+++ b/docker/ci.compose.yml
@@ -12,12 +12,6 @@
 # without relying on the _VolumeInitializer busybox chown mechanism.
 # Caller must create /tmp/flowmesh-ci-results with chmod 777 before 'up'.
 #
-# HF_CACHE_DIR should be set to the host's ~/.cache/huggingface so that
-# model weights survive 'docker volume prune' between runs.  When set,
-# _mount_hf_cache uses a bind mount (absolute path bypasses
-# _VolumeInitializer).  If unset the adapter falls back to the named
-# volume flowmesh_server_hf_cache (fine for one-off runs).
-#
 # NOTE: No ports are exposed in this base file.  Add ports via an overlay:
 #   - Fixed (GitHub Actions / bare docker compose):  docker/ci.ports.fixed.yml
 #   - Fixed local (run_local.sh):                    generated at runtime
@@ -73,10 +67,6 @@ services:
       # Using an absolute path bypasses the _VolumeInitializer busybox chown
       # so workers (UID 10001) can write without depending on image pulls.
       RESULTS_DIR: "/tmp/flowmesh-ci-results"
-      # Host HF cache dir — pass from compose runtime env so model weights
-      # persist across runs and survive 'docker volume prune'.
-      # Unset → adapter falls back to named volume flowmesh_server_hf_cache.
-      HF_CACHE_DIR:
       # Pass HuggingFace token through so workers can download gated models.
       HF_TOKEN:
     volumes:
diff --git a/scripts/ci/run_local.sh b/scripts/ci/run_local.sh
index a27b8f4..3d46f9a 100644
--- a/scripts/ci/run_local.sh
+++ b/scripts/ci/run_local.sh
@@ -20,7 +20,7 @@
 # Options:
 #   --gpu               Run the GPU smoke test instead of the CPU integration test
 #   --task-yaml PATH    Override the workflow YAML submitted to the server
-#   --timeout SEC       Override E2E wait timeout (default: 120, GPU default: 300)
+#   --timeout SEC       Override E2E wait timeout (default: 120, GPU default: 600)
 #   --no-clean          Skip the pre-run docker prune step
 #   --no-build          Skip rebuilding the worker image (use cached)
 #   --keep              Do not tear down services after the run
@@ -49,7 +49,6 @@ WORKER_NAME=""
 _WORKER_CFG=""
 _COMPOSE_OVERRIDE=""
 _RESULTS_DIR=""
-_HF_CACHE_DIR=""
 HOST_URL="http://localhost:8000"
 
 # ── Argument parsing ───────────────────────────────────────────────────────────
@@ -119,7 +118,6 @@ _teardown() {
   docker volume prune -f >/dev/null
   rm -f "${_WORKER_CFG:-}" "${_COMPOSE_OVERRIDE:-}" 2>/dev/null || true
   rm -rf "${_RESULTS_DIR:-}" 2>/dev/null || true
-  # _HF_CACHE_DIR is a persistent host path — intentionally NOT deleted.
 
   if [[ $code -eq 0 ]]; then
     ok "Local CI run PASSED"
@@ -134,7 +132,7 @@ if $GPU; then
   WORKER_NAME="ci-worker-gpu-$$"
   WORKER_IMAGE="$WORKER_IMAGE_GPU"
   WORKER_DOCKERFILE="src/worker/docker/Dockerfile.cuda"
-  [[ -z "$TIMEOUT" ]] && TIMEOUT=300
+  [[ -z "$TIMEOUT" ]] && TIMEOUT=600
   if [[ -n "$TASK_YAML" ]]; then
     GPU_TASK_YAMLS=("$TASK_YAML")
   else
@@ -184,22 +182,15 @@ _RESULTS_DIR="/tmp/flowmesh-ci-results-${PROJECT}"
 mkdir -p "$_RESULTS_DIR"
 chmod 777 "$_RESULTS_DIR"
 
-# HF model cache: bind-mount the host path so downloaded model weights survive
-# 'docker volume prune' between runs.  The server passes HF_CACHE_DIR to each
-# spawned worker; _mount_hf_cache uses a bind-mount for absolute paths
-# (bypasses _VolumeInitializer).  Falls back to named volume if unset.
-_HF_CACHE_DIR="${HF_CACHE_DIR:-${HOME}/.cache/huggingface}"
-mkdir -p "$_HF_CACHE_DIR"
-chmod 777 "$_HF_CACHE_DIR"
-
-# Compose override: fixed ports + per-run RESULTS_DIR + persistent HF cache.
+# Compose override: fixed ports + per-run RESULTS_DIR override.
+# RESULTS_DIR in ci.compose.yml defaults to /tmp/flowmesh-ci-results (CI);
+# here we use a PID-scoped path so parallel local runs don't collide.
 _COMPOSE_OVERRIDE="$(mktemp /tmp/ci-compose-override-XXXXXX.yml)"
 cat > "$_COMPOSE_OVERRIDE" <<EOF
 services:
   server:
     environment:
       RESULTS_DIR: "$_RESULTS_DIR"
-      HF_CACHE_DIR: "$_HF_CACHE_DIR"
     ports:
       - "127.0.0.1:8000:8000"
       - "50051:50051"
@@ -212,7 +203,6 @@ log "Project     : $PROJECT"
 log "Worker      : $WORKER_NAME"
 log "GPU mode    : $GPU"
 log "Results dir : $_RESULTS_DIR"
-log "HF cache    : $_HF_CACHE_DIR"
 if $GPU; then
   for _y in "${GPU_TASK_YAMLS[@]}"; do log "YAML        : $_y"; done
 else

From b6d1e2ccc5ec6dbae26f4c2227261ce33088d488 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 3 May 2026 17:22:39 +0000
Subject: [PATCH 15/17] fix: replace gated llama model with open Qwen model in
 dag_inference.json

meta-llama/Llama-3.2-1B-Instruct requires HF_TOKEN; use the non-gated
Qwen/Qwen2.5-0.5B-Instruct instead, matching FlowMesh_dev's fix.

Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn>
---
 templates/n8n/dag_inference.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/templates/n8n/dag_inference.json b/templates/n8n/dag_inference.json
index cd6f75b..05faf83 100644
--- a/templates/n8n/dag_inference.json
+++ b/templates/n8n/dag_inference.json
@@ -17,7 +17,7 @@
     },
     {
       "parameters": {
-        "model": "meta-llama/Llama-3.2-1B-Instruct",
+        "model": "Qwen/Qwen2.5-0.5B-Instruct",
         "options": {
           "maxTokens": 128,
           "temperature": 1,
@@ -136,7 +136,7 @@
     },
     {
       "parameters": {
-        "model": "meta-llama/Llama-3.2-1B-Instruct",
+        "model": "Qwen/Qwen2.5-0.5B-Instruct",
         "options": {
           "maxTokens": 128,
           "temperature": 1,
@@ -236,7 +236,7 @@
     },
     {
       "parameters": {
-        "model": "meta-llama/Llama-3.2-1B-Instruct",
+        "model": "Qwen/Qwen2.5-0.5B-Instruct",
         "options": {
           "maxTokens": 128,
           "temperature": 1,

From 344d783ac5c603b6e468e7962b4c7905f0bb474b Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 4 May 2026 12:12:52 +0000
Subject: [PATCH 16/17] fix: harden ci.yml for zizmor pedantic audit

- Pin actions/checkout and actions/upload-artifact to commit SHAs
- Add persist-credentials: false to all checkout steps
- Add top-level permissions: contents: read
- Move github.workspace and github.run_id out of run: blocks into
  step-level env: to eliminate template-expansion injection warnings

Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn>
---
 .github/workflows/ci.yml | 97 +++++++++++++++++++++++++++-------------
 1 file changed, 67 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index f2e8a01..ffc9a10 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -5,6 +5,9 @@ on:
     branches: [main]
   workflow_dispatch:
 
+permissions:
+  contents: read
+
 concurrency:
   group: ci-${{ github.ref }}
   cancel-in-progress: true
@@ -21,10 +24,14 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6
+        with:
+          persist-credentials: false
 
       - name: Set project name
-        run: echo "PROJECT=ci-${{ github.run_id }}-integ" >> "$GITHUB_ENV"
+        env:
+          RUN_ID: ${{ github.run_id }}
+        run: echo "PROJECT=ci-${RUN_ID}-integ" >> "$GITHUB_ENV"
 
       - name: Pre-clean stale worker containers and disk
         run: |
@@ -103,6 +110,8 @@ jobs:
           exit 1
 
       - name: Run E2E smoke test (echo task)
+        env:
+          WORKSPACE: ${{ github.workspace }}
         run: |
           docker run --rm \
             --network "${PROJECT}_ci-net" \
@@ -110,19 +119,21 @@ jobs:
             -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
             -e TASK_YAML="/templates/echo_local.yaml" \
             -e E2E_TIMEOUT_SEC="120" \
-            -v "${{ github.workspace }}/tests:/tests:ro" \
-            -v "${{ github.workspace }}/templates:/templates:ro" \
+            -v "$WORKSPACE/tests:/tests:ro" \
+            -v "$WORKSPACE/templates:/templates:ro" \
             python:3.11-slim \
             sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v"
 
       - name: Verify CPU worker actually executed the task
+        env:
+          RUN_ID: ${{ github.run_id }}
         run: |
           echo "=== CPU worker logs (full) ==="
-          docker logs ci-worker-cpu 2>&1 | tee /tmp/worker-cpu-${{ github.run_id }}.log || true
+          docker logs ci-worker-cpu 2>&1 | tee "/tmp/worker-cpu-${RUN_ID}.log" || true
 
           echo ""
           echo "=== Execution evidence check ==="
-          LOG=/tmp/worker-cpu-${{ github.run_id }}.log
+          LOG="/tmp/worker-cpu-${RUN_ID}.log"
           if grep -qiE "executor|running task|dispatched|echo|succeeded|TASK_SUCCEEDED|done" "$LOG"; then
             echo "✓ Worker executed and completed the task"
           else
@@ -136,13 +147,15 @@ jobs:
 
       - name: Collect logs on failure
         if: failure()
+        env:
+          RUN_ID: ${{ github.run_id }}
         run: |
           docker compose -p "$PROJECT" -f docker/ci.compose.yml logs --no-color \
-            > /tmp/ci-logs-${{ github.run_id }}.txt 2>&1 || true
+            > "/tmp/ci-logs-${RUN_ID}.txt" 2>&1 || true
 
       - name: Upload logs on failure
         if: failure()
-        uses: actions/upload-artifact@v7
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7
         with:
           name: ci-logs-integ-${{ github.run_id }}
           path: /tmp/ci-logs-${{ github.run_id }}.txt
@@ -182,10 +195,14 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v6
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6
+        with:
+          persist-credentials: false
 
       - name: Set project name
-        run: echo "PROJECT=ci-${{ github.run_id }}-gpu" >> "$GITHUB_ENV"
+        env:
+          RUN_ID: ${{ github.run_id }}
+        run: echo "PROJECT=ci-${RUN_ID}-gpu" >> "$GITHUB_ENV"
 
       - name: Pre-clean stale worker containers and disk
         run: |
@@ -274,6 +291,8 @@ jobs:
           exit 1
 
       - name: "E2E: vLLM inference (TinyLlama-1.1B)"
+        env:
+          WORKSPACE: ${{ github.workspace }}
         run: |
           docker run --rm \
             --network "${PROJECT}_ci-net" \
@@ -281,12 +300,14 @@ jobs:
             -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
             -e TASK_YAML="/templates/inference_vllm_tiny.yaml" \
             -e E2E_TIMEOUT_SEC="600" \
-            -v "${{ github.workspace }}/tests:/tests:ro" \
-            -v "${{ github.workspace }}/templates:/templates:ro" \
+            -v "$WORKSPACE/tests:/tests:ro" \
+            -v "$WORKSPACE/templates:/templates:ro" \
             python:3.11-slim \
             sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v"
 
       - name: "E2E: 3-node fan-in graph DAG (echo executor)"
+        env:
+          WORKSPACE: ${{ github.workspace }}
         run: |
           docker run --rm \
             --network "${PROJECT}_ci-net" \
@@ -294,12 +315,14 @@ jobs:
             -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
             -e TASK_YAML="/templates/echo_three_node_graph.yaml" \
             -e E2E_TIMEOUT_SEC="120" \
-            -v "${{ github.workspace }}/tests:/tests:ro" \
-            -v "${{ github.workspace }}/templates:/templates:ro" \
+            -v "$WORKSPACE/tests:/tests:ro" \
+            -v "$WORKSPACE/templates:/templates:ro" \
             python:3.11-slim \
             sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v"
 
       - name: "E2E: parallel DAG with synthesis (vLLM, graph_template)"
+        env:
+          WORKSPACE: ${{ github.workspace }}
         run: |
           docker run --rm \
             --network "${PROJECT}_ci-net" \
@@ -307,12 +330,14 @@ jobs:
             -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
             -e TASK_YAML="/templates/dag_inference_example.yaml" \
             -e E2E_TIMEOUT_SEC="600" \
-            -v "${{ github.workspace }}/tests:/tests:ro" \
-            -v "${{ github.workspace }}/templates:/templates:ro" \
+            -v "$WORKSPACE/tests:/tests:ro" \
+            -v "$WORKSPACE/templates:/templates:ro" \
             python:3.11-slim \
             sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v"
 
       - name: "E2E: conditional task skip (echo executor)"
+        env:
+          WORKSPACE: ${{ github.workspace }}
         run: |
           docker run --rm \
             --network "${PROJECT}_ci-net" \
@@ -320,12 +345,14 @@ jobs:
             -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
             -e TASK_YAML="/templates/conditional_echo_test.yaml" \
             -e E2E_TIMEOUT_SEC="120" \
-            -v "${{ github.workspace }}/tests:/tests:ro" \
-            -v "${{ github.workspace }}/templates:/templates:ro" \
+            -v "$WORKSPACE/tests:/tests:ro" \
+            -v "$WORKSPACE/templates:/templates:ro" \
             python:3.11-slim \
             sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v"
 
       - name: "E2E: HF Transformers inference (tiny-gpt2)"
+        env:
+          WORKSPACE: ${{ github.workspace }}
         run: |
           docker run --rm \
             --network "${PROJECT}_ci-net" \
@@ -333,12 +360,14 @@ jobs:
             -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
             -e TASK_YAML="/templates/inference_hf_tiny.yaml" \
             -e E2E_TIMEOUT_SEC="300" \
-            -v "${{ github.workspace }}/tests:/tests:ro" \
-            -v "${{ github.workspace }}/templates:/templates:ro" \
+            -v "$WORKSPACE/tests:/tests:ro" \
+            -v "$WORKSPACE/templates:/templates:ro" \
             python:3.11-slim \
             sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v"
 
       - name: "E2E: LoRA SFT fine-tuning (TinyLlama-1.1B, gsm8k 2%)"
+        env:
+          WORKSPACE: ${{ github.workspace }}
         run: |
           docker run --rm \
             --network "${PROJECT}_ci-net" \
@@ -346,12 +375,14 @@ jobs:
             -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
             -e TASK_YAML="/templates/lora_sft_llama.yaml" \
             -e E2E_TIMEOUT_SEC="1200" \
-            -v "${{ github.workspace }}/tests:/tests:ro" \
-            -v "${{ github.workspace }}/templates:/templates:ro" \
+            -v "$WORKSPACE/tests:/tests:ro" \
+            -v "$WORKSPACE/templates:/templates:ro" \
             python:3.11-slim \
             sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v"
 
       - name: "E2E: SSH non-interactive (python:3.12-slim container)"
+        env:
+          WORKSPACE: ${{ github.workspace }}
         run: |
           docker run --rm \
             --network "${PROJECT}_ci-net" \
@@ -359,12 +390,14 @@ jobs:
             -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
             -e TASK_YAML="/templates/ssh_noninteractive.yaml" \
             -e E2E_TIMEOUT_SEC="120" \
-            -v "${{ github.workspace }}/tests:/tests:ro" \
-            -v "${{ github.workspace }}/templates:/templates:ro" \
+            -v "$WORKSPACE/tests:/tests:ro" \
+            -v "$WORKSPACE/templates:/templates:ro" \
             python:3.11-slim \
             sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v"
 
       - name: "E2E: n8n parallel DAG inference (dag_inference.json)"
+        env:
+          WORKSPACE: ${{ github.workspace }}
         run: |
           docker run --rm \
             --network "${PROJECT}_ci-net" \
@@ -372,19 +405,21 @@ jobs:
             -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \
             -e TASK_YAML="/templates/n8n/dag_inference.json" \
             -e E2E_TIMEOUT_SEC="600" \
-            -v "${{ github.workspace }}/tests:/tests:ro" \
-            -v "${{ github.workspace }}/templates:/templates:ro" \
+            -v "$WORKSPACE/tests:/tests:ro" \
+            -v "$WORKSPACE/templates:/templates:ro" \
             python:3.11-slim \
             sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v"
 
       - name: Verify GPU worker actually executed the task
+        env:
+          RUN_ID: ${{ github.run_id }}
         run: |
           echo "=== GPU worker logs (full) ==="
-          docker logs ci-worker-gpu 2>&1 | tee /tmp/worker-gpu-${{ github.run_id }}.log || true
+          docker logs ci-worker-gpu 2>&1 | tee "/tmp/worker-gpu-${RUN_ID}.log" || true
 
           echo ""
           echo "=== Execution evidence check ==="
-          LOG=/tmp/worker-gpu-${{ github.run_id }}.log
+          LOG="/tmp/worker-gpu-${RUN_ID}.log"
 
           if grep -qiE "executor|running task|dispatched|inference|model" "$LOG"; then
             echo "✓ Worker received and processed a task"
@@ -411,15 +446,17 @@ jobs:
 
       - name: Collect logs on failure
         if: failure()
+        env:
+          RUN_ID: ${{ github.run_id }}
         run: |
           docker compose -p "$PROJECT" \
             -f docker/ci.compose.yml \
             -f docker/ci.worker.gpu.yml \
-            logs --no-color > /tmp/ci-gpu-logs-${{ github.run_id }}.txt 2>&1 || true
+            logs --no-color > "/tmp/ci-gpu-logs-${RUN_ID}.txt" 2>&1 || true
 
       - name: Upload logs on failure
         if: failure()
-        uses: actions/upload-artifact@v7
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7
         with:
           name: ci-logs-gpu-${{ github.run_id }}
           path: /tmp/ci-gpu-logs-${{ github.run_id }}.txt

From e6af60130e71722d0cc11caed48efb79fbb08d00 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 4 May 2026 12:19:53 +0000
Subject: [PATCH 17/17] style: apply isort and black fixes

Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn>
---
 src/server/supervisor/adapters/docker.py      |  2 +-
 src/server/utils/helpers.py                   |  3 ++-
 src/worker/executors/ssh_executor.py          | 10 ++++++----
 src/worker/executors/transformers_executor.py |  4 +++-
 tests/integration/test_e2e.py                 |  4 +---
 5 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/server/supervisor/adapters/docker.py b/src/server/supervisor/adapters/docker.py
index 1f5af9b..efe2fca 100644
--- a/src/server/supervisor/adapters/docker.py
+++ b/src/server/supervisor/adapters/docker.py
@@ -7,12 +7,12 @@
 from enum import StrEnum
 from typing import Any
 
-from docker import DockerClient
 from docker.errors import NotFound
 from docker.models.containers import Container
 from docker.types import DeviceRequest
 from pydantic import BaseModel, Field
 
+from docker import DockerClient
 from shared.utils.docker import sanitize_container_name
 
 from ... import env
diff --git a/src/server/utils/helpers.py b/src/server/utils/helpers.py
index ca92caf..cfe6a30 100644
--- a/src/server/utils/helpers.py
+++ b/src/server/utils/helpers.py
@@ -10,10 +10,11 @@
 from typing import Any
 
 import aiohttp
-import docker
 import requests
 from redis.client import PubSub
 
+import docker
+
 _logger: logging.Logger | None = None
 _docker_client: docker.DockerClient | None = None
 
diff --git a/src/worker/executors/ssh_executor.py b/src/worker/executors/ssh_executor.py
index 1a088e2..cdd423b 100644
--- a/src/worker/executors/ssh_executor.py
+++ b/src/worker/executors/ssh_executor.py
@@ -50,19 +50,21 @@
 )
 
 try:
-    import docker
-    from docker import DockerClient
     from docker.models.containers import Container
     from docker.types import DeviceRequest
 
+    import docker
+    from docker import DockerClient
+
     _HAS_DOCKER = True
 except Exception:
     _HAS_DOCKER = False
     if TYPE_CHECKING:
-        import docker
-        from docker import DockerClient
         from docker.models.containers import Container
         from docker.types import DeviceRequest
+
+        import docker
+        from docker import DockerClient
     else:
         docker = None
         DockerClient = Any
diff --git a/src/worker/executors/transformers_executor.py b/src/worker/executors/transformers_executor.py
index 04fd084..b959a82 100644
--- a/src/worker/executors/transformers_executor.py
+++ b/src/worker/executors/transformers_executor.py
@@ -113,7 +113,9 @@
     from transformers import PreTrainedModel, PreTrainedTokenizerBase
 except ImportError:
     try:
-        from transformers.modeling_utils import PreTrainedModel  # type: ignore[assignment]
+        from transformers.modeling_utils import (
+            PreTrainedModel,  # type: ignore[assignment]
+        )
         from transformers.tokenization_utils_base import (  # type: ignore[assignment]
             PreTrainedTokenizerBase,
         )
diff --git a/tests/integration/test_e2e.py b/tests/integration/test_e2e.py
index 5c22f0d..6a1c8f8 100644
--- a/tests/integration/test_e2e.py
+++ b/tests/integration/test_e2e.py
@@ -167,9 +167,7 @@ def _poll_task(task_id: str) -> dict[str, Any]:
             error = task.get("error") or ""
             log_text = _dump_task_logs(task_id)
             if _EXECUTOR_UNAVAILABLE_RE.search(error):
-                pytest.skip(
-                    f"[e2e] Executor not available on this worker: {error}"
-                )
+                pytest.skip(f"[e2e] Executor not available on this worker: {error}")
             # max_attempts_exceeded means the host retried until giving up.
             # Inspect logs for the root cause; skip if the executor was
             # unavailable (e.g. Docker socket missing for SSH executor).