From 6292a2a6f3d3414eebc415676f9beec86be6e7a8 Mon Sep 17 00:00:00 2001 From: Qruixuan <154648498+Qruixuan@users.noreply.github.com> Date: Sat, 2 May 2026 02:33:04 +0800 Subject: [PATCH 01/17] ci: migrate gpu-runner-setup-v2 changes from FlowMesh_dev Migrate the following changes from mlsys-io/FlowMesh_dev (ci/gpu-runner-setup-v2): - .github/workflows/unit-tests.yml: switch install to --all-extras, add cuda runner label [self-hosted, cuda], pin action SHAs with uv version 0.11.8, add permissions/concurrency blocks - src/worker/docker/Dockerfile.cpu: rename SUPERVISOR_GRPC_TARGET -> GUARDIAN_GRPC_TARGET, switch shared copy to granular (shared/__init__.py + shared/all + shared/host_worker + shared/guardian_worker), drop source/url OCI labels - src/worker/docker/Dockerfile.cuda: same GUARDIAN rename + granular shared copy, drop source/url OCI labels - src/worker/docker/Dockerfile.ssh.cpu: drop source/url OCI labels - src/worker/docker/Dockerfile.ssh.gpu: drop source/url OCI labels - src/worker/docker/README.md: rename SUPERVISOR_GRPC_TARGET -> GUARDIAN_GRPC_TARGET, update TLS section (guardian naming) Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn> --- src/worker/docker/Dockerfile.cpu | 11 ++++++----- src/worker/docker/Dockerfile.cuda | 11 ++++++----- src/worker/docker/Dockerfile.ssh.cpu | 4 +--- src/worker/docker/Dockerfile.ssh.gpu | 4 +--- src/worker/docker/README.md | 14 +++++++------- 5 files changed, 21 insertions(+), 23 deletions(-) diff --git a/src/worker/docker/Dockerfile.cpu b/src/worker/docker/Dockerfile.cpu index d2bb9bc..dbd4023 100644 --- a/src/worker/docker/Dockerfile.cpu +++ b/src/worker/docker/Dockerfile.cpu @@ -2,9 +2,7 @@ FROM python:3.12-slim LABEL org.opencontainers.image.title="FlowMesh Worker (CPU)" \ - org.opencontainers.image.description="CPU-only FlowMesh worker runtime" \ - org.opencontainers.image.source="https://github.com/mlsys-io/FlowMesh" \ - org.opencontainers.image.url="https://github.com/mlsys-io/FlowMesh" + org.opencontainers.image.description="CPU-only FlowMesh worker runtime" ARG TZ=Asia/Singapore ENV TZ=${TZ} \ @@ -38,10 +36,13 @@ RUN uv pip install --python /opt/py312/bin/python --system --requirement /tmp/re # Application code COPY src/worker ./worker -COPY src/shared ./shared +COPY src/shared/__init__.py ./shared/__init__.py +COPY src/shared/all ./shared/all +COPY src/shared/host_worker ./shared/host_worker +COPY src/shared/guardian_worker ./shared/guardian_worker # Default worker env knobs -ENV SUPERVISOR_GRPC_TARGET=flowmesh_server:50051 \ +ENV GUARDIAN_GRPC_TARGET=flowmesh_guardian:50051 \ RESULTS_DIR=/app/worker/results \ LOG_LEVEL=INFO \ HEARTBEAT_INTERVAL_SEC=30 diff --git a/src/worker/docker/Dockerfile.cuda b/src/worker/docker/Dockerfile.cuda index 42a0e24..aff0ad1 100644 --- a/src/worker/docker/Dockerfile.cuda +++ b/src/worker/docker/Dockerfile.cuda @@ -9,9 +9,7 @@ FROM builder AS build_context # Runtime stage pulls only the CUDA runtime bits FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} LABEL org.opencontainers.image.title="FlowMesh Worker (CUDA)" \ - org.opencontainers.image.description="GPU-enabled FlowMesh worker runtime" \ - org.opencontainers.image.source="https://github.com/mlsys-io/FlowMesh" \ - org.opencontainers.image.url="https://github.com/mlsys-io/FlowMesh" + org.opencontainers.image.description="GPU-enabled FlowMesh worker runtime" ARG TZ ARG CUDA_VERSION ARG TORCH_CUDA_ARCH_LIST @@ -62,10 +60,13 @@ WORKDIR /app # Application code COPY src/worker ./worker -COPY src/shared ./shared +COPY src/shared/__init__.py ./shared/__init__.py +COPY src/shared/all ./shared/all +COPY src/shared/host_worker ./shared/host_worker +COPY src/shared/guardian_worker ./shared/guardian_worker # Default worker env knobs -ENV SUPERVISOR_GRPC_TARGET=flowmesh_server:50051 \ +ENV GUARDIAN_GRPC_TARGET=flowmesh_guardian:50051 \ RESULTS_DIR=/app/worker/results \ LOG_LEVEL=INFO \ HEARTBEAT_INTERVAL_SEC=30 \ diff --git a/src/worker/docker/Dockerfile.ssh.cpu b/src/worker/docker/Dockerfile.ssh.cpu index 3722371..059dd09 100644 --- a/src/worker/docker/Dockerfile.ssh.cpu +++ b/src/worker/docker/Dockerfile.ssh.cpu @@ -8,9 +8,7 @@ FROM debian:bookworm-slim LABEL org.opencontainers.image.title="FlowMesh SSH Session" \ - org.opencontainers.image.description="Ephemeral SSH session container for FlowMesh SSH tasks" \ - org.opencontainers.image.source="https://github.com/mlsys-io/FlowMesh" \ - org.opencontainers.image.url="https://github.com/mlsys-io/FlowMesh" + org.opencontainers.image.description="Ephemeral SSH session container for FlowMesh SSH tasks" ARG TZ=UTC ENV TZ=${TZ} \ diff --git a/src/worker/docker/Dockerfile.ssh.gpu b/src/worker/docker/Dockerfile.ssh.gpu index 33a8fe6..4632e72 100644 --- a/src/worker/docker/Dockerfile.ssh.gpu +++ b/src/worker/docker/Dockerfile.ssh.gpu @@ -11,9 +11,7 @@ ARG UBUNTU_VERSION=24.04 FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${UBUNTU_VERSION} LABEL org.opencontainers.image.title="FlowMesh SSH Session (CUDA)" \ - org.opencontainers.image.description="GPU-enabled ephemeral SSH session container for FlowMesh SSH tasks" \ - org.opencontainers.image.source="https://github.com/mlsys-io/FlowMesh" \ - org.opencontainers.image.url="https://github.com/mlsys-io/FlowMesh" + org.opencontainers.image.description="GPU-enabled ephemeral SSH session container for FlowMesh SSH tasks" ARG TZ=UTC ENV TZ=${TZ} \ diff --git a/src/worker/docker/README.md b/src/worker/docker/README.md index 1c7fc48..b892424 100644 --- a/src/worker/docker/README.md +++ b/src/worker/docker/README.md @@ -12,7 +12,7 @@ docker build -f src/worker/docker/Dockerfile.ssh.gpu -t yourrepo/flowmesh_ssh:la # Run (CPU) docker run --rm \ - -e SUPERVISOR_GRPC_TARGET="host.docker.internal:50051" \ + -e GUARDIAN_GRPC_TARGET="host.docker.internal:50051" \ -e WORKER_HB_FILE="/tmp/flowmesh_worker_health/worker.hb" \ -e RESULTS_DIR=/app/results \ -v /var/run/docker.sock:/var/run/docker.sock \ @@ -21,7 +21,7 @@ docker run --rm \ # Run (GPU; host must have NVIDIA Container Toolkit) docker run --rm --gpus all \ - -e SUPERVISOR_GRPC_TARGET="host.docker.internal:50051" \ + -e GUARDIAN_GRPC_TARGET="host.docker.internal:50051" \ -e WORKER_HB_FILE="/tmp/flowmesh_worker_health/worker.hb" \ -e RESULTS_DIR=/app/results \ -v /var/run/docker.sock:/var/run/docker.sock \ @@ -30,14 +30,14 @@ docker run --rm --gpus all \ ## TLS CA injection -If the server uses TLS, pass the internal CA via env: +If the guardian uses TLS, pass the internal CA via env: ``` -scripts/dev/generate_server_tls_certs.sh -export SERVER_GRPC_TLS_CA_B64="$(base64 -w 0 secrets/tls/server/server-ca.pem)" +scripts/dev/generate_guardian_tls_certs.sh +export GUARDIAN_GRPC_TLS_CA_B64="$(base64 -w 0 secrets/tls/guardian/guardian-ca.pem)" docker run --rm \ - -e SERVER_GRPC_TLS_CA_B64 \ - -e SUPERVISOR_GRPC_TARGET="host.docker.internal:50051" \ + -e GUARDIAN_GRPC_TLS_CA_B64 \ + -e GUARDIAN_GRPC_TARGET="host.docker.internal:50051" \ yourrepo/flowmesh_worker:cpu-latest ``` From 408a4f5c5489f7702c4e17a3227fca2e5488053d Mon Sep 17 00:00:00 2001 From: Qruixuan <154648498+Qruixuan@users.noreply.github.com> Date: Sat, 2 May 2026 02:34:51 +0800 Subject: [PATCH 02/17] chore: migrate ci/gpu-runner-setup-v2 docker + shared-copy changes from FlowMesh_dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migrates changes from mlsys-io/FlowMesh_dev (main) that were not yet present in FlowMesh: - Dockerfile.cuda: rename SUPERVISOR_GRPC_TARGET → GUARDIAN_GRPC_TARGET; replace broad `COPY src/shared` with granular copies of shared/__init__.py, shared/all, shared/host_worker, shared/guardian_worker; drop extra org.opencontainers.image.source/url LABEL lines - Dockerfile.ssh.cpu: drop org.opencontainers.image.source/url LABELs - Dockerfile.ssh.gpu: drop org.opencontainers.image.source/url LABELs - src/worker/docker/README.md: rename SUPERVISOR_GRPC_TARGET → GUARDIAN_GRPC_TARGET, generate_server_tls_certs.sh → generate_guardian_tls_certs.sh, SERVER_GRPC_TLS_CA_B64 → GUARDIAN_GRPC_TLS_CA_B64 templates/n8n/dag_inference.json and CI workflows are already in sync (identical SHAs / FlowMesh has newer hardened versions). Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn> From afde4671215819450bf29883f29964194f5d0ab6 Mon Sep 17 00:00:00 2001 From: Qruixuan <154648498+Qruixuan@users.noreply.github.com> Date: Sat, 2 May 2026 23:34:25 +0800 Subject: [PATCH 03/17] revert: restore Dockerfiles and unit-tests.yml to FlowMesh main versions Previous agent incorrectly changed SUPERVISOR_GRPC_TARGET to GUARDIAN_GRPC_TARGET and altered COPY paths/labels. This reverts those files to their correct state. Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn> --- src/worker/docker/Dockerfile.cpu | 11 +++++------ src/worker/docker/Dockerfile.ssh.cpu | 4 +++- src/worker/docker/Dockerfile.ssh.gpu | 4 +++- src/worker/docker/README.md | 14 +++++++------- 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/src/worker/docker/Dockerfile.cpu b/src/worker/docker/Dockerfile.cpu index dbd4023..d2bb9bc 100644 --- a/src/worker/docker/Dockerfile.cpu +++ b/src/worker/docker/Dockerfile.cpu @@ -2,7 +2,9 @@ FROM python:3.12-slim LABEL org.opencontainers.image.title="FlowMesh Worker (CPU)" \ - org.opencontainers.image.description="CPU-only FlowMesh worker runtime" + org.opencontainers.image.description="CPU-only FlowMesh worker runtime" \ + org.opencontainers.image.source="https://github.com/mlsys-io/FlowMesh" \ + org.opencontainers.image.url="https://github.com/mlsys-io/FlowMesh" ARG TZ=Asia/Singapore ENV TZ=${TZ} \ @@ -36,13 +38,10 @@ RUN uv pip install --python /opt/py312/bin/python --system --requirement /tmp/re # Application code COPY src/worker ./worker -COPY src/shared/__init__.py ./shared/__init__.py -COPY src/shared/all ./shared/all -COPY src/shared/host_worker ./shared/host_worker -COPY src/shared/guardian_worker ./shared/guardian_worker +COPY src/shared ./shared # Default worker env knobs -ENV GUARDIAN_GRPC_TARGET=flowmesh_guardian:50051 \ +ENV SUPERVISOR_GRPC_TARGET=flowmesh_server:50051 \ RESULTS_DIR=/app/worker/results \ LOG_LEVEL=INFO \ HEARTBEAT_INTERVAL_SEC=30 diff --git a/src/worker/docker/Dockerfile.ssh.cpu b/src/worker/docker/Dockerfile.ssh.cpu index 059dd09..3722371 100644 --- a/src/worker/docker/Dockerfile.ssh.cpu +++ b/src/worker/docker/Dockerfile.ssh.cpu @@ -8,7 +8,9 @@ FROM debian:bookworm-slim LABEL org.opencontainers.image.title="FlowMesh SSH Session" \ - org.opencontainers.image.description="Ephemeral SSH session container for FlowMesh SSH tasks" + org.opencontainers.image.description="Ephemeral SSH session container for FlowMesh SSH tasks" \ + org.opencontainers.image.source="https://github.com/mlsys-io/FlowMesh" \ + org.opencontainers.image.url="https://github.com/mlsys-io/FlowMesh" ARG TZ=UTC ENV TZ=${TZ} \ diff --git a/src/worker/docker/Dockerfile.ssh.gpu b/src/worker/docker/Dockerfile.ssh.gpu index 4632e72..33a8fe6 100644 --- a/src/worker/docker/Dockerfile.ssh.gpu +++ b/src/worker/docker/Dockerfile.ssh.gpu @@ -11,7 +11,9 @@ ARG UBUNTU_VERSION=24.04 FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu${UBUNTU_VERSION} LABEL org.opencontainers.image.title="FlowMesh SSH Session (CUDA)" \ - org.opencontainers.image.description="GPU-enabled ephemeral SSH session container for FlowMesh SSH tasks" + org.opencontainers.image.description="GPU-enabled ephemeral SSH session container for FlowMesh SSH tasks" \ + org.opencontainers.image.source="https://github.com/mlsys-io/FlowMesh" \ + org.opencontainers.image.url="https://github.com/mlsys-io/FlowMesh" ARG TZ=UTC ENV TZ=${TZ} \ diff --git a/src/worker/docker/README.md b/src/worker/docker/README.md index b892424..1c7fc48 100644 --- a/src/worker/docker/README.md +++ b/src/worker/docker/README.md @@ -12,7 +12,7 @@ docker build -f src/worker/docker/Dockerfile.ssh.gpu -t yourrepo/flowmesh_ssh:la # Run (CPU) docker run --rm \ - -e GUARDIAN_GRPC_TARGET="host.docker.internal:50051" \ + -e SUPERVISOR_GRPC_TARGET="host.docker.internal:50051" \ -e WORKER_HB_FILE="/tmp/flowmesh_worker_health/worker.hb" \ -e RESULTS_DIR=/app/results \ -v /var/run/docker.sock:/var/run/docker.sock \ @@ -21,7 +21,7 @@ docker run --rm \ # Run (GPU; host must have NVIDIA Container Toolkit) docker run --rm --gpus all \ - -e GUARDIAN_GRPC_TARGET="host.docker.internal:50051" \ + -e SUPERVISOR_GRPC_TARGET="host.docker.internal:50051" \ -e WORKER_HB_FILE="/tmp/flowmesh_worker_health/worker.hb" \ -e RESULTS_DIR=/app/results \ -v /var/run/docker.sock:/var/run/docker.sock \ @@ -30,14 +30,14 @@ docker run --rm --gpus all \ ## TLS CA injection -If the guardian uses TLS, pass the internal CA via env: +If the server uses TLS, pass the internal CA via env: ``` -scripts/dev/generate_guardian_tls_certs.sh -export GUARDIAN_GRPC_TLS_CA_B64="$(base64 -w 0 secrets/tls/guardian/guardian-ca.pem)" +scripts/dev/generate_server_tls_certs.sh +export SERVER_GRPC_TLS_CA_B64="$(base64 -w 0 secrets/tls/server/server-ca.pem)" docker run --rm \ - -e GUARDIAN_GRPC_TLS_CA_B64 \ - -e GUARDIAN_GRPC_TARGET="host.docker.internal:50051" \ + -e SERVER_GRPC_TLS_CA_B64 \ + -e SUPERVISOR_GRPC_TARGET="host.docker.internal:50051" \ yourrepo/flowmesh_worker:cpu-latest ``` From 0b38e8c9743c277131611b25bb6879e1227b0b03 Mon Sep 17 00:00:00 2001 From: Qruixuan <154648498+Qruixuan@users.noreply.github.com> Date: Sat, 2 May 2026 23:36:16 +0800 Subject: [PATCH 04/17] feat: add GPU requirements install + HF import error capture Dockerfile.cuda: install requirements.gpu.txt in addition to requirements.txt, and add build-time verification that torch/transformers are importable. transformers_executor.py: capture import error message in _HF_IMPORT_ERROR, split PreTrainedModel into a separate fallback import block, add _require_transformers() helper called from both prepare() and run(). Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn> --- src/worker/docker/Dockerfile.cuda | 18 +++++--- src/worker/executors/transformers_executor.py | 44 ++++++++++++++----- 2 files changed, 44 insertions(+), 18 deletions(-) diff --git a/src/worker/docker/Dockerfile.cuda b/src/worker/docker/Dockerfile.cuda index aff0ad1..9974f05 100644 --- a/src/worker/docker/Dockerfile.cuda +++ b/src/worker/docker/Dockerfile.cuda @@ -9,7 +9,9 @@ FROM builder AS build_context # Runtime stage pulls only the CUDA runtime bits FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} LABEL org.opencontainers.image.title="FlowMesh Worker (CUDA)" \ - org.opencontainers.image.description="GPU-enabled FlowMesh worker runtime" + org.opencontainers.image.description="GPU-enabled FlowMesh worker runtime" \ + org.opencontainers.image.source="https://github.com/mlsys-io/FlowMesh" \ + org.opencontainers.image.url="https://github.com/mlsys-io/FlowMesh" ARG TZ ARG CUDA_VERSION ARG TORCH_CUDA_ARCH_LIST @@ -48,8 +50,13 @@ ENV PATH=/opt/py312/bin:$PATH # Install Python dependencies (CPU + GPU stacks) COPY src/worker/requirements/requirements.txt /tmp/requirements.txt +COPY src/worker/requirements/requirements.gpu.txt /tmp/requirements.gpu.txt RUN uv pip install --python /opt/py312/bin/python --system --requirement /tmp/requirements.txt \ - && rm -f /tmp/requirements.txt + && uv pip install --python /opt/py312/bin/python --system --requirement /tmp/requirements.gpu.txt \ + && rm -f /tmp/requirements.txt /tmp/requirements.gpu.txt + +# Verify GPU dependencies are importable at build time +RUN python -c "import torch; from transformers import AutoModelForCausalLM; print('torch:', torch.__version__, 'cuda:', torch.cuda.is_available())" # Non-root runtime user + HF cache RUN useradd -m -u 10001 appuser \ @@ -60,13 +67,10 @@ WORKDIR /app # Application code COPY src/worker ./worker -COPY src/shared/__init__.py ./shared/__init__.py -COPY src/shared/all ./shared/all -COPY src/shared/host_worker ./shared/host_worker -COPY src/shared/guardian_worker ./shared/guardian_worker +COPY src/shared ./shared # Default worker env knobs -ENV GUARDIAN_GRPC_TARGET=flowmesh_guardian:50051 \ +ENV SUPERVISOR_GRPC_TARGET=flowmesh_server:50051 \ RESULTS_DIR=/app/worker/results \ LOG_LEVEL=INFO \ HEARTBEAT_INTERVAL_SEC=30 \ diff --git a/src/worker/executors/transformers_executor.py b/src/worker/executors/transformers_executor.py index a40d1e0..04fd084 100644 --- a/src/worker/executors/transformers_executor.py +++ b/src/worker/executors/transformers_executor.py @@ -69,6 +69,7 @@ from .mixins.inference import InferenceMixin from .utils.checkpoints import artifact_ref, maybe_upload_artifacts +_HF_IMPORT_ERROR: str = "" try: import torch from transformers import ( @@ -78,12 +79,11 @@ AutoModelForImageTextToText, AutoTokenizer, GenerationConfig, - PreTrainedModel, - PreTrainedTokenizerBase, ) _HAS_TRANSFORMERS = True -except Exception: +except Exception as _exc: + _HF_IMPORT_ERROR = f"{type(_exc).__name__}: {_exc}" if TYPE_CHECKING: import torch from transformers import ( @@ -93,8 +93,6 @@ AutoModelForImageTextToText, AutoTokenizer, GenerationConfig, - PreTrainedModel, - PreTrainedTokenizerBase, ) else: torch = None @@ -104,11 +102,28 @@ AutoModelForCausalLM = None AutoTokenizer = None GenerationConfig = None - PreTrainedModel = None - PreTrainedTokenizerBase = None _HAS_TRANSFORMERS = False +# PreTrainedModel and PreTrainedTokenizerBase are used only as type annotations. +# Some installations (e.g. when vllm pins an older/patched transformers) don't +# re-export them from transformers.__init__; import from their source modules as +# a fallback so a missing top-level export doesn't break the functional classes. +try: + from transformers import PreTrainedModel, PreTrainedTokenizerBase +except ImportError: + try: + from transformers.modeling_utils import PreTrainedModel # type: ignore[assignment] + from transformers.tokenization_utils_base import ( # type: ignore[assignment] + PreTrainedTokenizerBase, + ) + except ImportError: + if TYPE_CHECKING: + from transformers import PreTrainedModel, PreTrainedTokenizerBase + else: + PreTrainedModel = None # type: ignore[assignment,misc] + PreTrainedTokenizerBase = None # type: ignore[assignment,misc] + logger = logging.getLogger(__name__) @@ -136,12 +151,17 @@ def __init__( # ------------------------------------------------------------------ # # Lifecycle # ------------------------------------------------------------------ # - def prepare(self) -> None: # type: ignore[override] - if not _HAS_TRANSFORMERS: + def _require_transformers(self) -> None: + """Raise ExecutionError with the original import traceback if unavailable.""" + if not _HAS_TRANSFORMERS or AutoModelForCausalLM is None: + detail = f" ({_HF_IMPORT_ERROR})" if _HF_IMPORT_ERROR else "" raise ExecutionError( - "transformers/torch is not installed (`pip install transformers " - "torch`)." + f"transformers/torch not available{detail} — " + "install with: pip install transformers torch" ) + + def prepare(self) -> None: # type: ignore[override] + self._require_transformers() configure_hf_library_logging() def _pick_device(self, cfg: dict[str, Any]) -> str: @@ -384,6 +404,8 @@ def _detect_finish_reason( return None def run(self, task: ExecutorTask, out_dir: Path) -> dict[str, Any]: # type: ignore[override] + # Guard runs in the subprocess too (prepare() only runs in parent process). + self._require_transformers() configure_hf_library_logging() spec = task.spec if not isinstance(spec, (InferenceSpecStrict, EmbeddingSpecStrict)): From aee6149720ca8365d67f553bc2b233c1f69006c1 Mon Sep 17 00:00:00 2001 From: Qruixuan <154648498+Qruixuan@users.noreply.github.com> Date: Sat, 2 May 2026 23:36:55 +0800 Subject: [PATCH 05/17] feat: add Docker CI compose infrastructure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migrate ci.compose.yml, ci.worker.gpu.yml, ci.ports.fixed.yml, ci.worker_config.yaml, and ci.gpu_worker_config.yaml from FlowMesh_dev. Adapted: guardian service → supervisor, src/guardian/ → src/server/, /etc/guardian/ → /etc/supervisor/, env var names GUARDIAN_* → SUPERVISOR_*. Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn> --- docker/ci.compose.yml | 129 +++++++++++++++++++++++++++++++ docker/ci.gpu_worker_config.yaml | 11 +++ docker/ci.ports.fixed.yml | 21 +++++ docker/ci.worker.gpu.yml | 27 +++++++ docker/ci.worker_config.yaml | 9 +++ 5 files changed, 197 insertions(+) create mode 100644 docker/ci.compose.yml create mode 100644 docker/ci.gpu_worker_config.yaml create mode 100644 docker/ci.ports.fixed.yml create mode 100644 docker/ci.worker.gpu.yml create mode 100644 docker/ci.worker_config.yaml diff --git a/docker/ci.compose.yml b/docker/ci.compose.yml new file mode 100644 index 0000000..0b3e47c --- /dev/null +++ b/docker/ci.compose.yml @@ -0,0 +1,129 @@ +# docker/ci.compose.yml — CI integration test stack (single-host, no GPU) +# +# Brings up a fully isolated FlowMesh environment for each CI run. +# All services live in an internal Docker network; no state persists between runs. +# +# Supervisor spawns the CPU worker via Docker (with Docker socket mounted), +# so the worker gets a proper token and can register correctly. +# +# NOTE: No ports are exposed in this base file. Add ports via an overlay: +# - Fixed (GitHub Actions / bare docker compose): docker/ci.ports.fixed.yml +# - Dynamic (local dev, run_local.sh): generated at runtime +# +# Usage (from repo root): +# docker build -f src/worker/docker/Dockerfile.cpu -t ci/flowmesh_worker:latest-cpu . +# docker compose -p ci-$RUN_ID -f docker/ci.compose.yml -f docker/ci.ports.fixed.yml up -d --build +# docker compose -p ci-$RUN_ID -f docker/ci.compose.yml -f docker/ci.ports.fixed.yml down -v + +services: + redis_control: + image: redis:7-alpine + command: ["redis-server", "--loglevel", "warning"] + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 3s + timeout: 2s + retries: 10 + networks: [ci-net] + + redis_telemetry: + image: redis:7-alpine + command: ["redis-server", "--loglevel", "warning"] + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 3s + timeout: 2s + retries: 10 + networks: [ci-net] + + postgres: + image: postgres:18-alpine + environment: + POSTGRES_USER: flowmesh + POSTGRES_PASSWORD: flowmesh + POSTGRES_DB: flowmesh + healthcheck: + test: ["CMD-SHELL", "pg_isready -U flowmesh"] + interval: 3s + timeout: 2s + retries: 10 + networks: [ci-net] + + host: + build: + context: .. + dockerfile: src/host/Dockerfile + depends_on: + redis_control: + condition: service_healthy + redis_telemetry: + condition: service_healthy + postgres: + condition: service_healthy + environment: + REDIS_URL: "redis://redis_control:6379/0" + REDIS_CONTROL_URL: "redis://redis_control:6379/0" + REDIS_TELEMETRY_URL: "redis://redis_telemetry:6379/0" + DATABASE_URL: "postgresql+asyncpg://flowmesh:flowmesh@postgres:5432/flowmesh" + HOST_RUN_MIGRATIONS: "true" + API_KEY_HMAC_SECRET: "ci-hmac-secret" + BOOTSTRAP_ORG_ID: "ci-org" + BOOTSTRAP_ADMIN_EXTERNAL_ID: "ci-admin" + BOOTSTRAP_ADMIN_API_KEY: "flm-ci-00000000000000000000000000000000" + ORCHESTRATOR_DISPATCH_MODE: "adaptive" + ORCHESTRATOR_WORKER_SELECTION: "first_fit" + ENABLE_ELASTIC_SCALING: "false" + LOG_LEVEL: "INFO" + healthcheck: + test: ["CMD", "curl", "-sf", "http://localhost:8000/healthz"] + interval: 5s + timeout: 3s + start_period: 20s + retries: 12 + networks: [ci-net] + + supervisor: + build: + context: .. + dockerfile: src/server/Dockerfile + depends_on: + host: + condition: service_healthy + redis_control: + condition: service_healthy + redis_telemetry: + condition: service_healthy + environment: + FLOWMESH_BASE_URL: "http://host:8000" + FLOWMESH_API_KEY: "flm-ci-00000000000000000000000000000000" + REDIS_CONTROL_URL: "redis://redis_control:6379/0" + REDIS_TELEMETRY_URL: "redis://redis_telemetry:6379/0" + SUPERVISOR_NAMESPACE: "ci" + SUPERVISOR_CLUSTER: "ci-cluster" + SUPERVISOR_ALIAS: "ci-supervisor" + LOG_LEVEL: "INFO" + # Worker spawning via Docker + FLOWMESH_REGISTRY: "ci" + FLOWMESH_VERSION: "latest" + WORKER_CONFIG_PATH: "/etc/supervisor/worker_config.yaml" + WORKER_EXECUTOR_IDLE_CLEANUP_SEC: "0" + # Workers are spawned on the compose network (WORKER_DOCKER_NETWORK) so + # they must reach supervisor by service name, not localhost. + SUPERVISOR_HOST: "supervisor" + # Pass HuggingFace token through so workers can download gated models. + # Set HF_TOKEN in the runner environment (or as a GitHub Actions secret). + HF_TOKEN: + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - ./ci.worker_config.yaml:/etc/supervisor/worker_config.yaml:ro + healthcheck: + test: ["CMD", "curl", "-sf", "http://localhost:8001/healthz"] + interval: 5s + timeout: 3s + start_period: 15s + retries: 10 + networks: [ci-net] + +networks: + ci-net: + # Isolated per-run network; named via project (-p ci-$RUN_ID) diff --git a/docker/ci.gpu_worker_config.yaml b/docker/ci.gpu_worker_config.yaml new file mode 100644 index 0000000..0d57dc3 --- /dev/null +++ b/docker/ci.gpu_worker_config.yaml @@ -0,0 +1,11 @@ +default_worker_config: + hb_interval: 30 + +workers: + - provider: docker + init_on_start: true + worker_config: + worker_alias: ci-worker-gpu + worker_type: gpu + cuda_devices: [0] + enable_ssh: true diff --git a/docker/ci.ports.fixed.yml b/docker/ci.ports.fixed.yml new file mode 100644 index 0000000..b5ba67f --- /dev/null +++ b/docker/ci.ports.fixed.yml @@ -0,0 +1,21 @@ +# docker/ci.ports.fixed.yml — Fixed host-port bindings for CI environments +# +# Include alongside ci.compose.yml when running without run_local.sh +# (e.g. GitHub Actions or a dedicated CI machine where ports 8000/50051 +# are guaranteed to be free): +# +# docker compose -p ci-$RUN_ID \ +# -f docker/ci.compose.yml \ +# -f docker/ci.ports.fixed.yml \ +# up -d --build --wait +# +# run_local.sh generates its own dynamic-port overlay instead; this file +# is not used by that script. + +services: + host: + ports: + - "8000:8000" + supervisor: + ports: + - "50051:50051" diff --git a/docker/ci.worker.gpu.yml b/docker/ci.worker.gpu.yml new file mode 100644 index 0000000..29e905c --- /dev/null +++ b/docker/ci.worker.gpu.yml @@ -0,0 +1,27 @@ +# docker/ci.worker.gpu.yml — GPU worker overlay for CI +# +# Overlay on top of ci.compose.yml for GPU runner (luyao3, RTX 5080). +# Supervisor spawns a GPU worker container (ci/flowmesh_worker:latest-gpu) +# using the Docker socket, same pattern as the CPU integration test. +# +# Pre-build the GPU worker image before running compose: +# docker build -f src/worker/docker/Dockerfile.cuda \ +# -t ci/flowmesh_worker:latest-gpu . +# +# Usage: +# docker compose -p ci-$RUN_ID \ +# -f docker/ci.compose.yml \ +# -f docker/ci.worker.gpu.yml \ +# up -d --build + +services: + supervisor: + environment: + WORKER_CONFIG_PATH: "/etc/supervisor/worker_config.yaml" + # Attach GPU workers to the compose network so they can resolve service + # hostnames (e.g. "host") when uploading results. COMPOSE_PROJECT_NAME + # must be exported before docker compose up (run_local.sh does this via + # the compose override; ci.yml sets it explicitly in the step env). + WORKER_DOCKER_NETWORK: "${COMPOSE_PROJECT_NAME}_ci-net" + volumes: + - ./ci.gpu_worker_config.yaml:/etc/supervisor/worker_config.yaml:ro diff --git a/docker/ci.worker_config.yaml b/docker/ci.worker_config.yaml new file mode 100644 index 0000000..b757bc2 --- /dev/null +++ b/docker/ci.worker_config.yaml @@ -0,0 +1,9 @@ +default_worker_config: + hb_interval: 30 + +workers: + - provider: docker + init_on_start: true + worker_config: + worker_alias: ci-worker-cpu + worker_type: cpu From 9567d4bdbe0fc1e1129d64cd861ccbfe4eb0d5c8 Mon Sep 17 00:00:00 2001 From: Qruixuan <154648498+Qruixuan@users.noreply.github.com> Date: Sat, 2 May 2026 23:38:29 +0800 Subject: [PATCH 06/17] feat: add CI workflow and runner setup guide MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migrate .github/workflows/ci.yml (integration + GPU smoke jobs) and scripts/ci/setup-runner.md from FlowMesh_dev. Adapted: guardian→supervisor service names throughout; repo URL updated to mlsys-io/FlowMesh. Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn> --- .github/workflows/ci.yml | 478 +++++++++++++++++++++++++++++++++++++ scripts/ci/setup-runner.md | 171 +++++++++++++ 2 files changed, 649 insertions(+) create mode 100644 .github/workflows/ci.yml create mode 100644 scripts/ci/setup-runner.md diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..9e5aea4 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,478 @@ +name: CI — Integration & GPU Tests + +on: + push: + branches: [main] # run on every merge to main + workflow_dispatch: # also allow manual trigger from GitHub UI + +concurrency: + group: ci-${{ github.ref }} + cancel-in-progress: true + +env: + FLOWMESH_API_KEY: "flm-ci-00000000000000000000000000000000" + +jobs: + # ── Integration test (CPU, luyaomini self-hosted runners) ────────────────────── + integration: + name: Integration test (CPU) + runs-on: [self-hosted, linux, luyao3] + timeout-minutes: 20 + + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Set project name + run: echo "PROJECT=ci-${{ github.run_id }}-integ" >> "$GITHUB_ENV" + + - name: Pre-clean stale worker containers and disk + run: | + docker rm -f ci-worker-cpu 2>/dev/null || true + # Remove the CI worker image so it always rebuilds fresh + docker rmi ci/flowmesh_worker:latest-cpu 2>/dev/null || true + # Remove dangling images and unused volumes from crashed/orphaned runs + docker image prune -f + docker volume prune -f + # Trim build cache: keep 5 GB of recent layers, discard the rest + docker builder prune -f --keep-storage 5gb 2>/dev/null \ + || docker builder prune -f --filter "until=72h" 2>/dev/null \ + || true + echo "=== Disk after pre-clean ===" + df -h / + docker system df + + - name: Build worker image + run: | + DOCKER_BUILDKIT=1 docker build \ + -f src/worker/docker/Dockerfile.cpu \ + -t ci/flowmesh_worker:latest-cpu \ + . + + - name: Build & start services + run: | + docker compose -p "$PROJECT" -f docker/ci.compose.yml up -d --build + env: + DOCKER_BUILDKIT: "1" + + - name: Wait for host to be healthy + run: | + timeout 120 bash -c ' + until docker compose -p "$PROJECT" -f docker/ci.compose.yml \ + exec -T host curl -sf http://localhost:8000/healthz; do + echo "waiting for host…" + sleep 3 + done + ' + + - name: Wait for supervisor to be healthy + run: | + timeout 120 bash -c ' + until docker compose -p "$PROJECT" -f docker/ci.compose.yml \ + exec -T supervisor curl -sf http://localhost:8001/healthz; do + echo "waiting for supervisor…" + sleep 3 + done + ' + + - name: Debug container state + run: | + echo "=== Running containers ===" + docker compose -p "$PROJECT" -f docker/ci.compose.yml ps + echo "=== All Docker containers (incl. supervisor-spawned worker) ===" + docker ps -a --format "table {{.Names}}\t{{.Image}}\t{{.Status}}" | grep -E "NAME|worker|ci-worker" || true + echo "=== Supervisor logs ===" + docker compose -p "$PROJECT" -f docker/ci.compose.yml logs supervisor --tail=40 + echo "=== Worker container logs (supervisor-spawned) ===" + docker logs ci-worker-cpu 2>&1 | tail -40 || echo "(no ci-worker-cpu container found)" + + - name: Wait for worker to register + run: | + for i in $(seq 1 24); do + RESP=$(curl -sf \ + -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" \ + http://localhost:8000/api/v1/workers || echo "CURL_FAILED") + echo "Attempt $i: $RESP" + if echo "$RESP" | grep -qE '"worker_id"|"id":|"wkr-'; then + echo "Worker registered!" + exit 0 + fi + sleep 5 + done + echo "=== Worker never registered. Final worker logs ===" + docker compose -p "$PROJECT" -f docker/ci.compose.yml logs worker --tail=80 + exit 1 + + - name: Run E2E smoke test (echo task) + run: | + docker run --rm \ + --network "${PROJECT}_ci-net" \ + -e FLOWMESH_HOST_URL="http://host:8000" \ + -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ + -e TASK_YAML="/templates/echo_local.yaml" \ + -e E2E_TIMEOUT_SEC="120" \ + -v "${{ github.workspace }}/tests:/tests:ro" \ + -v "${{ github.workspace }}/templates:/templates:ro" \ + python:3.11-slim \ + sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v" + + - name: Verify CPU worker actually executed the task + run: | + echo "=== CPU worker logs (full) ===" + docker logs ci-worker-cpu 2>&1 | tee /tmp/worker-cpu-${{ github.run_id }}.log || true + + echo "" + echo "=== Execution evidence check ===" + LOG=/tmp/worker-cpu-${{ github.run_id }}.log + if grep -qiE "executor|running task|dispatched|echo|succeeded|TASK_SUCCEEDED|done" "$LOG"; then + echo "✓ Worker executed and completed the task" + else + echo "✗ FAIL: No task execution evidence in worker logs" + exit 1 + fi + + echo "" + echo "=== Result files written by worker ===" + docker run --rm \ + --volumes-from ci-worker-cpu \ + busybox find /var/lib/flowmesh-results -type f 2>/dev/null \ + | head -20 || echo "(could not inspect result volume)" + + - name: Collect logs on failure + if: failure() + run: | + docker compose -p "$PROJECT" -f docker/ci.compose.yml logs --no-color \ + > /tmp/ci-logs-${{ github.run_id }}.txt 2>&1 || true + + - name: Upload logs on failure + if: failure() + uses: actions/upload-artifact@v7 + with: + name: ci-logs-integ-${{ github.run_id }} + path: /tmp/ci-logs-${{ github.run_id }}.txt + retention-days: 3 + + - name: Destroy workers via supervisor API + if: always() + run: | + docker compose -p "$PROJECT" -f docker/ci.compose.yml \ + exec -T supervisor \ + curl -sf -X DELETE http://localhost:8001/api/v1/workers \ + -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" || true + sleep 5 + + - name: Teardown + if: always() + run: | + docker rm -f ci-worker-cpu 2>/dev/null || true + docker compose -p "$PROJECT" -f docker/ci.compose.yml down -v --remove-orphans + # Remove the built CI image — it will be rebuilt next run + docker rmi ci/flowmesh_worker:latest-cpu 2>/dev/null || true + # Clean up dangling images and unused volumes left by this run + docker image prune -f + docker volume prune -f + echo "=== Disk after teardown ===" + df -h / + docker system df + + # ── GPU smoke test (RTX 5080 self-hosted runners) ───────────────────────── + gpu-smoke: + name: GPU smoke test (RTX 5080) + needs: integration + runs-on: [self-hosted, linux, luyao3] + timeout-minutes: 90 + # One GPU job at a time per machine + concurrency: + group: gpu-rtx5080-${{ github.ref }} + cancel-in-progress: false + + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Set project name + run: echo "PROJECT=ci-${{ github.run_id }}-gpu" >> "$GITHUB_ENV" + + - name: Pre-clean stale worker containers and disk + run: | + docker rm -f ci-worker-gpu 2>/dev/null || true + # Remove old CI GPU worker image (rebuilt each run) + docker rmi ci/flowmesh_worker:latest-gpu 2>/dev/null || true + # Remove dangling images and unused volumes from crashed/orphaned runs + docker image prune -f + docker volume prune -f + # Trim build cache but keep recent layers for faster builds + docker builder prune -f --keep-storage 5gb 2>/dev/null \ + || docker builder prune -f --filter "until=72h" 2>/dev/null \ + || true + echo "=== Disk after pre-clean ===" + df -h / + docker system df + + - name: Build GPU worker builder image (cached by content hash) + run: | + # Hash Dockerfile.cuda.builder + GPU requirements so we only rebuild + # when the actual inputs change. The tagged image persists on the runner. + BUILDER_HASH=$(cat \ + src/worker/docker/Dockerfile.cuda.builder \ + src/worker/requirements/requirements.gpu.txt \ + | sha256sum | cut -d' ' -f1 | head -c 12) + BUILDER_TAG="flowmesh-builder:${BUILDER_HASH}" + echo "Builder content hash: ${BUILDER_HASH}" + if docker image inspect "${BUILDER_TAG}" > /dev/null 2>&1; then + echo "Cache hit — reusing ${BUILDER_TAG}" + docker tag "${BUILDER_TAG}" builder + else + echo "Cache miss — building ${BUILDER_TAG}" + DOCKER_BUILDKIT=1 docker build \ + -f src/worker/docker/Dockerfile.cuda.builder \ + -t "${BUILDER_TAG}" \ + -t builder \ + . + fi + + - name: Build GPU worker image + run: | + DOCKER_BUILDKIT=1 docker build \ + -f src/worker/docker/Dockerfile.cuda \ + -t ci/flowmesh_worker:latest-gpu \ + . + + - name: Build & start services (with GPU worker) + run: | + docker compose -p "$PROJECT" \ + -f docker/ci.compose.yml \ + -f docker/ci.worker.gpu.yml \ + up -d --build + env: + DOCKER_BUILDKIT: "1" + HF_TOKEN: ${{ secrets.HF_TOKEN }} + COMPOSE_PROJECT_NAME: ${{ env.PROJECT }} + + - name: Wait for host to be healthy + run: | + timeout 120 bash -c ' + until docker compose -p "$PROJECT" -f docker/ci.compose.yml \ + exec -T host curl -sf http://localhost:8000/healthz; do + echo "waiting for host…" + sleep 3 + done + ' + + - name: Wait for supervisor to be healthy + run: | + timeout 120 bash -c ' + until docker compose -p "$PROJECT" -f docker/ci.compose.yml \ + exec -T supervisor curl -sf http://localhost:8001/healthz; do + echo "waiting for supervisor…" + sleep 3 + done + ' + + - name: Wait for GPU worker to register + run: | + for i in $(seq 1 36); do + RESP=$(curl -sf \ + -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" \ + http://localhost:8000/api/v1/workers || echo "CURL_FAILED") + echo "Attempt $i: $RESP" + if echo "$RESP" | grep -qE '"worker_id"|"id":|"wkr-'; then + echo "Worker registered!" + exit 0 + fi + sleep 5 + done + docker logs ci-worker-gpu 2>&1 | tail -40 || true + exit 1 + + - name: "E2E: vLLM inference (TinyLlama-1.1B)" + run: | + docker run --rm \ + --network "${PROJECT}_ci-net" \ + -e FLOWMESH_HOST_URL="http://host:8000" \ + -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ + -e TASK_YAML="/templates/inference_vllm_tiny.yaml" \ + -e E2E_TIMEOUT_SEC="300" \ + -v "${{ github.workspace }}/tests:/tests:ro" \ + -v "${{ github.workspace }}/templates:/templates:ro" \ + python:3.11-slim \ + sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v" + + - name: "E2E: 3-node fan-in graph DAG (echo executor)" + run: | + docker run --rm \ + --network "${PROJECT}_ci-net" \ + -e FLOWMESH_HOST_URL="http://host:8000" \ + -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ + -e TASK_YAML="/templates/echo_three_node_graph.yaml" \ + -e E2E_TIMEOUT_SEC="120" \ + -v "${{ github.workspace }}/tests:/tests:ro" \ + -v "${{ github.workspace }}/templates:/templates:ro" \ + python:3.11-slim \ + sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v" + + - name: "E2E: parallel DAG with synthesis (vLLM, graph_template)" + run: | + docker run --rm \ + --network "${PROJECT}_ci-net" \ + -e FLOWMESH_HOST_URL="http://host:8000" \ + -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ + -e TASK_YAML="/templates/dag_inference_example.yaml" \ + -e E2E_TIMEOUT_SEC="600" \ + -v "${{ github.workspace }}/tests:/tests:ro" \ + -v "${{ github.workspace }}/templates:/templates:ro" \ + python:3.11-slim \ + sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v" + + - name: "E2E: conditional task skip (echo executor)" + run: | + docker run --rm \ + --network "${PROJECT}_ci-net" \ + -e FLOWMESH_HOST_URL="http://host:8000" \ + -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ + -e TASK_YAML="/templates/conditional_echo_test.yaml" \ + -e E2E_TIMEOUT_SEC="120" \ + -v "${{ github.workspace }}/tests:/tests:ro" \ + -v "${{ github.workspace }}/templates:/templates:ro" \ + python:3.11-slim \ + sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v" + + - name: "E2E: HF Transformers inference (tiny-gpt2)" + run: | + docker run --rm \ + --network "${PROJECT}_ci-net" \ + -e FLOWMESH_HOST_URL="http://host:8000" \ + -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ + -e TASK_YAML="/templates/inference_hf_tiny.yaml" \ + -e E2E_TIMEOUT_SEC="300" \ + -v "${{ github.workspace }}/tests:/tests:ro" \ + -v "${{ github.workspace }}/templates:/templates:ro" \ + python:3.11-slim \ + sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v" + + - name: "E2E: LoRA SFT fine-tuning (TinyLlama-1.1B, gsm8k 2%)" + run: | + docker run --rm \ + --network "${PROJECT}_ci-net" \ + -e FLOWMESH_HOST_URL="http://host:8000" \ + -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ + -e TASK_YAML="/templates/lora_sft_llama.yaml" \ + -e E2E_TIMEOUT_SEC="1200" \ + -v "${{ github.workspace }}/tests:/tests:ro" \ + -v "${{ github.workspace }}/templates:/templates:ro" \ + python:3.11-slim \ + sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v" + + - name: "E2E: SSH non-interactive (python:3.12-slim container)" + run: | + docker run --rm \ + --network "${PROJECT}_ci-net" \ + -e FLOWMESH_HOST_URL="http://host:8000" \ + -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ + -e TASK_YAML="/templates/ssh_noninteractive.yaml" \ + -e E2E_TIMEOUT_SEC="120" \ + -v "${{ github.workspace }}/tests:/tests:ro" \ + -v "${{ github.workspace }}/templates:/templates:ro" \ + python:3.11-slim \ + sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v" + + - name: "E2E: n8n parallel DAG inference (dag_inference.json)" + run: | + docker run --rm \ + --network "${PROJECT}_ci-net" \ + -e FLOWMESH_HOST_URL="http://host:8000" \ + -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ + -e TASK_YAML="/templates/n8n/dag_inference.json" \ + -e E2E_TIMEOUT_SEC="600" \ + -v "${{ github.workspace }}/tests:/tests:ro" \ + -v "${{ github.workspace }}/templates:/templates:ro" \ + python:3.11-slim \ + sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v" + + - name: Verify GPU worker actually executed the task + run: | + echo "=== GPU worker logs (full) ===" + docker logs ci-worker-gpu 2>&1 | tee /tmp/worker-gpu-${{ github.run_id }}.log || true + + echo "" + echo "=== Execution evidence check ===" + LOG=/tmp/worker-gpu-${{ github.run_id }}.log + + # Must have received and run a task + if grep -qiE "executor|running task|dispatched|inference|model" "$LOG"; then + echo "✓ Worker received and processed a task" + else + echo "✗ FAIL: No task execution evidence in worker logs" + exit 1 + fi + + # Must show task succeeded (not just status update) + if grep -qiE "succeeded|TASK_SUCCEEDED|done|completed" "$LOG"; then + echo "✓ Task completed successfully in worker" + else + echo "✗ FAIL: No task completion evidence in worker logs" + exit 1 + fi + + echo "" + echo "=== GPU utilization during test ===" + nvidia-smi --query-gpu=name,memory.used,memory.total,utilization.gpu \ + --format=csv,noheader,nounits 2>/dev/null || echo "(nvidia-smi not available)" + + echo "" + echo "=== Result files written by worker ===" + docker run --rm \ + --volumes-from ci-worker-gpu \ + busybox find /var/lib/flowmesh-results -type f 2>/dev/null \ + | head -20 || echo "(could not inspect result volume)" + + - name: Collect logs on failure + if: failure() + run: | + docker compose -p "$PROJECT" \ + -f docker/ci.compose.yml \ + -f docker/ci.worker.gpu.yml \ + logs --no-color > /tmp/ci-gpu-logs-${{ github.run_id }}.txt 2>&1 || true + + - name: Upload logs on failure + if: failure() + uses: actions/upload-artifact@v7 + with: + name: ci-logs-gpu-${{ github.run_id }} + path: /tmp/ci-gpu-logs-${{ github.run_id }}.txt + retention-days: 3 + + - name: Destroy workers via supervisor API + if: always() + run: | + docker compose -p "$PROJECT" -f docker/ci.compose.yml \ + exec -T supervisor \ + curl -sf -X DELETE http://localhost:8001/api/v1/workers \ + -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" || true + sleep 5 + + - name: Teardown + if: always() + run: | + docker rm -f ci-worker-gpu 2>/dev/null || true + docker compose -p "$PROJECT" \ + -f docker/ci.compose.yml \ + -f docker/ci.worker.gpu.yml \ + down -v --remove-orphans + # Remove the CI GPU worker image — rebuilt next run + docker rmi ci/flowmesh_worker:latest-gpu 2>/dev/null || true + # Remove old flowmesh-builder images (keep only the current content hash) + CURRENT_HASH=$(cat \ + src/worker/docker/Dockerfile.cuda.builder \ + src/worker/requirements/requirements.gpu.txt \ + | sha256sum | cut -d' ' -f1 | head -c 12) + docker images --format "{{.Repository}}:{{.Tag}}" \ + | grep "^flowmesh-builder:" \ + | grep -v ":${CURRENT_HASH}$" \ + | xargs -r docker rmi 2>/dev/null || true + # Clean up dangling images and unused volumes + docker image prune -f + docker volume prune -f + echo "=== Disk after teardown ===" + df -h / + docker system df diff --git a/scripts/ci/setup-runner.md b/scripts/ci/setup-runner.md new file mode 100644 index 0000000..f96fd4f --- /dev/null +++ b/scripts/ci/setup-runner.md @@ -0,0 +1,171 @@ +# FlowMesh CI — Self-Hosted Runner Setup + +This guide sets up GitHub Actions self-hosted runners on the FlowMesh GPU and CPU machines. + +## Overview + +| Machine | Role | Labels | +|---------|------|--------| +| luyao3 | Integration tests (CPU) | `self-hosted,linux,luyao3` | +| luyao3 | GPU smoke tests | `self-hosted,linux,luyao3` | + +Each machine runs one runner. Multiple runners on the same machine would cause GPU memory conflicts. + +--- + +## Part 1 — Prerequisites (all machines) + +### 1.1 Create a dedicated runner user + +Run as root: + +```bash +sudo useradd -m -s /bin/bash github-runner +sudo usermod -aG docker github-runner # allow Docker without sudo +``` + +### 1.2 Install Docker + +```bash +curl -fsSL https://get.docker.com | sudo bash +sudo systemctl enable --now docker +``` + +Verify: + +```bash +docker run --rm hello-world +``` + +--- + +## Part 2 — GPU machines only (RTX 5080) + +### 2.1 Install nvidia-container-toolkit + +```bash +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ + | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + +curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ + | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ + | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list + +sudo apt-get update +sudo apt-get install -y nvidia-container-toolkit +sudo nvidia-ctk runtime configure --runtime=docker +sudo systemctl restart docker +``` + +Verify: + +```bash +docker run --rm --gpus all nvidia/cuda:12.0-base-ubuntu22.04 nvidia-smi +``` + +--- + +## Part 3 — Install the GitHub Actions runner + +Repeat this section on **each machine** with the appropriate labels. + +### 3.1 Get a runner registration token + +In the GitHub repo: +**Settings → Actions → Runners → New self-hosted runner** + +Copy the token shown (valid for 1 hour). + +### 3.2 Download and configure the runner + +Run as `github-runner` user: + +```bash +sudo -u github-runner -i # switch to runner user + +mkdir -p ~/actions-runner && cd ~/actions-runner + +# Download latest runner (check https://github.com/actions/runner/releases for latest version) +curl -sL https://github.com/actions/runner/releases/download/v2.322.0/actions-runner-linux-x64-2.322.0.tar.gz \ + -o actions-runner.tar.gz +tar xzf actions-runner.tar.gz +rm actions-runner.tar.gz +``` + +Configure — **luyao3 (CPU + GPU)**: + +```bash +./config.sh \ + --url https://github.com/mlsys-io/FlowMesh \ + --token \ + --name "luyao3" \ + --labels "self-hosted,linux,luyao3" \ + --work "_work" \ + --unattended +``` + +### 3.3 Install as a systemd service + +```bash +# Still as github-runner user inside ~/actions-runner +exit # back to root or sudo user + +sudo /home/github-runner/actions-runner/svc.sh install github-runner +sudo /home/github-runner/actions-runner/svc.sh start +``` + +Verify the service is running: + +```bash +sudo /home/github-runner/actions-runner/svc.sh status +# or +sudo systemctl status actions.runner.mlsys-io-FlowMesh.*.service +``` + +--- + +## Part 4 — GitHub Secrets + +Add these in **Settings → Secrets and variables → Actions**: + +| Secret | Value | Used by | +|--------|-------|---------| +| `HF_TOKEN` | HuggingFace API token | GPU worker (model downloads) | + +The CI API key (`flm-ci-00000000000000000000000000000000`) is hardcoded in the CI compose and test script — it is a fixed test credential, not a real secret. + +--- + +## Part 5 — Verify the runner appears in GitHub + +Go to **Settings → Actions → Runners** in the repo. +Each machine should show as **Idle** within a minute of starting the service. + +--- + +## Maintenance + +### View runner logs + +```bash +journalctl -u "actions.runner.*" -f +``` + +### Remove a runner + +```bash +cd ~/actions-runner +sudo ./svc.sh stop +sudo ./svc.sh uninstall +./config.sh remove --token +``` + +### Disk cleanup (CI build cache accumulates over time) + +Add a cron job on each runner machine: + +```bash +# As root — weekly Docker prune +echo "0 3 * * 0 root docker system prune -f --filter until=168h" \ + > /etc/cron.d/docker-prune +``` From 6600c687ee6d70b5d671314aca1c86c8e0ab8876 Mon Sep 17 00:00:00 2001 From: Qruixuan <154648498+Qruixuan@users.noreply.github.com> Date: Sat, 2 May 2026 23:40:45 +0800 Subject: [PATCH 07/17] feat: add local CI runner script and fix template output destinations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scripts/ci/run_local.sh: migrate from FlowMesh_dev, adapted guardian→supervisor throughout (service exec, compose override, health checks, log references). templates: fix output.destination from http to local in conditional_echo_test.yaml and ssh_noninteractive.yaml; use dev version of echo_three_node_graph.yaml. Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn> --- scripts/ci/run_local.sh | 363 +++++++++++++++++++++++++++ templates/conditional_echo_test.yaml | 3 +- templates/echo_three_node_graph.yaml | 8 - templates/ssh_noninteractive.yaml | 3 +- 4 files changed, 367 insertions(+), 10 deletions(-) create mode 100644 scripts/ci/run_local.sh diff --git a/scripts/ci/run_local.sh b/scripts/ci/run_local.sh new file mode 100644 index 0000000..f569dcc --- /dev/null +++ b/scripts/ci/run_local.sh @@ -0,0 +1,363 @@ +#!/usr/bin/env bash +# scripts/ci/run_local.sh — Run the full FlowMesh CI pipeline locally +# +# Mirrors the GitHub Actions CI workflow end-to-end so you can test without +# pushing to GitHub. Requires: docker, docker compose v2, uv. +# +# Fully isolated from any running FlowMesh services: +# - Host and supervisor ports are dynamically assigned (no fixed 8000/50051) +# - Worker container name is scoped to the process PID +# - Each run gets its own Docker network via compose project name +# +# Usage: +# ./scripts/ci/run_local.sh [OPTIONS] +# +# Options: +# --gpu Run the GPU smoke test instead of the CPU integration test +# --task-yaml PATH Override the workflow YAML submitted to the host +# --timeout SEC Override E2E wait timeout (default: 120, GPU default: 300) +# --no-clean Skip the pre-run docker prune step +# --no-build Skip rebuilding the worker image (use cached) +# --keep Do not tear down services after the run +# -h, --help Show this help + +set -euo pipefail + +# ── Paths ───────────────────────────────────────────────────────────────────────────────────── +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +DOCKER_DIR="$REPO_ROOT/docker" + +# ── Defaults ──────────────────────────────────────────────────────────────────────────────────────── +PROJECT="ci-local-$$" +API_KEY="flm-ci-00000000000000000000000000000000" +GPU=false +TASK_YAML="" +TIMEOUT="" +DO_CLEAN=true +DO_BUILD=true +DO_TEARDOWN=true + +WORKER_IMAGE_CPU="ci/flowmesh_worker:latest-cpu" +WORKER_IMAGE_GPU="ci/flowmesh_worker:latest-gpu" + +# Populated in section 0; referenced in teardown. +WORKER_NAME="" +_WORKER_CFG="" +_COMPOSE_OVERRIDE="" +HOST_URL="http://localhost:8000" # overwritten after dc up + +# ── Argument parsing ─────────────────────────────────────────────────────────────────────────────────────── +while [[ $# -gt 0 ]]; do + case "$1" in + --gpu) GPU=true; shift ;; + --task-yaml) TASK_YAML="$2"; shift 2 ;; + --timeout) TIMEOUT="$2"; shift 2 ;; + --no-clean) DO_CLEAN=false; shift ;; + --no-build) DO_BUILD=false; shift ;; + --keep) DO_TEARDOWN=false; shift ;; + -h|--help) sed -n '2,23p' "$0"; exit 0 ;; + *) echo "Unknown option: $1" >&2; exit 1 ;; + esac +done + +# ── Colors ────────────────────────────────────────────────────────────────────────────────────── +if [[ -t 1 ]]; then + _B='\033[0;34m' _G='\033[0;32m' _Y='\033[1;33m' _R='\033[0;31m' _N='\033[0m' +else + _B='' _G='' _Y='' _R='' _N='' +fi +log() { echo -e "${_B}[ci]${_N} $*"; } +ok() { echo -e "${_G}[ok]${_N} $*"; } +warn() { echo -e "${_Y}[warn]${_N} $*"; } +fail() { echo -e "${_R}[FAIL]${_N} $*" >&2; } + +# ── Compose helpers ──────────────────────────────────────────────────────────────────────────────────────── +COMPOSE_FILES=(-f "$DOCKER_DIR/ci.compose.yml") +if $GPU; then + COMPOSE_FILES+=(-f "$DOCKER_DIR/ci.worker.gpu.yml") +fi + +dc() { docker compose -p "$PROJECT" "${COMPOSE_FILES[@]}" "$@"; } + +# ── Teardown (trap runs on any exit) ────────────────────────────────────────────────────────────────── +_teardown() { + local code=$? + if ! $DO_TEARDOWN; then + warn "Skipping teardown (--keep). To clean up manually:" + echo " docker compose -p $PROJECT ${COMPOSE_FILES[*]} down -v --remove-orphans" + return + fi + + log "Tearing down..." + + # Always dump service logs before removal — essential for diagnosing failures. + echo + log "Supervisor logs (last 40 lines):" + dc logs supervisor --tail=40 2>/dev/null || true + echo + + if [[ -n "$WORKER_NAME" ]]; then + log "Worker logs ($WORKER_NAME):" + docker logs "$WORKER_NAME" 2>&1 | tail -60 || true + echo + fi + + # Ask supervisor to stop managed workers gracefully. + dc exec -T supervisor \ + curl -sf -X DELETE http://localhost:8001/api/v1/workers \ + -H "Authorization: Bearer $API_KEY" 2>/dev/null || true + sleep 3 + + docker rm -f "$WORKER_NAME" 2>/dev/null || true + dc down -v --remove-orphans 2>/dev/null || true + + # Worker image is intentionally kept: the next build overwrites the tag in-place, + # so there is always exactly one cached image available for --no-build runs. + docker image prune -f >/dev/null + docker volume prune -f >/dev/null + + # Clean up isolation temp files. + rm -f "${_WORKER_CFG:-}" "${_COMPOSE_OVERRIDE:-}" 2>/dev/null || true + + if [[ $code -eq 0 ]]; then + ok "Local CI run PASSED" + else + fail "Local CI run FAILED (exit $code)" + fi +} +trap _teardown EXIT + +# ── 0. Resolve defaults ────────────────────────────────────────────────────────────────────────────────────────────────────── +if $GPU; then + WORKER_NAME="ci-worker-gpu-$$" + WORKER_IMAGE="$WORKER_IMAGE_GPU" + WORKER_DOCKERFILE="src/worker/docker/Dockerfile.cuda" + [[ -z "$TIMEOUT" ]] && TIMEOUT=300 + # If --task-yaml was given, run only that one; otherwise run the full GPU suite. + if [[ -n "$TASK_YAML" ]]; then + GPU_TASK_YAMLS=("$TASK_YAML") + else + GPU_TASK_YAMLS=( + "$REPO_ROOT/templates/inference_vllm_tiny.yaml" + "$REPO_ROOT/templates/echo_three_node_graph.yaml" + "$REPO_ROOT/templates/dag_inference_example.yaml" + "$REPO_ROOT/templates/conditional_echo_test.yaml" + "$REPO_ROOT/templates/inference_hf_tiny.yaml" + "$REPO_ROOT/templates/lora_sft_llama.yaml" + "$REPO_ROOT/templates/ssh_noninteractive.yaml" + "$REPO_ROOT/templates/n8n/dag_inference.json" + ) + fi +else + WORKER_NAME="ci-worker-cpu-$$" + WORKER_IMAGE="$WORKER_IMAGE_CPU" + WORKER_DOCKERFILE="src/worker/docker/Dockerfile.cpu" + [[ -z "$TASK_YAML" ]] && TASK_YAML="$REPO_ROOT/templates/echo_local.yaml" + [[ -z "$TIMEOUT" ]] && TIMEOUT=120 +fi + +cd "$REPO_ROOT" + +# ── 0b. Create isolation artifacts ─────────────────────────────────────────────────────────────────────────────────────────────────────────── +# Worker config: project-scoped alias prevents container name clashes when a +# second local CI run or a dev worker with the same name is already running. +_WORKER_CFG="$(mktemp /tmp/ci-worker-cfg-XXXXXX.yml)" +if $GPU; then + sed "s/ci-worker-gpu/$WORKER_NAME/g" \ + "$DOCKER_DIR/ci.gpu_worker_config.yaml" > "$_WORKER_CFG" +else + cat > "$_WORKER_CFG" < "$_COMPOSE_OVERRIDE" </dev/null || true + # Tear down any stale ci-local-* compose stacks (e.g. from a disconnected SSH session). + docker ps -a --format '{{.Labels}}' \ + | grep -oP 'com\.docker\.compose\.project=ci-local-\d+' \ + | sort -u \ + | sed 's/com\.docker\.compose\.project=//' \ + | xargs -r -I{} docker compose -p {} -f "$DOCKER_DIR/ci.compose.yml" down -v --remove-orphans 2>/dev/null || true + docker image prune -f >/dev/null + docker volume prune -f >/dev/null + docker builder prune -f --keep-storage 5gb 2>/dev/null \ + || docker builder prune -f --filter "until=72h" 2>/dev/null \ + || true +fi + +# ── 2. Build worker image ──────────────────────────────────────────────────────────────────────────────────────────────────────────── +if $DO_BUILD; then + log "Building worker image ($WORKER_IMAGE)..." + DOCKER_BUILDKIT=1 docker build \ + -f "$WORKER_DOCKERFILE" \ + -t "$WORKER_IMAGE" \ + . + ok "Worker image built" +else + if ! docker image inspect "$WORKER_IMAGE" >/dev/null 2>&1; then + fail "--no-build specified but image '$WORKER_IMAGE' not found locally." + fail "Run without --no-build first, or: docker build -f $WORKER_DOCKERFILE -t $WORKER_IMAGE ." + exit 1 + fi + log "Using cached worker image: $WORKER_IMAGE" +fi + +# ── 3. Build & start services ──────────────────────────────────────────────────────────────────────────────────────────────────────────────── +# --wait blocks until every healthcheck passes. +log "Starting services (redis × 2, postgres, host, supervisor)..." +if ! DOCKER_BUILDKIT=1 dc up -d --build --wait; then + fail "Services failed to start — supervisor logs:" + dc logs supervisor --tail=60 2>/dev/null || true + exit 1 +fi +ok "All services healthy" + +# ── 4. Resolve the dynamically assigned host port ───────────────────────────────────────────────────────────────────────────────────── +# docker compose port returns 0.0.0.0:0 for 127.0.0.1-only bindings; use docker port instead. +HOST_PORT=$(docker port "$(dc ps -q host)" 8000/tcp \ + | grep '127.0.0.1:' | awk -F: '{print $NF}' | head -1) +HOST_URL="http://localhost:$HOST_PORT" +log "Host bound to $HOST_URL" + +curl -sf "$HOST_URL/healthz" >/dev/null \ + || { fail "Host not reachable at $HOST_URL"; dc logs host --tail=40; exit 1; } +ok "Host healthy at $HOST_URL" + +# ── 5. Confirm supervisor ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── +# Supervisor has start_period:15s; retry up to ~45s to let it fully start. +SUPERVISOR_OK=false +for i in $(seq 1 9); do + if dc exec -T supervisor curl -sf http://localhost:8001/healthz >/dev/null 2>&1; then + SUPERVISOR_OK=true; break + fi + echo " supervisor attempt $i/9" + sleep 5 +done +if ! $SUPERVISOR_OK; then + fail "Supervisor never became healthy" + dc logs supervisor --tail=40 || true + exit 1 +fi +ok "Supervisor healthy" + +# ── 6. Debug snapshot ──────────────────────────────────────────────────────────────────────────────────────────────────────────── +echo +log "Container state:" +dc ps +echo +log "Supervisor logs (last 20 lines):" +dc logs supervisor --tail=20 +echo + +# ── 7. Wait for worker to register ─────────────────────────────────────────────────────────────────────────────────────────────────────────── +log "Waiting for worker to register with host..." +REGISTERED=false +for i in $(seq 1 24); do + RESP=$(curl -sf \ + -H "Authorization: Bearer $API_KEY" \ + "$HOST_URL/api/v1/workers" 2>/dev/null || echo "CURL_FAILED") + if echo "$RESP" | grep -qE '"worker_id"|"id":|"wkr-'; then + REGISTERED=true + break + fi + echo " attempt $i/24 — $RESP" + sleep 5 +done + +if ! $REGISTERED; then + fail "Worker never registered. Supervisor + worker logs:" + dc logs supervisor --tail=40 || true + docker logs "$WORKER_NAME" 2>&1 | tail -40 || true + exit 1 +fi +ok "Worker registered" + +# ── 8. Run E2E smoke test(s) ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────── +echo +log "Running E2E smoke test(s)..." +log " HOST=$HOST_URL" + +if $GPU; then + YAML_LIST=("${GPU_TASK_YAMLS[@]}") +else + YAML_LIST=("$TASK_YAML") +fi + +for _YAML in "${YAML_LIST[@]}"; do + log " → $(basename "$_YAML")" + FLOWMESH_HOST_URL="$HOST_URL" \ + FLOWMESH_API_KEY="$API_KEY" \ + TASK_YAML="$_YAML" \ + E2E_TIMEOUT_SEC="$TIMEOUT" \ + uv run --with pytest --with pytest-asyncio --with requests \ + pytest tests/integration/test_e2e.py -v -s +done + +# ── 9. Verify worker execution evidence ─────────────────────────────────────────────────────────────────────────────────────────────────────── +echo +log "Verifying worker execution evidence..." +LOG_FILE="/tmp/flowmesh-local-worker-$$.log" +docker logs "$WORKER_NAME" 2>&1 | tee "$LOG_FILE" || true + +if grep -qiE "executor|running task|dispatched|echo|inference|succeeded|TASK_SUCCEEDED|done" "$LOG_FILE"; then + ok "Worker executed and completed the task" +else + fail "No task execution evidence found in worker logs ($LOG_FILE)" + exit 1 +fi + +if $GPU; then + echo + log "GPU utilisation during test:" + nvidia-smi --query-gpu=name,memory.used,memory.total,utilization.gpu \ + --format=csv,noheader,nounits 2>/dev/null \ + || warn "nvidia-smi not available" +fi + +echo +ok "All checks passed" diff --git a/templates/conditional_echo_test.yaml b/templates/conditional_echo_test.yaml index acbc07c..6c5f8ca 100644 --- a/templates/conditional_echo_test.yaml +++ b/templates/conditional_echo_test.yaml @@ -87,6 +87,7 @@ spec: output: destination: - type: "http" + type: "local" + path: "./conditional_echo_test" artifacts: - "results.json" diff --git a/templates/echo_three_node_graph.yaml b/templates/echo_three_node_graph.yaml index 2c503be..405068b 100644 --- a/templates/echo_three_node_graph.yaml +++ b/templates/echo_three_node_graph.yaml @@ -50,11 +50,3 @@ spec: path: "result.items[0].output" - node: "echo-b" path: "result.items[0].output" - - output: - destination: - type: "http" - artifacts: - - "results.json" - - "logs" - - "artifacts" diff --git a/templates/ssh_noninteractive.yaml b/templates/ssh_noninteractive.yaml index e33e71b..8669a89 100644 --- a/templates/ssh_noninteractive.yaml +++ b/templates/ssh_noninteractive.yaml @@ -18,7 +18,8 @@ spec: MY_CUSTOM_VAR: "hello" output: destination: - type: http + type: local + path: "./ssh_noninteractive_output" artifacts: - "results.json" - "logs" From 37bef386179749f3e7a88ab80ad33721d5519df7 Mon Sep 17 00:00:00 2001 From: Qruixuan <154648498+Qruixuan@users.noreply.github.com> Date: Sun, 3 May 2026 18:36:36 +0800 Subject: [PATCH 08/17] fix: redesign CI compose for FlowMesh's single-server architecture FlowMesh has no separate host/guardian/postgres services. A single src/server/Dockerfile exposes both HTTP API (8000) and gRPC supervisor (50051). Updated ci.compose.yml, ci.worker.gpu.yml, ci.ports.fixed.yml: - server service built from src/server/Dockerfile - redis only (no postgres) - WORKER_DOCKER_NETWORK uses ${COMPOSE_PROJECT_NAME}_ci-net interpolation - SERVER_HOST=server so spawned workers get SUPERVISOR_GRPC_TARGET=server:50051 Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn> --- docker/ci.compose.yml | 96 ++++++++++----------------------------- docker/ci.ports.fixed.yml | 4 +- docker/ci.worker.gpu.yml | 14 ++---- 3 files changed, 30 insertions(+), 84 deletions(-) diff --git a/docker/ci.compose.yml b/docker/ci.compose.yml index 0b3e47c..df2d298 100644 --- a/docker/ci.compose.yml +++ b/docker/ci.compose.yml @@ -1,19 +1,15 @@ # docker/ci.compose.yml — CI integration test stack (single-host, no GPU) # # Brings up a fully isolated FlowMesh environment for each CI run. -# All services live in an internal Docker network; no state persists between runs. +# FlowMesh uses a single server container (HTTP API port 8000 + gRPC +# supervisor port 50051); no separate host or database service needed. # -# Supervisor spawns the CPU worker via Docker (with Docker socket mounted), -# so the worker gets a proper token and can register correctly. +# The server spawns worker containers via Docker (socket mounted) and +# attaches them to WORKER_DOCKER_NETWORK so they can resolve "server". # # NOTE: No ports are exposed in this base file. Add ports via an overlay: # - Fixed (GitHub Actions / bare docker compose): docker/ci.ports.fixed.yml # - Dynamic (local dev, run_local.sh): generated at runtime -# -# Usage (from repo root): -# docker build -f src/worker/docker/Dockerfile.cpu -t ci/flowmesh_worker:latest-cpu . -# docker compose -p ci-$RUN_ID -f docker/ci.compose.yml -f docker/ci.ports.fixed.yml up -d --build -# docker compose -p ci-$RUN_ID -f docker/ci.compose.yml -f docker/ci.ports.fixed.yml down -v services: redis_control: @@ -36,92 +32,48 @@ services: retries: 10 networks: [ci-net] - postgres: - image: postgres:18-alpine - environment: - POSTGRES_USER: flowmesh - POSTGRES_PASSWORD: flowmesh - POSTGRES_DB: flowmesh - healthcheck: - test: ["CMD-SHELL", "pg_isready -U flowmesh"] - interval: 3s - timeout: 2s - retries: 10 - networks: [ci-net] - - host: - build: - context: .. - dockerfile: src/host/Dockerfile - depends_on: - redis_control: - condition: service_healthy - redis_telemetry: - condition: service_healthy - postgres: - condition: service_healthy - environment: - REDIS_URL: "redis://redis_control:6379/0" - REDIS_CONTROL_URL: "redis://redis_control:6379/0" - REDIS_TELEMETRY_URL: "redis://redis_telemetry:6379/0" - DATABASE_URL: "postgresql+asyncpg://flowmesh:flowmesh@postgres:5432/flowmesh" - HOST_RUN_MIGRATIONS: "true" - API_KEY_HMAC_SECRET: "ci-hmac-secret" - BOOTSTRAP_ORG_ID: "ci-org" - BOOTSTRAP_ADMIN_EXTERNAL_ID: "ci-admin" - BOOTSTRAP_ADMIN_API_KEY: "flm-ci-00000000000000000000000000000000" - ORCHESTRATOR_DISPATCH_MODE: "adaptive" - ORCHESTRATOR_WORKER_SELECTION: "first_fit" - ENABLE_ELASTIC_SCALING: "false" - LOG_LEVEL: "INFO" - healthcheck: - test: ["CMD", "curl", "-sf", "http://localhost:8000/healthz"] - interval: 5s - timeout: 3s - start_period: 20s - retries: 12 - networks: [ci-net] - - supervisor: + server: build: context: .. dockerfile: src/server/Dockerfile depends_on: - host: - condition: service_healthy redis_control: condition: service_healthy redis_telemetry: condition: service_healthy environment: - FLOWMESH_BASE_URL: "http://host:8000" - FLOWMESH_API_KEY: "flm-ci-00000000000000000000000000000000" REDIS_CONTROL_URL: "redis://redis_control:6379/0" REDIS_TELEMETRY_URL: "redis://redis_telemetry:6379/0" - SUPERVISOR_NAMESPACE: "ci" - SUPERVISOR_CLUSTER: "ci-cluster" - SUPERVISOR_ALIAS: "ci-supervisor" + FLOWMESH_API_KEY: "flm-ci-00000000000000000000000000000000" + # FLOWMESH_BASE_URL lets the server know its own HTTP address so it + # can embed the correct URL in tokens passed to spawned workers. + FLOWMESH_BASE_URL: "http://server:8000" + # SERVER_HOST tells the server its own gRPC hostname so spawned + # workers receive SUPERVISOR_GRPC_TARGET=server:50051. + SERVER_HOST: "server" + NODE_NAMESPACE: "ci" + NODE_CLUSTER: "ci-cluster" + NODE_ALIAS: "ci-server" LOG_LEVEL: "INFO" # Worker spawning via Docker + ENABLE_SUPERVISOR: "true" FLOWMESH_REGISTRY: "ci" FLOWMESH_VERSION: "latest" - WORKER_CONFIG_PATH: "/etc/supervisor/worker_config.yaml" - WORKER_EXECUTOR_IDLE_CLEANUP_SEC: "0" - # Workers are spawned on the compose network (WORKER_DOCKER_NETWORK) so - # they must reach supervisor by service name, not localhost. - SUPERVISOR_HOST: "supervisor" + WORKER_CONFIG_PATH: "/etc/flowmesh/worker_config.yaml" + # Attach spawned workers to this compose network so they can + # resolve "server" by hostname. Set via COMPOSE_PROJECT_NAME. + WORKER_DOCKER_NETWORK: "${COMPOSE_PROJECT_NAME}_ci-net" # Pass HuggingFace token through so workers can download gated models. - # Set HF_TOKEN in the runner environment (or as a GitHub Actions secret). HF_TOKEN: volumes: - /var/run/docker.sock:/var/run/docker.sock - - ./ci.worker_config.yaml:/etc/supervisor/worker_config.yaml:ro + - ./ci.worker_config.yaml:/etc/flowmesh/worker_config.yaml:ro healthcheck: - test: ["CMD", "curl", "-sf", "http://localhost:8001/healthz"] + test: ["CMD", "curl", "-sf", "http://localhost:8000/healthz"] interval: 5s timeout: 3s - start_period: 15s - retries: 10 + start_period: 20s + retries: 12 networks: [ci-net] networks: diff --git a/docker/ci.ports.fixed.yml b/docker/ci.ports.fixed.yml index b5ba67f..71676ea 100644 --- a/docker/ci.ports.fixed.yml +++ b/docker/ci.ports.fixed.yml @@ -13,9 +13,7 @@ # is not used by that script. services: - host: + server: ports: - "8000:8000" - supervisor: - ports: - "50051:50051" diff --git a/docker/ci.worker.gpu.yml b/docker/ci.worker.gpu.yml index 29e905c..a96335d 100644 --- a/docker/ci.worker.gpu.yml +++ b/docker/ci.worker.gpu.yml @@ -1,8 +1,8 @@ # docker/ci.worker.gpu.yml — GPU worker overlay for CI # # Overlay on top of ci.compose.yml for GPU runner (luyao3, RTX 5080). -# Supervisor spawns a GPU worker container (ci/flowmesh_worker:latest-gpu) -# using the Docker socket, same pattern as the CPU integration test. +# Overrides the worker config to use the GPU image and passes the +# compose network name so GPU workers can reach the server by hostname. # # Pre-build the GPU worker image before running compose: # docker build -f src/worker/docker/Dockerfile.cuda \ @@ -15,13 +15,9 @@ # up -d --build services: - supervisor: + server: environment: - WORKER_CONFIG_PATH: "/etc/supervisor/worker_config.yaml" - # Attach GPU workers to the compose network so they can resolve service - # hostnames (e.g. "host") when uploading results. COMPOSE_PROJECT_NAME - # must be exported before docker compose up (run_local.sh does this via - # the compose override; ci.yml sets it explicitly in the step env). + WORKER_CONFIG_PATH: "/etc/flowmesh/worker_config.yaml" WORKER_DOCKER_NETWORK: "${COMPOSE_PROJECT_NAME}_ci-net" volumes: - - ./ci.gpu_worker_config.yaml:/etc/supervisor/worker_config.yaml:ro + - ./ci.gpu_worker_config.yaml:/etc/flowmesh/worker_config.yaml:ro From b20602a45732f4a49fe382064a884f1e99e51b7c Mon Sep 17 00:00:00 2001 From: Qruixuan <154648498+Qruixuan@users.noreply.github.com> Date: Sun, 3 May 2026 18:39:19 +0800 Subject: [PATCH 09/17] fix: update CI workflow and run_local.sh for single-server architecture Key changes: - Single "Wait for server" health check (port 8000) instead of separate host + supervisor - Worker registration check uses docker compose exec -T server (no exposed port needed) - E2E tests use http://server:8000 (internal compose network name) - Destroy workers via server API on port 8000 - COMPOSE_PROJECT_NAME exported so ${COMPOSE_PROJECT_NAME}_ci-net interpolation works - run_local.sh: dc() wrapper exports COMPOSE_PROJECT_NAME; single server port block in compose override; step numbering adjusted (no separate supervisor confirm step) Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn> --- .github/workflows/ci.yml | 117 +++++++++++++++------------------------ scripts/ci/run_local.sh | 102 +++++++++++----------------------- 2 files changed, 77 insertions(+), 142 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9e5aea4..de7b9ae 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,8 +2,8 @@ name: CI — Integration & GPU Tests on: push: - branches: [main] # run on every merge to main - workflow_dispatch: # also allow manual trigger from GitHub UI + branches: [main] + workflow_dispatch: concurrency: group: ci-${{ github.ref }} @@ -13,7 +13,7 @@ env: FLOWMESH_API_KEY: "flm-ci-00000000000000000000000000000000" jobs: - # ── Integration test (CPU, luyaomini self-hosted runners) ────────────────────── + # ── Integration test (CPU, luyao3 self-hosted runner) ────────────────────── integration: name: Integration test (CPU) runs-on: [self-hosted, linux, luyao3] @@ -29,12 +29,9 @@ jobs: - name: Pre-clean stale worker containers and disk run: | docker rm -f ci-worker-cpu 2>/dev/null || true - # Remove the CI worker image so it always rebuilds fresh docker rmi ci/flowmesh_worker:latest-cpu 2>/dev/null || true - # Remove dangling images and unused volumes from crashed/orphaned runs docker image prune -f docker volume prune -f - # Trim build cache: keep 5 GB of recent layers, discard the rest docker builder prune -f --keep-storage 5gb 2>/dev/null \ || docker builder prune -f --filter "until=72h" 2>/dev/null \ || true @@ -54,23 +51,14 @@ jobs: docker compose -p "$PROJECT" -f docker/ci.compose.yml up -d --build env: DOCKER_BUILDKIT: "1" + COMPOSE_PROJECT_NAME: ${{ env.PROJECT }} - - name: Wait for host to be healthy - run: | - timeout 120 bash -c ' - until docker compose -p "$PROJECT" -f docker/ci.compose.yml \ - exec -T host curl -sf http://localhost:8000/healthz; do - echo "waiting for host…" - sleep 3 - done - ' - - - name: Wait for supervisor to be healthy + - name: Wait for server to be healthy run: | timeout 120 bash -c ' until docker compose -p "$PROJECT" -f docker/ci.compose.yml \ - exec -T supervisor curl -sf http://localhost:8001/healthz; do - echo "waiting for supervisor…" + exec -T server curl -sf http://localhost:8000/healthz; do + echo "waiting for server…" sleep 3 done ' @@ -79,19 +67,22 @@ jobs: run: | echo "=== Running containers ===" docker compose -p "$PROJECT" -f docker/ci.compose.yml ps - echo "=== All Docker containers (incl. supervisor-spawned worker) ===" + echo "=== All Docker containers (incl. server-spawned worker) ===" docker ps -a --format "table {{.Names}}\t{{.Image}}\t{{.Status}}" | grep -E "NAME|worker|ci-worker" || true - echo "=== Supervisor logs ===" - docker compose -p "$PROJECT" -f docker/ci.compose.yml logs supervisor --tail=40 - echo "=== Worker container logs (supervisor-spawned) ===" + echo "=== Server logs ===" + docker compose -p "$PROJECT" -f docker/ci.compose.yml logs server --tail=40 + echo "=== Worker container logs (server-spawned) ===" docker logs ci-worker-cpu 2>&1 | tail -40 || echo "(no ci-worker-cpu container found)" - name: Wait for worker to register run: | for i in $(seq 1 24); do - RESP=$(curl -sf \ - -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" \ - http://localhost:8000/api/v1/workers || echo "CURL_FAILED") + RESP=$(docker compose -p "$PROJECT" -f docker/ci.compose.yml \ + exec -T server \ + curl -sf \ + -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" \ + http://localhost:8000/api/v1/workers 2>/dev/null \ + || echo "CURL_FAILED") echo "Attempt $i: $RESP" if echo "$RESP" | grep -qE '"worker_id"|"id":|"wkr-'; then echo "Worker registered!" @@ -99,15 +90,15 @@ jobs: fi sleep 5 done - echo "=== Worker never registered. Final worker logs ===" - docker compose -p "$PROJECT" -f docker/ci.compose.yml logs worker --tail=80 + echo "=== Worker never registered. Final server logs ===" + docker compose -p "$PROJECT" -f docker/ci.compose.yml logs server --tail=80 exit 1 - name: Run E2E smoke test (echo task) run: | docker run --rm \ --network "${PROJECT}_ci-net" \ - -e FLOWMESH_HOST_URL="http://host:8000" \ + -e FLOWMESH_HOST_URL="http://server:8000" \ -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ -e TASK_YAML="/templates/echo_local.yaml" \ -e E2E_TIMEOUT_SEC="120" \ @@ -152,12 +143,12 @@ jobs: path: /tmp/ci-logs-${{ github.run_id }}.txt retention-days: 3 - - name: Destroy workers via supervisor API + - name: Destroy workers via server API if: always() run: | docker compose -p "$PROJECT" -f docker/ci.compose.yml \ - exec -T supervisor \ - curl -sf -X DELETE http://localhost:8001/api/v1/workers \ + exec -T server \ + curl -sf -X DELETE http://localhost:8000/api/v1/workers \ -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" || true sleep 5 @@ -166,22 +157,19 @@ jobs: run: | docker rm -f ci-worker-cpu 2>/dev/null || true docker compose -p "$PROJECT" -f docker/ci.compose.yml down -v --remove-orphans - # Remove the built CI image — it will be rebuilt next run docker rmi ci/flowmesh_worker:latest-cpu 2>/dev/null || true - # Clean up dangling images and unused volumes left by this run docker image prune -f docker volume prune -f echo "=== Disk after teardown ===" df -h / docker system df - # ── GPU smoke test (RTX 5080 self-hosted runners) ───────────────────────── + # ── GPU smoke test (RTX 5080 self-hosted runner) ───────────────────────── gpu-smoke: name: GPU smoke test (RTX 5080) needs: integration runs-on: [self-hosted, linux, luyao3] timeout-minutes: 90 - # One GPU job at a time per machine concurrency: group: gpu-rtx5080-${{ github.ref }} cancel-in-progress: false @@ -196,12 +184,9 @@ jobs: - name: Pre-clean stale worker containers and disk run: | docker rm -f ci-worker-gpu 2>/dev/null || true - # Remove old CI GPU worker image (rebuilt each run) docker rmi ci/flowmesh_worker:latest-gpu 2>/dev/null || true - # Remove dangling images and unused volumes from crashed/orphaned runs docker image prune -f docker volume prune -f - # Trim build cache but keep recent layers for faster builds docker builder prune -f --keep-storage 5gb 2>/dev/null \ || docker builder prune -f --filter "until=72h" 2>/dev/null \ || true @@ -211,8 +196,6 @@ jobs: - name: Build GPU worker builder image (cached by content hash) run: | - # Hash Dockerfile.cuda.builder + GPU requirements so we only rebuild - # when the actual inputs change. The tagged image persists on the runner. BUILDER_HASH=$(cat \ src/worker/docker/Dockerfile.cuda.builder \ src/worker/requirements/requirements.gpu.txt \ @@ -249,22 +232,12 @@ jobs: HF_TOKEN: ${{ secrets.HF_TOKEN }} COMPOSE_PROJECT_NAME: ${{ env.PROJECT }} - - name: Wait for host to be healthy - run: | - timeout 120 bash -c ' - until docker compose -p "$PROJECT" -f docker/ci.compose.yml \ - exec -T host curl -sf http://localhost:8000/healthz; do - echo "waiting for host…" - sleep 3 - done - ' - - - name: Wait for supervisor to be healthy + - name: Wait for server to be healthy run: | timeout 120 bash -c ' until docker compose -p "$PROJECT" -f docker/ci.compose.yml \ - exec -T supervisor curl -sf http://localhost:8001/healthz; do - echo "waiting for supervisor…" + exec -T server curl -sf http://localhost:8000/healthz; do + echo "waiting for server…" sleep 3 done ' @@ -272,9 +245,12 @@ jobs: - name: Wait for GPU worker to register run: | for i in $(seq 1 36); do - RESP=$(curl -sf \ - -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" \ - http://localhost:8000/api/v1/workers || echo "CURL_FAILED") + RESP=$(docker compose -p "$PROJECT" -f docker/ci.compose.yml \ + exec -T server \ + curl -sf \ + -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" \ + http://localhost:8000/api/v1/workers 2>/dev/null \ + || echo "CURL_FAILED") echo "Attempt $i: $RESP" if echo "$RESP" | grep -qE '"worker_id"|"id":|"wkr-'; then echo "Worker registered!" @@ -289,7 +265,7 @@ jobs: run: | docker run --rm \ --network "${PROJECT}_ci-net" \ - -e FLOWMESH_HOST_URL="http://host:8000" \ + -e FLOWMESH_HOST_URL="http://server:8000" \ -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ -e TASK_YAML="/templates/inference_vllm_tiny.yaml" \ -e E2E_TIMEOUT_SEC="300" \ @@ -302,7 +278,7 @@ jobs: run: | docker run --rm \ --network "${PROJECT}_ci-net" \ - -e FLOWMESH_HOST_URL="http://host:8000" \ + -e FLOWMESH_HOST_URL="http://server:8000" \ -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ -e TASK_YAML="/templates/echo_three_node_graph.yaml" \ -e E2E_TIMEOUT_SEC="120" \ @@ -315,7 +291,7 @@ jobs: run: | docker run --rm \ --network "${PROJECT}_ci-net" \ - -e FLOWMESH_HOST_URL="http://host:8000" \ + -e FLOWMESH_HOST_URL="http://server:8000" \ -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ -e TASK_YAML="/templates/dag_inference_example.yaml" \ -e E2E_TIMEOUT_SEC="600" \ @@ -328,7 +304,7 @@ jobs: run: | docker run --rm \ --network "${PROJECT}_ci-net" \ - -e FLOWMESH_HOST_URL="http://host:8000" \ + -e FLOWMESH_HOST_URL="http://server:8000" \ -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ -e TASK_YAML="/templates/conditional_echo_test.yaml" \ -e E2E_TIMEOUT_SEC="120" \ @@ -341,7 +317,7 @@ jobs: run: | docker run --rm \ --network "${PROJECT}_ci-net" \ - -e FLOWMESH_HOST_URL="http://host:8000" \ + -e FLOWMESH_HOST_URL="http://server:8000" \ -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ -e TASK_YAML="/templates/inference_hf_tiny.yaml" \ -e E2E_TIMEOUT_SEC="300" \ @@ -354,7 +330,7 @@ jobs: run: | docker run --rm \ --network "${PROJECT}_ci-net" \ - -e FLOWMESH_HOST_URL="http://host:8000" \ + -e FLOWMESH_HOST_URL="http://server:8000" \ -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ -e TASK_YAML="/templates/lora_sft_llama.yaml" \ -e E2E_TIMEOUT_SEC="1200" \ @@ -367,7 +343,7 @@ jobs: run: | docker run --rm \ --network "${PROJECT}_ci-net" \ - -e FLOWMESH_HOST_URL="http://host:8000" \ + -e FLOWMESH_HOST_URL="http://server:8000" \ -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ -e TASK_YAML="/templates/ssh_noninteractive.yaml" \ -e E2E_TIMEOUT_SEC="120" \ @@ -380,7 +356,7 @@ jobs: run: | docker run --rm \ --network "${PROJECT}_ci-net" \ - -e FLOWMESH_HOST_URL="http://host:8000" \ + -e FLOWMESH_HOST_URL="http://server:8000" \ -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ -e TASK_YAML="/templates/n8n/dag_inference.json" \ -e E2E_TIMEOUT_SEC="600" \ @@ -398,7 +374,6 @@ jobs: echo "=== Execution evidence check ===" LOG=/tmp/worker-gpu-${{ github.run_id }}.log - # Must have received and run a task if grep -qiE "executor|running task|dispatched|inference|model" "$LOG"; then echo "✓ Worker received and processed a task" else @@ -406,7 +381,6 @@ jobs: exit 1 fi - # Must show task succeeded (not just status update) if grep -qiE "succeeded|TASK_SUCCEEDED|done|completed" "$LOG"; then echo "✓ Task completed successfully in worker" else @@ -442,12 +416,12 @@ jobs: path: /tmp/ci-gpu-logs-${{ github.run_id }}.txt retention-days: 3 - - name: Destroy workers via supervisor API + - name: Destroy workers via server API if: always() run: | docker compose -p "$PROJECT" -f docker/ci.compose.yml \ - exec -T supervisor \ - curl -sf -X DELETE http://localhost:8001/api/v1/workers \ + exec -T server \ + curl -sf -X DELETE http://localhost:8000/api/v1/workers \ -H "Authorization: Bearer flm-ci-00000000000000000000000000000000" || true sleep 5 @@ -459,9 +433,7 @@ jobs: -f docker/ci.compose.yml \ -f docker/ci.worker.gpu.yml \ down -v --remove-orphans - # Remove the CI GPU worker image — rebuilt next run docker rmi ci/flowmesh_worker:latest-gpu 2>/dev/null || true - # Remove old flowmesh-builder images (keep only the current content hash) CURRENT_HASH=$(cat \ src/worker/docker/Dockerfile.cuda.builder \ src/worker/requirements/requirements.gpu.txt \ @@ -470,7 +442,6 @@ jobs: | grep "^flowmesh-builder:" \ | grep -v ":${CURRENT_HASH}$" \ | xargs -r docker rmi 2>/dev/null || true - # Clean up dangling images and unused volumes docker image prune -f docker volume prune -f echo "=== Disk after teardown ===" diff --git a/scripts/ci/run_local.sh b/scripts/ci/run_local.sh index f569dcc..99d52e1 100644 --- a/scripts/ci/run_local.sh +++ b/scripts/ci/run_local.sh @@ -5,7 +5,8 @@ # pushing to GitHub. Requires: docker, docker compose v2, uv. # # Fully isolated from any running FlowMesh services: -# - Host and supervisor ports are dynamically assigned (no fixed 8000/50051) +# - Server HTTP port is dynamically assigned (no fixed 8000) +# - gRPC port 50051 is fixed (workers cannot follow a dynamic port) # - Worker container name is scoped to the process PID # - Each run gets its own Docker network via compose project name # @@ -14,7 +15,7 @@ # # Options: # --gpu Run the GPU smoke test instead of the CPU integration test -# --task-yaml PATH Override the workflow YAML submitted to the host +# --task-yaml PATH Override the workflow YAML submitted to the server # --timeout SEC Override E2E wait timeout (default: 120, GPU default: 300) # --no-clean Skip the pre-run docker prune step # --no-build Skip rebuilding the worker image (use cached) @@ -40,13 +41,12 @@ DO_TEARDOWN=true WORKER_IMAGE_CPU="ci/flowmesh_worker:latest-cpu" WORKER_IMAGE_GPU="ci/flowmesh_worker:latest-gpu" -# Populated in section 0; referenced in teardown. WORKER_NAME="" _WORKER_CFG="" _COMPOSE_OVERRIDE="" -HOST_URL="http://localhost:8000" # overwritten after dc up +HOST_URL="http://localhost:8000" -# ── Argument parsing ─────────────────────────────────────────────────────────────────────────────────────── +# ── Argument parsing ────────────────────────────────────────────────────────────────────────────────────── while [[ $# -gt 0 ]]; do case "$1" in --gpu) GPU=true; shift ;; @@ -77,23 +77,22 @@ if $GPU; then COMPOSE_FILES+=(-f "$DOCKER_DIR/ci.worker.gpu.yml") fi -dc() { docker compose -p "$PROJECT" "${COMPOSE_FILES[@]}" "$@"; } +dc() { COMPOSE_PROJECT_NAME="$PROJECT" docker compose -p "$PROJECT" "${COMPOSE_FILES[@]}" "$@"; } # ── Teardown (trap runs on any exit) ────────────────────────────────────────────────────────────────── _teardown() { local code=$? if ! $DO_TEARDOWN; then warn "Skipping teardown (--keep). To clean up manually:" - echo " docker compose -p $PROJECT ${COMPOSE_FILES[*]} down -v --remove-orphans" + echo " COMPOSE_PROJECT_NAME=$PROJECT docker compose -p $PROJECT ${COMPOSE_FILES[*]} down -v --remove-orphans" return fi log "Tearing down..." - # Always dump service logs before removal — essential for diagnosing failures. echo - log "Supervisor logs (last 40 lines):" - dc logs supervisor --tail=40 2>/dev/null || true + log "Server logs (last 40 lines):" + dc logs server --tail=40 2>/dev/null || true echo if [[ -n "$WORKER_NAME" ]]; then @@ -102,21 +101,16 @@ _teardown() { echo fi - # Ask supervisor to stop managed workers gracefully. - dc exec -T supervisor \ - curl -sf -X DELETE http://localhost:8001/api/v1/workers \ + dc exec -T server \ + curl -sf -X DELETE http://localhost:8000/api/v1/workers \ -H "Authorization: Bearer $API_KEY" 2>/dev/null || true sleep 3 docker rm -f "$WORKER_NAME" 2>/dev/null || true dc down -v --remove-orphans 2>/dev/null || true - # Worker image is intentionally kept: the next build overwrites the tag in-place, - # so there is always exactly one cached image available for --no-build runs. docker image prune -f >/dev/null docker volume prune -f >/dev/null - - # Clean up isolation temp files. rm -f "${_WORKER_CFG:-}" "${_COMPOSE_OVERRIDE:-}" 2>/dev/null || true if [[ $code -eq 0 ]]; then @@ -133,7 +127,6 @@ if $GPU; then WORKER_IMAGE="$WORKER_IMAGE_GPU" WORKER_DOCKERFILE="src/worker/docker/Dockerfile.cuda" [[ -z "$TIMEOUT" ]] && TIMEOUT=300 - # If --task-yaml was given, run only that one; otherwise run the full GPU suite. if [[ -n "$TASK_YAML" ]]; then GPU_TASK_YAMLS=("$TASK_YAML") else @@ -159,8 +152,6 @@ fi cd "$REPO_ROOT" # ── 0b. Create isolation artifacts ─────────────────────────────────────────────────────────────────────────────────────────────────────────── -# Worker config: project-scoped alias prevents container name clashes when a -# second local CI run or a dev worker with the same name is already running. _WORKER_CFG="$(mktemp /tmp/ci-worker-cfg-XXXXXX.yml)" if $GPU; then sed "s/ci-worker-gpu/$WORKER_NAME/g" \ @@ -179,23 +170,18 @@ workers: EOF fi -# Compose override: host port is dynamic (avoids silently hitting a production -# host on 8000). Supervisor gRPC stays on fixed 50051 — workers are spawned with -# SUPERVISOR_GRPC_TARGET=localhost:50051 and cannot follow a random port. -# If 50051 is already taken, dc up fails loudly at startup. +# Compose override: HTTP port is dynamic, gRPC port stays fixed at 50051. +# Workers receive SUPERVISOR_GRPC_TARGET=server:50051 (set via SERVER_HOST +# in ci.compose.yml) and cannot follow a dynamic port. _COMPOSE_OVERRIDE="$(mktemp /tmp/ci-compose-override-XXXXXX.yml)" cat > "$_COMPOSE_OVERRIDE" </dev/null || true - # Tear down any stale ci-local-* compose stacks (e.g. from a disconnected SSH session). docker ps -a --format '{{.Labels}}' \ | grep -oP 'com\.docker\.compose\.project=ci-local-\d+' \ | sort -u \ @@ -241,61 +225,41 @@ if $DO_BUILD; then else if ! docker image inspect "$WORKER_IMAGE" >/dev/null 2>&1; then fail "--no-build specified but image '$WORKER_IMAGE' not found locally." - fail "Run without --no-build first, or: docker build -f $WORKER_DOCKERFILE -t $WORKER_IMAGE ." exit 1 fi log "Using cached worker image: $WORKER_IMAGE" fi -# ── 3. Build & start services ──────────────────────────────────────────────────────────────────────────────────────────────────────────────── -# --wait blocks until every healthcheck passes. -log "Starting services (redis × 2, postgres, host, supervisor)..." +# ── 3. Build & start services ──────────────────────────────────────────────────────────────────────────────────────────────────────────── +log "Starting services (redis × 2, server)..." if ! DOCKER_BUILDKIT=1 dc up -d --build --wait; then - fail "Services failed to start — supervisor logs:" - dc logs supervisor --tail=60 2>/dev/null || true + fail "Services failed to start — server logs:" + dc logs server --tail=60 2>/dev/null || true exit 1 fi ok "All services healthy" # ── 4. Resolve the dynamically assigned host port ───────────────────────────────────────────────────────────────────────────────────── -# docker compose port returns 0.0.0.0:0 for 127.0.0.1-only bindings; use docker port instead. -HOST_PORT=$(docker port "$(dc ps -q host)" 8000/tcp \ +HOST_PORT=$(docker port "$(dc ps -q server)" 8000/tcp \ | grep '127.0.0.1:' | awk -F: '{print $NF}' | head -1) HOST_URL="http://localhost:$HOST_PORT" -log "Host bound to $HOST_URL" +log "Server HTTP bound to $HOST_URL" curl -sf "$HOST_URL/healthz" >/dev/null \ - || { fail "Host not reachable at $HOST_URL"; dc logs host --tail=40; exit 1; } -ok "Host healthy at $HOST_URL" - -# ── 5. Confirm supervisor ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── -# Supervisor has start_period:15s; retry up to ~45s to let it fully start. -SUPERVISOR_OK=false -for i in $(seq 1 9); do - if dc exec -T supervisor curl -sf http://localhost:8001/healthz >/dev/null 2>&1; then - SUPERVISOR_OK=true; break - fi - echo " supervisor attempt $i/9" - sleep 5 -done -if ! $SUPERVISOR_OK; then - fail "Supervisor never became healthy" - dc logs supervisor --tail=40 || true - exit 1 -fi -ok "Supervisor healthy" + || { fail "Server not reachable at $HOST_URL"; dc logs server --tail=40; exit 1; } +ok "Server healthy at $HOST_URL" -# ── 6. Debug snapshot ──────────────────────────────────────────────────────────────────────────────────────────────────────────── +# ── 5. Debug snapshot ──────────────────────────────────────────────────────────────────────────────────────────────────────────── echo log "Container state:" dc ps echo -log "Supervisor logs (last 20 lines):" -dc logs supervisor --tail=20 +log "Server logs (last 20 lines):" +dc logs server --tail=20 echo -# ── 7. Wait for worker to register ─────────────────────────────────────────────────────────────────────────────────────────────────────────── -log "Waiting for worker to register with host..." +# ── 6. Wait for worker to register ─────────────────────────────────────────────────────────────────────────────────────────────────────────── +log "Waiting for worker to register with server..." REGISTERED=false for i in $(seq 1 24); do RESP=$(curl -sf \ @@ -310,14 +274,14 @@ for i in $(seq 1 24); do done if ! $REGISTERED; then - fail "Worker never registered. Supervisor + worker logs:" - dc logs supervisor --tail=40 || true + fail "Worker never registered. Server + worker logs:" + dc logs server --tail=40 || true docker logs "$WORKER_NAME" 2>&1 | tail -40 || true exit 1 fi ok "Worker registered" -# ── 8. Run E2E smoke test(s) ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────── +# ── 7. Run E2E smoke test(s) ──────────────────────────────────────────────────────────────────────────────────────────────────────────── echo log "Running E2E smoke test(s)..." log " HOST=$HOST_URL" @@ -338,7 +302,7 @@ for _YAML in "${YAML_LIST[@]}"; do pytest tests/integration/test_e2e.py -v -s done -# ── 9. Verify worker execution evidence ─────────────────────────────────────────────────────────────────────────────────────────────────────── +# ── 8. Verify worker execution evidence ─────────────────────────────────────────────────────────────────────────────────────────────────────── echo log "Verifying worker execution evidence..." LOG_FILE="/tmp/flowmesh-local-worker-$$.log" From 9c5733db636c0c5c06069a6b21290c92d2a06a52 Mon Sep 17 00:00:00 2001 From: Qruixuan <154648498+Qruixuan@users.noreply.github.com> Date: Sun, 3 May 2026 18:46:11 +0800 Subject: [PATCH 10/17] =?UTF-8?q?fix:=20correct=20worker=20networking=20?= =?UTF-8?q?=E2=80=94=20host-mode=20workers=20need=20localhost=20URLs=20and?= =?UTF-8?q?=20exposed=20ports?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Workers are spawned by the Docker adapter with network_mode: host (see supervisor/adapters/docker.py _start()). They connect to the gRPC supervisor at localhost:50051 and download results via FLOWMESH_BASE_URL. Three bugs in the previous CI setup: 1. WORKER_DOCKER_NETWORK env var doesn't exist in FlowMesh — removed. 2. FLOWMESH_BASE_URL was "http://server:8000" but workers on host network can't resolve "server"; changed to "http://localhost:8000". 3. CI workflow never exposed ports 8000/50051 on the host, so workers (network_mode: host) couldn't reach the server container at all; added ci.ports.fixed.yml to both build steps. 4. run_local.sh used a dynamic HTTP port, but FLOWMESH_BASE_URL in the compose is a static value set before start; changed to fixed 127.0.0.1:8000:8000 so workers can always reach http://localhost:8000. Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn> --- .github/workflows/ci.yml | 6 ++++- docker/ci.compose.yml | 18 +++++-------- scripts/ci/run_local.sh | 55 ++++++++++++++++++++-------------------- 3 files changed, 40 insertions(+), 39 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index de7b9ae..8bb9122 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -48,7 +48,10 @@ jobs: - name: Build & start services run: | - docker compose -p "$PROJECT" -f docker/ci.compose.yml up -d --build + docker compose -p "$PROJECT" \ + -f docker/ci.compose.yml \ + -f docker/ci.ports.fixed.yml \ + up -d --build env: DOCKER_BUILDKIT: "1" COMPOSE_PROJECT_NAME: ${{ env.PROJECT }} @@ -226,6 +229,7 @@ jobs: docker compose -p "$PROJECT" \ -f docker/ci.compose.yml \ -f docker/ci.worker.gpu.yml \ + -f docker/ci.ports.fixed.yml \ up -d --build env: DOCKER_BUILDKIT: "1" diff --git a/docker/ci.compose.yml b/docker/ci.compose.yml index df2d298..448ca31 100644 --- a/docker/ci.compose.yml +++ b/docker/ci.compose.yml @@ -4,12 +4,13 @@ # FlowMesh uses a single server container (HTTP API port 8000 + gRPC # supervisor port 50051); no separate host or database service needed. # -# The server spawns worker containers via Docker (socket mounted) and -# attaches them to WORKER_DOCKER_NETWORK so they can resolve "server". +# Workers are spawned by the server's Docker adapter with network_mode: host. +# They connect to gRPC at localhost:50051 and HTTP at http://localhost:8000. +# Ports 8000 and 50051 MUST therefore be bound on the Docker host machine. # # NOTE: No ports are exposed in this base file. Add ports via an overlay: # - Fixed (GitHub Actions / bare docker compose): docker/ci.ports.fixed.yml -# - Dynamic (local dev, run_local.sh): generated at runtime +# - Fixed local (run_local.sh): generated at runtime services: redis_control: @@ -45,11 +46,9 @@ services: REDIS_CONTROL_URL: "redis://redis_control:6379/0" REDIS_TELEMETRY_URL: "redis://redis_telemetry:6379/0" FLOWMESH_API_KEY: "flm-ci-00000000000000000000000000000000" - # FLOWMESH_BASE_URL lets the server know its own HTTP address so it - # can embed the correct URL in tokens passed to spawned workers. - FLOWMESH_BASE_URL: "http://server:8000" - # SERVER_HOST tells the server its own gRPC hostname so spawned - # workers receive SUPERVISOR_GRPC_TARGET=server:50051. + # Workers run with network_mode: host, so FLOWMESH_BASE_URL must be + # reachable from the Docker host (not the compose overlay network). + FLOWMESH_BASE_URL: "http://localhost:8000" SERVER_HOST: "server" NODE_NAMESPACE: "ci" NODE_CLUSTER: "ci-cluster" @@ -60,9 +59,6 @@ services: FLOWMESH_REGISTRY: "ci" FLOWMESH_VERSION: "latest" WORKER_CONFIG_PATH: "/etc/flowmesh/worker_config.yaml" - # Attach spawned workers to this compose network so they can - # resolve "server" by hostname. Set via COMPOSE_PROJECT_NAME. - WORKER_DOCKER_NETWORK: "${COMPOSE_PROJECT_NAME}_ci-net" # Pass HuggingFace token through so workers can download gated models. HF_TOKEN: volumes: diff --git a/scripts/ci/run_local.sh b/scripts/ci/run_local.sh index 99d52e1..bef117d 100644 --- a/scripts/ci/run_local.sh +++ b/scripts/ci/run_local.sh @@ -5,11 +5,15 @@ # pushing to GitHub. Requires: docker, docker compose v2, uv. # # Fully isolated from any running FlowMesh services: -# - Server HTTP port is dynamically assigned (no fixed 8000) +# - Server HTTP port is fixed at 8000 (workers need a known address) # - gRPC port 50051 is fixed (workers cannot follow a dynamic port) # - Worker container name is scoped to the process PID # - Each run gets its own Docker network via compose project name # +# IMPORTANT: Ports 8000 and 50051 must be free on your machine. +# Workers are spawned with network_mode: host and connect to these +# ports on localhost to reach the server container. +# # Usage: # ./scripts/ci/run_local.sh [OPTIONS] # @@ -24,11 +28,11 @@ set -euo pipefail -# ── Paths ───────────────────────────────────────────────────────────────────────────────────── +# ── Paths ───────────────────────────────────────────────────────────────────── REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" DOCKER_DIR="$REPO_ROOT/docker" -# ── Defaults ──────────────────────────────────────────────────────────────────────────────────────── +# ── Defaults ────────────────────────────────────────────────────────────────── PROJECT="ci-local-$$" API_KEY="flm-ci-00000000000000000000000000000000" GPU=false @@ -46,7 +50,7 @@ _WORKER_CFG="" _COMPOSE_OVERRIDE="" HOST_URL="http://localhost:8000" -# ── Argument parsing ────────────────────────────────────────────────────────────────────────────────────── +# ── Argument parsing ─────────────────────────────────────────────────────────── while [[ $# -gt 0 ]]; do case "$1" in --gpu) GPU=true; shift ;; @@ -55,12 +59,12 @@ while [[ $# -gt 0 ]]; do --no-clean) DO_CLEAN=false; shift ;; --no-build) DO_BUILD=false; shift ;; --keep) DO_TEARDOWN=false; shift ;; - -h|--help) sed -n '2,23p' "$0"; exit 0 ;; + -h|--help) sed -n '2,25p' "$0"; exit 0 ;; *) echo "Unknown option: $1" >&2; exit 1 ;; esac done -# ── Colors ────────────────────────────────────────────────────────────────────────────────────── +# ── Colors ──────────────────────────────────────────────────────────────────── if [[ -t 1 ]]; then _B='\033[0;34m' _G='\033[0;32m' _Y='\033[1;33m' _R='\033[0;31m' _N='\033[0m' else @@ -71,7 +75,7 @@ ok() { echo -e "${_G}[ok]${_N} $*"; } warn() { echo -e "${_Y}[warn]${_N} $*"; } fail() { echo -e "${_R}[FAIL]${_N} $*" >&2; } -# ── Compose helpers ──────────────────────────────────────────────────────────────────────────────────────── +# ── Compose helpers ─────────────────────────────────────────────────────────── COMPOSE_FILES=(-f "$DOCKER_DIR/ci.compose.yml") if $GPU; then COMPOSE_FILES+=(-f "$DOCKER_DIR/ci.worker.gpu.yml") @@ -79,7 +83,7 @@ fi dc() { COMPOSE_PROJECT_NAME="$PROJECT" docker compose -p "$PROJECT" "${COMPOSE_FILES[@]}" "$@"; } -# ── Teardown (trap runs on any exit) ────────────────────────────────────────────────────────────────── +# ── Teardown (trap runs on any exit) ────────────────────────────────────────── _teardown() { local code=$? if ! $DO_TEARDOWN; then @@ -121,7 +125,7 @@ _teardown() { } trap _teardown EXIT -# ── 0. Resolve defaults ────────────────────────────────────────────────────────────────────────────────────────────────────── +# ── 0. Resolve defaults ─────────────────────────────────────────────────────── if $GPU; then WORKER_NAME="ci-worker-gpu-$$" WORKER_IMAGE="$WORKER_IMAGE_GPU" @@ -151,7 +155,7 @@ fi cd "$REPO_ROOT" -# ── 0b. Create isolation artifacts ─────────────────────────────────────────────────────────────────────────────────────────────────────────── +# ── 0b. Create isolation artifacts ──────────────────────────────────────────── _WORKER_CFG="$(mktemp /tmp/ci-worker-cfg-XXXXXX.yml)" if $GPU; then sed "s/ci-worker-gpu/$WORKER_NAME/g" \ @@ -170,15 +174,16 @@ workers: EOF fi -# Compose override: HTTP port is dynamic, gRPC port stays fixed at 50051. -# Workers receive SUPERVISOR_GRPC_TARGET=server:50051 (set via SERVER_HOST -# in ci.compose.yml) and cannot follow a dynamic port. +# Compose override: both ports are fixed so workers (network_mode: host) +# can reach localhost:8000 (HTTP) and localhost:50051 (gRPC). +# FLOWMESH_BASE_URL in ci.compose.yml is http://localhost:8000 — this +# must match the HTTP port binding below. _COMPOSE_OVERRIDE="$(mktemp /tmp/ci-compose-override-XXXXXX.yml)" cat > "$_COMPOSE_OVERRIDE" </dev/null \ || { fail "Server not reachable at $HOST_URL"; dc logs server --tail=40; exit 1; } ok "Server healthy at $HOST_URL" -# ── 5. Debug snapshot ──────────────────────────────────────────────────────────────────────────────────────────────────────────── +# ── 5. Debug snapshot ───────────────────────────────────────────────────────── echo log "Container state:" dc ps @@ -258,7 +259,7 @@ log "Server logs (last 20 lines):" dc logs server --tail=20 echo -# ── 6. Wait for worker to register ─────────────────────────────────────────────────────────────────────────────────────────────────────────── +# ── 6. Wait for worker to register ─────────────────────────────────────────── log "Waiting for worker to register with server..." REGISTERED=false for i in $(seq 1 24); do @@ -281,7 +282,7 @@ if ! $REGISTERED; then fi ok "Worker registered" -# ── 7. Run E2E smoke test(s) ──────────────────────────────────────────────────────────────────────────────────────────────────────────── +# ── 7. Run E2E smoke test(s) ────────────────────────────────────────────────── echo log "Running E2E smoke test(s)..." log " HOST=$HOST_URL" @@ -302,7 +303,7 @@ for _YAML in "${YAML_LIST[@]}"; do pytest tests/integration/test_e2e.py -v -s done -# ── 8. Verify worker execution evidence ─────────────────────────────────────────────────────────────────────────────────────────────────────── +# ── 8. Verify worker execution evidence ────────────────────────────────────── echo log "Verifying worker execution evidence..." LOG_FILE="/tmp/flowmesh-local-worker-$$.log" From 62f2fca1656c239f8c19cf1c9af26b3b2010e161 Mon Sep 17 00:00:00 2001 From: Qruixuan <154648498+Qruixuan@users.noreply.github.com> Date: Sun, 3 May 2026 18:55:21 +0800 Subject: [PATCH 11/17] =?UTF-8?q?feat:=20add=20tests/integration/test=5Fe2?= =?UTF-8?q?e.py=20=E2=80=94=20E2E=20smoke=20test=20for=20CI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migrated from FlowMesh_dev ci/gpu-runner-setup-v2 branch unchanged: - Submits a workflow YAML to a live server and polls until DONE/FAILED - Skips automatically when FLOWMESH_HOST_URL is unset (safe for unit test runs) - Handles n8n JSON and native YAML formats - Skips (not fails) when executor package is unavailable on the worker Used by run_local.sh (step 7) and .github/workflows/ci.yml E2E steps. Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn> --- tests/integration/__init__.py | 0 tests/integration/conftest.py | 68 ++++++++++ tests/integration/test_e2e.py | 230 ++++++++++++++++++++++++++++++++++ 3 files changed, 298 insertions(+) create mode 100644 tests/integration/__init__.py create mode 100644 tests/integration/conftest.py create mode 100644 tests/integration/test_e2e.py diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py new file mode 100644 index 0000000..53f9ea2 --- /dev/null +++ b/tests/integration/conftest.py @@ -0,0 +1,68 @@ +""" +Pytest configuration for FlowMesh end-to-end integration tests. + +Registers CLI options so the suite can be driven without pre-setting env vars: + + pytest tests/integration/ --host-url http://myserver:8000 --api-key flm-... + +The options are synced into environment variables during pytest_configure so +that module-level constants and the pytestmark skip-condition in test_e2e.py +pick them up at collection time (before any fixtures run). +""" + +import os + +import pytest + + +def pytest_addoption(parser: pytest.Parser) -> None: + group = parser.getgroup("e2e", "FlowMesh end-to-end tests") + group.addoption( + "--host-url", + default=None, + metavar="URL", + help="FlowMesh host base URL (overrides FLOWMESH_HOST_URL env var)", + ) + group.addoption( + "--api-key", + default=None, + metavar="KEY", + help="FlowMesh API key (overrides FLOWMESH_API_KEY env var)", + ) + group.addoption( + "--task-yaml", + default=None, + metavar="PATH", + help="Path to workflow YAML to submit (overrides TASK_YAML env var)", + ) + group.addoption( + "--e2e-timeout", + type=int, + default=None, + metavar="SEC", + help="Max seconds to wait for task completion (overrides E2E_TIMEOUT_SEC)", + ) + + +def pytest_configure(config: pytest.Config) -> None: + config.addinivalue_line( + "markers", + "e2e: end-to-end integration tests that require a live FlowMesh host", + ) + + # Sync CLI options into env vars *before* test modules are collected so + # that module-level constants and pytestmark conditions in test_e2e.py see + # the right values. os.environ.setdefault is used so an explicit env var + # always takes precedence over a CLI flag. + _sync_opt(config, "--host-url", "FLOWMESH_HOST_URL") + _sync_opt(config, "--api-key", "FLOWMESH_API_KEY") + _sync_opt(config, "--task-yaml", "TASK_YAML") + if (timeout := config.getoption("--e2e-timeout")) is not None: + os.environ.setdefault("E2E_TIMEOUT_SEC", str(timeout)) + + +def _sync_opt(config: pytest.Config, opt: str, env_var: str) -> None: + """If *opt* was passed on the CLI, set *env_var* unless already present.""" + value: str | None = config.getoption(opt) + if value is not None: + os.environ.setdefault(env_var, value) diff --git a/tests/integration/test_e2e.py b/tests/integration/test_e2e.py new file mode 100644 index 0000000..5c22f0d --- /dev/null +++ b/tests/integration/test_e2e.py @@ -0,0 +1,230 @@ +""" +End-to-end integration test for FlowMesh CI. + +Submits a workflow YAML to a running FlowMesh host and asserts the task +reaches DONE status within the timeout. + +Skipped automatically when FLOWMESH_HOST_URL is not set in the environment +so this file does not break the regular unit-test suite. + +Environment variables: + FLOWMESH_HOST_URL Base URL of the host (default: http://localhost:8000) + FLOWMESH_API_KEY API key for authentication + TASK_YAML Path to a workflow YAML or n8n JSON file to submit + (default: /templates/echo_local.yaml) + Files ending in .json are submitted as n8n format. + E2E_TIMEOUT_SEC Max seconds to wait for task completion (default: 120) +""" + +import os +import re +import sys +import time +from pathlib import Path +from typing import Any + +import pytest +import requests + +# Task errors that indicate the executor package is missing/broken on this +# worker rather than a genuine workflow logic failure. The test skips instead +# of failing so CI stays green while the gap is clearly surfaced. +_EXECUTOR_UNAVAILABLE_RE = re.compile( + r"not available|not installed|not importable", + re.IGNORECASE, +) + +_REPO_ROOT = Path(__file__).resolve().parent.parent.parent + +HOST_URL = os.getenv("FLOWMESH_HOST_URL", "http://localhost:8000").rstrip("/") +API_KEY = os.getenv("FLOWMESH_API_KEY", "flm-ci-00000000000000000000000000000000") +TASK_YAML = os.getenv("TASK_YAML", str(_REPO_ROOT / "templates" / "echo_local.yaml")) +TIMEOUT = int(os.getenv("E2E_TIMEOUT_SEC", "120")) +POLL_INTERVAL = 3 + +HEADERS = {"Authorization": f"Bearer {API_KEY}"} + +# Skip the whole module when no host is configured — keeps the unit-test suite +# clean. The E2E CI job always sets FLOWMESH_HOST_URL explicitly. +pytestmark = pytest.mark.skipif( + os.getenv("FLOWMESH_HOST_URL") is None, + reason="requires a running FlowMesh host; set FLOWMESH_HOST_URL to enable", +) + + +def _wait_for_host(timeout: int = 60) -> None: + deadline = time.time() + timeout + while time.time() < deadline: + try: + r = requests.get(f"{HOST_URL}/healthz", timeout=3) + if r.status_code == 200: + print(f"[e2e] Host is up at {HOST_URL}") + return + except requests.RequestException: + pass + time.sleep(2) + pytest.fail(f"[e2e] Host did not become healthy within {timeout}s") + + +def _submit_workflow() -> tuple[str, str]: + """Submit workflow file, return (workflow_id, first_task_id). + + Files ending in .json are submitted as n8n format (Workflow-Format: n8n). + All other files are submitted as native YAML (text/plain). + """ + try: + with open(TASK_YAML) as f: + content = f.read() + except FileNotFoundError: + pytest.fail(f"[e2e] Task YAML not found: {TASK_YAML}") + + is_n8n = Path(TASK_YAML).suffix.lower() == ".json" + fmt_label = "n8n" if is_n8n else "native" + print(f"[e2e] Submitting {fmt_label} workflow from {TASK_YAML}") + + extra_headers: dict[str, str] = {} + if is_n8n: + extra_headers["Workflow-Format"] = "n8n" + extra_headers["Content-Type"] = "application/json" + else: + extra_headers["Content-Type"] = "text/plain" + + r = requests.post( + f"{HOST_URL}/api/v1/workflows", + data=content.encode("utf-8"), + headers={**HEADERS, **extra_headers}, + timeout=10, + ) + if r.status_code not in (200, 201): + pytest.fail(f"[e2e] Workflow submission failed {r.status_code}: {r.text}") + + body: dict[str, Any] = r.json() + workflow_id: str = body["workflow_id"] + task_id: str = body["tasks"][0]["task_id"] + print(f"[e2e] Submitted workflow {workflow_id}, task {task_id}") + return workflow_id, task_id + + +def _dump_task_logs(task_id: str) -> str: + """Print task logs to stderr and return them as a single string for matching.""" + try: + r = requests.get( + f"{HOST_URL}/api/v1/tasks/{task_id}/logs", + headers=HEADERS, + params={"limit": 100}, + timeout=5, + ) + if r.status_code == 200: + entries = r.json().get("entries") or r.json() + print(f"[e2e] === task logs for {task_id} ===", file=sys.stderr) + messages: list[str] = [] + for entry in entries if isinstance(entries, list) else []: + print(f" {entry}", file=sys.stderr) + msg = ( + entry.get("event", {}).get("message", "") + if isinstance(entry, dict) + else str(entry) + ) + if msg: + messages.append(msg) + return " ".join(messages) + else: + print( + f"[e2e] (could not fetch task logs: {r.status_code})", + file=sys.stderr, + ) + except Exception as exc: + print(f"[e2e] (error fetching task logs: {exc})", file=sys.stderr) + return "" + + +def _poll_task(task_id: str) -> dict[str, Any]: + deadline = time.time() + TIMEOUT + last_status = None + while time.time() < deadline: + r = requests.get( + f"{HOST_URL}/api/v1/tasks/{task_id}", + headers=HEADERS, + timeout=5, + ) + if r.status_code != 200: + print( + f"[e2e] WARNING: GET task returned {r.status_code}", + file=sys.stderr, + ) + time.sleep(POLL_INTERVAL) + continue + + task: dict[str, Any] = r.json() + status = task.get("status") + if status != last_status: + print(f"[e2e] Task {task_id}: {last_status} -> {status}") + last_status = status + + if status == "DONE": + return task + if status == "FAILED": + error = task.get("error") or "" + log_text = _dump_task_logs(task_id) + if _EXECUTOR_UNAVAILABLE_RE.search(error): + pytest.skip( + f"[e2e] Executor not available on this worker: {error}" + ) + # max_attempts_exceeded means the host retried until giving up. + # Inspect logs for the root cause; skip if the executor was + # unavailable (e.g. Docker socket missing for SSH executor). + if error == "max_attempts_exceeded" and _EXECUTOR_UNAVAILABLE_RE.search( + log_text + ): + pytest.skip( + f"[e2e] Executor not available (retries exhausted): " + f"{log_text[:300]}" + ) + pytest.fail(f"[e2e] Task FAILED: {error}") + + time.sleep(POLL_INTERVAL) + + pytest.fail( + f"[e2e] Task {task_id} did not complete within {TIMEOUT}s" + f" (last status: {last_status})" + ) + + +def _assert_result(task: dict[str, Any]) -> None: + task_id: str = task["task_id"] + + assert task.get("status") == "DONE", f"Expected DONE, got {task.get('status')}" + + # Check the results endpoint — executor should have written responses.json + r = requests.get( + f"{HOST_URL}/api/v1/results/{task_id}", + headers=HEADERS, + timeout=5, + ) + if r.status_code == 200: + result: dict[str, Any] = r.json() + print(f"[e2e] Result OK: status={result.get('status')} task_id={task_id}") + if result.get("payload"): + print(f"[e2e] Executor output: {str(result['payload'])[:200]}") + elif r.status_code == 404: + # Echo tasks may not write a result file — DONE is sufficient + print(f"[e2e] No result record for {task_id} — DONE is sufficient") + else: + print( + f"[e2e] WARNING: results endpoint returned {r.status_code}", + file=sys.stderr, + ) + + +def test_workflow_runs_to_done() -> None: + """Submit a workflow and verify it reaches DONE status.""" + print(f"[e2e] FlowMesh E2E smoke test -> {HOST_URL}") + print(f"[e2e] Task YAML: {TASK_YAML}") + _wait_for_host() + _, task_id = _submit_workflow() + task = _poll_task(task_id) + _assert_result(task) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__, "-v", "-s", *sys.argv[1:]])) From f3325f353c4c5c88e94fc59b13fe6148ed760860 Mon Sep 17 00:00:00 2001 From: Qruixuan <154648498+Qruixuan@users.noreply.github.com> Date: Sun, 3 May 2026 19:11:20 +0800 Subject: [PATCH 12/17] fix: use host bind-mount for worker results to avoid _VolumeInitializer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The server's _VolumeInitializer runs busybox:1.36.1 to chown the named Docker volume to UID 10001, but if busybox isn't cached it fails silently and marks the volume as initialized anyway — so all subsequent workers also get PermissionError writing to /var/lib/flowmesh-results. Fix: set RESULTS_DIR to an absolute host path. The docker adapter skips _VolumeInitializer for absolute paths (see _ensure_volume_access). Workers receive a bind-mount of a pre-created host dir with chmod 777, which UID 10001 (appuser) can write to without any chown step. - ci.compose.yml: RESULTS_DIR=/tmp/flowmesh-ci-results - ci.yml: mkdir + chmod 777 before 'docker compose up' in both jobs, rm -rf in teardown - run_local.sh: per-PID dir /tmp/flowmesh-ci-results-$PROJECT, overridden in compose overlay; cleaned up in teardown Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn> --- .github/workflows/ci.yml | 12 ++++++++++++ docker/ci.compose.yml | 8 ++++++++ scripts/ci/run_local.sh | 32 +++++++++++++++++++++----------- 3 files changed, 41 insertions(+), 11 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8bb9122..f5441eb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,6 +39,11 @@ jobs: df -h / docker system df + - name: Create worker results directory + run: | + mkdir -p /tmp/flowmesh-ci-results + chmod 777 /tmp/flowmesh-ci-results + - name: Build worker image run: | DOCKER_BUILDKIT=1 docker build \ @@ -161,6 +166,7 @@ jobs: docker rm -f ci-worker-cpu 2>/dev/null || true docker compose -p "$PROJECT" -f docker/ci.compose.yml down -v --remove-orphans docker rmi ci/flowmesh_worker:latest-cpu 2>/dev/null || true + rm -rf /tmp/flowmesh-ci-results 2>/dev/null || true docker image prune -f docker volume prune -f echo "=== Disk after teardown ===" @@ -197,6 +203,11 @@ jobs: df -h / docker system df + - name: Create worker results directory + run: | + mkdir -p /tmp/flowmesh-ci-results + chmod 777 /tmp/flowmesh-ci-results + - name: Build GPU worker builder image (cached by content hash) run: | BUILDER_HASH=$(cat \ @@ -446,6 +457,7 @@ jobs: | grep "^flowmesh-builder:" \ | grep -v ":${CURRENT_HASH}$" \ | xargs -r docker rmi 2>/dev/null || true + rm -rf /tmp/flowmesh-ci-results 2>/dev/null || true docker image prune -f docker volume prune -f echo "=== Disk after teardown ===" diff --git a/docker/ci.compose.yml b/docker/ci.compose.yml index 448ca31..539ce1f 100644 --- a/docker/ci.compose.yml +++ b/docker/ci.compose.yml @@ -8,6 +8,10 @@ # They connect to gRPC at localhost:50051 and HTTP at http://localhost:8000. # Ports 8000 and 50051 MUST therefore be bound on the Docker host machine. # +# RESULTS_DIR is set to an absolute host path so workers can write results +# without relying on the _VolumeInitializer busybox chown mechanism. +# Caller must create /tmp/flowmesh-ci-results with chmod 777 before 'up'. +# # NOTE: No ports are exposed in this base file. Add ports via an overlay: # - Fixed (GitHub Actions / bare docker compose): docker/ci.ports.fixed.yml # - Fixed local (run_local.sh): generated at runtime @@ -59,6 +63,10 @@ services: FLOWMESH_REGISTRY: "ci" FLOWMESH_VERSION: "latest" WORKER_CONFIG_PATH: "/etc/flowmesh/worker_config.yaml" + # Absolute host path for worker results (chmod 777 before 'up'). + # Using an absolute path bypasses the _VolumeInitializer busybox chown + # so workers (UID 10001) can write without depending on image pulls. + RESULTS_DIR: "/tmp/flowmesh-ci-results" # Pass HuggingFace token through so workers can download gated models. HF_TOKEN: volumes: diff --git a/scripts/ci/run_local.sh b/scripts/ci/run_local.sh index bef117d..841409d 100644 --- a/scripts/ci/run_local.sh +++ b/scripts/ci/run_local.sh @@ -8,7 +8,7 @@ # - Server HTTP port is fixed at 8000 (workers need a known address) # - gRPC port 50051 is fixed (workers cannot follow a dynamic port) # - Worker container name is scoped to the process PID -# - Each run gets its own Docker network via compose project name +# - Each run gets its own Docker network and results directory # # IMPORTANT: Ports 8000 and 50051 must be free on your machine. # Workers are spawned with network_mode: host and connect to these @@ -48,6 +48,7 @@ WORKER_IMAGE_GPU="ci/flowmesh_worker:latest-gpu" WORKER_NAME="" _WORKER_CFG="" _COMPOSE_OVERRIDE="" +_RESULTS_DIR="" HOST_URL="http://localhost:8000" # ── Argument parsing ─────────────────────────────────────────────────────────── @@ -116,6 +117,7 @@ _teardown() { docker image prune -f >/dev/null docker volume prune -f >/dev/null rm -f "${_WORKER_CFG:-}" "${_COMPOSE_OVERRIDE:-}" 2>/dev/null || true + rm -rf "${_RESULTS_DIR:-}" 2>/dev/null || true if [[ $code -eq 0 ]]; then ok "Local CI run PASSED" @@ -174,14 +176,21 @@ workers: EOF fi -# Compose override: both ports are fixed so workers (network_mode: host) -# can reach localhost:8000 (HTTP) and localhost:50051 (gRPC). -# FLOWMESH_BASE_URL in ci.compose.yml is http://localhost:8000 — this -# must match the HTTP port binding below. +# Per-run results dir: absolute host path so workers (UID 10001) can write +# without depending on _VolumeInitializer / busybox chown. +_RESULTS_DIR="/tmp/flowmesh-ci-results-${PROJECT}" +mkdir -p "$_RESULTS_DIR" +chmod 777 "$_RESULTS_DIR" + +# Compose override: fixed ports + per-run RESULTS_DIR override. +# RESULTS_DIR in ci.compose.yml defaults to /tmp/flowmesh-ci-results (CI); +# here we use a PID-scoped path so parallel local runs don't collide. _COMPOSE_OVERRIDE="$(mktemp /tmp/ci-compose-override-XXXXXX.yml)" cat > "$_COMPOSE_OVERRIDE" < Date: Mon, 4 May 2026 00:45:24 +0800 Subject: [PATCH 13/17] fix: persist HF model cache across CI runs via host bind-mount docker volume prune -f (in pre-clean) deleted the named volume flowmesh_server_hf_cache between runs, forcing TinyLlama to be re-downloaded every time (~50s) and causing the 300s vLLM test to time out by a few seconds. Fix: set HF_CACHE_DIR to the host's ~/.cache/huggingface so workers receive a bind mount of an absolute path. _ensure_volume_access skips _VolumeInitializer for absolute paths; models downloaded on the first run persist for every subsequent run on the same machine. - ci.compose.yml: pass HF_CACHE_DIR through from compose env - run_local.sh: resolve _HF_CACHE_DIR (host ~/.cache/huggingface), mkdir+chmod 777, inject into compose override - ci.yml: set HF_CACHE_DIR=$HOME/.cache/huggingface in project-name step; mkdir+chmod 777 in setup step; pass to docker compose env Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn> --- .github/workflows/ci.yml | 28 ++++++++++++++++------------ docker/ci.compose.yml | 10 ++++++++++ scripts/ci/run_local.sh | 16 +++++++++++++--- 3 files changed, 39 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f5441eb..8e9692c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,7 +24,9 @@ jobs: uses: actions/checkout@v6 - name: Set project name - run: echo "PROJECT=ci-${{ github.run_id }}-integ" >> "$GITHUB_ENV" + run: | + echo "PROJECT=ci-${{ github.run_id }}-integ" >> "$GITHUB_ENV" + echo "HF_CACHE_DIR=$HOME/.cache/huggingface" >> "$GITHUB_ENV" - name: Pre-clean stale worker containers and disk run: | @@ -39,10 +41,12 @@ jobs: df -h / docker system df - - name: Create worker results directory + - name: Create CI directories run: | mkdir -p /tmp/flowmesh-ci-results chmod 777 /tmp/flowmesh-ci-results + mkdir -p "$HOME/.cache/huggingface" + chmod 777 "$HOME/.cache/huggingface" - name: Build worker image run: | @@ -60,6 +64,7 @@ jobs: env: DOCKER_BUILDKIT: "1" COMPOSE_PROJECT_NAME: ${{ env.PROJECT }} + HF_CACHE_DIR: ${{ env.HF_CACHE_DIR }} - name: Wait for server to be healthy run: | @@ -132,10 +137,7 @@ jobs: echo "" echo "=== Result files written by worker ===" - docker run --rm \ - --volumes-from ci-worker-cpu \ - busybox find /var/lib/flowmesh-results -type f 2>/dev/null \ - | head -20 || echo "(could not inspect result volume)" + ls -la /tmp/flowmesh-ci-results/ 2>/dev/null | head -20 || echo "(result dir empty or missing)" - name: Collect logs on failure if: failure() @@ -188,7 +190,9 @@ jobs: uses: actions/checkout@v6 - name: Set project name - run: echo "PROJECT=ci-${{ github.run_id }}-gpu" >> "$GITHUB_ENV" + run: | + echo "PROJECT=ci-${{ github.run_id }}-gpu" >> "$GITHUB_ENV" + echo "HF_CACHE_DIR=$HOME/.cache/huggingface" >> "$GITHUB_ENV" - name: Pre-clean stale worker containers and disk run: | @@ -203,10 +207,12 @@ jobs: df -h / docker system df - - name: Create worker results directory + - name: Create CI directories run: | mkdir -p /tmp/flowmesh-ci-results chmod 777 /tmp/flowmesh-ci-results + mkdir -p "$HOME/.cache/huggingface" + chmod 777 "$HOME/.cache/huggingface" - name: Build GPU worker builder image (cached by content hash) run: | @@ -246,6 +252,7 @@ jobs: DOCKER_BUILDKIT: "1" HF_TOKEN: ${{ secrets.HF_TOKEN }} COMPOSE_PROJECT_NAME: ${{ env.PROJECT }} + HF_CACHE_DIR: ${{ env.HF_CACHE_DIR }} - name: Wait for server to be healthy run: | @@ -410,10 +417,7 @@ jobs: echo "" echo "=== Result files written by worker ===" - docker run --rm \ - --volumes-from ci-worker-gpu \ - busybox find /var/lib/flowmesh-results -type f 2>/dev/null \ - | head -20 || echo "(could not inspect result volume)" + ls -la /tmp/flowmesh-ci-results/ 2>/dev/null | head -20 || echo "(result dir empty or missing)" - name: Collect logs on failure if: failure() diff --git a/docker/ci.compose.yml b/docker/ci.compose.yml index 539ce1f..9592041 100644 --- a/docker/ci.compose.yml +++ b/docker/ci.compose.yml @@ -12,6 +12,12 @@ # without relying on the _VolumeInitializer busybox chown mechanism. # Caller must create /tmp/flowmesh-ci-results with chmod 777 before 'up'. # +# HF_CACHE_DIR should be set to the host's ~/.cache/huggingface so that +# model weights survive 'docker volume prune' between runs. When set, +# _mount_hf_cache uses a bind mount (absolute path bypasses +# _VolumeInitializer). If unset the adapter falls back to the named +# volume flowmesh_server_hf_cache (fine for one-off runs). +# # NOTE: No ports are exposed in this base file. Add ports via an overlay: # - Fixed (GitHub Actions / bare docker compose): docker/ci.ports.fixed.yml # - Fixed local (run_local.sh): generated at runtime @@ -67,6 +73,10 @@ services: # Using an absolute path bypasses the _VolumeInitializer busybox chown # so workers (UID 10001) can write without depending on image pulls. RESULTS_DIR: "/tmp/flowmesh-ci-results" + # Host HF cache dir — pass from compose runtime env so model weights + # persist across runs and survive 'docker volume prune'. + # Unset → adapter falls back to named volume flowmesh_server_hf_cache. + HF_CACHE_DIR: # Pass HuggingFace token through so workers can download gated models. HF_TOKEN: volumes: diff --git a/scripts/ci/run_local.sh b/scripts/ci/run_local.sh index 841409d..a27b8f4 100644 --- a/scripts/ci/run_local.sh +++ b/scripts/ci/run_local.sh @@ -49,6 +49,7 @@ WORKER_NAME="" _WORKER_CFG="" _COMPOSE_OVERRIDE="" _RESULTS_DIR="" +_HF_CACHE_DIR="" HOST_URL="http://localhost:8000" # ── Argument parsing ─────────────────────────────────────────────────────────── @@ -118,6 +119,7 @@ _teardown() { docker volume prune -f >/dev/null rm -f "${_WORKER_CFG:-}" "${_COMPOSE_OVERRIDE:-}" 2>/dev/null || true rm -rf "${_RESULTS_DIR:-}" 2>/dev/null || true + # _HF_CACHE_DIR is a persistent host path — intentionally NOT deleted. if [[ $code -eq 0 ]]; then ok "Local CI run PASSED" @@ -182,15 +184,22 @@ _RESULTS_DIR="/tmp/flowmesh-ci-results-${PROJECT}" mkdir -p "$_RESULTS_DIR" chmod 777 "$_RESULTS_DIR" -# Compose override: fixed ports + per-run RESULTS_DIR override. -# RESULTS_DIR in ci.compose.yml defaults to /tmp/flowmesh-ci-results (CI); -# here we use a PID-scoped path so parallel local runs don't collide. +# HF model cache: bind-mount the host path so downloaded model weights survive +# 'docker volume prune' between runs. The server passes HF_CACHE_DIR to each +# spawned worker; _mount_hf_cache uses a bind-mount for absolute paths +# (bypasses _VolumeInitializer). Falls back to named volume if unset. +_HF_CACHE_DIR="${HF_CACHE_DIR:-${HOME}/.cache/huggingface}" +mkdir -p "$_HF_CACHE_DIR" +chmod 777 "$_HF_CACHE_DIR" + +# Compose override: fixed ports + per-run RESULTS_DIR + persistent HF cache. _COMPOSE_OVERRIDE="$(mktemp /tmp/ci-compose-override-XXXXXX.yml)" cat > "$_COMPOSE_OVERRIDE" < Date: Mon, 4 May 2026 01:07:24 +0800 Subject: [PATCH 14/17] fix: revert HF cache to named volume (same as FlowMesh_dev), bump vLLM timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HF_CACHE_DIR bind-mount was reverted — using the named Docker volume flowmesh_server_hf_cache (identical to FlowMesh_dev) avoids accumulating model weights on the host between CI runs; docker volume prune cleans it up. The timeout issue is fixed by bumping the GPU E2E timeout: cold-start (model download ~50s + load ~53s + compile ~17s + CUDA graphs) takes ~250s, leaving only ~50s for inference at the old 300s limit. - run_local.sh: GPU default timeout 300 → 600s - ci.yml: inference_vllm_tiny E2E_TIMEOUT_SEC 300 → 600s Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn> --- .github/workflows/ci.yml | 20 +++++--------------- docker/ci.compose.yml | 10 ---------- scripts/ci/run_local.sh | 20 +++++--------------- 3 files changed, 10 insertions(+), 40 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8e9692c..f2e8a01 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,9 +24,7 @@ jobs: uses: actions/checkout@v6 - name: Set project name - run: | - echo "PROJECT=ci-${{ github.run_id }}-integ" >> "$GITHUB_ENV" - echo "HF_CACHE_DIR=$HOME/.cache/huggingface" >> "$GITHUB_ENV" + run: echo "PROJECT=ci-${{ github.run_id }}-integ" >> "$GITHUB_ENV" - name: Pre-clean stale worker containers and disk run: | @@ -41,12 +39,10 @@ jobs: df -h / docker system df - - name: Create CI directories + - name: Create worker results directory run: | mkdir -p /tmp/flowmesh-ci-results chmod 777 /tmp/flowmesh-ci-results - mkdir -p "$HOME/.cache/huggingface" - chmod 777 "$HOME/.cache/huggingface" - name: Build worker image run: | @@ -64,7 +60,6 @@ jobs: env: DOCKER_BUILDKIT: "1" COMPOSE_PROJECT_NAME: ${{ env.PROJECT }} - HF_CACHE_DIR: ${{ env.HF_CACHE_DIR }} - name: Wait for server to be healthy run: | @@ -190,9 +185,7 @@ jobs: uses: actions/checkout@v6 - name: Set project name - run: | - echo "PROJECT=ci-${{ github.run_id }}-gpu" >> "$GITHUB_ENV" - echo "HF_CACHE_DIR=$HOME/.cache/huggingface" >> "$GITHUB_ENV" + run: echo "PROJECT=ci-${{ github.run_id }}-gpu" >> "$GITHUB_ENV" - name: Pre-clean stale worker containers and disk run: | @@ -207,12 +200,10 @@ jobs: df -h / docker system df - - name: Create CI directories + - name: Create worker results directory run: | mkdir -p /tmp/flowmesh-ci-results chmod 777 /tmp/flowmesh-ci-results - mkdir -p "$HOME/.cache/huggingface" - chmod 777 "$HOME/.cache/huggingface" - name: Build GPU worker builder image (cached by content hash) run: | @@ -252,7 +243,6 @@ jobs: DOCKER_BUILDKIT: "1" HF_TOKEN: ${{ secrets.HF_TOKEN }} COMPOSE_PROJECT_NAME: ${{ env.PROJECT }} - HF_CACHE_DIR: ${{ env.HF_CACHE_DIR }} - name: Wait for server to be healthy run: | @@ -290,7 +280,7 @@ jobs: -e FLOWMESH_HOST_URL="http://server:8000" \ -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ -e TASK_YAML="/templates/inference_vllm_tiny.yaml" \ - -e E2E_TIMEOUT_SEC="300" \ + -e E2E_TIMEOUT_SEC="600" \ -v "${{ github.workspace }}/tests:/tests:ro" \ -v "${{ github.workspace }}/templates:/templates:ro" \ python:3.11-slim \ diff --git a/docker/ci.compose.yml b/docker/ci.compose.yml index 9592041..539ce1f 100644 --- a/docker/ci.compose.yml +++ b/docker/ci.compose.yml @@ -12,12 +12,6 @@ # without relying on the _VolumeInitializer busybox chown mechanism. # Caller must create /tmp/flowmesh-ci-results with chmod 777 before 'up'. # -# HF_CACHE_DIR should be set to the host's ~/.cache/huggingface so that -# model weights survive 'docker volume prune' between runs. When set, -# _mount_hf_cache uses a bind mount (absolute path bypasses -# _VolumeInitializer). If unset the adapter falls back to the named -# volume flowmesh_server_hf_cache (fine for one-off runs). -# # NOTE: No ports are exposed in this base file. Add ports via an overlay: # - Fixed (GitHub Actions / bare docker compose): docker/ci.ports.fixed.yml # - Fixed local (run_local.sh): generated at runtime @@ -73,10 +67,6 @@ services: # Using an absolute path bypasses the _VolumeInitializer busybox chown # so workers (UID 10001) can write without depending on image pulls. RESULTS_DIR: "/tmp/flowmesh-ci-results" - # Host HF cache dir — pass from compose runtime env so model weights - # persist across runs and survive 'docker volume prune'. - # Unset → adapter falls back to named volume flowmesh_server_hf_cache. - HF_CACHE_DIR: # Pass HuggingFace token through so workers can download gated models. HF_TOKEN: volumes: diff --git a/scripts/ci/run_local.sh b/scripts/ci/run_local.sh index a27b8f4..3d46f9a 100644 --- a/scripts/ci/run_local.sh +++ b/scripts/ci/run_local.sh @@ -20,7 +20,7 @@ # Options: # --gpu Run the GPU smoke test instead of the CPU integration test # --task-yaml PATH Override the workflow YAML submitted to the server -# --timeout SEC Override E2E wait timeout (default: 120, GPU default: 300) +# --timeout SEC Override E2E wait timeout (default: 120, GPU default: 600) # --no-clean Skip the pre-run docker prune step # --no-build Skip rebuilding the worker image (use cached) # --keep Do not tear down services after the run @@ -49,7 +49,6 @@ WORKER_NAME="" _WORKER_CFG="" _COMPOSE_OVERRIDE="" _RESULTS_DIR="" -_HF_CACHE_DIR="" HOST_URL="http://localhost:8000" # ── Argument parsing ─────────────────────────────────────────────────────────── @@ -119,7 +118,6 @@ _teardown() { docker volume prune -f >/dev/null rm -f "${_WORKER_CFG:-}" "${_COMPOSE_OVERRIDE:-}" 2>/dev/null || true rm -rf "${_RESULTS_DIR:-}" 2>/dev/null || true - # _HF_CACHE_DIR is a persistent host path — intentionally NOT deleted. if [[ $code -eq 0 ]]; then ok "Local CI run PASSED" @@ -134,7 +132,7 @@ if $GPU; then WORKER_NAME="ci-worker-gpu-$$" WORKER_IMAGE="$WORKER_IMAGE_GPU" WORKER_DOCKERFILE="src/worker/docker/Dockerfile.cuda" - [[ -z "$TIMEOUT" ]] && TIMEOUT=300 + [[ -z "$TIMEOUT" ]] && TIMEOUT=600 if [[ -n "$TASK_YAML" ]]; then GPU_TASK_YAMLS=("$TASK_YAML") else @@ -184,22 +182,15 @@ _RESULTS_DIR="/tmp/flowmesh-ci-results-${PROJECT}" mkdir -p "$_RESULTS_DIR" chmod 777 "$_RESULTS_DIR" -# HF model cache: bind-mount the host path so downloaded model weights survive -# 'docker volume prune' between runs. The server passes HF_CACHE_DIR to each -# spawned worker; _mount_hf_cache uses a bind-mount for absolute paths -# (bypasses _VolumeInitializer). Falls back to named volume if unset. -_HF_CACHE_DIR="${HF_CACHE_DIR:-${HOME}/.cache/huggingface}" -mkdir -p "$_HF_CACHE_DIR" -chmod 777 "$_HF_CACHE_DIR" - -# Compose override: fixed ports + per-run RESULTS_DIR + persistent HF cache. +# Compose override: fixed ports + per-run RESULTS_DIR override. +# RESULTS_DIR in ci.compose.yml defaults to /tmp/flowmesh-ci-results (CI); +# here we use a PID-scoped path so parallel local runs don't collide. _COMPOSE_OVERRIDE="$(mktemp /tmp/ci-compose-override-XXXXXX.yml)" cat > "$_COMPOSE_OVERRIDE" < Date: Sun, 3 May 2026 17:22:39 +0000 Subject: [PATCH 15/17] fix: replace gated llama model with open Qwen model in dag_inference.json meta-llama/Llama-3.2-1B-Instruct requires HF_TOKEN; use the non-gated Qwen/Qwen2.5-0.5B-Instruct instead, matching FlowMesh_dev's fix. Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn> --- templates/n8n/dag_inference.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/templates/n8n/dag_inference.json b/templates/n8n/dag_inference.json index cd6f75b..05faf83 100644 --- a/templates/n8n/dag_inference.json +++ b/templates/n8n/dag_inference.json @@ -17,7 +17,7 @@ }, { "parameters": { - "model": "meta-llama/Llama-3.2-1B-Instruct", + "model": "Qwen/Qwen2.5-0.5B-Instruct", "options": { "maxTokens": 128, "temperature": 1, @@ -136,7 +136,7 @@ }, { "parameters": { - "model": "meta-llama/Llama-3.2-1B-Instruct", + "model": "Qwen/Qwen2.5-0.5B-Instruct", "options": { "maxTokens": 128, "temperature": 1, @@ -236,7 +236,7 @@ }, { "parameters": { - "model": "meta-llama/Llama-3.2-1B-Instruct", + "model": "Qwen/Qwen2.5-0.5B-Instruct", "options": { "maxTokens": 128, "temperature": 1, From 344d783ac5c603b6e468e7962b4c7905f0bb474b Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 12:12:52 +0000 Subject: [PATCH 16/17] fix: harden ci.yml for zizmor pedantic audit - Pin actions/checkout and actions/upload-artifact to commit SHAs - Add persist-credentials: false to all checkout steps - Add top-level permissions: contents: read - Move github.workspace and github.run_id out of run: blocks into step-level env: to eliminate template-expansion injection warnings Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn> --- .github/workflows/ci.yml | 97 +++++++++++++++++++++++++++------------- 1 file changed, 67 insertions(+), 30 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f2e8a01..ffc9a10 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -5,6 +5,9 @@ on: branches: [main] workflow_dispatch: +permissions: + contents: read + concurrency: group: ci-${{ github.ref }} cancel-in-progress: true @@ -21,10 +24,14 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v6 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false - name: Set project name - run: echo "PROJECT=ci-${{ github.run_id }}-integ" >> "$GITHUB_ENV" + env: + RUN_ID: ${{ github.run_id }} + run: echo "PROJECT=ci-${RUN_ID}-integ" >> "$GITHUB_ENV" - name: Pre-clean stale worker containers and disk run: | @@ -103,6 +110,8 @@ jobs: exit 1 - name: Run E2E smoke test (echo task) + env: + WORKSPACE: ${{ github.workspace }} run: | docker run --rm \ --network "${PROJECT}_ci-net" \ @@ -110,19 +119,21 @@ jobs: -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ -e TASK_YAML="/templates/echo_local.yaml" \ -e E2E_TIMEOUT_SEC="120" \ - -v "${{ github.workspace }}/tests:/tests:ro" \ - -v "${{ github.workspace }}/templates:/templates:ro" \ + -v "$WORKSPACE/tests:/tests:ro" \ + -v "$WORKSPACE/templates:/templates:ro" \ python:3.11-slim \ sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v" - name: Verify CPU worker actually executed the task + env: + RUN_ID: ${{ github.run_id }} run: | echo "=== CPU worker logs (full) ===" - docker logs ci-worker-cpu 2>&1 | tee /tmp/worker-cpu-${{ github.run_id }}.log || true + docker logs ci-worker-cpu 2>&1 | tee "/tmp/worker-cpu-${RUN_ID}.log" || true echo "" echo "=== Execution evidence check ===" - LOG=/tmp/worker-cpu-${{ github.run_id }}.log + LOG="/tmp/worker-cpu-${RUN_ID}.log" if grep -qiE "executor|running task|dispatched|echo|succeeded|TASK_SUCCEEDED|done" "$LOG"; then echo "✓ Worker executed and completed the task" else @@ -136,13 +147,15 @@ jobs: - name: Collect logs on failure if: failure() + env: + RUN_ID: ${{ github.run_id }} run: | docker compose -p "$PROJECT" -f docker/ci.compose.yml logs --no-color \ - > /tmp/ci-logs-${{ github.run_id }}.txt 2>&1 || true + > "/tmp/ci-logs-${RUN_ID}.txt" 2>&1 || true - name: Upload logs on failure if: failure() - uses: actions/upload-artifact@v7 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 with: name: ci-logs-integ-${{ github.run_id }} path: /tmp/ci-logs-${{ github.run_id }}.txt @@ -182,10 +195,14 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v6 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false - name: Set project name - run: echo "PROJECT=ci-${{ github.run_id }}-gpu" >> "$GITHUB_ENV" + env: + RUN_ID: ${{ github.run_id }} + run: echo "PROJECT=ci-${RUN_ID}-gpu" >> "$GITHUB_ENV" - name: Pre-clean stale worker containers and disk run: | @@ -274,6 +291,8 @@ jobs: exit 1 - name: "E2E: vLLM inference (TinyLlama-1.1B)" + env: + WORKSPACE: ${{ github.workspace }} run: | docker run --rm \ --network "${PROJECT}_ci-net" \ @@ -281,12 +300,14 @@ jobs: -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ -e TASK_YAML="/templates/inference_vllm_tiny.yaml" \ -e E2E_TIMEOUT_SEC="600" \ - -v "${{ github.workspace }}/tests:/tests:ro" \ - -v "${{ github.workspace }}/templates:/templates:ro" \ + -v "$WORKSPACE/tests:/tests:ro" \ + -v "$WORKSPACE/templates:/templates:ro" \ python:3.11-slim \ sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v" - name: "E2E: 3-node fan-in graph DAG (echo executor)" + env: + WORKSPACE: ${{ github.workspace }} run: | docker run --rm \ --network "${PROJECT}_ci-net" \ @@ -294,12 +315,14 @@ jobs: -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ -e TASK_YAML="/templates/echo_three_node_graph.yaml" \ -e E2E_TIMEOUT_SEC="120" \ - -v "${{ github.workspace }}/tests:/tests:ro" \ - -v "${{ github.workspace }}/templates:/templates:ro" \ + -v "$WORKSPACE/tests:/tests:ro" \ + -v "$WORKSPACE/templates:/templates:ro" \ python:3.11-slim \ sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v" - name: "E2E: parallel DAG with synthesis (vLLM, graph_template)" + env: + WORKSPACE: ${{ github.workspace }} run: | docker run --rm \ --network "${PROJECT}_ci-net" \ @@ -307,12 +330,14 @@ jobs: -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ -e TASK_YAML="/templates/dag_inference_example.yaml" \ -e E2E_TIMEOUT_SEC="600" \ - -v "${{ github.workspace }}/tests:/tests:ro" \ - -v "${{ github.workspace }}/templates:/templates:ro" \ + -v "$WORKSPACE/tests:/tests:ro" \ + -v "$WORKSPACE/templates:/templates:ro" \ python:3.11-slim \ sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v" - name: "E2E: conditional task skip (echo executor)" + env: + WORKSPACE: ${{ github.workspace }} run: | docker run --rm \ --network "${PROJECT}_ci-net" \ @@ -320,12 +345,14 @@ jobs: -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ -e TASK_YAML="/templates/conditional_echo_test.yaml" \ -e E2E_TIMEOUT_SEC="120" \ - -v "${{ github.workspace }}/tests:/tests:ro" \ - -v "${{ github.workspace }}/templates:/templates:ro" \ + -v "$WORKSPACE/tests:/tests:ro" \ + -v "$WORKSPACE/templates:/templates:ro" \ python:3.11-slim \ sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v" - name: "E2E: HF Transformers inference (tiny-gpt2)" + env: + WORKSPACE: ${{ github.workspace }} run: | docker run --rm \ --network "${PROJECT}_ci-net" \ @@ -333,12 +360,14 @@ jobs: -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ -e TASK_YAML="/templates/inference_hf_tiny.yaml" \ -e E2E_TIMEOUT_SEC="300" \ - -v "${{ github.workspace }}/tests:/tests:ro" \ - -v "${{ github.workspace }}/templates:/templates:ro" \ + -v "$WORKSPACE/tests:/tests:ro" \ + -v "$WORKSPACE/templates:/templates:ro" \ python:3.11-slim \ sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v" - name: "E2E: LoRA SFT fine-tuning (TinyLlama-1.1B, gsm8k 2%)" + env: + WORKSPACE: ${{ github.workspace }} run: | docker run --rm \ --network "${PROJECT}_ci-net" \ @@ -346,12 +375,14 @@ jobs: -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ -e TASK_YAML="/templates/lora_sft_llama.yaml" \ -e E2E_TIMEOUT_SEC="1200" \ - -v "${{ github.workspace }}/tests:/tests:ro" \ - -v "${{ github.workspace }}/templates:/templates:ro" \ + -v "$WORKSPACE/tests:/tests:ro" \ + -v "$WORKSPACE/templates:/templates:ro" \ python:3.11-slim \ sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v" - name: "E2E: SSH non-interactive (python:3.12-slim container)" + env: + WORKSPACE: ${{ github.workspace }} run: | docker run --rm \ --network "${PROJECT}_ci-net" \ @@ -359,12 +390,14 @@ jobs: -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ -e TASK_YAML="/templates/ssh_noninteractive.yaml" \ -e E2E_TIMEOUT_SEC="120" \ - -v "${{ github.workspace }}/tests:/tests:ro" \ - -v "${{ github.workspace }}/templates:/templates:ro" \ + -v "$WORKSPACE/tests:/tests:ro" \ + -v "$WORKSPACE/templates:/templates:ro" \ python:3.11-slim \ sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v" - name: "E2E: n8n parallel DAG inference (dag_inference.json)" + env: + WORKSPACE: ${{ github.workspace }} run: | docker run --rm \ --network "${PROJECT}_ci-net" \ @@ -372,19 +405,21 @@ jobs: -e FLOWMESH_API_KEY="${FLOWMESH_API_KEY}" \ -e TASK_YAML="/templates/n8n/dag_inference.json" \ -e E2E_TIMEOUT_SEC="600" \ - -v "${{ github.workspace }}/tests:/tests:ro" \ - -v "${{ github.workspace }}/templates:/templates:ro" \ + -v "$WORKSPACE/tests:/tests:ro" \ + -v "$WORKSPACE/templates:/templates:ro" \ python:3.11-slim \ sh -c "pip install requests pytest -q && pytest /tests/integration/test_e2e.py -v" - name: Verify GPU worker actually executed the task + env: + RUN_ID: ${{ github.run_id }} run: | echo "=== GPU worker logs (full) ===" - docker logs ci-worker-gpu 2>&1 | tee /tmp/worker-gpu-${{ github.run_id }}.log || true + docker logs ci-worker-gpu 2>&1 | tee "/tmp/worker-gpu-${RUN_ID}.log" || true echo "" echo "=== Execution evidence check ===" - LOG=/tmp/worker-gpu-${{ github.run_id }}.log + LOG="/tmp/worker-gpu-${RUN_ID}.log" if grep -qiE "executor|running task|dispatched|inference|model" "$LOG"; then echo "✓ Worker received and processed a task" @@ -411,15 +446,17 @@ jobs: - name: Collect logs on failure if: failure() + env: + RUN_ID: ${{ github.run_id }} run: | docker compose -p "$PROJECT" \ -f docker/ci.compose.yml \ -f docker/ci.worker.gpu.yml \ - logs --no-color > /tmp/ci-gpu-logs-${{ github.run_id }}.txt 2>&1 || true + logs --no-color > "/tmp/ci-gpu-logs-${RUN_ID}.txt" 2>&1 || true - name: Upload logs on failure if: failure() - uses: actions/upload-artifact@v7 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 with: name: ci-logs-gpu-${{ github.run_id }} path: /tmp/ci-gpu-logs-${{ github.run_id }}.txt From e6af60130e71722d0cc11caed48efb79fbb08d00 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 4 May 2026 12:19:53 +0000 Subject: [PATCH 17/17] style: apply isort and black fixes Signed-off-by: Qruixuan <121090450@link.cuhk.edu.cn> --- src/server/supervisor/adapters/docker.py | 2 +- src/server/utils/helpers.py | 3 ++- src/worker/executors/ssh_executor.py | 10 ++++++---- src/worker/executors/transformers_executor.py | 4 +++- tests/integration/test_e2e.py | 4 +--- 5 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/server/supervisor/adapters/docker.py b/src/server/supervisor/adapters/docker.py index 1f5af9b..efe2fca 100644 --- a/src/server/supervisor/adapters/docker.py +++ b/src/server/supervisor/adapters/docker.py @@ -7,12 +7,12 @@ from enum import StrEnum from typing import Any -from docker import DockerClient from docker.errors import NotFound from docker.models.containers import Container from docker.types import DeviceRequest from pydantic import BaseModel, Field +from docker import DockerClient from shared.utils.docker import sanitize_container_name from ... import env diff --git a/src/server/utils/helpers.py b/src/server/utils/helpers.py index ca92caf..cfe6a30 100644 --- a/src/server/utils/helpers.py +++ b/src/server/utils/helpers.py @@ -10,10 +10,11 @@ from typing import Any import aiohttp -import docker import requests from redis.client import PubSub +import docker + _logger: logging.Logger | None = None _docker_client: docker.DockerClient | None = None diff --git a/src/worker/executors/ssh_executor.py b/src/worker/executors/ssh_executor.py index 1a088e2..cdd423b 100644 --- a/src/worker/executors/ssh_executor.py +++ b/src/worker/executors/ssh_executor.py @@ -50,19 +50,21 @@ ) try: - import docker - from docker import DockerClient from docker.models.containers import Container from docker.types import DeviceRequest + import docker + from docker import DockerClient + _HAS_DOCKER = True except Exception: _HAS_DOCKER = False if TYPE_CHECKING: - import docker - from docker import DockerClient from docker.models.containers import Container from docker.types import DeviceRequest + + import docker + from docker import DockerClient else: docker = None DockerClient = Any diff --git a/src/worker/executors/transformers_executor.py b/src/worker/executors/transformers_executor.py index 04fd084..b959a82 100644 --- a/src/worker/executors/transformers_executor.py +++ b/src/worker/executors/transformers_executor.py @@ -113,7 +113,9 @@ from transformers import PreTrainedModel, PreTrainedTokenizerBase except ImportError: try: - from transformers.modeling_utils import PreTrainedModel # type: ignore[assignment] + from transformers.modeling_utils import ( + PreTrainedModel, # type: ignore[assignment] + ) from transformers.tokenization_utils_base import ( # type: ignore[assignment] PreTrainedTokenizerBase, ) diff --git a/tests/integration/test_e2e.py b/tests/integration/test_e2e.py index 5c22f0d..6a1c8f8 100644 --- a/tests/integration/test_e2e.py +++ b/tests/integration/test_e2e.py @@ -167,9 +167,7 @@ def _poll_task(task_id: str) -> dict[str, Any]: error = task.get("error") or "" log_text = _dump_task_logs(task_id) if _EXECUTOR_UNAVAILABLE_RE.search(error): - pytest.skip( - f"[e2e] Executor not available on this worker: {error}" - ) + pytest.skip(f"[e2e] Executor not available on this worker: {error}") # max_attempts_exceeded means the host retried until giving up. # Inspect logs for the root cause; skip if the executor was # unavailable (e.g. Docker socket missing for SSH executor).