Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/security.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ jobs:
run: |
grep -v '@ git+' src/server/requirements.txt > /tmp/requirements-server-audit.txt
uvx pip-audit==2.9.0 --strict \
--ignore-vuln PYSEC-2026-161 \
-r /tmp/requirements-server-audit.txt
- name: Run pip-audit (worker CPU)
# Drop ``@ git+`` deps before auditing — not on PyPI, no CVE feed.
Expand All @@ -105,6 +106,7 @@ jobs:
--ignore-vuln GHSA-vfmq-68hx-4jfw \
--ignore-vuln GHSA-j7w6-vpvq-j3gm \
--ignore-vuln GHSA-98h9-4798-4q5v \
--ignore-vuln GHSA-7wx4-6vff-v64p \
--ignore-vuln PYSEC-2025-189 \
--ignore-vuln PYSEC-2025-190 \
--ignore-vuln PYSEC-2025-191 \
Expand Down Expand Up @@ -152,6 +154,7 @@ jobs:
--ignore-vuln GHSA-83vm-p52w-f9pw \
--ignore-vuln GHSA-j7w6-vpvq-j3gm \
--ignore-vuln GHSA-98h9-4798-4q5v \
--ignore-vuln GHSA-7wx4-6vff-v64p \
--ignore-vuln PYSEC-2025-189 \
--ignore-vuln PYSEC-2025-190 \
--ignore-vuln PYSEC-2025-191 \
Expand All @@ -176,4 +179,5 @@ jobs:
--ignore-vuln PYSEC-2024-277 \
--ignore-vuln PYSEC-2025-222 \
--ignore-vuln PYSEC-2024-274 \
--ignore-vuln PYSEC-2026-161 \
-r src/worker/requirements/requirements.gpu.txt
12 changes: 8 additions & 4 deletions cli/stack/src/flowmesh_cli_stack/assets/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -173,11 +173,15 @@ NEBULA_API_TOKEN=

# ==== External Plugins ====
# Plugins are Python packages dropped under FLOWMESH_PLUGIN_DIR
# (host-mounted to /app/plugins on the server) and selected by
# FLOWMESH_PLUGINS as a comma-separated list of top-level module
# names. Each named module must expose `install()` returning a
# `HookBindings`. Leave both empty unless you ship a plugin.
# (read-only at /app/plugins) and selected by FLOWMESH_PLUGINS as
# a comma-separated list of top-level module names. Each must
# expose `install()` returning a `HookBindings`.
# FLOWMESH_PLUGIN_DATA_DIR is writable at /app/plugin-data for
# plugin state. Leave all empty unless you ship a plugin.
FLOWMESH_PLUGIN_DIR=./plugins
# A path (`./x`, `/abs/x`) -> host bind-mount (auto-created).
# A bare name -> external Docker volume of that name.
FLOWMESH_PLUGIN_DATA_DIR=./plugin-data
FLOWMESH_PLUGINS=

# ==== Agent Executor (youtu-agent / utu) ====
Expand Down
4 changes: 4 additions & 0 deletions cli/stack/src/flowmesh_cli_stack/assets/compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ services:
- /var/run/docker.sock:/var/run/docker.sock
- ${SERVER_WORKER_CONFIG:-./configs/worker_config.yaml}:/etc/flowmesh/worker_config.yaml:ro
- ${FLOWMESH_PLUGIN_DIR:-./plugins}:/app/plugins:ro
- ${FLOWMESH_PLUGIN_DATA_DIR:-./plugin-data}:/app/plugin-data
- ${REDIS_TLS_DIR:-./secrets/tls/redis}:/etc/ssl/redis:ro
- ${SERVER_TLS_DIR:-./secrets/tls/server}:/etc/ssl/server:ro
restart: unless-stopped
Expand Down Expand Up @@ -199,3 +200,6 @@ volumes:
name: ${FLOWMESH_STACK_SLUG:-flowmesh_node}_metrics
flowmesh_server_logs:
name: ${FLOWMESH_STACK_SLUG:-flowmesh_node}_server_logs
flowmesh_plugin_data:
external: true
name: ${FLOWMESH_PLUGIN_DATA_VOLUME:-${FLOWMESH_STACK_SLUG:-flowmesh_node}_plugin_data}
18 changes: 14 additions & 4 deletions cli/stack/src/flowmesh_cli_stack/env_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,10 +530,11 @@
title="External Plugins",
description=[
"Plugins are Python packages dropped under FLOWMESH_PLUGIN_DIR ",
"(host-mounted to /app/plugins on the server) and selected by ",
"FLOWMESH_PLUGINS as a comma-separated list of top-level module ",
"names. Each named module must expose `install()` returning a ",
"`HookBindings`. Leave both empty unless you ship a plugin.",
"(read-only at /app/plugins) and selected by FLOWMESH_PLUGINS as ",
"a comma-separated list of top-level module names. Each must ",
"expose `install()` returning a `HookBindings`. ",
"FLOWMESH_PLUGIN_DATA_DIR is writable at /app/plugin-data for ",
"plugin state. Leave all empty unless you ship a plugin.",
],
vars=[
EnvVar(
Expand All @@ -543,6 +544,15 @@
use_default=True,
ensure_path="create",
),
EnvVar(
"FLOWMESH_PLUGIN_DATA_DIR",
"./plugin-data",
use_default=True,
description=[
"A path (`./x`, `/abs/x`) -> host bind-mount (auto-created).",
"A bare name -> external Docker volume of that name.",
],
),
EnvVar("FLOWMESH_PLUGINS", ""),
],
),
Expand Down
2 changes: 2 additions & 0 deletions cli/stack/src/flowmesh_cli_stack/stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from .utils import (
DEFAULT_ENV_FILE,
STACK_PATH_KEYS,
apply_plugin_data_env,
apply_stack_resource_env,
ensure_deploy_paths,
parse_node_role,
Expand All @@ -58,6 +59,7 @@ def _load(env_file: Path) -> None:
except ValueError as exc:
logging.error(str(exc))
raise typer.Exit(code=1)
apply_plugin_data_env(Path.cwd())

return DockerComposeStack(
compose_file=stack_compose_file(),
Expand Down
24 changes: 24 additions & 0 deletions cli/stack/src/flowmesh_cli_stack/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,22 @@ def apply_stack_resource_env() -> None:
os.environ[WORKER_RESULTS_DIR_ENV] = results_volume


_PLUGIN_DATA_PATH_PREFIXES = ("/", "./", "../", "~")
_PLUGIN_DATA_ALIAS = "flowmesh_plugin_data"
_PLUGIN_DATA_DEFAULT = "./plugin-data"
_PLUGIN_DATA_VOLUME_ENV = "FLOWMESH_PLUGIN_DATA_VOLUME"


def apply_plugin_data_env(base_dir: Path) -> None:
raw = os.environ.get("FLOWMESH_PLUGIN_DATA_DIR", "").strip()
if not raw or raw.startswith(_PLUGIN_DATA_PATH_PREFIXES):
resolved = resolve_path(raw, default=_PLUGIN_DATA_DEFAULT, base_dir=base_dir)
os.environ["FLOWMESH_PLUGIN_DATA_DIR"] = resolved.as_posix()
else:
os.environ[_PLUGIN_DATA_VOLUME_ENV] = raw
os.environ["FLOWMESH_PLUGIN_DATA_DIR"] = _PLUGIN_DATA_ALIAS


def stack_compose_file() -> Path:
return asset_path("flowmesh_cli_stack.assets", "compose.yml")

Expand Down Expand Up @@ -118,6 +134,14 @@ def ensure_deploy_paths(base_dir: Path) -> None:
base_dir=base_dir,
)
)
if not os.environ.get(_PLUGIN_DATA_VOLUME_ENV):
ensure_dir(
resolve_path(
os.getenv("FLOWMESH_PLUGIN_DATA_DIR", ""),
default=_PLUGIN_DATA_DEFAULT,
base_dir=base_dir,
)
)


def parse_node_role(raw: str) -> NodeRole:
Expand Down
2 changes: 2 additions & 0 deletions docs/CODE_STYLE.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ as `--ignore-vuln` flags in `.github/workflows/security.yml`.
| GHSA-w8v5-vhqr-4h9v | diskcache | (none) | upstream unmaintained, no fixed version published |
| GHSA-j7w6-vpvq-j3gm | diffusers | 0.38.0 | fix requires safetensors>=0.8.0rc0 pre-release; uv lock won't pick up pre-releases without explicit opt-in |
| GHSA-98h9-4798-4q5v | diffusers | 0.38.0 | same blocker as GHSA-j7w6-vpvq-j3gm — both fixed in 0.38.0 |
| GHSA-7wx4-6vff-v64p | diffusers | 0.38.0 | same blocker as GHSA-j7w6-vpvq-j3gm — fixed in 0.38.0 |
| PYSEC-2025-189 | torch | (none) | no fix version published |
| PYSEC-2025-190 | torch | (none) | same |
| PYSEC-2025-191 | torch | (none) | same |
Expand All @@ -141,6 +142,7 @@ as `--ignore-vuln` flags in `.github/workflows/security.yml`.
| PYSEC-2024-277 | joblib | (none) | no fix version published |
| PYSEC-2025-222 | vllm | (none) | no fix version published; held by vllm-omni 0.18 pin |
| PYSEC-2024-274 | gradio | (none) | no fix version published; vllm-omni 0.18 pins gradio==5.50 |
| PYSEC-2026-161 | starlette | 1.0.1 | gradio 5.50 caps starlette<1.0 (transitive via vllm-omni 0.18) |

When a blocker lifts (e.g. transformers 5 ↔ vllm 0.19 line stabilizes),
drop the corresponding `--ignore-vuln` flag from the workflow and the
Expand Down
1 change: 1 addition & 0 deletions docs/ENV.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ listed here is in `.env.example`.
| `ENABLE_WORKER_WATCHDOG` | `true` | Worker death detection |
| `WORKER_DEATH_GRACE_SEC` | `60` | Grace period before marking dead |
| `FLOWMESH_PLUGINS` | – | Comma-separated plugin module names |
| `FLOWMESH_PLUGIN_DATA_DIR` | `./plugin-data` | Writable mount at `/app/plugin-data` for plugin state. A path -> host bind-mount (auto-created); a bare name -> external Docker volume of that name. |
| `SERVER_CUDA_PROBE_IMAGE` | `nvidia/cuda:12.9.1-base-ubuntu24.04` | CUDA image the server runs briefly to query local GPU names/indices |
| `DOCKER_GPU_RUNTIME` | nvidia | Optional Docker runtime name for GPU probe/worker containers; leave empty unless the host requires a named runtime such as `nvidia` |
| `FLOWMESH_API_KEY` | – | Forwarded to spawned workers as their server-callback bearer |
Expand Down
17 changes: 16 additions & 1 deletion docs/PLUGINS.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,17 @@ The hooks:
tables so subsequent `PermissionChecker` calls have data to decide
on. `RESULT` ownership is inferred from the owning task; `RESULT`
permission checks are always paired with a `task_id`, and
workflow-level operations check `WORKFLOW`.
workflow-level operations check `WORKFLOW`. At startup the server
runs a reconcile sweep — after plugins load, the system principal
resolves, and the supervisor handshake completes — enumerating
every live workflow, task, worker, and node and calling
`reconcile(resources, logger)` once per registrar with the full
batch. `reconcile` is atomic per the `lumid-hooks` contract: on
failure the registrar's store is unchanged, so the server logs
the exception and moves on without risk of a partial wipe.
Persistent registrars use this call to drop records for resources
the server no longer knows about — stateless registrars implement
it as a no-op.

The shared protocols treat `kind` and `action` as plain strings —
`lumid-hooks` does not enumerate kinds. FlowMesh layers the
Expand Down Expand Up @@ -134,6 +144,11 @@ Each subdirectory of `FLOWMESH_PLUGIN_DIR` is importable as a
top-level module. The mount is read-only, so the plugin code is
treated as static deployment artifact.

For writable persistence, `FLOWMESH_PLUGIN_DATA_DIR` (default
`./plugin-data`) is mounted read-write at `/app/plugin-data`. A path
value is a host bind-mount (auto-created on `stack up`); a bare name
is an external Docker volume of that name.

This handles plugin **code** without rebuilding the server image.
When that isn't enough, build a thin overlay on top of the prebuilt
image. Two patterns, pick by need:
Expand Down
4 changes: 3 additions & 1 deletion hook/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
FlowMesh-specific plugin extension surface. Carries the pieces FlowMesh adds
on top of [`lumid-hooks`](https://github.com/mlsys-io/lumid.hooks):

- `HookBindings` — concrete dataclass with FlowMesh's six fields (the five
- `HookBindings` — runtime-checkable Protocol extending the shared one with
`supplier_resolvers`, used by the server's plugin gate.
- `BaseBindings` — frozen dataclass with FlowMesh's six fields (the five
shared from `lumid-hooks` plus `supplier_resolvers`).
- `ResourceKind` / `ResourceAction` — FlowMesh resource and action enums.
- `SupplierResolver` / `WorkerView` — supplier attribution at dispatch time.
Expand Down
2 changes: 1 addition & 1 deletion hook/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ requires-python = ">=3.12"
license = "Apache-2.0"
license-files = ["LICENSE"]
dependencies = [
"lumid-hooks>=0.1.0",
"lumid-hooks>=0.2.0",
]

[tool.setuptools]
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ runtime-server = [
"flowmesh-hook",
"grpcio>=1.76.0",
"httpx>=0.28.1",
"lumid-hooks>=0.1.0",
"lumid-hooks>=0.2.0",
"protobuf>=5.29.6",
"pydantic>=2.12.3",
"python-multipart>=0.0.26",
Expand Down
2 changes: 2 additions & 0 deletions src/server/auth/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
authenticate_websocket,
default_principal,
deregister_resource,
reconcile_resources,
register_resource,
require_permission,
resolve_accessible_ids,
Expand All @@ -18,6 +19,7 @@
"authenticate_websocket",
"default_principal",
"deregister_resource",
"reconcile_resources",
"register_resource",
"require_permission",
"resolve_accessible_ids",
Expand Down
26 changes: 25 additions & 1 deletion src/server/auth/security.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"""

import logging
from collections.abc import Mapping
from collections.abc import Iterable, Mapping
from typing import Any

from fastapi import HTTPException, WebSocket, WebSocketException, status
Expand Down Expand Up @@ -205,3 +205,27 @@ async def deregister_resource(
resource = ResourceRef(kind=resource_kind.value, id=resource_id)
for registrar in RESOURCE_REGISTRARS:
await registrar.deregister(principal, resource, logger)


async def reconcile_resources(
resources: Iterable[ResourceRef],
logger: logging.Logger,
) -> None:
"""Tell every registered `ResourceRegistrar` to replace its stored live
set with `resources`.

Called once during startup reconcile with every live workflow / task /
node / worker. Each registrar's `reconcile` is atomic — on failure the
registrar's store is unchanged — so a raised exception is logged and
the sweep continues with the next registrar. The failing registrar
retries next boot.
"""
refs = list(resources)
for registrar in RESOURCE_REGISTRARS:
try:
await registrar.reconcile(refs, logger)
except Exception:
logger.exception(
"ResourceRegistrar %s.reconcile failed; store left untouched.",
registrar.name,
)
39 changes: 37 additions & 2 deletions src/server/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@

import uvicorn
from fastapi import FastAPI
from lumid_hooks import HookBindings
from flowmesh_hook import ResourceKind
from lumid_hooks import HookBindings, ResourceRef

if __name__ == "__main__" and __package__ is None:
import sys
Expand All @@ -19,7 +20,7 @@

from shared._version import FLOWMESH_RELEASE_VERSION

from .auth import resolve_system_principal
from .auth import reconcile_resources, resolve_system_principal
from .clients import RedisClient
from .config import NodeRole, ServerConfig
from .dispatcher.factory import create_dispatcher
Expand Down Expand Up @@ -305,6 +306,35 @@ async def _load_plugins(stack: AsyncExitStack) -> None:
register(bindings)


async def _reconcile_resources() -> None:
"""Refresh registrar-tracked records for every live resource, then purge
anything the sweep didn't touch. Runs once at startup after plugins load
so registrars don't drop grants on resources that outlived their TTL.
"""
refs: list[ResourceRef] = []

# Nodes (always present).
for node in await NODE_REGISTRY.list_nodes_async():
refs.append(ResourceRef(kind=ResourceKind.NODE.value, id=node.id))

# Workers and workflows live on the root node.
if WORKER_REGISTRY is not None:
for worker in await WORKER_REGISTRY.list_workers_async():
refs.append(ResourceRef(kind=ResourceKind.WORKER.value, id=worker.id))
if WORKFLOW_REGISTRY is not None:
workflow_ids = await WORKFLOW_REGISTRY.get_workflow_ids_async()
for workflow_id in workflow_ids:
record = await WORKFLOW_REGISTRY.get_workflow_record_async(workflow_id)
if record is None:
continue
refs.append(ResourceRef(kind=ResourceKind.WORKFLOW.value, id=workflow_id))
for task_id in record.task_ids:
refs.append(ResourceRef(kind=ResourceKind.TASK.value, id=task_id))

logger.info("Startup reconcile: %d live resource(s)", len(refs))
await reconcile_resources(refs, logger)


@asynccontextmanager
async def _lifespan(_: FastAPI):
async with AsyncExitStack() as plugin_stack:
Expand Down Expand Up @@ -333,6 +363,11 @@ async def _lifespan(_: FastAPI):
if EVENT_MONITOR is not None:
EVENT_MONITOR.set_own_node(SUPERVISOR.node_id)

# --- Startup reconcile (registrar plugins) ---
# Runs after the supervisor handshake so this node is in NODE_REGISTRY
# and is included in the live batch.
await _reconcile_resources()

try:
yield
finally:
Expand Down
2 changes: 1 addition & 1 deletion src/server/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ docker==7.1.0
fastapi==0.120.3
grpcio==1.76.0
httpx==0.28.1
lumid-hooks==0.1.0
lumid-hooks==0.2.0
protobuf==5.29.6
pydantic==2.12.3
python-multipart==0.0.27
Expand Down
Loading