From 619fb97450ae459808029a641abf4e70a46fdd9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 18 Feb 2026 12:27:41 -0800
Subject: [PATCH 01/26] docs: add PRD for zero-boilerplate flash run experience

---
 PRD.md | 341 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 341 insertions(+)
 create mode 100644 PRD.md

diff --git a/PRD.md b/PRD.md
new file mode 100644
index 00000000..a5d9d98f
--- /dev/null
+++ b/PRD.md
@@ -0,0 +1,341 @@
+# Flash SDK: Zero-Boilerplate Experience — Product Requirements Document
+
+## 1. Problem Statement
+
+Flash currently forces every project into a FastAPI-first model:
+
+- Users must create `main.py` with a `FastAPI()` instance
+- HTTP routing boilerplate adds no semantic value — the routes simply call `@remote` functions
+- No straightforward path for deploying a standalone QB function without wrapping it in a FastAPI app
+- The "mothership" concept introduces an implicit coordinator with no clear ownership model
+- `flash run` fails unless `main.py` exists with a FastAPI app, blocking the simplest use cases
+
+## 2. Goals
+
+- **Zero boilerplate**: a `@remote`-decorated function in any `.py` file is sufficient for `flash run` and `flash deploy`
+- **File-system-as-namespace**: the project directory structure maps 1:1 to URL paths on the local dev server
+- **Single command**: `flash run` works for all project topologies (one QB function, many files, mixed QB+LB) without any configuration
+- **`flash deploy` requires no additional configuration** beyond the `@remote` declarations themselves
+- **Peer endpoints**: every `@resource_config` is a first-class endpoint; no implicit coordinator
+
+## 3. Non-Goals
+
+- No backward compatibility with `main.py`/FastAPI-first style
+- No implicit "mothership" concept; all endpoints are peers
+- No changes to the QB runtime (`generic_handler.py`) or QB stub behavior
+- No changes to deployed endpoint behavior (RunPod QB/LB APIs are unchanged)
+
+## 4. Developer Experience Specification
+
+### 4.1 Minimum viable QB project
+
+```python
+# gpu_worker.py
+from runpod_flash import LiveServerless, GpuGroup, remote
+
+gpu_config = LiveServerless(name="gpu_worker", gpus=[GpuGroup.ANY])
+
+@remote(gpu_config)
+async def process(input_data: dict) -> dict:
+    return {"result": "processed", "input": input_data}
+```
+
+`flash run` → `POST /gpu_worker/run` and `POST /gpu_worker/run_sync`
+`flash deploy` → standalone QB endpoint at `api.runpod.ai/v2/{id}/run`
+
+### 4.2 LB endpoint
+
+```python
+# api/routes.py
+from runpod_flash import CpuLiveLoadBalancer, remote
+
+lb_config = CpuLiveLoadBalancer(name="api_routes")
+
+@remote(lb_config, method="POST", path="/compute")
+async def compute(input_data: dict) -> dict:
+    return {"result": input_data}
+```
+
+`flash run` → `POST /api/routes/compute`
+`flash deploy` → LB endpoint at `{id}.api.runpod.ai/compute`
+
+### 4.3 Mixed QB + LB (LB calling QB)
+
+```python
+# api/routes.py (LB)
+from runpod_flash import CpuLiveLoadBalancer, remote
+from workers.gpu import heavy_compute  # QB stub
+
+lb_config = CpuLiveLoadBalancer(name="api_routes")
+
+@remote(lb_config, method="POST", path="/process")
+async def process_route(data: dict):
+    return await heavy_compute(data)  # dispatches to QB endpoint
+
+# workers/gpu.py (QB)
+from runpod_flash import LiveServerless, GpuGroup, remote
+
+gpu_config = LiveServerless(name="gpu_worker", gpus=[GpuGroup.ANY])
+
+@remote(gpu_config)
+async def heavy_compute(data: dict) -> dict: ...
+```
+
+## 5. URL Path Specification
+
+### 5.1 File prefix derivation
+
+The local dev server uses the project directory structure as a URL namespace. Each file's URL prefix is its path relative to the project root with `.py` stripped:
+
+```
+File                            Local URL prefix
+──────────────────────────────  ────────────────────────────
+gpu_worker.py               →   /gpu_worker
+longruns/stage1.py          →   /longruns/stage1
+preprocess/first_pass.py    →   /preprocess/first_pass
+workers/gpu/inference.py    →   /workers/gpu/inference
+```
+
+### 5.2 QB route generation
+
+| Condition | Routes |
+|---|---|
+| One `@remote` function in file | `POST {file_prefix}/run` and `POST {file_prefix}/run_sync` |
+| Multiple `@remote` functions in file | `POST {file_prefix}/{fn_name}/run` and `POST {file_prefix}/{fn_name}/run_sync` |
+
+### 5.3 LB route generation
+
+| Condition | Route |
+|---|---|
+| `@remote(lb_config, method="POST", path="/compute")` | `POST {file_prefix}/compute` |
+
+The declared `path=` is appended to the file prefix. The `method=` determines the HTTP verb.
+
+### 5.4 QB request/response envelope
+
+Mirrors RunPod's API for consistency:
+
+```
+POST /gpu_worker/run_sync
+Body:     {"input": {"key": "value"}}
+Response: {"id": "uuid", "status": "COMPLETED", "output": {...}}
+```
+
+## 6. Deployed Topology Specification
+
+Each unique resource config gets its own RunPod endpoint:
+
+| Type | Deployed URL | Example |
+|---|---|---|
+| QB | `https://api.runpod.ai/v2/{endpoint_id}/run` | `https://api.runpod.ai/v2/uoy3n7hkyb052a/run` |
+| QB sync | `https://api.runpod.ai/v2/{endpoint_id}/run_sync` | |
+| LB | `https://{endpoint_id}.api.runpod.ai/{declared_path}` | `https://rzlk6lph6gw7dk.api.runpod.ai/compute` |
+
+## 7. `.flash/` Folder Specification
+
+All generated artifacts go to `.flash/` in the project root. Auto-created, gitignored, never committed.
+
+```
+my_project/
+├── gpu_worker.py
+├── longruns/
+│   └── stage1.py
+└── .flash/
+    ├── server.py        ← generated by flash run
+    └── manifest.json    ← generated by flash build
+```
+
+- `.flash/` is added to `.gitignore` automatically on first `flash run`
+- `server.py` and `manifest.json` are overwritten on each run/build; other files preserved
+- The `.flash/` directory itself is never committed
+
+### 7.1 Dev server launch
+
+Uvicorn is launched with `--app-dir .flash/` so `server:app` is importable. The server inserts the project root into `sys.path` so user modules resolve:
+
+```bash
+uvicorn server:app \
+  --app-dir .flash/ \
+  --reload \
+  --reload-dir . \
+  --reload-include "*.py"
+```
+
+## 8. `flash run` Behavior
+
+1. Scan project for all `@remote` functions (QB and LB) in any `.py` file
+   - Skip: `.flash/`, `__pycache__`, `*.pyc`, `__init__.py`
+2. If none found: print error with usage instructions, exit 1
+3. Generate `.flash/server.py` with routes for all discovered functions
+4. Add `.flash/` to `.gitignore` if not already present
+5. Start uvicorn with `--reload` watching both `.flash/` and project root
+6. Print startup table: local paths → resource names → types
+7. Swagger UI available at `http://localhost:{port}/docs`
+8. On exit (Ctrl+C or SIGTERM): deprovision all Live Serverless endpoints provisioned during this session
+
+### 8.1 Startup table format
+
+```
+Flash Dev Server  http://localhost:8888
+
+  Local path                            Resource               Type
+  ──────────────────────────────────    ───────────────────    ────
+  POST  /gpu_worker/run                 gpu_worker             QB
+  POST  /gpu_worker/run_sync            gpu_worker             QB
+  POST  /longruns/stage1/run            longruns_stage1        QB
+  POST  /preprocess/first_pass/compute  preprocess_first_pass  LB
+
+  Visit http://localhost:8888/docs for Swagger UI
+```
+
+## 9. `flash build` Behavior
+
+1. Scan project for all `@remote` functions (QB and LB)
+2. Build `.flash/manifest.json` with flat resource structure (see §10)
+3. For LB resources: generate deployed handler files using `module_path`
+4. Package build artifact
+
+## 10. Manifest Structure
+
+Resource names are derived from file paths (slashes → underscores):
+
+```json
+{
+  "version": "1.0",
+  "project_name": "my_project",
+  "resources": {
+    "gpu_worker": {
+      "resource_type": "LiveServerless",
+      "file_path": "gpu_worker.py",
+      "local_path_prefix": "/gpu_worker",
+      "module_path": "gpu_worker",
+      "functions": ["gpu_hello"],
+      "is_load_balanced": false,
+      "makes_remote_calls": false
+    },
+    "longruns_stage1": {
+      "resource_type": "LiveServerless",
+      "file_path": "longruns/stage1.py",
+      "local_path_prefix": "/longruns/stage1",
+      "module_path": "longruns.stage1",
+      "functions": ["stage1_process"],
+      "is_load_balanced": false,
+      "makes_remote_calls": false
+    },
+    "preprocess_first_pass": {
+      "resource_type": "CpuLiveLoadBalancer",
+      "file_path": "preprocess/first_pass.py",
+      "local_path_prefix": "/preprocess/first_pass",
+      "module_path": "preprocess.first_pass",
+      "functions": [
+        {"name": "first_pass_fn", "http_method": "POST", "http_path": "/compute"}
+      ],
+      "is_load_balanced": true,
+      "makes_remote_calls": true
+    }
+  }
+}
+```
+
+## 11. `.flash/server.py` Structure
+
+```python
+"""Auto-generated Flash dev server. Do not edit — regenerated on each flash run."""
+import sys
+import uuid
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from fastapi import FastAPI
+
+# QB imports
+from gpu_worker import gpu_hello
+from longruns.stage1 import stage1_process
+
+# LB imports
+from preprocess.first_pass import first_pass_fn
+
+app = FastAPI(
+    title="Flash Dev Server",
+    description="Auto-generated by `flash run`. Visit /docs for interactive testing.",
+)
+
+# QB: gpu_worker.py
+@app.post("/gpu_worker/run", tags=["gpu_worker [QB]"])
+@app.post("/gpu_worker/run_sync", tags=["gpu_worker [QB]"])
+async def gpu_worker_run(body: dict):
+    result = await gpu_hello(body.get("input", body))
+    return {"id": str(uuid.uuid4()), "status": "COMPLETED", "output": result}
+
+# QB: longruns/stage1.py
+@app.post("/longruns/stage1/run", tags=["longruns/stage1 [QB]"])
+@app.post("/longruns/stage1/run_sync", tags=["longruns/stage1 [QB]"])
+async def longruns_stage1_run(body: dict):
+    result = await stage1_process(body.get("input", body))
+    return {"id": str(uuid.uuid4()), "status": "COMPLETED", "output": result}
+
+# LB: preprocess/first_pass.py
+@app.post("/preprocess/first_pass/compute", tags=["preprocess/first_pass [LB]"])
+async def _route_first_pass_compute(body: dict):
+    return await first_pass_fn(body)
+
+# Health
+@app.get("/", tags=["health"])
+def home():
+    return {"message": "Flash Dev Server", "docs": "/docs"}
+
+@app.get("/ping", tags=["health"])
+def ping():
+    return {"status": "healthy"}
+```
+
+Subdirectory imports use dotted module paths: `longruns/stage1.py` → `from longruns.stage1 import fn`.
+
+Multi-function QB files (2+ `@remote` functions) get sub-prefixed routes:
+```
+longruns/stage1.py has: stage1_preprocess, stage1_infer
+→ POST /longruns/stage1/stage1_preprocess/run
+→ POST /longruns/stage1/stage1_preprocess/run_sync
+→ POST /longruns/stage1/stage1_infer/run
+→ POST /longruns/stage1/stage1_infer/run_sync
+```
+
+## 12. Acceptance Criteria
+
+- [ ] A file with one `@remote(QB_config)` function and nothing else is a valid Flash project
+- [ ] `flash run` produces a Swagger UI showing all routes grouped by source file
+- [ ] QB routes accept `{"input": {...}}` and return `{"id": ..., "status": "COMPLETED", "output": {...}}`
+- [ ] Subdirectory files produce URL prefixes matching their relative path
+- [ ] Multiple `@remote` functions in one file each get their own sub-prefixed routes
+- [ ] LB route handler body executes directly (not dispatched remotely)
+- [ ] QB calls inside LB route handler body route to the remote QB endpoint
+- [ ] `flash deploy` creates a RunPod endpoint for each resource config
+- [ ] `flash build` produces `.flash/manifest.json` with `file_path`, `local_path_prefix`, `module_path` per resource
+- [ ] When `flash run` exits, all Live Serverless endpoints provisioned during that session are automatically undeployed
+
+## 13. Edge Cases
+
+- **No `@remote` functions found**: Error with clear message and usage instructions
+- **Multiple `@remote` functions per file (QB)**: Sub-prefixed routes `/{file_prefix}/{fn_name}/run`
+- **`__init__.py` files**: Skipped — not treated as worker files
+- **File path with hyphens** (e.g., `my-worker.py`): Resource name sanitized to `my_worker`, URL prefix `/my-worker` (hyphens valid in URLs, underscores in Python identifiers)
+- **LB function calling another LB function**: Not supported via `@remote` — emit a warning at build time
+- **`.flash/` already exists**: `server.py` and `manifest.json` overwritten; other files preserved
+- **`flash deploy` with no LB endpoints**: QB-only deploy
+- **Subdirectory `__init__.py`** imports needed: Generator checks and warns if missing
+
+## 14. Implementation Files
+
+| File | Change |
+|------|--------|
+| `flash/main/PRD.md` | This document |
+| `src/runpod_flash/client.py` | Passthrough for LB route handlers (`__is_lb_route_handler__`) |
+| `cli/commands/run.py` | Unified server generation; `--app-dir .flash/`; file-path-based route discovery |
+| `cli/commands/build_utils/scanner.py` | Path utilities; `is_lb_route_handler` field; file-based resource identity |
+| `cli/commands/build_utils/manifest.py` | Flat resource structure; `file_path`/`local_path_prefix`/`module_path` fields |
+| `cli/commands/build_utils/lb_handler_generator.py` | Import module by `module_path`, walk `__is_lb_route_handler__`, register routes |
+| `cli/commands/build.py` | Remove main.py requirement from `validate_project_structure` |
+| `core/resources/serverless.py` | Inject `FLASH_MODULE_PATH` env var |
+| `flash-examples/.../01_hello_world/` | Rewrite to bare minimum |
+| `flash-examples/.../00_standalone_worker/` | New |
+| `flash-examples/.../00_multi_resource/` | New |

From db8908b0292b43046d050ce3205e8442e11ba23e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 18 Feb 2026 12:29:40 -0800
Subject: [PATCH 02/26] feat(client,scanner): LB route handler passthrough and
 path-aware discovery

LB @remote functions (with method= and path=) now return the decorated
function unwrapped with __is_lb_route_handler__=True. The function body
executes directly on the LB endpoint server rather than being dispatched
as a remote stub. QB stubs inside the body are unaffected.

Scanner gains three path utilities (file_to_url_prefix,
file_to_resource_name, file_to_module_path) that convert file paths to
URL prefixes, resource names, and dotted module paths respectively.
RemoteFunctionMetadata gains is_lb_route_handler to distinguish LB route
handlers from QB remote stubs during discovery.
---
 .../cli/commands/build_utils/scanner.py       |  73 +++++-
 src/runpod_flash/client.py                    |  14 ++
 .../build_utils/test_path_utilities.py        | 217 ++++++++++++++++++
 3 files changed, 303 insertions(+), 1 deletion(-)
 create mode 100644 tests/unit/cli/commands/build_utils/test_path_utilities.py

diff --git a/src/runpod_flash/cli/commands/build_utils/scanner.py b/src/runpod_flash/cli/commands/build_utils/scanner.py
index 2215ab9e..d217dcb3 100644
--- a/src/runpod_flash/cli/commands/build_utils/scanner.py
+++ b/src/runpod_flash/cli/commands/build_utils/scanner.py
@@ -3,6 +3,7 @@
 import ast
 import importlib
 import logging
+import os
 import re
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -11,6 +12,61 @@
 logger = logging.getLogger(__name__)
 
 
+def file_to_url_prefix(file_path: Path, project_root: Path) -> str:
+    """Derive the local dev server URL prefix from a source file path.
+
+    Args:
+        file_path: Absolute path to the Python source file
+        project_root: Absolute path to the project root directory
+
+    Returns:
+        URL prefix starting with "/" (e.g., /longruns/stage1)
+
+    Example:
+        longruns/stage1.py  →  /longruns/stage1
+    """
+    rel = file_path.relative_to(project_root).with_suffix("")
+    return "/" + str(rel).replace(os.sep, "/")
+
+
+def file_to_resource_name(file_path: Path, project_root: Path) -> str:
+    """Derive the manifest resource name from a source file path.
+
+    Slashes and hyphens are replaced with underscores to produce a valid
+    Python identifier suitable for use as a resource name.
+
+    Args:
+        file_path: Absolute path to the Python source file
+        project_root: Absolute path to the project root directory
+
+    Returns:
+        Resource name using underscores (e.g., longruns_stage1)
+
+    Example:
+        longruns/stage1.py  →  longruns_stage1
+        my-worker.py        →  my_worker
+    """
+    rel = file_path.relative_to(project_root).with_suffix("")
+    return str(rel).replace(os.sep, "_").replace("/", "_").replace("-", "_")
+
+
+def file_to_module_path(file_path: Path, project_root: Path) -> str:
+    """Derive the Python dotted module path from a source file path.
+
+    Args:
+        file_path: Absolute path to the Python source file
+        project_root: Absolute path to the project root directory
+
+    Returns:
+        Dotted module path (e.g., longruns.stage1)
+
+    Example:
+        longruns/stage1.py  →  longruns.stage1
+    """
+    rel = file_path.relative_to(project_root).with_suffix("")
+    return str(rel).replace(os.sep, ".").replace("/", ".")
+
+
 @dataclass
 class RemoteFunctionMetadata:
     """Metadata about a @remote decorated function or class."""
@@ -35,6 +91,9 @@ class RemoteFunctionMetadata:
     called_remote_functions: List[str] = field(
         default_factory=list
     )  # Names of @remote functions called
+    is_lb_route_handler: bool = (
+        False  # LB @remote with method= and path= — runs directly as HTTP handler
+    )
 
 
 class RemoteDecoratorScanner:
@@ -62,7 +121,9 @@ def discover_remote_functions(self) -> List[RemoteFunctionMetadata]:
                 rel_path = f.relative_to(self.project_dir)
                 # Check if first part of path is in excluded_root_dirs
                 if rel_path.parts and rel_path.parts[0] not in excluded_root_dirs:
-                    self.py_files.append(f)
+                    # Exclude __init__.py — not valid worker entry points
+                    if f.name != "__init__.py":
+                        self.py_files.append(f)
             except (ValueError, IndexError):
                 # Include files that can't be made relative
                 self.py_files.append(f)
@@ -220,6 +281,15 @@ def _extract_remote_functions(
                             {"is_load_balanced": False, "is_live_resource": False},
                         )
 
+                        # An LB route handler is an LB @remote function that has
+                        # both method= and path= declared. Its body runs directly
+                        # on the LB endpoint — it is NOT a remote dispatch stub.
+                        is_lb_route_handler = (
+                            flags["is_load_balanced"]
+                            and http_method is not None
+                            and http_path is not None
+                        )
+
                         metadata = RemoteFunctionMetadata(
                             function_name=node.name,
                             module_path=module_path,
@@ -235,6 +305,7 @@ def _extract_remote_functions(
                             config_variable=self.resource_variables.get(
                                 resource_config_name
                             ),
+                            is_lb_route_handler=is_lb_route_handler,
                         )
                         functions.append(metadata)
 
diff --git a/src/runpod_flash/client.py b/src/runpod_flash/client.py
index ed68bc30..8709cf75 100644
--- a/src/runpod_flash/client.py
+++ b/src/runpod_flash/client.py
@@ -159,6 +159,20 @@ def decorator(func_or_class):
             "system_dependencies": system_dependencies,
         }
 
+        # LB route handler passthrough — return the function unwrapped.
+        #
+        # When @remote is applied to an LB resource (LiveLoadBalancer,
+        # CpuLiveLoadBalancer, LoadBalancerSlsResource) with method= and path=,
+        # the decorated function IS the HTTP route handler. Its body executes
+        # directly on the LB endpoint server; it is not dispatched to a remote
+        # process. QB @remote calls inside its body still use their own stubs.
+        is_lb_route_handler = is_lb_resource and method is not None and path is not None
+        if is_lb_route_handler:
+            routing_config["is_lb_route_handler"] = True
+            func_or_class.__remote_config__ = routing_config
+            func_or_class.__is_lb_route_handler__ = True
+            return func_or_class
+
         # Local execution mode - execute without provisioning remote servers
         if local:
             func_or_class.__remote_config__ = routing_config
diff --git a/tests/unit/cli/commands/build_utils/test_path_utilities.py b/tests/unit/cli/commands/build_utils/test_path_utilities.py
new file mode 100644
index 00000000..73ec3557
--- /dev/null
+++ b/tests/unit/cli/commands/build_utils/test_path_utilities.py
@@ -0,0 +1,217 @@
+"""TDD tests for scanner path utility functions.
+
+Written first (failing) per the plan's TDD requirement.
+These test file_to_url_prefix, file_to_resource_name, file_to_module_path.
+"""
+
+import os
+
+
+from runpod_flash.cli.commands.build_utils.scanner import (
+    file_to_module_path,
+    file_to_resource_name,
+    file_to_url_prefix,
+)
+
+
+class TestFileToUrlPrefix:
+    """Tests for file_to_url_prefix utility."""
+
+    def test_root_level_file(self, tmp_path):
+        """gpu_worker.py → /gpu_worker"""
+        f = tmp_path / "gpu_worker.py"
+        assert file_to_url_prefix(f, tmp_path) == "/gpu_worker"
+
+    def test_single_subdir(self, tmp_path):
+        """longruns/stage1.py → /longruns/stage1"""
+        f = tmp_path / "longruns" / "stage1.py"
+        assert file_to_url_prefix(f, tmp_path) == "/longruns/stage1"
+
+    def test_nested_subdir(self, tmp_path):
+        """preprocess/first_pass.py → /preprocess/first_pass"""
+        f = tmp_path / "preprocess" / "first_pass.py"
+        assert file_to_url_prefix(f, tmp_path) == "/preprocess/first_pass"
+
+    def test_deep_nested(self, tmp_path):
+        """workers/gpu/inference.py → /workers/gpu/inference"""
+        f = tmp_path / "workers" / "gpu" / "inference.py"
+        assert file_to_url_prefix(f, tmp_path) == "/workers/gpu/inference"
+
+    def test_hyphenated_filename(self, tmp_path):
+        """my-worker.py → /my-worker (hyphens valid in URLs)"""
+        f = tmp_path / "my-worker.py"
+        assert file_to_url_prefix(f, tmp_path) == "/my-worker"
+
+    def test_starts_with_slash(self, tmp_path):
+        """Result always starts with /"""
+        f = tmp_path / "worker.py"
+        result = file_to_url_prefix(f, tmp_path)
+        assert result.startswith("/")
+
+    def test_no_py_extension(self, tmp_path):
+        """Result does not include .py extension"""
+        f = tmp_path / "worker.py"
+        result = file_to_url_prefix(f, tmp_path)
+        assert ".py" not in result
+
+
+class TestFileToResourceName:
+    """Tests for file_to_resource_name utility."""
+
+    def test_root_level_file(self, tmp_path):
+        """gpu_worker.py → gpu_worker"""
+        f = tmp_path / "gpu_worker.py"
+        assert file_to_resource_name(f, tmp_path) == "gpu_worker"
+
+    def test_single_subdir(self, tmp_path):
+        """longruns/stage1.py → longruns_stage1"""
+        f = tmp_path / "longruns" / "stage1.py"
+        assert file_to_resource_name(f, tmp_path) == "longruns_stage1"
+
+    def test_nested_subdir(self, tmp_path):
+        """preprocess/first_pass.py → preprocess_first_pass"""
+        f = tmp_path / "preprocess" / "first_pass.py"
+        assert file_to_resource_name(f, tmp_path) == "preprocess_first_pass"
+
+    def test_deep_nested(self, tmp_path):
+        """workers/gpu/inference.py → workers_gpu_inference"""
+        f = tmp_path / "workers" / "gpu" / "inference.py"
+        assert file_to_resource_name(f, tmp_path) == "workers_gpu_inference"
+
+    def test_hyphenated_filename(self, tmp_path):
+        """my-worker.py → my_worker (hyphens replaced with underscores for Python identifiers)"""
+        f = tmp_path / "my-worker.py"
+        assert file_to_resource_name(f, tmp_path) == "my_worker"
+
+    def test_no_py_extension(self, tmp_path):
+        """Result does not include .py extension"""
+        f = tmp_path / "worker.py"
+        result = file_to_resource_name(f, tmp_path)
+        assert ".py" not in result
+
+    def test_no_path_separators(self, tmp_path):
+        """Result contains no / or os.sep characters"""
+        f = tmp_path / "a" / "b" / "worker.py"
+        result = file_to_resource_name(f, tmp_path)
+        assert "/" not in result
+        assert os.sep not in result
+
+
+class TestFileToModulePath:
+    """Tests for file_to_module_path utility."""
+
+    def test_root_level_file(self, tmp_path):
+        """gpu_worker.py → gpu_worker"""
+        f = tmp_path / "gpu_worker.py"
+        assert file_to_module_path(f, tmp_path) == "gpu_worker"
+
+    def test_single_subdir(self, tmp_path):
+        """longruns/stage1.py → longruns.stage1"""
+        f = tmp_path / "longruns" / "stage1.py"
+        assert file_to_module_path(f, tmp_path) == "longruns.stage1"
+
+    def test_nested_subdir(self, tmp_path):
+        """preprocess/first_pass.py → preprocess.first_pass"""
+        f = tmp_path / "preprocess" / "first_pass.py"
+        assert file_to_module_path(f, tmp_path) == "preprocess.first_pass"
+
+    def test_deep_nested(self, tmp_path):
+        """workers/gpu/inference.py → workers.gpu.inference"""
+        f = tmp_path / "workers" / "gpu" / "inference.py"
+        assert file_to_module_path(f, tmp_path) == "workers.gpu.inference"
+
+    def test_no_py_extension(self, tmp_path):
+        """Result does not include .py extension"""
+        f = tmp_path / "worker.py"
+        result = file_to_module_path(f, tmp_path)
+        assert ".py" not in result
+
+    def test_uses_dots_not_slashes(self, tmp_path):
+        """Result uses dots as separators, not slashes"""
+        f = tmp_path / "a" / "b" / "worker.py"
+        result = file_to_module_path(f, tmp_path)
+        assert "." in result
+        assert "/" not in result
+        assert os.sep not in result
+
+
+class TestIsLbRouteHandlerField:
+    """Tests that RemoteFunctionMetadata.is_lb_route_handler is set correctly."""
+
+    def test_lb_function_with_method_and_path_is_handler(self, tmp_path):
+        """An LB @remote function with method= and path= is marked as LB route handler."""
+        from runpod_flash.cli.commands.build_utils.scanner import RemoteDecoratorScanner
+
+        (tmp_path / "routes.py").write_text(
+            """
+from runpod_flash import CpuLiveLoadBalancer, remote
+
+lb_config = CpuLiveLoadBalancer(name="my_lb")
+
+@remote(lb_config, method="POST", path="/compute")
+async def compute(data: dict) -> dict:
+    return data
+"""
+        )
+
+        scanner = RemoteDecoratorScanner(tmp_path)
+        functions = scanner.discover_remote_functions()
+
+        assert len(functions) == 1
+        assert functions[0].is_lb_route_handler is True
+
+    def test_qb_function_is_not_handler(self, tmp_path):
+        """A QB @remote function is NOT marked as LB route handler."""
+        from runpod_flash.cli.commands.build_utils.scanner import RemoteDecoratorScanner
+
+        (tmp_path / "worker.py").write_text(
+            """
+from runpod_flash import LiveServerless, GpuGroup, remote
+
+gpu_config = LiveServerless(name="gpu_worker", gpus=[GpuGroup.ANY])
+
+@remote(gpu_config)
+async def process(data: dict) -> dict:
+    return data
+"""
+        )
+
+        scanner = RemoteDecoratorScanner(tmp_path)
+        functions = scanner.discover_remote_functions()
+
+        assert len(functions) == 1
+        assert functions[0].is_lb_route_handler is False
+
+    def test_init_py_files_excluded(self, tmp_path):
+        """__init__.py files are excluded from scanning."""
+        from runpod_flash.cli.commands.build_utils.scanner import RemoteDecoratorScanner
+
+        (tmp_path / "__init__.py").write_text(
+            """
+from runpod_flash import LiveServerless, remote
+
+gpu_config = LiveServerless(name="gpu_worker")
+
+@remote(gpu_config)
+async def process(data: dict) -> dict:
+    return data
+"""
+        )
+        (tmp_path / "worker.py").write_text(
+            """
+from runpod_flash import LiveServerless, GpuGroup, remote
+
+gpu_config = LiveServerless(name="gpu_worker", gpus=[GpuGroup.ANY])
+
+@remote(gpu_config)
+async def process(data: dict) -> dict:
+    return data
+"""
+        )
+
+        scanner = RemoteDecoratorScanner(tmp_path)
+        functions = scanner.discover_remote_functions()
+
+        # Only the worker.py function should be discovered, not __init__.py
+        assert len(functions) == 1
+        assert functions[0].file_path.name == "worker.py"

From e583886b169cdd1a2a2fc9a9a1446eb4a4095ffe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 18 Feb 2026 12:30:59 -0800
Subject: [PATCH 03/26] refactor(manifest): remove mothership dead code, flat
 resource structure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove _serialize_routes, _create_mothership_resource, and
_create_mothership_from_explicit — all referenced unimported symbols and
caused F821 lint errors. The manifest now emits a flat resources dict
with file_path, local_path_prefix, and module_path per resource; no
is_mothership flag.
---
 .../cli/commands/build_utils/manifest.py      | 205 ++-------
 .../build_utils/test_manifest_mothership.py   | 404 ------------------
 2 files changed, 36 insertions(+), 573 deletions(-)
 delete mode 100644 tests/unit/cli/commands/build_utils/test_manifest_mothership.py

diff --git a/src/runpod_flash/cli/commands/build_utils/manifest.py b/src/runpod_flash/cli/commands/build_utils/manifest.py
index b67ce9bd..af2a283f 100644
--- a/src/runpod_flash/cli/commands/build_utils/manifest.py
+++ b/src/runpod_flash/cli/commands/build_utils/manifest.py
@@ -9,45 +9,17 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 
-from runpod_flash.core.resources.constants import (
-    DEFAULT_WORKERS_MAX,
-    DEFAULT_WORKERS_MIN,
-    FLASH_CPU_LB_IMAGE,
-    FLASH_LB_IMAGE,
+from .scanner import (
+    RemoteFunctionMetadata,
+    file_to_module_path,
+    file_to_url_prefix,
 )
 
-from .scanner import RemoteFunctionMetadata, detect_explicit_mothership, detect_main_app
-
 logger = logging.getLogger(__name__)
 
 RESERVED_PATHS = ["/execute", "/ping"]
 
 
-def _serialize_routes(routes: List[RemoteFunctionMetadata]) -> List[Dict[str, Any]]:
-    """Convert RemoteFunctionMetadata to manifest dict format.
-
-    Args:
-        routes: List of route metadata objects
-
-    Returns:
-        List of dicts with route information for manifest
-    """
-    return [
-        {
-            "name": route.function_name,
-            "module": route.module_path,
-            "is_async": route.is_async,
-            "is_class": route.is_class,
-            "is_load_balanced": route.is_load_balanced,
-            "is_live_resource": route.is_live_resource,
-            "config_variable": route.config_variable,
-            "http_method": route.http_method,
-            "http_path": route.http_path,
-        }
-        for route in routes
-    ]
-
-
 @dataclass
 class ManifestFunction:
     """Function entry in manifest."""
@@ -213,83 +185,13 @@ def _extract_deployment_config(
 
         return config
 
-    def _create_mothership_resource(self, main_app_config: dict) -> Dict[str, Any]:
-        """Create implicit mothership resource from main.py.
-
-        Args:
-            main_app_config: Dict with 'file_path', 'app_variable', 'has_routes', 'fastapi_routes' keys
-
-        Returns:
-            Dictionary representing the mothership resource for the manifest
-        """
-        # Extract FastAPI routes if present
-        fastapi_routes = main_app_config.get("fastapi_routes", [])
-        functions_list = _serialize_routes(fastapi_routes)
-
-        return {
-            "resource_type": "CpuLiveLoadBalancer",
-            "functions": functions_list,
-            "is_load_balanced": True,
-            "is_live_resource": True,
-            "is_mothership": True,
-            "main_file": main_app_config["file_path"].name,
-            "app_variable": main_app_config["app_variable"],
-            "imageName": FLASH_CPU_LB_IMAGE,
-            "workersMin": DEFAULT_WORKERS_MIN,
-            "workersMax": DEFAULT_WORKERS_MAX,
-        }
-
-    def _create_mothership_from_explicit(
-        self, explicit_config: dict, search_dir: Path
-    ) -> Dict[str, Any]:
-        """Create mothership resource from explicit mothership.py configuration.
-
-        Args:
-            explicit_config: Configuration dict from detect_explicit_mothership()
-            search_dir: Project directory
+    def build(self) -> Dict[str, Any]:
+        """Build the manifest dictionary.
 
-        Returns:
-            Dictionary representing the mothership resource for the manifest
+        Resources are keyed by resource_config_name for runtime compatibility.
+        Each resource entry includes file_path, local_path_prefix, and module_path
+        for the dev server and LB handler generator.
         """
-        # Detect FastAPI app details for handler generation
-        main_app_config = detect_main_app(search_dir, explicit_mothership_exists=False)
-
-        if not main_app_config:
-            # No FastAPI app found, use defaults
-            main_file = "main.py"
-            app_variable = "app"
-            fastapi_routes = []
-        else:
-            main_file = main_app_config["file_path"].name
-            app_variable = main_app_config["app_variable"]
-            fastapi_routes = main_app_config.get("fastapi_routes", [])
-
-        # Extract FastAPI routes into functions list
-        functions_list = _serialize_routes(fastapi_routes)
-
-        # Map resource type to image name
-        resource_type = explicit_config.get("resource_type", "CpuLiveLoadBalancer")
-        if resource_type == "LiveLoadBalancer":
-            image_name = FLASH_LB_IMAGE  # GPU load balancer
-        else:
-            image_name = FLASH_CPU_LB_IMAGE  # CPU load balancer
-
-        return {
-            "resource_type": resource_type,
-            "functions": functions_list,
-            "is_load_balanced": True,
-            "is_live_resource": True,
-            "is_mothership": True,
-            "is_explicit": True,  # Flag to indicate explicit configuration
-            "main_file": main_file,
-            "app_variable": app_variable,
-            "imageName": image_name,
-            "workersMin": explicit_config.get("workersMin", DEFAULT_WORKERS_MIN),
-            "workersMax": explicit_config.get("workersMax", DEFAULT_WORKERS_MAX),
-        }
-
-    def build(self) -> Dict[str, Any]:
-        """Build the manifest dictionary."""
         # Group functions by resource_config_name
         resources: Dict[str, List[RemoteFunctionMetadata]] = {}
 
@@ -305,6 +207,9 @@ def build(self) -> Dict[str, Any]:
             str, Dict[str, str]
         ] = {}  # resource_name -> {route_key -> function_name}
 
+        # Determine project root for path derivation
+        project_root = self.build_dir.parent if self.build_dir else Path.cwd()
+
         for resource_name, functions in sorted(resources.items()):
             # Use actual resource type from first function in group
             resource_type = (
@@ -315,6 +220,27 @@ def build(self) -> Dict[str, Any]:
             is_load_balanced = functions[0].is_load_balanced if functions else False
             is_live_resource = functions[0].is_live_resource if functions else False
 
+            # Derive path fields from the first function's source file.
+            # All functions in a resource share the same source file per convention.
+            first_file = functions[0].file_path if functions else None
+            file_path_str = ""
+            local_path_prefix = ""
+            resource_module_path = functions[0].module_path if functions else ""
+
+            if first_file and first_file.exists():
+                try:
+                    file_path_str = str(first_file.relative_to(project_root))
+                    local_path_prefix = file_to_url_prefix(first_file, project_root)
+                    resource_module_path = file_to_module_path(first_file, project_root)
+                except ValueError:
+                    # File is outside project root — fall back to module_path
+                    file_path_str = str(first_file)
+                    local_path_prefix = "/" + functions[0].module_path.replace(".", "/")
+            elif first_file:
+                # File path may be relative (in test scenarios)
+                file_path_str = str(first_file)
+                local_path_prefix = "/" + functions[0].module_path.replace(".", "/")
+
             # Validate and collect routing for LB endpoints
             resource_routes = {}
             if is_load_balanced:
@@ -374,6 +300,9 @@ def build(self) -> Dict[str, Any]:
 
             resources_dict[resource_name] = {
                 "resource_type": resource_type,
+                "file_path": file_path_str,
+                "local_path_prefix": local_path_prefix,
+                "module_path": resource_module_path,
                 "functions": functions_list,
                 "is_load_balanced": is_load_balanced,
                 "is_live_resource": is_live_resource,
@@ -395,68 +324,6 @@ def build(self) -> Dict[str, Any]:
                     )
                 function_registry[f.function_name] = resource_name
 
-        # === MOTHERSHIP DETECTION (EXPLICIT THEN FALLBACK) ===
-        search_dir = self.build_dir if self.build_dir else Path.cwd()
-
-        # Step 1: Check for explicit mothership.py
-        explicit_mothership = detect_explicit_mothership(search_dir)
-
-        if explicit_mothership:
-            # Use explicit configuration
-            logger.debug("Found explicit mothership configuration in mothership.py")
-
-            # Check for name conflict
-            mothership_name = explicit_mothership.get("name", "mothership")
-            if mothership_name in resources_dict:
-                logger.warning(
-                    f"Project has a @remote resource named '{mothership_name}'. "
-                    f"Using 'mothership-entrypoint' for explicit mothership endpoint."
-                )
-                mothership_name = "mothership-entrypoint"
-
-            # Create mothership resource from explicit config
-            mothership_resource = self._create_mothership_from_explicit(
-                explicit_mothership, search_dir
-            )
-            resources_dict[mothership_name] = mothership_resource
-
-        else:
-            # Step 2: Fallback to auto-detection
-            main_app_config = detect_main_app(
-                search_dir, explicit_mothership_exists=False
-            )
-
-            if main_app_config and main_app_config["has_routes"]:
-                logger.warning(
-                    "Auto-detected FastAPI app in main.py (no mothership.py found). "
-                    "Consider running 'flash init' to create explicit mothership configuration."
-                )
-
-                # Check for name conflict
-                if "mothership" in resources_dict:
-                    logger.warning(
-                        "Project has a @remote resource named 'mothership'. "
-                        "Using 'mothership-entrypoint' for auto-generated mothership endpoint."
-                    )
-                    mothership_name = "mothership-entrypoint"
-                else:
-                    mothership_name = "mothership"
-
-                # Create mothership resource from auto-detection (legacy behavior)
-                mothership_resource = self._create_mothership_resource(main_app_config)
-                resources_dict[mothership_name] = mothership_resource
-
-        # Extract routes from mothership resources
-        for resource_name, resource in resources_dict.items():
-            if resource.get("is_mothership") and resource.get("functions"):
-                mothership_routes = {}
-                for func in resource["functions"]:
-                    if func.get("http_method") and func.get("http_path"):
-                        route_key = f"{func['http_method']} {func['http_path']}"
-                        mothership_routes[route_key] = func["name"]
-                if mothership_routes:
-                    routes_dict[resource_name] = mothership_routes
-
         manifest = {
             "version": "1.0",
             "generated_at": datetime.now(timezone.utc)
diff --git a/tests/unit/cli/commands/build_utils/test_manifest_mothership.py b/tests/unit/cli/commands/build_utils/test_manifest_mothership.py
deleted file mode 100644
index 896eefdf..00000000
--- a/tests/unit/cli/commands/build_utils/test_manifest_mothership.py
+++ /dev/null
@@ -1,404 +0,0 @@
-"""Tests for mothership resource creation in manifest."""
-
-import tempfile
-from pathlib import Path
-from unittest.mock import patch
-
-from runpod_flash.cli.commands.build_utils.manifest import ManifestBuilder
-from runpod_flash.cli.commands.build_utils.scanner import RemoteFunctionMetadata
-from runpod_flash.core.resources.constants import (
-    FLASH_CPU_LB_IMAGE,
-    FLASH_LB_IMAGE,
-)
-
-
-class TestManifestMothership:
-    """Test mothership resource creation in manifest."""
-
-    def test_manifest_includes_mothership_with_main_py(self):
-        """Test mothership resource added to manifest when main.py detected."""
-        with tempfile.TemporaryDirectory() as tmpdir:
-            project_root = Path(tmpdir)
-
-            # Create main.py with FastAPI routes
-            main_file = project_root / "main.py"
-            main_file.write_text(
-                """
-from fastapi import FastAPI
-app = FastAPI()
-
-@app.get("/")
-def root():
-    return {"msg": "Hello"}
-"""
-            )
-
-            # Create a simple function file
-            func_file = project_root / "functions.py"
-            func_file.write_text(
-                """
-from runpod_flash import remote
-from runpod_flash import LiveServerless
-
-gpu_config = LiveServerless(name="gpu_worker")
-
-@remote(resource_config=gpu_config)
-def process(data):
-    return data
-"""
-            )
-
-            # Change to project directory for detection
-            with patch(
-                "runpod_flash.cli.commands.build_utils.manifest.Path.cwd",
-                return_value=project_root,
-            ):
-                builder = ManifestBuilder(
-                    project_name="test",
-                    remote_functions=[],
-                )
-                manifest = builder.build()
-
-                # Check mothership is in resources
-                assert "mothership" in manifest["resources"]
-                mothership = manifest["resources"]["mothership"]
-                assert mothership["is_mothership"] is True
-                assert mothership["main_file"] == "main.py"
-                assert mothership["app_variable"] == "app"
-                assert mothership["resource_type"] == "CpuLiveLoadBalancer"
-                assert mothership["imageName"] == FLASH_CPU_LB_IMAGE
-
-    def test_manifest_skips_mothership_without_routes(self):
-        """Test mothership NOT added if main.py has no routes."""
-        with tempfile.TemporaryDirectory() as tmpdir:
-            project_root = Path(tmpdir)
-
-            # Create main.py without routes
-            main_file = project_root / "main.py"
-            main_file.write_text(
-                """
-from fastapi import FastAPI
-app = FastAPI()
-# No routes defined
-"""
-            )
-
-            with patch(
-                "runpod_flash.cli.commands.build_utils.manifest.Path.cwd",
-                return_value=project_root,
-            ):
-                builder = ManifestBuilder(project_name="test", remote_functions=[])
-                manifest = builder.build()
-
-                # Mothership should NOT be in resources
-                assert "mothership" not in manifest["resources"]
-
-    def test_manifest_skips_mothership_without_main_py(self):
-        """Test mothership NOT added if no main.py exists."""
-        with tempfile.TemporaryDirectory() as tmpdir:
-            project_root = Path(tmpdir)
-
-            with patch(
-                "runpod_flash.cli.commands.build_utils.manifest.Path.cwd",
-                return_value=project_root,
-            ):
-                builder = ManifestBuilder(project_name="test", remote_functions=[])
-                manifest = builder.build()
-
-                # Mothership should NOT be in resources
-                assert "mothership" not in manifest["resources"]
-
-    def test_manifest_handles_mothership_name_conflict(self):
-        """Test mothership uses alternate name if conflict with @remote resource."""
-        with tempfile.TemporaryDirectory() as tmpdir:
-            project_root = Path(tmpdir)
-
-            # Create main.py with routes
-            main_file = project_root / "main.py"
-            main_file.write_text(
-                """
-from fastapi import FastAPI
-app = FastAPI()
-
-@app.get("/")
-def root():
-    return {"msg": "Hello"}
-"""
-            )
-
-            # Create a remote function with name "mothership" (conflict)
-            func_file = project_root / "functions.py"
-            func_file.write_text(
-                """
-from runpod_flash import remote
-from runpod_flash import LiveServerless
-
-mothership_config = LiveServerless(name="mothership")
-
-@remote(resource_config=mothership_config)
-def process(data):
-    return data
-"""
-            )
-
-            # Create remote function metadata with resource named "mothership"
-            remote_func = RemoteFunctionMetadata(
-                function_name="process",
-                module_path="functions",
-                resource_config_name="mothership",
-                resource_type="LiveServerless",
-                is_async=False,
-                is_class=False,
-                file_path=func_file,
-            )
-
-            with patch(
-                "runpod_flash.cli.commands.build_utils.manifest.Path.cwd",
-                return_value=project_root,
-            ):
-                builder = ManifestBuilder(
-                    project_name="test", remote_functions=[remote_func]
-                )
-                manifest = builder.build()
-
-                # Original mothership should be in resources
-                assert "mothership" in manifest["resources"]
-                # Auto-generated mothership should use alternate name
-                assert "mothership-entrypoint" in manifest["resources"]
-                entrypoint = manifest["resources"]["mothership-entrypoint"]
-                assert entrypoint["is_mothership"] is True
-
-    def test_mothership_resource_config(self):
-        """Test mothership resource has correct configuration."""
-        with tempfile.TemporaryDirectory() as tmpdir:
-            project_root = Path(tmpdir)
-
-            main_file = project_root / "main.py"
-            main_file.write_text(
-                """
-from fastapi import FastAPI
-app = FastAPI()
-
-@app.get("/")
-def root():
-    return {"msg": "Hello"}
-"""
-            )
-
-            with patch(
-                "runpod_flash.cli.commands.build_utils.manifest.Path.cwd",
-                return_value=project_root,
-            ):
-                builder = ManifestBuilder(project_name="test", remote_functions=[])
-                manifest = builder.build()
-
-                mothership = manifest["resources"]["mothership"]
-
-                # Check all expected fields
-                assert mothership["resource_type"] == "CpuLiveLoadBalancer"
-                # Functions should include the FastAPI route
-                assert len(mothership["functions"]) == 1
-                assert mothership["functions"][0]["name"] == "root"
-                assert mothership["functions"][0]["http_method"] == "GET"
-                assert mothership["functions"][0]["http_path"] == "/"
-                assert mothership["is_load_balanced"] is True
-                assert mothership["is_live_resource"] is True
-                assert mothership["imageName"] == FLASH_CPU_LB_IMAGE
-                assert mothership["workersMin"] == 1
-                assert mothership["workersMax"] == 1
-
-    def test_manifest_uses_explicit_mothership_config(self):
-        """Test explicit mothership.py config takes precedence over auto-detection."""
-        with tempfile.TemporaryDirectory() as tmpdir:
-            project_root = Path(tmpdir)
-
-            # Create main.py with FastAPI routes
-            main_file = project_root / "main.py"
-            main_file.write_text(
-                """
-from fastapi import FastAPI
-app = FastAPI()
-
-@app.get("/")
-def root():
-    return {"msg": "Hello"}
-"""
-            )
-
-            # Create explicit mothership.py with custom config
-            mothership_file = project_root / "mothership.py"
-            mothership_file.write_text(
-                """
-from runpod_flash import CpuLiveLoadBalancer
-
-mothership = CpuLiveLoadBalancer(
-    name="my-api",
-    workersMin=3,
-    workersMax=7,
-)
-"""
-            )
-
-            with patch(
-                "runpod_flash.cli.commands.build_utils.manifest.Path.cwd",
-                return_value=project_root,
-            ):
-                builder = ManifestBuilder(project_name="test", remote_functions=[])
-                manifest = builder.build()
-
-                # Check explicit config is used
-                assert "my-api" in manifest["resources"]
-                mothership = manifest["resources"]["my-api"]
-                assert mothership["is_explicit"] is True
-                assert mothership["workersMin"] == 3
-                assert mothership["workersMax"] == 7
-
-    def test_manifest_skips_auto_detect_with_explicit_config(self):
-        """Test auto-detection is skipped when explicit config exists."""
-        with tempfile.TemporaryDirectory() as tmpdir:
-            project_root = Path(tmpdir)
-
-            # Create main.py with FastAPI routes
-            main_file = project_root / "main.py"
-            main_file.write_text(
-                """
-from fastapi import FastAPI
-app = FastAPI()
-
-@app.get("/")
-def root():
-    return {"msg": "Hello"}
-"""
-            )
-
-            # Create explicit mothership.py
-            mothership_file = project_root / "mothership.py"
-            mothership_file.write_text(
-                """
-from runpod_flash import CpuLiveLoadBalancer
-
-mothership = CpuLiveLoadBalancer(
-    name="explicit-mothership",
-    workersMin=2,
-    workersMax=4,
-)
-"""
-            )
-
-            with patch(
-                "runpod_flash.cli.commands.build_utils.manifest.Path.cwd",
-                return_value=project_root,
-            ):
-                builder = ManifestBuilder(project_name="test", remote_functions=[])
-                manifest = builder.build()
-
-                # Check only explicit config is in resources (not auto-detected "mothership")
-                assert "explicit-mothership" in manifest["resources"]
-                assert (
-                    manifest["resources"]["explicit-mothership"]["is_explicit"] is True
-                )
-                assert "mothership" not in manifest["resources"]
-
-    def test_manifest_handles_explicit_mothership_name_conflict(self):
-        """Test explicit mothership uses alternate name if conflict with @remote."""
-        with tempfile.TemporaryDirectory() as tmpdir:
-            project_root = Path(tmpdir)
-
-            # Create explicit mothership.py with name that conflicts with resource
-            mothership_file = project_root / "mothership.py"
-            mothership_file.write_text(
-                """
-from runpod_flash import CpuLiveLoadBalancer
-
-mothership = CpuLiveLoadBalancer(
-    name="api",  # Will conflict with @remote resource named "api"
-    workersMin=1,
-    workersMax=3,
-)
-"""
-            )
-
-            # Create a remote function with name "api" (conflict)
-            func_file = project_root / "functions.py"
-            func_file.write_text(
-                """
-from runpod_flash import remote
-from runpod_flash import LiveServerless
-
-api_config = LiveServerless(name="api")
-
-@remote(resource_config=api_config)
-def process(data):
-    return data
-"""
-            )
-
-            remote_func = RemoteFunctionMetadata(
-                function_name="process",
-                module_path="functions",
-                resource_config_name="api",
-                resource_type="LiveServerless",
-                is_async=False,
-                is_class=False,
-                file_path=func_file,
-            )
-
-            with patch(
-                "runpod_flash.cli.commands.build_utils.manifest.Path.cwd",
-                return_value=project_root,
-            ):
-                builder = ManifestBuilder(
-                    project_name="test", remote_functions=[remote_func]
-                )
-                manifest = builder.build()
-
-                # Original resource should be in resources
-                assert "api" in manifest["resources"]
-                # Explicit mothership should use alternate name
-                assert "mothership-entrypoint" in manifest["resources"]
-                entrypoint = manifest["resources"]["mothership-entrypoint"]
-                assert entrypoint["is_explicit"] is True
-
-    def test_manifest_explicit_mothership_with_gpu_load_balancer(self):
-        """Test explicit GPU-based load balancer config."""
-        with tempfile.TemporaryDirectory() as tmpdir:
-            project_root = Path(tmpdir)
-
-            # Create explicit mothership.py with GPU load balancer
-            mothership_file = project_root / "mothership.py"
-            mothership_file.write_text(
-                """
-from runpod_flash import LiveLoadBalancer
-
-mothership = LiveLoadBalancer(
-    name="gpu-mothership",
-    workersMin=1,
-    workersMax=2,
-)
-"""
-            )
-
-            # Create main.py for FastAPI app
-            main_file = project_root / "main.py"
-            main_file.write_text(
-                """
-from fastapi import FastAPI
-app = FastAPI()
-
-@app.get("/")
-def root():
-    return {"msg": "Hello"}
-"""
-            )
-
-            with patch(
-                "runpod_flash.cli.commands.build_utils.manifest.Path.cwd",
-                return_value=project_root,
-            ):
-                builder = ManifestBuilder(project_name="test", remote_functions=[])
-                manifest = builder.build()
-
-                mothership = manifest["resources"]["gpu-mothership"]
-                assert mothership["resource_type"] == "LiveLoadBalancer"
-                assert mothership["imageName"] == FLASH_LB_IMAGE
-                assert mothership["is_explicit"] is True

From 35cfa6e846b8ec96ba9b5516c231ae97bdcb4277 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 18 Feb 2026 12:31:42 -0800
Subject: [PATCH 04/26] feat(run): file-system-as-namespace dev server
 generation

flash run now scans the project for all @remote functions, generates
.flash/server.py with routes derived from file paths, and starts uvicorn
with --app-dir .flash/. Route convention: gpu_worker.py -> /gpu_worker/run
and /gpu_worker/run_sync; subdirectory files produce matching URL prefixes.

Cleanup on Ctrl+C is fixed: _cleanup_live_endpoints now reads
.runpod/resources.pkl written by the uvicorn subprocess and deprovisions
all live- prefixed endpoints, removing the dead in-process _SESSION_ENDPOINTS
approach which never received data from the subprocess.
---
 src/runpod_flash/cli/commands/run.py         | 581 +++++++++++++------
 tests/integration/test_run_auto_provision.py | 337 -----------
 tests/unit/cli/test_run.py                   |  12 +-
 3 files changed, 397 insertions(+), 533 deletions(-)
 delete mode 100644 tests/integration/test_run_auto_provision.py

diff --git a/src/runpod_flash/cli/commands/run.py b/src/runpod_flash/cli/commands/run.py
index 051115a8..faf4f50b 100644
--- a/src/runpod_flash/cli/commands/run.py
+++ b/src/runpod_flash/cli/commands/run.py
@@ -5,16 +5,361 @@
 import signal
 import subprocess
 import sys
+from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Optional
+from typing import List
 
-import questionary
 import typer
 from rich.console import Console
+from rich.table import Table
+
+from .build_utils.scanner import (
+    RemoteDecoratorScanner,
+    file_to_module_path,
+    file_to_resource_name,
+    file_to_url_prefix,
+)
 
 logger = logging.getLogger(__name__)
 console = Console()
 
+# Resource state file written by ResourceManager in the uvicorn subprocess.
+_RESOURCE_STATE_FILE = Path(".runpod") / "resources.pkl"
+
+
+@dataclass
+class WorkerInfo:
+    """Info about a discovered @remote function for dev server generation."""
+
+    file_path: Path
+    url_prefix: str  # e.g. /longruns/stage1
+    module_path: str  # e.g. longruns.stage1
+    resource_name: str  # e.g. longruns_stage1
+    worker_type: str  # "QB" or "LB"
+    functions: List[str]  # function names
+    lb_routes: List[dict] = field(default_factory=list)  # [{method, path, fn_name}]
+
+
+def _scan_project_workers(project_root: Path) -> List[WorkerInfo]:
+    """Scan the project for all @remote decorated functions.
+
+    Walks all .py files (excluding .flash/, __pycache__, __init__.py) and
+    builds WorkerInfo for each file that contains @remote functions.
+
+    Files with QB functions produce one WorkerInfo per file (QB type).
+    Files with LB functions produce one WorkerInfo per file (LB type).
+    A file can have both QB and LB functions (unusual but supported).
+
+    Args:
+        project_root: Root directory of the Flash project
+
+    Returns:
+        List of WorkerInfo, one entry per discovered source file
+    """
+    scanner = RemoteDecoratorScanner(project_root)
+    remote_functions = scanner.discover_remote_functions()
+
+    # Group by file path
+    by_file: dict[Path, List] = {}
+    for func in remote_functions:
+        by_file.setdefault(func.file_path, []).append(func)
+
+    workers: List[WorkerInfo] = []
+    for file_path, funcs in sorted(by_file.items()):
+        url_prefix = file_to_url_prefix(file_path, project_root)
+        module_path = file_to_module_path(file_path, project_root)
+        resource_name = file_to_resource_name(file_path, project_root)
+
+        qb_funcs = [f for f in funcs if not f.is_load_balanced]
+        lb_funcs = [f for f in funcs if f.is_load_balanced and f.is_lb_route_handler]
+
+        if qb_funcs:
+            workers.append(
+                WorkerInfo(
+                    file_path=file_path,
+                    url_prefix=url_prefix,
+                    module_path=module_path,
+                    resource_name=resource_name,
+                    worker_type="QB",
+                    functions=[f.function_name for f in qb_funcs],
+                )
+            )
+
+        if lb_funcs:
+            lb_routes = [
+                {
+                    "method": f.http_method,
+                    "path": f.http_path,
+                    "fn_name": f.function_name,
+                }
+                for f in lb_funcs
+            ]
+            workers.append(
+                WorkerInfo(
+                    file_path=file_path,
+                    url_prefix=url_prefix,
+                    module_path=module_path,
+                    resource_name=resource_name,
+                    worker_type="LB",
+                    functions=[f.function_name for f in lb_funcs],
+                    lb_routes=lb_routes,
+                )
+            )
+
+    return workers
+
+
+def _ensure_gitignore(project_root: Path) -> None:
+    """Add .flash/ to .gitignore if not already present."""
+    gitignore = project_root / ".gitignore"
+    entry = ".flash/"
+
+    if gitignore.exists():
+        content = gitignore.read_text(encoding="utf-8")
+        if entry in content:
+            return
+        # Append with a newline
+        if not content.endswith("\n"):
+            content += "\n"
+        gitignore.write_text(content + entry + "\n", encoding="utf-8")
+    else:
+        gitignore.write_text(entry + "\n", encoding="utf-8")
+
+
+def _sanitize_fn_name(name: str) -> str:
+    """Sanitize a string for use as a Python function name."""
+    return name.replace("/", "_").replace(".", "_").replace("-", "_")
+
+
+def _generate_flash_server(project_root: Path, workers: List[WorkerInfo]) -> Path:
+    """Generate .flash/server.py from the discovered workers.
+
+    Args:
+        project_root: Root of the Flash project
+        workers: List of discovered worker infos
+
+    Returns:
+        Path to the generated server.py
+    """
+    flash_dir = project_root / ".flash"
+    flash_dir.mkdir(exist_ok=True)
+
+    _ensure_gitignore(project_root)
+
+    lines = [
+        '"""Auto-generated Flash dev server. Do not edit — regenerated on each flash run."""',
+        "import sys",
+        "import uuid",
+        "from pathlib import Path",
+        "sys.path.insert(0, str(Path(__file__).parent.parent))",
+        "",
+        "from fastapi import FastAPI",
+        "",
+    ]
+
+    # Collect all imports
+    all_imports: List[str] = []
+    for worker in workers:
+        for fn_name in worker.functions:
+            all_imports.append(f"from {worker.module_path} import {fn_name}")
+
+    if all_imports:
+        lines.extend(all_imports)
+        lines.append("")
+
+    lines += [
+        "app = FastAPI(",
+        '    title="Flash Dev Server",',
+        '    description="Auto-generated by `flash run`. Visit /docs for interactive testing.",',
+        ")",
+        "",
+    ]
+
+    for worker in workers:
+        tag = f"{worker.url_prefix.lstrip('/')} [{worker.worker_type}]"
+        lines.append(f"# {'─' * 60}")
+        lines.append(f"# {worker.worker_type}: {worker.file_path.name}")
+        lines.append(f"# {'─' * 60}")
+
+        if worker.worker_type == "QB":
+            if len(worker.functions) == 1:
+                fn = worker.functions[0]
+                handler_name = _sanitize_fn_name(f"{worker.resource_name}_run")
+                run_path = f"{worker.url_prefix}/run"
+                sync_path = f"{worker.url_prefix}/run_sync"
+                lines += [
+                    f'@app.post("{run_path}", tags=["{tag}"])',
+                    f'@app.post("{sync_path}", tags=["{tag}"])',
+                    f"async def {handler_name}(body: dict):",
+                    f'    result = await {fn}(body.get("input", body))',
+                    '    return {"id": str(uuid.uuid4()), "status": "COMPLETED", "output": result}',
+                    "",
+                ]
+            else:
+                for fn in worker.functions:
+                    handler_name = _sanitize_fn_name(f"{worker.resource_name}_{fn}_run")
+                    run_path = f"{worker.url_prefix}/{fn}/run"
+                    sync_path = f"{worker.url_prefix}/{fn}/run_sync"
+                    lines += [
+                        f'@app.post("{run_path}", tags=["{tag}"])',
+                        f'@app.post("{sync_path}", tags=["{tag}"])',
+                        f"async def {handler_name}(body: dict):",
+                        f'    result = await {fn}(body.get("input", body))',
+                        '    return {"id": str(uuid.uuid4()), "status": "COMPLETED", "output": result}',
+                        "",
+                    ]
+
+        elif worker.worker_type == "LB":
+            for route in worker.lb_routes:
+                method = route["method"].lower()
+                sub_path = route["path"].lstrip("/")
+                fn_name = route["fn_name"]
+                full_path = f"{worker.url_prefix}/{sub_path}"
+                handler_name = _sanitize_fn_name(
+                    f"_route_{worker.resource_name}_{fn_name}"
+                )
+                lines += [
+                    f'@app.{method}("{full_path}", tags=["{tag}"])',
+                    f"async def {handler_name}(body: dict):",
+                    f"    return await {fn_name}(body)",
+                    "",
+                ]
+
+    # Health endpoints
+    lines += [
+        "# Health",
+        '@app.get("/", tags=["health"])',
+        "def home():",
+        '    return {"message": "Flash Dev Server", "docs": "/docs"}',
+        "",
+        '@app.get("/ping", tags=["health"])',
+        "def ping():",
+        '    return {"status": "healthy"}',
+        "",
+    ]
+
+    server_path = flash_dir / "server.py"
+    server_path.write_text("\n".join(lines), encoding="utf-8")
+    return server_path
+
+
+def _print_startup_table(workers: List[WorkerInfo], host: str, port: int) -> None:
+    """Print the startup table showing local paths, resource names, and types."""
+    console.print(f"\n[bold green]Flash Dev Server[/bold green]  http://{host}:{port}")
+    console.print()
+
+    table = Table(show_header=True, header_style="bold")
+    table.add_column("Local path", style="cyan")
+    table.add_column("Resource", style="white")
+    table.add_column("Type", style="yellow")
+
+    for worker in workers:
+        if worker.worker_type == "QB":
+            if len(worker.functions) == 1:
+                table.add_row(
+                    f"POST  {worker.url_prefix}/run",
+                    worker.resource_name,
+                    "QB",
+                )
+                table.add_row(
+                    f"POST  {worker.url_prefix}/run_sync",
+                    worker.resource_name,
+                    "QB",
+                )
+            else:
+                for fn in worker.functions:
+                    table.add_row(
+                        f"POST  {worker.url_prefix}/{fn}/run",
+                        worker.resource_name,
+                        "QB",
+                    )
+                    table.add_row(
+                        f"POST  {worker.url_prefix}/{fn}/run_sync",
+                        worker.resource_name,
+                        "QB",
+                    )
+        elif worker.worker_type == "LB":
+            for route in worker.lb_routes:
+                sub_path = route["path"].lstrip("/")
+                full_path = f"{worker.url_prefix}/{sub_path}"
+                table.add_row(
+                    f"{route['method']}  {full_path}",
+                    worker.resource_name,
+                    "LB",
+                )
+
+    console.print(table)
+    console.print(f"\n  Visit [bold]http://{host}:{port}/docs[/bold] for Swagger UI\n")
+
+
+def _cleanup_live_endpoints() -> None:
+    """Deprovision all Live Serverless endpoints created during this session.
+
+    Reads the resource state file written by the uvicorn subprocess, finds
+    all endpoints with the 'live-' name prefix, and deprovisions them.
+    Best-effort: errors per endpoint are logged but do not prevent cleanup
+    of other endpoints.
+    """
+    if not _RESOURCE_STATE_FILE.exists():
+        return
+
+    try:
+        import asyncio
+        import cloudpickle
+        from ...core.utils.file_lock import file_lock
+
+        with open(_RESOURCE_STATE_FILE, "rb") as f:
+            with file_lock(f, exclusive=False):
+                data = cloudpickle.load(f)
+
+        if isinstance(data, tuple):
+            resources, configs = data
+        else:
+            resources, configs = data, {}
+
+        live_items = {
+            key: resource
+            for key, resource in resources.items()
+            if hasattr(resource, "name")
+            and resource.name
+            and resource.name.startswith("live-")
+        }
+
+        if not live_items:
+            return
+
+        async def _do_cleanup():
+            for key, resource in live_items.items():
+                name = getattr(resource, "name", key)
+                try:
+                    success = await resource._do_undeploy()
+                    if success:
+                        console.print(f"  Deprovisioned: {name}")
+                    else:
+                        logger.warning(f"Failed to deprovision: {name}")
+                except Exception as e:
+                    logger.warning(f"Error deprovisioning {name}: {e}")
+
+        asyncio.run(_do_cleanup())
+
+        # Remove live- entries from persisted state so they don't linger.
+        remaining = {k: v for k, v in resources.items() if k not in live_items}
+        remaining_configs = {k: v for k, v in configs.items() if k not in live_items}
+        try:
+            with open(_RESOURCE_STATE_FILE, "wb") as f:
+                with file_lock(f, exclusive=True):
+                    cloudpickle.dump((remaining, remaining_configs), f)
+        except Exception as e:
+            logger.warning(f"Could not update resource state after cleanup: {e}")
+
+    except Exception as e:
+        logger.warning(f"Live endpoint cleanup failed: {e}")
+
+
+def _is_reload() -> bool:
+    """Check if running in uvicorn reload subprocess."""
+    return "UVICORN_RELOADER_PID" in os.environ
+
 
 def run_command(
     host: str = typer.Option(
@@ -33,68 +378,51 @@ def run_command(
     reload: bool = typer.Option(
         True, "--reload/--no-reload", help="Enable auto-reload"
     ),
-    auto_provision: bool = typer.Option(
-        False,
-        "--auto-provision",
-        help="Auto-provision deployable resources on startup",
-    ),
 ):
-    """Run Flash development server with uvicorn."""
+    """Run Flash development server.
 
-    # Discover entry point
-    entry_point = discover_entry_point()
-    if not entry_point:
-        console.print("[red]Error:[/red] No entry point found")
-        console.print("Create main.py with a FastAPI app")
-        raise typer.Exit(1)
+    Scans the project for @remote decorated functions, generates a dev server
+    at .flash/server.py, and starts uvicorn with hot-reload.
 
-    # Check if entry point has FastAPI app
-    app_location = check_fastapi_app(entry_point)
-    if not app_location:
-        console.print(f"[red]Error:[/red] No FastAPI app found in {entry_point}")
-        console.print("Make sure your main.py contains: app = FastAPI()")
-        raise typer.Exit(1)
+    No main.py or FastAPI boilerplate required. Any .py file with @remote
+    decorated functions is a valid Flash project.
+    """
+    project_root = Path.cwd()
 
-    # Set flag for all flash run sessions to ensure both auto-provisioned
-    # and on-the-fly provisioned resources get the live- prefix
+    # Set flag for live provisioning so stubs get the live- prefix
     if not _is_reload():
         os.environ["FLASH_IS_LIVE_PROVISIONING"] = "true"
 
-    # Auto-provision resources if flag is set and not a reload
-    if auto_provision and not _is_reload():
-        try:
-            resources = _discover_resources(entry_point)
+    # Discover @remote functions
+    workers = _scan_project_workers(project_root)
 
-            if resources:
-                # If many resources found, ask for confirmation
-                if len(resources) > 5:
-                    if not _confirm_large_provisioning(resources):
-                        console.print("[yellow]Auto-provisioning cancelled[/yellow]\n")
-                    else:
-                        _provision_resources(resources)
-                else:
-                    _provision_resources(resources)
-        except Exception as e:
-            logger.error("Auto-provisioning failed", exc_info=True)
-            console.print(
-                f"[yellow]Warning:[/yellow] Resource provisioning failed: {e}"
-            )
-            console.print(
-                "[yellow]Note:[/yellow] Resources will be deployed on-demand when first called"
-            )
+    if not workers:
+        console.print("[red]Error:[/red] No @remote functions found.")
+        console.print("Add @remote decorators to your functions to get started.")
+        console.print("\nExample:")
+        console.print(
+            "  from runpod_flash import LiveServerless, remote\n"
+            "  gpu_config = LiveServerless(name='my_worker')\n"
+            "\n"
+            "  @remote(gpu_config)\n"
+            "  async def process(input_data: dict) -> dict:\n"
+            "      return {'result': input_data}"
+        )
+        raise typer.Exit(1)
 
-    console.print("\n[green]Starting Flash Server[/green]")
-    console.print(f"Entry point: [bold]{app_location}[/bold]")
-    console.print(f"Server: [bold]http://{host}:{port}[/bold]")
-    console.print(f"Auto-reload: [bold]{'enabled' if reload else 'disabled'}[/bold]")
-    console.print("\nPress CTRL+C to stop\n")
+    # Generate .flash/server.py
+    _generate_flash_server(project_root, workers)
 
-    # Build uvicorn command
+    _print_startup_table(workers, host, port)
+
+    # Build uvicorn command using --app-dir so server:app is importable
     cmd = [
         sys.executable,
         "-m",
         "uvicorn",
-        app_location,
+        "server:app",
+        "--app-dir",
+        ".flash",
         "--host",
         host,
         "--port",
@@ -104,13 +432,16 @@ def run_command(
     ]
 
     if reload:
-        cmd.append("--reload")
+        cmd += [
+            "--reload",
+            "--reload-dir",
+            ".",
+            "--reload-include",
+            "*.py",
+        ]
 
-    # Run uvicorn with proper process group handling
     process = None
     try:
-        # Create new process group to ensure all child processes can be killed together
-        # On Unix systems, use process group; on Windows, CREATE_NEW_PROCESS_GROUP
         if sys.platform == "win32":
             process = subprocess.Popen(
                 cmd, creationflags=subprocess.CREATE_NEW_PROCESS_GROUP
@@ -118,27 +449,21 @@ def run_command(
         else:
             process = subprocess.Popen(cmd, preexec_fn=os.setsid)
 
-        # Wait for process to complete
         process.wait()
 
     except KeyboardInterrupt:
-        console.print("\n[yellow]Stopping server and cleaning up processes...[/yellow]")
+        console.print("\n[yellow]Stopping server and cleaning up...[/yellow]")
 
-        # Kill the entire process group to ensure all child processes are terminated
         if process:
             try:
                 if sys.platform == "win32":
-                    # Windows: terminate the process
                     process.terminate()
                 else:
-                    # Unix: kill entire process group
                     os.killpg(os.getpgid(process.pid), signal.SIGTERM)
 
-                # Wait briefly for graceful shutdown
                 try:
                     process.wait(timeout=2)
                 except subprocess.TimeoutExpired:
-                    # Force kill if didn't terminate gracefully
                     if sys.platform == "win32":
                         process.kill()
                     else:
@@ -146,9 +471,9 @@ def run_command(
                     process.wait()
 
             except (ProcessLookupError, OSError):
-                # Process already terminated
                 pass
 
+        _cleanup_live_endpoints()
         console.print("[green]Server stopped[/green]")
         raise typer.Exit(0)
 
@@ -162,135 +487,5 @@ def run_command(
                     os.killpg(os.getpgid(process.pid), signal.SIGTERM)
             except (ProcessLookupError, OSError):
                 pass
+        _cleanup_live_endpoints()
         raise typer.Exit(1)
-
-
-def discover_entry_point() -> Optional[str]:
-    """Discover the main entry point file."""
-    candidates = ["main.py", "app.py", "server.py"]
-
-    for candidate in candidates:
-        if Path(candidate).exists():
-            return candidate
-
-    return None
-
-
-def check_fastapi_app(entry_point: str) -> Optional[str]:
-    """
-    Check if entry point has a FastAPI app and return the app location.
-
-    Returns:
-        App location in format "module:app" or None
-    """
-    try:
-        # Read the file
-        content = Path(entry_point).read_text()
-
-        # Check for FastAPI app
-        if "app = FastAPI(" in content or "app=FastAPI(" in content:
-            # Extract module name from file path
-            module = entry_point.replace(".py", "").replace("/", ".")
-            return f"{module}:app"
-
-        return None
-
-    except Exception:
-        return None
-
-
-def _is_reload() -> bool:
-    """Check if running in uvicorn reload subprocess.
-
-    Returns:
-        True if running in a reload subprocess
-    """
-    return "UVICORN_RELOADER_PID" in os.environ
-
-
-def _discover_resources(entry_point: str):
-    """Discover deployable resources in entry point.
-
-    Args:
-        entry_point: Path to entry point file
-
-    Returns:
-        List of discovered DeployableResource instances
-    """
-    from ...core.discovery import ResourceDiscovery
-
-    try:
-        discovery = ResourceDiscovery(entry_point, max_depth=2)
-        resources = discovery.discover()
-
-        # Debug: Log what was discovered
-        if resources:
-            console.print(f"\n[dim]Discovered {len(resources)} resource(s):[/dim]")
-            for res in resources:
-                res_name = getattr(res, "name", "Unknown")
-                res_type = res.__class__.__name__
-                console.print(f"  [dim]• {res_name} ({res_type})[/dim]")
-            console.print()
-
-        return resources
-    except Exception as e:
-        console.print(f"[yellow]Warning:[/yellow] Resource discovery failed: {e}")
-        return []
-
-
-def _confirm_large_provisioning(resources) -> bool:
-    """Show resources and prompt user for confirmation.
-
-    Args:
-        resources: List of resources to provision
-
-    Returns:
-        True if user confirms, False otherwise
-    """
-    try:
-        console.print(
-            f"\n[yellow]Found {len(resources)} resources to provision:[/yellow]"
-        )
-
-        for resource in resources:
-            name = getattr(resource, "name", "Unknown")
-            resource_type = resource.__class__.__name__
-            console.print(f"  • {name} ({resource_type})")
-
-        console.print()
-
-        confirmed = questionary.confirm(
-            "This may take several minutes. Do you want to proceed?"
-        ).ask()
-
-        return confirmed if confirmed is not None else False
-
-    except (KeyboardInterrupt, EOFError):
-        console.print("\n[yellow]Cancelled[/yellow]")
-        return False
-    except Exception as e:
-        console.print(f"[yellow]Warning:[/yellow] Confirmation failed: {e}")
-        return False
-
-
-def _provision_resources(resources):
-    """Provision resources and wait for completion.
-
-    Args:
-        resources: List of resources to provision
-    """
-    import asyncio
-    from ...core.deployment import DeploymentOrchestrator
-
-    try:
-        console.print(f"\n[bold]Provisioning {len(resources)} resource(s)...[/bold]")
-        orchestrator = DeploymentOrchestrator(max_concurrent=3)
-
-        # Run provisioning with progress shown
-        loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(loop)
-        loop.run_until_complete(orchestrator.deploy_all(resources, show_progress=True))
-        loop.close()
-
-    except Exception as e:
-        console.print(f"[yellow]Warning:[/yellow] Provisioning failed: {e}")
diff --git a/tests/integration/test_run_auto_provision.py b/tests/integration/test_run_auto_provision.py
deleted file mode 100644
index 9478f442..00000000
--- a/tests/integration/test_run_auto_provision.py
+++ /dev/null
@@ -1,337 +0,0 @@
-"""Integration tests for flash run --auto-provision command."""
-
-import pytest
-from unittest.mock import patch, MagicMock
-from textwrap import dedent
-from typer.testing import CliRunner
-
-from runpod_flash.cli.main import app
-
-runner = CliRunner()
-
-
-class TestRunAutoProvision:
-    """Test flash run --auto-provision integration."""
-
-    @pytest.fixture
-    def temp_project(self, tmp_path):
-        """Create temporary Flash project for testing."""
-        # Create main.py with FastAPI app
-        main_file = tmp_path / "main.py"
-        main_file.write_text(
-            dedent(
-                """
-                from fastapi import FastAPI
-                from runpod_flash.client import remote
-                from runpod_flash.core.resources.serverless import ServerlessResource
-
-                app = FastAPI()
-
-                gpu_config = ServerlessResource(
-                    name="test-gpu",
-                    gpuCount=1,
-                    workersMax=3,
-                    workersMin=0,
-                    flashboot=False,
-                )
-
-                @remote(resource_config=gpu_config)
-                async def gpu_task():
-                    return "result"
-
-                @app.get("/")
-                def root():
-                    return {"message": "Hello"}
-                """
-            )
-        )
-
-        return tmp_path
-
-    @pytest.fixture
-    def temp_project_many_resources(self, tmp_path):
-        """Create temporary project with many resources (> 5)."""
-        main_file = tmp_path / "main.py"
-        main_file.write_text(
-            dedent(
-                """
-                from fastapi import FastAPI
-                from runpod_flash.client import remote
-                from runpod_flash.core.resources.serverless import ServerlessResource
-
-                app = FastAPI()
-
-                # Create 6 resources to trigger confirmation
-                configs = [
-                    ServerlessResource(
-                        name=f"endpoint-{i}",
-                        gpuCount=1,
-                        workersMax=3,
-                        workersMin=0,
-                        flashboot=False,
-                    )
-                    for i in range(6)
-                ]
-
-                @remote(resource_config=configs[0])
-                async def task1(): pass
-
-                @remote(resource_config=configs[1])
-                async def task2(): pass
-
-                @remote(resource_config=configs[2])
-                async def task3(): pass
-
-                @remote(resource_config=configs[3])
-                async def task4(): pass
-
-                @remote(resource_config=configs[4])
-                async def task5(): pass
-
-                @remote(resource_config=configs[5])
-                async def task6(): pass
-
-                @app.get("/")
-                def root():
-                    return {"message": "Hello"}
-                """
-            )
-        )
-
-        return tmp_path
-
-    def test_run_without_auto_provision(self, temp_project, monkeypatch):
-        """Test that flash run without --auto-provision doesn't deploy resources."""
-        monkeypatch.chdir(temp_project)
-
-        # Mock subprocess to prevent actual uvicorn start
-        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.pid = 12345
-            mock_process.wait.side_effect = KeyboardInterrupt()
-            mock_popen.return_value = mock_process
-
-            # Mock OS-level process group operations to prevent hanging
-            with patch("runpod_flash.cli.commands.run.os.getpgid") as mock_getpgid:
-                mock_getpgid.return_value = 12345
-
-                with patch("runpod_flash.cli.commands.run.os.killpg"):
-                    # Mock discovery to track if it was called
-                    with patch(
-                        "runpod_flash.cli.commands.run._discover_resources"
-                    ) as mock_discover:
-                        runner.invoke(app, ["run"])
-
-                        # Discovery should not be called
-                        mock_discover.assert_not_called()
-
-    def test_run_with_auto_provision_single_resource(self, temp_project, monkeypatch):
-        """Test flash run --auto-provision with single resource."""
-        monkeypatch.chdir(temp_project)
-
-        # Mock subprocess to prevent actual uvicorn start
-        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.pid = 12345
-            mock_process.wait.side_effect = KeyboardInterrupt()
-            mock_popen.return_value = mock_process
-
-            # Mock OS-level process group operations
-            with patch("runpod_flash.cli.commands.run.os.getpgid") as mock_getpgid:
-                mock_getpgid.return_value = 12345
-
-                with patch("runpod_flash.cli.commands.run.os.killpg"):
-                    # Mock deployment orchestrator
-                    with patch(
-                        "runpod_flash.cli.commands.run._provision_resources"
-                    ) as mock_provision:
-                        runner.invoke(app, ["run", "--auto-provision"])
-
-                        # Provisioning should be called
-                        mock_provision.assert_called_once()
-
-    def test_run_with_auto_provision_skips_reload(self, temp_project, monkeypatch):
-        """Test that auto-provision is skipped on reload."""
-        monkeypatch.chdir(temp_project)
-
-        # Simulate reload environment
-        monkeypatch.setenv("UVICORN_RELOADER_PID", "12345")
-
-        # Mock subprocess to prevent actual uvicorn start
-        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.pid = 12345
-            mock_process.wait.side_effect = KeyboardInterrupt()
-            mock_popen.return_value = mock_process
-
-            # Mock OS-level process group operations
-            with patch("runpod_flash.cli.commands.run.os.getpgid") as mock_getpgid:
-                mock_getpgid.return_value = 12345
-
-                with patch("runpod_flash.cli.commands.run.os.killpg"):
-                    # Mock provisioning
-                    with patch(
-                        "runpod_flash.cli.commands.run._provision_resources"
-                    ) as mock_provision:
-                        runner.invoke(app, ["run", "--auto-provision"])
-
-                        # Provisioning should NOT be called on reload
-                        mock_provision.assert_not_called()
-
-    def test_run_with_auto_provision_many_resources_confirmed(
-        self, temp_project, monkeypatch
-    ):
-        """Test auto-provision with > 5 resources and user confirmation."""
-        monkeypatch.chdir(temp_project)
-
-        # Create 6 mock resources
-        mock_resources = [MagicMock(name=f"endpoint-{i}") for i in range(6)]
-
-        # Mock subprocess to prevent actual uvicorn start
-        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.pid = 12345
-            mock_process.wait.side_effect = KeyboardInterrupt()
-            mock_popen.return_value = mock_process
-
-            # Mock OS-level process group operations
-            with patch("runpod_flash.cli.commands.run.os.getpgid") as mock_getpgid:
-                mock_getpgid.return_value = 12345
-
-                with patch("runpod_flash.cli.commands.run.os.killpg"):
-                    # Mock discovery to return > 5 resources
-                    with patch(
-                        "runpod_flash.cli.commands.run._discover_resources"
-                    ) as mock_discover:
-                        mock_discover.return_value = mock_resources
-
-                        # Mock questionary to simulate user confirmation
-                        with patch(
-                            "runpod_flash.cli.commands.run.questionary.confirm"
-                        ) as mock_confirm:
-                            mock_confirm.return_value.ask.return_value = True
-
-                            with patch(
-                                "runpod_flash.cli.commands.run._provision_resources"
-                            ) as mock_provision:
-                                runner.invoke(app, ["run", "--auto-provision"])
-
-                                # Should prompt for confirmation
-                                mock_confirm.assert_called_once()
-
-                                # Should provision after confirmation
-                                mock_provision.assert_called_once()
-
-    def test_run_with_auto_provision_many_resources_cancelled(
-        self, temp_project, monkeypatch
-    ):
-        """Test auto-provision with > 5 resources and user cancellation."""
-        monkeypatch.chdir(temp_project)
-
-        # Create 6 mock resources
-        mock_resources = [MagicMock(name=f"endpoint-{i}") for i in range(6)]
-
-        # Mock subprocess to prevent actual uvicorn start
-        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.pid = 12345
-            mock_process.wait.side_effect = KeyboardInterrupt()
-            mock_popen.return_value = mock_process
-
-            # Mock OS-level process group operations
-            with patch("runpod_flash.cli.commands.run.os.getpgid") as mock_getpgid:
-                mock_getpgid.return_value = 12345
-
-                with patch("runpod_flash.cli.commands.run.os.killpg"):
-                    # Mock discovery to return > 5 resources
-                    with patch(
-                        "runpod_flash.cli.commands.run._discover_resources"
-                    ) as mock_discover:
-                        mock_discover.return_value = mock_resources
-
-                        # Mock questionary to simulate user cancellation
-                        with patch(
-                            "runpod_flash.cli.commands.run.questionary.confirm"
-                        ) as mock_confirm:
-                            mock_confirm.return_value.ask.return_value = False
-
-                            with patch(
-                                "runpod_flash.cli.commands.run._provision_resources"
-                            ) as mock_provision:
-                                runner.invoke(app, ["run", "--auto-provision"])
-
-                                # Should prompt for confirmation
-                                mock_confirm.assert_called_once()
-
-                                # Should NOT provision after cancellation
-                                mock_provision.assert_not_called()
-
-    def test_run_auto_provision_discovery_error(self, temp_project, monkeypatch):
-        """Test that run handles discovery errors gracefully."""
-        monkeypatch.chdir(temp_project)
-
-        # Mock subprocess to prevent actual uvicorn start
-        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.pid = 12345
-            mock_process.wait.side_effect = KeyboardInterrupt()
-            mock_popen.return_value = mock_process
-
-            # Mock OS-level process group operations
-            with patch("runpod_flash.cli.commands.run.os.getpgid") as mock_getpgid:
-                mock_getpgid.return_value = 12345
-
-                with patch("runpod_flash.cli.commands.run.os.killpg"):
-                    # Mock discovery to raise exception
-                    with patch(
-                        "runpod_flash.cli.commands.run._discover_resources"
-                    ) as mock_discover:
-                        mock_discover.return_value = []
-
-                        runner.invoke(app, ["run", "--auto-provision"])
-
-                        # Server should still start despite discovery error
-                        mock_popen.assert_called_once()
-
-    def test_run_auto_provision_no_resources_found(self, tmp_path, monkeypatch):
-        """Test auto-provision when no resources are found."""
-        monkeypatch.chdir(tmp_path)
-
-        # Create main.py without any @remote decorators
-        main_file = tmp_path / "main.py"
-        main_file.write_text(
-            dedent(
-                """
-                from fastapi import FastAPI
-
-                app = FastAPI()
-
-                @app.get("/")
-                def root():
-                    return {"message": "Hello"}
-                """
-            )
-        )
-
-        # Mock subprocess to prevent actual uvicorn start
-        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.pid = 12345
-            mock_process.wait.side_effect = KeyboardInterrupt()
-            mock_popen.return_value = mock_process
-
-            # Mock OS-level process group operations
-            with patch("runpod_flash.cli.commands.run.os.getpgid") as mock_getpgid:
-                mock_getpgid.return_value = 12345
-
-                with patch("runpod_flash.cli.commands.run.os.killpg"):
-                    with patch(
-                        "runpod_flash.cli.commands.run._provision_resources"
-                    ) as mock_provision:
-                        runner.invoke(app, ["run", "--auto-provision"])
-
-                        # Provisioning should not be called
-                        mock_provision.assert_not_called()
-
-                        # Server should still start
-                        mock_popen.assert_called_once()
diff --git a/tests/unit/cli/test_run.py b/tests/unit/cli/test_run.py
index a652aa75..cf7eb5fd 100644
--- a/tests/unit/cli/test_run.py
+++ b/tests/unit/cli/test_run.py
@@ -15,9 +15,15 @@ def runner():
 
 @pytest.fixture
 def temp_fastapi_app(tmp_path):
-    """Create minimal FastAPI app for testing."""
-    main_file = tmp_path / "main.py"
-    main_file.write_text("from fastapi import FastAPI\napp = FastAPI()")
+    """Create minimal Flash project with @remote function for testing."""
+    worker_file = tmp_path / "worker.py"
+    worker_file.write_text(
+        "from runpod_flash import LiveServerless, remote\n"
+        "gpu_config = LiveServerless(name='test_worker')\n"
+        "@remote(gpu_config)\n"
+        "async def process(data: dict) -> dict:\n"
+        "    return data\n"
+    )
     return tmp_path
 
 

From 5776af66a8197289c0f3f78814d31228788f7d89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 18 Feb 2026 12:32:30 -0800
Subject: [PATCH 05/26] feat(build,lb_handler_generator): invoke LB handler
 generator, rglob project validation

LBHandlerGenerator is now called from run_build() for all is_load_balanced
resources, wiring the build pipeline to the new module_path-based handler
generation. validate_project_structure switches from glob to rglob so
projects with files only in subdirectories (e.g. 00_multi_resource) are
not incorrectly rejected.

lb_handler_generator loses the mothership reconciliation lifespan
(StateManagerClient, reconcile_children) in favour of a clean
startup/shutdown lifespan.
---
 src/runpod_flash/cli/commands/build.py        | 25 ++++-----
 .../build_utils/lb_handler_generator.py       | 54 +------------------
 2 files changed, 11 insertions(+), 68 deletions(-)

diff --git a/src/runpod_flash/cli/commands/build.py b/src/runpod_flash/cli/commands/build.py
index 00b152ff..6b325de0 100644
--- a/src/runpod_flash/cli/commands/build.py
+++ b/src/runpod_flash/cli/commands/build.py
@@ -23,6 +23,7 @@
 from runpod_flash.core.resources.constants import MAX_TARBALL_SIZE_MB
 
 from ..utils.ignore import get_file_tree, load_ignore_patterns
+from .build_utils.lb_handler_generator import LBHandlerGenerator
 from .build_utils.manifest import ManifestBuilder
 from .build_utils.scanner import RemoteDecoratorScanner
 
@@ -239,6 +240,9 @@ def run_build(
             manifest_path = build_dir / "flash_manifest.json"
             manifest_path.write_text(json.dumps(manifest, indent=2))
 
+            lb_generator = LBHandlerGenerator(manifest, build_dir)
+            lb_generator.generate_handlers()
+
             flash_dir = project_dir / ".flash"
             deployment_manifest_path = flash_dir / "flash_manifest.json"
             shutil.copy2(manifest_path, deployment_manifest_path)
@@ -425,28 +429,19 @@ def validate_project_structure(project_dir: Path) -> bool:
     """
     Validate that directory is a Flash project.
 
+    A Flash project is any directory containing Python files. The
+    RemoteDecoratorScanner validates that @remote functions exist.
+
     Args:
         project_dir: Directory to validate
 
     Returns:
         True if valid Flash project
     """
-    main_py = project_dir / "main.py"
-
-    if not main_py.exists():
-        console.print(f"[red]Error:[/red] main.py not found in {project_dir}")
+    py_files = list(project_dir.rglob("*.py"))
+    if not py_files:
+        console.print(f"[red]Error:[/red] No Python files found in {project_dir}")
         return False
-
-    # Check if main.py has FastAPI app
-    try:
-        content = main_py.read_text(encoding="utf-8")
-        if "FastAPI" not in content:
-            console.print(
-                "[yellow]Warning:[/yellow] main.py does not appear to have a FastAPI app"
-            )
-    except Exception:
-        pass
-
     return True
 
 
diff --git a/src/runpod_flash/cli/commands/build_utils/lb_handler_generator.py b/src/runpod_flash/cli/commands/build_utils/lb_handler_generator.py
index dcd0845d..a0d28601 100644
--- a/src/runpod_flash/cli/commands/build_utils/lb_handler_generator.py
+++ b/src/runpod_flash/cli/commands/build_utils/lb_handler_generator.py
@@ -21,13 +21,10 @@
 - Real-time communication patterns
 """
 
-import asyncio
 import logging
 from contextlib import asynccontextmanager
-from pathlib import Path
-from typing import Optional
 
-from fastapi import FastAPI, Request
+from fastapi import FastAPI
 from runpod_flash.runtime.lb_handler import create_lb_handler
 
 logger = logging.getLogger(__name__)
@@ -45,57 +42,8 @@
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Handle application startup and shutdown."""
-    # Startup
     logger.info("Starting {resource_name} endpoint")
-
-    # Check if this is the mothership and run reconciliation
-    # Note: Resources are now provisioned upfront by the CLI during deployment.
-    # This background task runs reconciliation on mothership startup to ensure
-    # all resources are still deployed and in sync with the manifest.
-    try:
-        from runpod_flash.runtime.mothership_provisioner import (
-            is_mothership,
-            reconcile_children,
-            get_mothership_url,
-        )
-        from runpod_flash.runtime.state_manager_client import StateManagerClient
-
-        if is_mothership():
-            logger.info("=" * 60)
-            logger.info("Mothership detected - Starting reconciliation task")
-            logger.info("Resources are provisioned upfront by the CLI")
-            logger.info("This task ensures all resources remain in sync")
-            logger.info("=" * 60)
-            try:
-                mothership_url = get_mothership_url()
-                logger.info(f"Mothership URL: {{mothership_url}}")
-
-                # Initialize State Manager client for reconciliation
-                state_client = StateManagerClient()
-
-                # Spawn background reconciliation task (non-blocking)
-                # This will verify all resources from manifest are deployed
-                manifest_path = Path(__file__).parent / "flash_manifest.json"
-                task = asyncio.create_task(
-                    reconcile_children(manifest_path, mothership_url, state_client)
-                )
-                # Add error callback to catch and log background task exceptions
-                task.add_done_callback(
-                    lambda t: logger.error(f"Reconciliation task failed: {{t.exception()}}")
-                    if t.exception()
-                    else None
-                )
-
-            except Exception as e:
-                logger.error(f"Failed to start reconciliation task: {{e}}")
-                # Don't fail startup - continue serving traffic
-
-    except ImportError:
-        logger.debug("Mothership provisioning modules not available")
-
     yield
-
-    # Shutdown
     logger.info("Shutting down {resource_name} endpoint")
 
 

From 4fb94b574012d1d2c84d931ddfa2e987d3ac2dea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 18 Feb 2026 12:33:32 -0800
Subject: [PATCH 06/26] fix(serverless): resolve flash run runtime bugs

is_deployed skips the health check when FLASH_IS_LIVE_PROVISIONING=true.
Newly created endpoints can fail RunPod's health API for a few seconds
after creation (propagation delay), causing get_or_deploy_resource to
trigger a spurious re-deploy on the second request (e.g. /run_sync
immediately after /run).

_payload_exclude now excludes template when templateId is already set.
After first deployment _do_deploy sets templateId on the config object
while the set_serverless_template validator has already set template at
construction time. Sending both fields in the same payload causes RunPod
to return 'You can only provide one of templateId or template.'

Also adds _get_module_path helper and injects FLASH_MODULE_PATH into LB
endpoint environment at deploy time so the deployed handler can import
the correct user module.
---
 src/runpod_flash/core/resources/serverless.py | 60 ++++++++++++++++++-
 tests/unit/resources/test_serverless.py       | 44 ++++++++++++++
 2 files changed, 102 insertions(+), 2 deletions(-)

diff --git a/src/runpod_flash/core/resources/serverless.py b/src/runpod_flash/core/resources/serverless.py
index d2a0f7b0..27d811a4 100644
--- a/src/runpod_flash/core/resources/serverless.py
+++ b/src/runpod_flash/core/resources/serverless.py
@@ -474,6 +474,14 @@ def is_deployed(self) -> bool:
             if not self.id:
                 return False
 
+            # During flash run, skip the health check. Newly-created endpoints
+            # can fail health checks due to RunPod propagation delay — the
+            # endpoint exists but the health API hasn't registered it yet.
+            # Trusting the cached ID is correct here; actual failures surface
+            # on the first real run/run_sync call.
+            if os.getenv("FLASH_IS_LIVE_PROVISIONING", "").lower() == "true":
+                return True
+
             response = self.endpoint.health()
             return response is not None
         except Exception as e:
@@ -484,6 +492,10 @@ def _payload_exclude(self) -> Set[str]:
         # flashEnvironmentId is input-only but must be sent when provided
         exclude_fields = set(self._input_only or set())
         exclude_fields.discard("flashEnvironmentId")
+        # When templateId is already set, exclude template from the payload.
+        # RunPod rejects requests that contain both fields simultaneously.
+        if self.templateId:
+            exclude_fields.add("template")
         return exclude_fields
 
     @staticmethod
@@ -564,12 +576,45 @@ def _check_makes_remote_calls(self) -> bool:
             )
             return True  # Safe default on error
 
+    def _get_module_path(self) -> Optional[str]:
+        """Get module_path from build manifest for this resource.
+
+        Returns:
+            Dotted module path (e.g., 'preprocess.first_pass'), or None if not found.
+        """
+        try:
+            manifest_path = Path.cwd() / "flash_manifest.json"
+            if not manifest_path.exists():
+                manifest_path = Path("/flash_manifest.json")
+            if not manifest_path.exists():
+                return None
+
+            with open(manifest_path) as f:
+                manifest_data = json.load(f)
+
+            resources = manifest_data.get("resources", {})
+
+            lookup_name = self.name
+            if lookup_name.endswith("-fb"):
+                lookup_name = lookup_name[:-3]
+            if lookup_name.startswith(LIVE_PREFIX):
+                lookup_name = lookup_name[len(LIVE_PREFIX) :]
+
+            resource_config = resources.get(lookup_name)
+            if not resource_config:
+                return None
+
+            return resource_config.get("module_path")
+
+        except Exception:
+            return None
+
     async def _do_deploy(self) -> "DeployableResource":
         """
         Deploys the serverless resource using the provided configuration.
 
-        For queue-based endpoints that make remote calls, injects RUNPOD_API_KEY
-        into environment variables if not already set.
+        For queue-based endpoints that make remote calls, injects RUNPOD_API_KEY.
+        For load-balanced endpoints, injects FLASH_MODULE_PATH.
 
         Returns a DeployableResource object.
         """
@@ -604,6 +649,17 @@ async def _do_deploy(self) -> "DeployableResource":
 
                 self.env = env_dict
 
+            # Inject module path for load-balanced endpoints
+            elif self.type == ServerlessType.LB:
+                env_dict = self.env or {}
+
+                module_path = self._get_module_path()
+                if module_path and "FLASH_MODULE_PATH" not in env_dict:
+                    env_dict["FLASH_MODULE_PATH"] = module_path
+                    log.info(f"{self.name}: Injected FLASH_MODULE_PATH={module_path}")
+
+                self.env = env_dict
+
             # Ensure network volume is deployed first
             await self._ensure_network_volume_deployed()
 
diff --git a/tests/unit/resources/test_serverless.py b/tests/unit/resources/test_serverless.py
index b5e3cbbe..124eb136 100644
--- a/tests/unit/resources/test_serverless.py
+++ b/tests/unit/resources/test_serverless.py
@@ -467,6 +467,32 @@ def test_is_deployed_false_when_no_id(self):
 
         assert serverless.is_deployed() is False
 
+    def test_is_deployed_skips_health_check_during_live_provisioning(self, monkeypatch):
+        """During flash run, is_deployed returns True based on ID alone."""
+        monkeypatch.setenv("FLASH_IS_LIVE_PROVISIONING", "true")
+        serverless = ServerlessResource(name="test")
+        serverless.id = "ep-live-123"
+
+        # health() must NOT be called — no mock needed, any call would raise
+        assert serverless.is_deployed() is True
+
+    def test_is_deployed_uses_health_check_outside_live_provisioning(self, monkeypatch):
+        """Outside flash run, is_deployed falls back to health check."""
+        monkeypatch.delenv("FLASH_IS_LIVE_PROVISIONING", raising=False)
+        serverless = ServerlessResource(name="test")
+        serverless.id = "ep-123"
+
+        mock_endpoint = MagicMock()
+        mock_endpoint.health.return_value = {"workers": {}}
+
+        with patch.object(
+            type(serverless),
+            "endpoint",
+            new_callable=lambda: property(lambda self: mock_endpoint),
+        ):
+            assert serverless.is_deployed() is True
+            mock_endpoint.health.assert_called_once()
+
     @pytest.mark.asyncio
     async def test_deploy_already_deployed(self):
         """Test deploy returns early when already deployed."""
@@ -938,6 +964,24 @@ def test_is_deployed_with_exception(self):
 
             assert result is False
 
+    def test_payload_exclude_adds_template_when_template_id_set(self):
+        """_payload_exclude excludes template field when templateId is already set."""
+        serverless = ServerlessResource(name="test")
+        serverless.templateId = "tmpl-123"
+
+        excluded = serverless._payload_exclude()
+
+        assert "template" in excluded
+
+    def test_payload_exclude_does_not_exclude_template_without_template_id(self):
+        """_payload_exclude does not exclude template when templateId is absent."""
+        serverless = ServerlessResource(name="test")
+        serverless.templateId = None
+
+        excluded = serverless._payload_exclude()
+
+        assert "template" not in excluded
+
     def test_reverse_sync_from_backend_response(self):
         """Test reverse sync when receiving backend response with gpuIds."""
         # This tests the lines 173-176 which convert gpuIds back to gpus list

From 8a0e026ab7f2b1e6fbb78dc530f8d7383ac7a43e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 18 Feb 2026 12:54:59 -0800
Subject: [PATCH 07/26] fix(run): hot-reload regenerates server.py on route
 changes

Parent process watches project .py files via watchfiles and regenerates
.flash/server.py on change. Uvicorn now watches only .flash/server.py
instead of the whole project, so it reloads exactly once per change
with the updated routes visible.

- Add _watch_and_regenerate() background thread using watchfiles
- Change --reload-dir from '.' to '.flash', --reload-include to 'server.py'
- Start watcher thread when reload=True, stop on KeyboardInterrupt/Exception
- Add TestRunCommandHotReload and TestWatchAndRegenerate test classes
---
 src/runpod_flash/cli/commands/run.py |  52 ++++++-
 tests/unit/cli/test_run.py           | 202 +++++++++++++++++++++++++++
 2 files changed, 252 insertions(+), 2 deletions(-)

diff --git a/src/runpod_flash/cli/commands/run.py b/src/runpod_flash/cli/commands/run.py
index faf4f50b..86fceb59 100644
--- a/src/runpod_flash/cli/commands/run.py
+++ b/src/runpod_flash/cli/commands/run.py
@@ -5,6 +5,7 @@
 import signal
 import subprocess
 import sys
+import threading
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import List
@@ -12,6 +13,8 @@
 import typer
 from rich.console import Console
 from rich.table import Table
+from watchfiles import DefaultFilter as _WatchfilesDefaultFilter
+from watchfiles import watch as _watchfiles_watch
 
 from .build_utils.scanner import (
     RemoteDecoratorScanner,
@@ -361,6 +364,33 @@ def _is_reload() -> bool:
     return "UVICORN_RELOADER_PID" in os.environ
 
 
+def _watch_and_regenerate(project_root: Path, stop_event: threading.Event) -> None:
+    """Watch project .py files and regenerate server.py when they change.
+
+    Ignores .flash/ to avoid reacting to our own writes. Runs until
+    stop_event is set.
+    """
+    watch_filter = _WatchfilesDefaultFilter(ignore_paths=[str(project_root / ".flash")])
+
+    try:
+        for changes in _watchfiles_watch(
+            project_root,
+            watch_filter=watch_filter,
+            stop_event=stop_event,
+        ):
+            py_changed = [p for _, p in changes if p.endswith(".py")]
+            if not py_changed:
+                continue
+            try:
+                workers = _scan_project_workers(project_root)
+                _generate_flash_server(project_root, workers)
+                logger.debug("server.py regenerated (%d changed)", len(py_changed))
+            except Exception as e:
+                logger.warning("Failed to regenerate server.py: %s", e)
+    except Exception:
+        pass  # stop_event was set or watchfiles unavailable — both are fine
+
+
 def run_command(
     host: str = typer.Option(
         "localhost",
@@ -435,11 +465,19 @@ def run_command(
         cmd += [
             "--reload",
             "--reload-dir",
-            ".",
+            ".flash",
             "--reload-include",
-            "*.py",
+            "server.py",
         ]
 
+    stop_event = threading.Event()
+    watcher_thread = threading.Thread(
+        target=_watch_and_regenerate,
+        args=(project_root, stop_event),
+        daemon=True,
+        name="flash-watcher",
+    )
+
     process = None
     try:
         if sys.platform == "win32":
@@ -449,11 +487,17 @@ def run_command(
         else:
             process = subprocess.Popen(cmd, preexec_fn=os.setsid)
 
+        if reload:
+            watcher_thread.start()
+
         process.wait()
 
     except KeyboardInterrupt:
         console.print("\n[yellow]Stopping server and cleaning up...[/yellow]")
 
+        stop_event.set()
+        watcher_thread.join(timeout=2)
+
         if process:
             try:
                 if sys.platform == "win32":
@@ -479,6 +523,10 @@ def run_command(
 
     except Exception as e:
         console.print(f"[red]Error:[/red] {e}")
+
+        stop_event.set()
+        watcher_thread.join(timeout=2)
+
         if process:
             try:
                 if sys.platform == "win32":
diff --git a/tests/unit/cli/test_run.py b/tests/unit/cli/test_run.py
index cf7eb5fd..1e0c549a 100644
--- a/tests/unit/cli/test_run.py
+++ b/tests/unit/cli/test_run.py
@@ -30,6 +30,12 @@ def temp_fastapi_app(tmp_path):
 class TestRunCommandEnvironmentVariables:
     """Test flash run command environment variable support."""
 
+    @pytest.fixture(autouse=True)
+    def patch_watcher(self):
+        """Prevent the background watcher thread from blocking tests."""
+        with patch("runpod_flash.cli.commands.run._watch_and_regenerate"):
+            yield
+
     def test_port_from_environment_variable(
         self, runner, temp_fastapi_app, monkeypatch
     ):
@@ -221,3 +227,199 @@ def test_short_port_flag_overrides_environment(
                     assert "--port" in call_args
                     port_index = call_args.index("--port")
                     assert call_args[port_index + 1] == "7000"
+
+
+class TestRunCommandHotReload:
+    """Test flash run hot-reload behavior."""
+
+    @pytest.fixture(autouse=True)
+    def patch_watcher(self):
+        """Prevent the background watcher thread from blocking tests."""
+        with patch("runpod_flash.cli.commands.run._watch_and_regenerate"):
+            yield
+
+    def _invoke_run(self, runner, monkeypatch, temp_fastapi_app, extra_args=None):
+        """Helper: invoke flash run and return the Popen call args."""
+        monkeypatch.chdir(temp_fastapi_app)
+        monkeypatch.delenv("FLASH_PORT", raising=False)
+        monkeypatch.delenv("FLASH_HOST", raising=False)
+
+        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
+            mock_process = MagicMock()
+            mock_process.pid = 12345
+            mock_process.wait.side_effect = KeyboardInterrupt()
+            mock_popen.return_value = mock_process
+
+            with patch("runpod_flash.cli.commands.run.os.getpgid", return_value=12345):
+                with patch("runpod_flash.cli.commands.run.os.killpg"):
+                    runner.invoke(app, ["run"] + (extra_args or []))
+
+            return mock_popen.call_args[0][0]
+
+    def test_reload_watches_flash_server_py(
+        self, runner, temp_fastapi_app, monkeypatch
+    ):
+        """Uvicorn watches .flash/server.py, not the whole project."""
+        cmd = self._invoke_run(runner, monkeypatch, temp_fastapi_app)
+
+        assert "--reload" in cmd
+        assert "--reload-dir" in cmd
+        reload_dir_index = cmd.index("--reload-dir")
+        assert cmd[reload_dir_index + 1] == ".flash"
+
+        assert "--reload-include" in cmd
+        reload_include_index = cmd.index("--reload-include")
+        assert cmd[reload_include_index + 1] == "server.py"
+
+    def test_reload_does_not_watch_project_root(
+        self, runner, temp_fastapi_app, monkeypatch
+    ):
+        """Uvicorn reload-dir must not be '.' to prevent double-reload."""
+        cmd = self._invoke_run(runner, monkeypatch, temp_fastapi_app)
+
+        reload_dir_index = cmd.index("--reload-dir")
+        assert cmd[reload_dir_index + 1] != "."
+
+    def test_no_reload_skips_watcher_thread(
+        self, runner, temp_fastapi_app, monkeypatch
+    ):
+        """--no-reload: neither uvicorn reload args nor watcher thread started."""
+        monkeypatch.chdir(temp_fastapi_app)
+
+        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
+            mock_process = MagicMock()
+            mock_process.pid = 12345
+            mock_process.wait.side_effect = KeyboardInterrupt()
+            mock_popen.return_value = mock_process
+
+            with patch("runpod_flash.cli.commands.run.os.getpgid", return_value=12345):
+                with patch("runpod_flash.cli.commands.run.os.killpg"):
+                    with patch(
+                        "runpod_flash.cli.commands.run.threading.Thread"
+                    ) as mock_thread_cls:
+                        mock_thread = MagicMock()
+                        mock_thread_cls.return_value = mock_thread
+
+                        runner.invoke(app, ["run", "--no-reload"])
+
+            cmd = mock_popen.call_args[0][0]
+            assert "--reload" not in cmd
+            mock_thread.start.assert_not_called()
+
+    def test_watcher_thread_started_on_reload(
+        self, runner, temp_fastapi_app, monkeypatch, patch_watcher
+    ):
+        """When reload=True, the background watcher thread is started."""
+        monkeypatch.chdir(temp_fastapi_app)
+
+        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
+            mock_process = MagicMock()
+            mock_process.pid = 12345
+            mock_process.wait.side_effect = KeyboardInterrupt()
+            mock_popen.return_value = mock_process
+
+            with patch("runpod_flash.cli.commands.run.os.getpgid", return_value=12345):
+                with patch("runpod_flash.cli.commands.run.os.killpg"):
+                    with patch(
+                        "runpod_flash.cli.commands.run.threading.Thread"
+                    ) as mock_thread_cls:
+                        mock_thread = MagicMock()
+                        mock_thread_cls.return_value = mock_thread
+
+                        runner.invoke(app, ["run"])
+
+            mock_thread.start.assert_called_once()
+
+    def test_watcher_thread_stopped_on_keyboard_interrupt(
+        self, runner, temp_fastapi_app, monkeypatch
+    ):
+        """KeyboardInterrupt sets stop_event and joins the watcher thread."""
+        monkeypatch.chdir(temp_fastapi_app)
+
+        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
+            mock_process = MagicMock()
+            mock_process.pid = 12345
+            mock_process.wait.side_effect = KeyboardInterrupt()
+            mock_popen.return_value = mock_process
+
+            with patch("runpod_flash.cli.commands.run.os.getpgid", return_value=12345):
+                with patch("runpod_flash.cli.commands.run.os.killpg"):
+                    with patch(
+                        "runpod_flash.cli.commands.run.threading.Thread"
+                    ) as mock_thread_cls:
+                        mock_thread = MagicMock()
+                        mock_thread_cls.return_value = mock_thread
+                        with patch(
+                            "runpod_flash.cli.commands.run.threading.Event"
+                        ) as mock_event_cls:
+                            mock_stop = MagicMock()
+                            mock_event_cls.return_value = mock_stop
+
+                            runner.invoke(app, ["run"])
+
+            mock_stop.set.assert_called_once()
+            mock_thread.join.assert_called_once_with(timeout=2)
+
+
+class TestWatchAndRegenerate:
+    """Unit tests for the _watch_and_regenerate background function."""
+
+    def test_regenerates_server_py_on_py_file_change(self, tmp_path):
+        """When a .py file changes, server.py is regenerated."""
+        import threading
+        from runpod_flash.cli.commands.run import _watch_and_regenerate
+
+        stop = threading.Event()
+
+        with patch(
+            "runpod_flash.cli.commands.run._scan_project_workers", return_value=[]
+        ) as mock_scan:
+            with patch(
+                "runpod_flash.cli.commands.run._generate_flash_server"
+            ) as mock_gen:
+                with patch(
+                    "runpod_flash.cli.commands.run._watchfiles_watch"
+                ) as mock_watch:
+                    # Yield one batch of changes then stop
+                    mock_watch.return_value = iter([{(1, "/path/to/worker.py")}])
+                    stop.set()  # ensures the loop exits after one iteration
+                    _watch_and_regenerate(tmp_path, stop)
+
+        mock_scan.assert_called_once_with(tmp_path)
+        mock_gen.assert_called_once()
+
+    def test_ignores_non_py_changes(self, tmp_path):
+        """Changes to non-.py files do not trigger regeneration."""
+        import threading
+        from runpod_flash.cli.commands.run import _watch_and_regenerate
+
+        stop = threading.Event()
+
+        with patch("runpod_flash.cli.commands.run._scan_project_workers") as mock_scan:
+            with patch(
+                "runpod_flash.cli.commands.run._generate_flash_server"
+            ) as mock_gen:
+                with patch(
+                    "runpod_flash.cli.commands.run._watchfiles_watch"
+                ) as mock_watch:
+                    mock_watch.return_value = iter([{(1, "/path/to/README.md")}])
+                    _watch_and_regenerate(tmp_path, stop)
+
+        mock_scan.assert_not_called()
+        mock_gen.assert_not_called()
+
+    def test_scan_error_does_not_crash_watcher(self, tmp_path):
+        """If regeneration raises, the watcher logs a warning and continues."""
+        import threading
+        from runpod_flash.cli.commands.run import _watch_and_regenerate
+
+        stop = threading.Event()
+
+        with patch(
+            "runpod_flash.cli.commands.run._scan_project_workers",
+            side_effect=RuntimeError("scan failed"),
+        ):
+            with patch("runpod_flash.cli.commands.run._watchfiles_watch") as mock_watch:
+                mock_watch.return_value = iter([{(1, "/path/to/worker.py")}])
+                # Should not raise
+                _watch_and_regenerate(tmp_path, stop)

From e40930d2b923df44c171efba4571fe19d115d9a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 18 Feb 2026 13:12:52 -0800
Subject: [PATCH 08/26] fix(run): suppress watchfiles debug logs from flash run
 output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

watchfiles emits DEBUG-level messages ("all changes filtered out",
"rust notify timeout") that are correct behavior but should not be
visible to users. Silence the watchfiles logger at WARNING in
_watch_and_regenerate() — scoped to that namespace only.
---
 src/runpod_flash/cli/commands/run.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/runpod_flash/cli/commands/run.py b/src/runpod_flash/cli/commands/run.py
index 86fceb59..57221c1a 100644
--- a/src/runpod_flash/cli/commands/run.py
+++ b/src/runpod_flash/cli/commands/run.py
@@ -370,6 +370,9 @@ def _watch_and_regenerate(project_root: Path, stop_event: threading.Event) -> No
     Ignores .flash/ to avoid reacting to our own writes. Runs until
     stop_event is set.
     """
+    # Suppress watchfiles' internal debug chatter (filter hits, rust timeouts).
+    logging.getLogger("watchfiles").setLevel(logging.WARNING)
+
     watch_filter = _WatchfilesDefaultFilter(ignore_paths=[str(project_root / ".flash")])
 
     try:

From 22ccee5c3bbead8f94a1473423e9f84657197279 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 18 Feb 2026 14:25:49 -0800
Subject: [PATCH 09/26] fix(run): omit body param from GET/HEAD route handlers

FastAPI treats `body: dict` as a required JSON body. GET/HEAD routes
must be zero-arg so Swagger UI and browsers do not attempt to send a
body, which triggers a fetch TypeError.

Split the LB route code generator in _generate_flash_server() on
method: get/head emit no-arg handlers; all other methods keep body: dict.
---
 src/runpod_flash/cli/commands/run.py | 20 ++++++++++-----
 tests/unit/cli/test_run.py           | 38 ++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/src/runpod_flash/cli/commands/run.py b/src/runpod_flash/cli/commands/run.py
index 57221c1a..e8732ad9 100644
--- a/src/runpod_flash/cli/commands/run.py
+++ b/src/runpod_flash/cli/commands/run.py
@@ -221,12 +221,20 @@ def _generate_flash_server(project_root: Path, workers: List[WorkerInfo]) -> Pat
                 handler_name = _sanitize_fn_name(
                     f"_route_{worker.resource_name}_{fn_name}"
                 )
-                lines += [
-                    f'@app.{method}("{full_path}", tags=["{tag}"])',
-                    f"async def {handler_name}(body: dict):",
-                    f"    return await {fn_name}(body)",
-                    "",
-                ]
+                if method in ("get", "head"):
+                    lines += [
+                        f'@app.{method}("{full_path}", tags=["{tag}"])',
+                        f"async def {handler_name}():",
+                        f"    return await {fn_name}()",
+                        "",
+                    ]
+                else:
+                    lines += [
+                        f'@app.{method}("{full_path}", tags=["{tag}"])',
+                        f"async def {handler_name}(body: dict):",
+                        f"    return await {fn_name}(body)",
+                        "",
+                    ]
 
     # Health endpoints
     lines += [
diff --git a/tests/unit/cli/test_run.py b/tests/unit/cli/test_run.py
index 1e0c549a..7fae486a 100644
--- a/tests/unit/cli/test_run.py
+++ b/tests/unit/cli/test_run.py
@@ -1,10 +1,12 @@
 """Unit tests for run CLI command."""
 
 import pytest
+from pathlib import Path
 from unittest.mock import patch, MagicMock
 from typer.testing import CliRunner
 
 from runpod_flash.cli.main import app
+from runpod_flash.cli.commands.run import WorkerInfo, _generate_flash_server
 
 
 @pytest.fixture
@@ -423,3 +425,39 @@ def test_scan_error_does_not_crash_watcher(self, tmp_path):
                 mock_watch.return_value = iter([{(1, "/path/to/worker.py")}])
                 # Should not raise
                 _watch_and_regenerate(tmp_path, stop)
+
+
+class TestGenerateFlashServer:
+    """Test _generate_flash_server() route code generation."""
+
+    def _make_lb_worker(self, tmp_path: Path, method: str) -> WorkerInfo:
+        return WorkerInfo(
+            file_path=tmp_path / "api.py",
+            url_prefix="/api",
+            module_path="api",
+            resource_name="api",
+            worker_type="LB",
+            functions=["list_routes"],
+            lb_routes=[
+                {"method": method, "path": "/routes/list", "fn_name": "list_routes"}
+            ],
+        )
+
+    def test_get_route_has_no_body_param(self, tmp_path):
+        """GET handler must omit body: dict to satisfy FastAPI/browser constraints."""
+        worker = self._make_lb_worker(tmp_path, "GET")
+        server_path = _generate_flash_server(tmp_path, [worker])
+        content = server_path.read_text()
+
+        # The GET handler must be zero-arg
+        assert "async def _route_api_list_routes():" in content
+        # No body parameter on any GET handler
+        assert "body: dict" not in content
+
+    def test_post_route_keeps_body_param(self, tmp_path):
+        """POST handler must include body: dict for JSON request body."""
+        worker = self._make_lb_worker(tmp_path, "POST")
+        server_path = _generate_flash_server(tmp_path, [worker])
+        content = server_path.read_text()
+
+        assert "async def _route_api_list_routes(body: dict):" in content

From adeb9c353784096f508b68d9b693d152c8a44b3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 18 Feb 2026 18:40:02 -0800
Subject: [PATCH 10/26] feat(run): proxy LB routes to deployed endpoints,
 restore --auto-provision

LB route handlers were executing locally in the dev server process
instead of forwarding to the deployed LB endpoint. The @remote decorator
returns LB handlers unwrapped (passthrough) because in a deployed pod
the body IS the HTTP handler, but in flash run there is no deployed pod.

Changes:
- Add _run_server_helpers.py with lb_proxy() that uses
  ResourceManager.get_or_deploy_resource() for on-demand provisioning
  and get_authenticated_httpx_client() for auth headers
- Generate proxy handlers for all LB routes (any HTTP method) that
  forward requests to the deployed endpoint transparently
- Import resource config variables (not function bodies) for LB workers
  so the actual DeployableResource object is passed to lb_proxy
- Restore --auto-provision flag dropped in 35cfa6e, using existing
  ResourceDiscovery and DeploymentOrchestrator to provision all
  endpoints upfront and eliminate cold-start latency
- Replace TestGenerateFlashServer tests with proxy-aware assertions
---
 .../cli/commands/_run_server_helpers.py       |  84 +++++++++++
 src/runpod_flash/cli/commands/run.py          | 140 +++++++++++++++---
 tests/unit/cli/test_run.py                    |  64 +++++---
 3 files changed, 249 insertions(+), 39 deletions(-)
 create mode 100644 src/runpod_flash/cli/commands/_run_server_helpers.py

diff --git a/src/runpod_flash/cli/commands/_run_server_helpers.py b/src/runpod_flash/cli/commands/_run_server_helpers.py
new file mode 100644
index 00000000..44b6d5d3
--- /dev/null
+++ b/src/runpod_flash/cli/commands/_run_server_helpers.py
@@ -0,0 +1,84 @@
+"""Helpers for the flash run dev server — loaded inside the generated server.py."""
+
+import httpx
+from fastapi import HTTPException, Request
+from fastapi.responses import Response
+
+from runpod_flash.core.resources.base import DeployableResource
+from runpod_flash.core.resources.resource_manager import ResourceManager
+from runpod_flash.core.utils.http import get_authenticated_httpx_client
+
+_resource_manager = ResourceManager()
+
+
+async def lb_proxy(
+    resource_config: DeployableResource, path_prefix: str, request: Request
+) -> Response:
+    """Transparent HTTP proxy from flash run dev server to deployed LB endpoint.
+
+    Uses ResourceManager.get_or_deploy_resource() to resolve the endpoint,
+    which handles provisioning, name prefixing, and caching automatically.
+
+    Args:
+        resource_config: The resource config object (e.g. LiveLoadBalancer instance)
+        path_prefix: URL prefix used by the dev server (e.g. "/api") — stripped before proxying
+        request: The incoming FastAPI request to forward
+
+    Returns:
+        FastAPI Response with upstream status code and body
+
+    Raises:
+        HTTPException 503: Endpoint not deployed or has no ID
+        HTTPException 504: Upstream request timed out
+        HTTPException 502: Connection error reaching the upstream endpoint
+    """
+    try:
+        deployed = await _resource_manager.get_or_deploy_resource(resource_config)
+        endpoint_url = deployed.endpoint_url
+    except ValueError as e:
+        raise HTTPException(
+            status_code=503,
+            detail=f"Endpoint '{resource_config.name}' not available: {e}",
+        )
+    except Exception as e:
+        raise HTTPException(
+            status_code=503,
+            detail=f"Failed to provision '{resource_config.name}': {e}",
+        )
+
+    target_path = request.url.path
+    if path_prefix and target_path.startswith(path_prefix):
+        target_path = target_path[len(path_prefix) :]
+    if not target_path:
+        target_path = "/"
+
+    target_url = endpoint_url.rstrip("/") + target_path
+    if request.url.query:
+        target_url += "?" + request.url.query
+
+    body = await request.body()
+    skip_headers = {"host", "content-length", "transfer-encoding", "connection"}
+    headers = {
+        k: v for k, v in request.headers.items() if k.lower() not in skip_headers
+    }
+
+    try:
+        async with get_authenticated_httpx_client(timeout=30.0) as client:
+            resp = await client.request(
+                request.method, target_url, content=body, headers=headers
+            )
+        return Response(
+            content=resp.content,
+            status_code=resp.status_code,
+            media_type=resp.headers.get("content-type"),
+        )
+    except httpx.TimeoutException:
+        raise HTTPException(
+            status_code=504,
+            detail=f"Timeout proxying to '{resource_config.name}'.",
+        )
+    except httpx.RequestError as e:
+        raise HTTPException(
+            status_code=502,
+            detail=f"Connection error proxying to '{resource_config.name}': {e}",
+        )
diff --git a/src/runpod_flash/cli/commands/run.py b/src/runpod_flash/cli/commands/run.py
index e8732ad9..f6392990 100644
--- a/src/runpod_flash/cli/commands/run.py
+++ b/src/runpod_flash/cli/commands/run.py
@@ -94,6 +94,7 @@ def _scan_project_workers(project_root: Path) -> List[WorkerInfo]:
                     "method": f.http_method,
                     "path": f.http_path,
                     "fn_name": f.function_name,
+                    "config_variable": f.config_variable,
                 }
                 for f in lb_funcs
             ]
@@ -149,6 +150,8 @@ def _generate_flash_server(project_root: Path, workers: List[WorkerInfo]) -> Pat
 
     _ensure_gitignore(project_root)
 
+    has_lb_workers = any(w.worker_type == "LB" for w in workers)
+
     lines = [
         '"""Auto-generated Flash dev server. Do not edit — regenerated on each flash run."""',
         "import sys",
@@ -156,15 +159,36 @@ def _generate_flash_server(project_root: Path, workers: List[WorkerInfo]) -> Pat
         "from pathlib import Path",
         "sys.path.insert(0, str(Path(__file__).parent.parent))",
         "",
-        "from fastapi import FastAPI",
-        "",
     ]
 
-    # Collect all imports
+    if has_lb_workers:
+        lines += [
+            "from fastapi import FastAPI, Request",
+            "from runpod_flash.cli.commands._run_server_helpers import lb_proxy as _lb_proxy",
+            "",
+        ]
+    else:
+        lines += [
+            "from fastapi import FastAPI",
+            "",
+        ]
+
+    # Collect imports — QB functions are called directly, LB config variables are
+    # passed to lb_proxy for on-demand provisioning via ResourceManager.
     all_imports: List[str] = []
     for worker in workers:
-        for fn_name in worker.functions:
-            all_imports.append(f"from {worker.module_path} import {fn_name}")
+        if worker.worker_type == "QB":
+            for fn_name in worker.functions:
+                all_imports.append(f"from {worker.module_path} import {fn_name}")
+        elif worker.worker_type == "LB":
+            # Import the resource config variable (e.g. "api" from api = LiveLoadBalancer(...))
+            config_vars = {
+                r["config_variable"]
+                for r in worker.lb_routes
+                if r.get("config_variable")
+            }
+            for var in sorted(config_vars):
+                all_imports.append(f"from {worker.module_path} import {var}")
 
     if all_imports:
         lines.extend(all_imports)
@@ -217,24 +241,17 @@ def _generate_flash_server(project_root: Path, workers: List[WorkerInfo]) -> Pat
                 method = route["method"].lower()
                 sub_path = route["path"].lstrip("/")
                 fn_name = route["fn_name"]
+                config_var = route["config_variable"]
                 full_path = f"{worker.url_prefix}/{sub_path}"
                 handler_name = _sanitize_fn_name(
                     f"_route_{worker.resource_name}_{fn_name}"
                 )
-                if method in ("get", "head"):
-                    lines += [
-                        f'@app.{method}("{full_path}", tags=["{tag}"])',
-                        f"async def {handler_name}():",
-                        f"    return await {fn_name}()",
-                        "",
-                    ]
-                else:
-                    lines += [
-                        f'@app.{method}("{full_path}", tags=["{tag}"])',
-                        f"async def {handler_name}(body: dict):",
-                        f"    return await {fn_name}(body)",
-                        "",
-                    ]
+                lines += [
+                    f'@app.{method}("{full_path}", tags=["{tag}"])',
+                    f"async def {handler_name}(request: Request):",
+                    f"    return await _lb_proxy({config_var}, {worker.url_prefix!r}, request)",
+                    "",
+                ]
 
     # Health endpoints
     lines += [
@@ -402,6 +419,73 @@ def _watch_and_regenerate(project_root: Path, stop_event: threading.Event) -> No
         pass  # stop_event was set or watchfiles unavailable — both are fine
 
 
+def _discover_resources(project_root: Path):
+    """Discover deployable resources in project files.
+
+    Uses ResourceDiscovery to find all DeployableResource instances by
+    parsing @remote decorators and importing the referenced config variables.
+
+    Args:
+        project_root: Root directory of the Flash project
+
+    Returns:
+        List of discovered DeployableResource instances
+    """
+    from ...core.discovery import ResourceDiscovery
+
+    py_files = sorted(
+        p
+        for p in project_root.rglob("*.py")
+        if not any(
+            skip in p.parts
+            for skip in (".flash", ".venv", "venv", "__pycache__", ".git")
+        )
+    )
+
+    resources = []
+    for py_file in py_files:
+        try:
+            discovery = ResourceDiscovery(str(py_file), max_depth=0)
+            resources.extend(discovery.discover())
+        except Exception as e:
+            logger.debug("Discovery failed for %s: %s", py_file, e)
+
+    if resources:
+        console.print(f"\n[dim]Discovered {len(resources)} resource(s):[/dim]")
+        for res in resources:
+            res_name = getattr(res, "name", "Unknown")
+            res_type = res.__class__.__name__
+            console.print(f"  [dim]- {res_name} ({res_type})[/dim]")
+        console.print()
+
+    return resources
+
+
+def _provision_resources(resources) -> None:
+    """Provision resources in parallel and wait for completion.
+
+    Args:
+        resources: List of DeployableResource instances to provision
+    """
+    import asyncio
+
+    from ...core.deployment import DeploymentOrchestrator
+
+    try:
+        console.print(f"[bold]Provisioning {len(resources)} resource(s)...[/bold]")
+        orchestrator = DeploymentOrchestrator(max_concurrent=3)
+
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        loop.run_until_complete(orchestrator.deploy_all(resources, show_progress=True))
+        loop.close()
+    except Exception as e:
+        console.print(f"[yellow]Warning:[/yellow] Provisioning failed: {e}")
+        console.print(
+            "[dim]Resources will be provisioned on-demand at first request.[/dim]"
+        )
+
+
 def run_command(
     host: str = typer.Option(
         "localhost",
@@ -419,6 +503,11 @@ def run_command(
     reload: bool = typer.Option(
         True, "--reload/--no-reload", help="Enable auto-reload"
     ),
+    auto_provision: bool = typer.Option(
+        False,
+        "--auto-provision",
+        help="Auto-provision all endpoints on startup (eliminates cold-start on first request)",
+    ),
 ):
     """Run Flash development server.
 
@@ -434,6 +523,19 @@ def run_command(
     if not _is_reload():
         os.environ["FLASH_IS_LIVE_PROVISIONING"] = "true"
 
+    # Auto-provision all endpoints upfront (eliminates cold-start)
+    if auto_provision and not _is_reload():
+        try:
+            resources = _discover_resources(project_root)
+            if resources:
+                _provision_resources(resources)
+        except Exception as e:
+            logger.error("Auto-provisioning failed", exc_info=True)
+            console.print(f"[yellow]Warning:[/yellow] Auto-provisioning failed: {e}")
+            console.print(
+                "[dim]Resources will be provisioned on-demand at first request.[/dim]"
+            )
+
     # Discover @remote functions
     workers = _scan_project_workers(project_root)
 
diff --git a/tests/unit/cli/test_run.py b/tests/unit/cli/test_run.py
index 7fae486a..2152b526 100644
--- a/tests/unit/cli/test_run.py
+++ b/tests/unit/cli/test_run.py
@@ -430,7 +430,7 @@ def test_scan_error_does_not_crash_watcher(self, tmp_path):
 class TestGenerateFlashServer:
     """Test _generate_flash_server() route code generation."""
 
-    def _make_lb_worker(self, tmp_path: Path, method: str) -> WorkerInfo:
+    def _make_lb_worker(self, tmp_path: Path, method: str = "GET") -> WorkerInfo:
         return WorkerInfo(
             file_path=tmp_path / "api.py",
             url_prefix="/api",
@@ -439,25 +439,49 @@ def _make_lb_worker(self, tmp_path: Path, method: str) -> WorkerInfo:
             worker_type="LB",
             functions=["list_routes"],
             lb_routes=[
-                {"method": method, "path": "/routes/list", "fn_name": "list_routes"}
+                {
+                    "method": method,
+                    "path": "/routes/list",
+                    "fn_name": "list_routes",
+                    "config_variable": "api_config",
+                }
             ],
         )
 
-    def test_get_route_has_no_body_param(self, tmp_path):
-        """GET handler must omit body: dict to satisfy FastAPI/browser constraints."""
-        worker = self._make_lb_worker(tmp_path, "GET")
-        server_path = _generate_flash_server(tmp_path, [worker])
-        content = server_path.read_text()
-
-        # The GET handler must be zero-arg
-        assert "async def _route_api_list_routes():" in content
-        # No body parameter on any GET handler
-        assert "body: dict" not in content
-
-    def test_post_route_keeps_body_param(self, tmp_path):
-        """POST handler must include body: dict for JSON request body."""
-        worker = self._make_lb_worker(tmp_path, "POST")
-        server_path = _generate_flash_server(tmp_path, [worker])
-        content = server_path.read_text()
-
-        assert "async def _route_api_list_routes(body: dict):" in content
+    def test_lb_route_generates_proxy_handler(self, tmp_path):
+        """All LB routes (any method) generate a proxy handler, not a local call."""
+        for method in ("GET", "POST", "DELETE", "PUT", "PATCH"):
+            worker = self._make_lb_worker(tmp_path, method)
+            content = _generate_flash_server(tmp_path, [worker]).read_text()
+            assert "async def _route_api_list_routes(request: Request):" in content
+            assert "_lb_proxy(" in content
+            assert "body: dict" not in content
+
+    def test_lb_config_variable_passed_to_proxy(self, tmp_path):
+        """The resource config variable is passed to lb_proxy, not a string name."""
+        worker = self._make_lb_worker(tmp_path)
+        content = _generate_flash_server(tmp_path, [worker]).read_text()
+        # Config variable is passed as a Python reference, not a quoted string
+        assert "_lb_proxy(api_config," in content
+        assert "from api import api_config" in content
+
+    def test_lb_proxy_import_present_when_lb_routes_exist(self, tmp_path):
+        """server.py imports _lb_proxy when there are LB workers."""
+        worker = self._make_lb_worker(tmp_path)
+        content = _generate_flash_server(tmp_path, [worker]).read_text()
+        assert "_lb_proxy" in content
+        assert "lb_proxy" in content
+
+    def test_qb_function_still_imported_directly(self, tmp_path):
+        """QB workers still import and call functions directly."""
+        worker = WorkerInfo(
+            file_path=tmp_path / "worker.py",
+            url_prefix="/worker",
+            module_path="worker",
+            resource_name="worker",
+            worker_type="QB",
+            functions=["process"],
+        )
+        content = _generate_flash_server(tmp_path, [worker]).read_text()
+        assert "from worker import process" in content
+        assert "await process(" in content

From be2e65eefa58ff5e8fc50e3148fac6d59079d9db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 18 Feb 2026 18:50:42 -0800
Subject: [PATCH 11/26] fix(run): add project root to sys.path during resource
 discovery

ResourceDiscovery._import_module() uses importlib to execute each file,
but cross-module imports (e.g. "from longruns.stage1 import ...") fail
when the project root isn't on sys.path. This caused --auto-provision
to silently skip LB endpoints whose files import from sibling packages.
---
 src/runpod_flash/cli/commands/run.py | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/src/runpod_flash/cli/commands/run.py b/src/runpod_flash/cli/commands/run.py
index f6392990..18509abe 100644
--- a/src/runpod_flash/cli/commands/run.py
+++ b/src/runpod_flash/cli/commands/run.py
@@ -442,13 +442,24 @@ def _discover_resources(project_root: Path):
         )
     )
 
+    # Add project root to sys.path so cross-module imports resolve
+    # (e.g. api/routes.py doing "from longruns.stage1 import stage1_process").
+    root_str = str(project_root)
+    added_to_path = root_str not in sys.path
+    if added_to_path:
+        sys.path.insert(0, root_str)
+
     resources = []
-    for py_file in py_files:
-        try:
-            discovery = ResourceDiscovery(str(py_file), max_depth=0)
-            resources.extend(discovery.discover())
-        except Exception as e:
-            logger.debug("Discovery failed for %s: %s", py_file, e)
+    try:
+        for py_file in py_files:
+            try:
+                discovery = ResourceDiscovery(str(py_file), max_depth=0)
+                resources.extend(discovery.discover())
+            except Exception as e:
+                logger.debug("Discovery failed for %s: %s", py_file, e)
+    finally:
+        if added_to_path:
+            sys.path.remove(root_str)
 
     if resources:
         console.print(f"\n[dim]Discovered {len(resources)} resource(s):[/dim]")

From 417f9ec0de7b2ffbe9c8946dce01eb214c4e306c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 18 Feb 2026 18:54:56 -0800
Subject: [PATCH 12/26] feat(run): show resource count and elapsed time during
 cleanup

Cleanup on server stop now prints a summary line with undeployed count
and wall-clock duration, matching the provisioning output format.
---
 src/runpod_flash/cli/commands/run.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/runpod_flash/cli/commands/run.py b/src/runpod_flash/cli/commands/run.py
index 18509abe..f4d88b2d 100644
--- a/src/runpod_flash/cli/commands/run.py
+++ b/src/runpod_flash/cli/commands/run.py
@@ -356,19 +356,30 @@ def _cleanup_live_endpoints() -> None:
         if not live_items:
             return
 
+        import time
+
         async def _do_cleanup():
+            undeployed = 0
             for key, resource in live_items.items():
                 name = getattr(resource, "name", key)
                 try:
                     success = await resource._do_undeploy()
                     if success:
                         console.print(f"  Deprovisioned: {name}")
+                        undeployed += 1
                     else:
                         logger.warning(f"Failed to deprovision: {name}")
                 except Exception as e:
                     logger.warning(f"Error deprovisioning {name}: {e}")
+            return undeployed
 
-        asyncio.run(_do_cleanup())
+        t0 = time.monotonic()
+        undeployed = asyncio.run(_do_cleanup())
+        elapsed = time.monotonic() - t0
+        console.print(
+            f"  Cleanup completed: {undeployed}/{len(live_items)} "
+            f"resource(s) undeployed in {elapsed:.1f}s"
+        )
 
         # Remove live- entries from persisted state so they don't linger.
         remaining = {k: v for k, v in resources.items() if k not in live_items}

From e264ab18c2c1a415406f62d2166a46ac4ebf6e44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 18 Feb 2026 21:37:52 -0800
Subject: [PATCH 13/26] fix(run): route LB calls through LoadBalancerSlsStub
 instead of HTTP proxy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace lb_proxy (transparent HTTP forwarding) with lb_execute which
uses LoadBalancerSlsStub's /execute dispatch path. This fixes 404s on
CpuLiveLoadBalancer resources where the remote container has no user
routes — only the /execute endpoint that accepts serialized function
code.

- Change isinstance check from LiveLoadBalancer to LiveServerlessMixin
  so all live resource types (including CpuLiveLoadBalancer) use /execute
- Add explicit CpuLiveLoadBalancer singledispatch registration in registry
- Generate server.py imports for both config var and function reference
- Clean up redundant URL debug logs in resource_manager
---
 .../cli/commands/_run_server_helpers.py       | 86 ++++++-------------
 src/runpod_flash/cli/commands/run.py          | 10 ++-
 src/runpod_flash/core/api/runpod.py           |  4 +-
 .../resources/load_balancer_sls_resource.py   |  4 +-
 .../core/resources/resource_manager.py        |  4 -
 src/runpod_flash/stubs/load_balancer_sls.py   |  6 +-
 src/runpod_flash/stubs/registry.py            | 26 ++++++
 tests/unit/cli/test_run.py                    | 22 ++---
 8 files changed, 74 insertions(+), 88 deletions(-)

diff --git a/src/runpod_flash/cli/commands/_run_server_helpers.py b/src/runpod_flash/cli/commands/_run_server_helpers.py
index 44b6d5d3..8de16732 100644
--- a/src/runpod_flash/cli/commands/_run_server_helpers.py
+++ b/src/runpod_flash/cli/commands/_run_server_helpers.py
@@ -1,84 +1,46 @@
 """Helpers for the flash run dev server — loaded inside the generated server.py."""
 
-import httpx
 from fastapi import HTTPException, Request
-from fastapi.responses import Response
 
-from runpod_flash.core.resources.base import DeployableResource
 from runpod_flash.core.resources.resource_manager import ResourceManager
-from runpod_flash.core.utils.http import get_authenticated_httpx_client
+from runpod_flash.stubs.load_balancer_sls import LoadBalancerSlsStub
 
 _resource_manager = ResourceManager()
 
 
-async def lb_proxy(
-    resource_config: DeployableResource, path_prefix: str, request: Request
-) -> Response:
-    """Transparent HTTP proxy from flash run dev server to deployed LB endpoint.
+async def lb_execute(resource_config, func, request: Request):
+    """Execute LB function on deployed endpoint via LoadBalancerSlsStub.
 
-    Uses ResourceManager.get_or_deploy_resource() to resolve the endpoint,
-    which handles provisioning, name prefixing, and caching automatically.
-
-    Args:
-        resource_config: The resource config object (e.g. LiveLoadBalancer instance)
-        path_prefix: URL prefix used by the dev server (e.g. "/api") — stripped before proxying
-        request: The incoming FastAPI request to forward
-
-    Returns:
-        FastAPI Response with upstream status code and body
-
-    Raises:
-        HTTPException 503: Endpoint not deployed or has no ID
-        HTTPException 504: Upstream request timed out
-        HTTPException 502: Connection error reaching the upstream endpoint
+    Uses the same /execute dispatch path that works on main — provisions
+    the endpoint, serializes the function via cloudpickle, and POSTs to
+    /execute on the deployed container.
     """
     try:
         deployed = await _resource_manager.get_or_deploy_resource(resource_config)
-        endpoint_url = deployed.endpoint_url
-    except ValueError as e:
-        raise HTTPException(
-            status_code=503,
-            detail=f"Endpoint '{resource_config.name}' not available: {e}",
-        )
     except Exception as e:
         raise HTTPException(
             status_code=503,
             detail=f"Failed to provision '{resource_config.name}': {e}",
         )
 
-    target_path = request.url.path
-    if path_prefix and target_path.startswith(path_prefix):
-        target_path = target_path[len(path_prefix) :]
-    if not target_path:
-        target_path = "/"
-
-    target_url = endpoint_url.rstrip("/") + target_path
-    if request.url.query:
-        target_url += "?" + request.url.query
+    stub = LoadBalancerSlsStub(deployed)
 
-    body = await request.body()
-    skip_headers = {"host", "content-length", "transfer-encoding", "connection"}
-    headers = {
-        k: v for k, v in request.headers.items() if k.lower() not in skip_headers
-    }
+    # Parse HTTP request into function kwargs
+    if request.method in ("POST", "PUT", "PATCH"):
+        try:
+            kwargs = await request.json()
+            if not isinstance(kwargs, dict):
+                kwargs = {"input": kwargs}
+        except Exception:
+            kwargs = {}
+    else:
+        kwargs = dict(request.query_params)
 
     try:
-        async with get_authenticated_httpx_client(timeout=30.0) as client:
-            resp = await client.request(
-                request.method, target_url, content=body, headers=headers
-            )
-        return Response(
-            content=resp.content,
-            status_code=resp.status_code,
-            media_type=resp.headers.get("content-type"),
-        )
-    except httpx.TimeoutException:
-        raise HTTPException(
-            status_code=504,
-            detail=f"Timeout proxying to '{resource_config.name}'.",
-        )
-    except httpx.RequestError as e:
-        raise HTTPException(
-            status_code=502,
-            detail=f"Connection error proxying to '{resource_config.name}': {e}",
-        )
+        return await stub(func, None, None, False, **kwargs)
+    except TimeoutError as e:
+        raise HTTPException(status_code=504, detail=str(e))
+    except ConnectionError as e:
+        raise HTTPException(status_code=502, detail=str(e))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
diff --git a/src/runpod_flash/cli/commands/run.py b/src/runpod_flash/cli/commands/run.py
index f4d88b2d..1c5b117c 100644
--- a/src/runpod_flash/cli/commands/run.py
+++ b/src/runpod_flash/cli/commands/run.py
@@ -164,7 +164,7 @@ def _generate_flash_server(project_root: Path, workers: List[WorkerInfo]) -> Pat
     if has_lb_workers:
         lines += [
             "from fastapi import FastAPI, Request",
-            "from runpod_flash.cli.commands._run_server_helpers import lb_proxy as _lb_proxy",
+            "from runpod_flash.cli.commands._run_server_helpers import lb_execute as _lb_execute",
             "",
         ]
     else:
@@ -173,8 +173,8 @@ def _generate_flash_server(project_root: Path, workers: List[WorkerInfo]) -> Pat
             "",
         ]
 
-    # Collect imports — QB functions are called directly, LB config variables are
-    # passed to lb_proxy for on-demand provisioning via ResourceManager.
+    # Collect imports — QB functions are called directly, LB config variables and
+    # functions are passed to lb_execute for dispatch via LoadBalancerSlsStub.
     all_imports: List[str] = []
     for worker in workers:
         if worker.worker_type == "QB":
@@ -189,6 +189,8 @@ def _generate_flash_server(project_root: Path, workers: List[WorkerInfo]) -> Pat
             }
             for var in sorted(config_vars):
                 all_imports.append(f"from {worker.module_path} import {var}")
+            for fn_name in worker.functions:
+                all_imports.append(f"from {worker.module_path} import {fn_name}")
 
     if all_imports:
         lines.extend(all_imports)
@@ -249,7 +251,7 @@ def _generate_flash_server(project_root: Path, workers: List[WorkerInfo]) -> Pat
                 lines += [
                     f'@app.{method}("{full_path}", tags=["{tag}"])',
                     f"async def {handler_name}(request: Request):",
-                    f"    return await _lb_proxy({config_var}, {worker.url_prefix!r}, request)",
+                    f"    return await _lb_execute({config_var}, {fn_name}, request)",
                     "",
                 ]
 
diff --git a/src/runpod_flash/core/api/runpod.py b/src/runpod_flash/core/api/runpod.py
index bc30219a..478428b8 100644
--- a/src/runpod_flash/core/api/runpod.py
+++ b/src/runpod_flash/core/api/runpod.py
@@ -202,7 +202,7 @@ async def save_endpoint(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
 
         variables = {"input": input_data}
 
-        log.debug(f"Saving endpoint with GraphQL: {input_data.get('name', 'unnamed')}")
+        log.debug(f"GraphQL saveEndpoint: {input_data.get('name', 'unnamed')}")
 
         result = await self._execute_graphql(mutation, variables)
 
@@ -211,7 +211,7 @@ async def save_endpoint(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
 
         endpoint_data = result["saveEndpoint"]
         log.debug(
-            f"Saved endpoint: {endpoint_data.get('id', 'unknown')} - {endpoint_data.get('name', 'unnamed')}"
+            f"GraphQL response: {endpoint_data.get('id', 'unknown')} ({endpoint_data.get('name', 'unnamed')})"
         )
 
         return endpoint_data
diff --git a/src/runpod_flash/core/resources/load_balancer_sls_resource.py b/src/runpod_flash/core/resources/load_balancer_sls_resource.py
index eb664ed0..df84d622 100644
--- a/src/runpod_flash/core/resources/load_balancer_sls_resource.py
+++ b/src/runpod_flash/core/resources/load_balancer_sls_resource.py
@@ -259,10 +259,10 @@ async def _do_deploy(self) -> "LoadBalancerSlsResource":
             self.env["FLASH_IS_MOTHERSHIP"] = "true"
 
             # Call parent deploy (creates endpoint via RunPod API)
-            log.debug(f"Deploying LB endpoint {self.name}...")
+            log.info(f"Deploying LB endpoint: {self.name}")
             deployed = await super()._do_deploy()
 
-            log.debug(f"LB endpoint {self.name} ({deployed.id}) deployed successfully")
+            log.info(f"Deployed: {self.name} ({deployed.url})")
             return deployed
 
         except Exception as e:
diff --git a/src/runpod_flash/core/resources/resource_manager.py b/src/runpod_flash/core/resources/resource_manager.py
index 0cd18f51..82eebe2b 100644
--- a/src/runpod_flash/core/resources/resource_manager.py
+++ b/src/runpod_flash/core/resources/resource_manager.py
@@ -245,7 +245,6 @@ async def get_or_deploy_resource(
                         deployed_resource = await self._deploy_with_error_context(
                             config
                         )
-                        log.debug(f"URL: {deployed_resource.url}")
                         self._add_resource(resource_key, deployed_resource)
                         return deployed_resource
                     except Exception:
@@ -278,7 +277,6 @@ async def get_or_deploy_resource(
                             deployed_resource = await self._deploy_with_error_context(
                                 config
                             )
-                            log.debug(f"URL: {deployed_resource.url}")
                             self._add_resource(resource_key, deployed_resource)
                             return deployed_resource
                         except Exception:
@@ -292,13 +290,11 @@ async def get_or_deploy_resource(
                             raise
 
                 # Config unchanged, reuse existing
-                log.info(f"URL: {existing.url}")
                 return existing
 
             # No existing resource, deploy new one
             try:
                 deployed_resource = await self._deploy_with_error_context(config)
-                log.debug(f"URL: {deployed_resource.url}")
                 self._add_resource(resource_key, deployed_resource)
                 return deployed_resource
             except Exception:
diff --git a/src/runpod_flash/stubs/load_balancer_sls.py b/src/runpod_flash/stubs/load_balancer_sls.py
index d30a9ce5..f44bccbb 100644
--- a/src/runpod_flash/stubs/load_balancer_sls.py
+++ b/src/runpod_flash/stubs/load_balancer_sls.py
@@ -75,10 +75,10 @@ def _should_use_execute_endpoint(self, func: Callable[..., Any]) -> bool:
         Returns:
             True if /execute should be used, False if user route should be used
         """
-        from ..core.resources.live_serverless import LiveLoadBalancer
+        from ..core.resources.live_serverless import LiveServerlessMixin
 
-        # Always use /execute for LiveLoadBalancer (local development)
-        if isinstance(self.server, LiveLoadBalancer):
+        # Always use /execute for live resources (local development)
+        if isinstance(self.server, LiveServerlessMixin):
             log.debug(f"Using /execute endpoint for LiveLoadBalancer: {func.__name__}")
             return True
 
diff --git a/src/runpod_flash/stubs/registry.py b/src/runpod_flash/stubs/registry.py
index 674e0085..bbea9243 100644
--- a/src/runpod_flash/stubs/registry.py
+++ b/src/runpod_flash/stubs/registry.py
@@ -3,6 +3,7 @@
 from functools import singledispatch
 
 from ..core.resources import (
+    CpuLiveLoadBalancer,
     CpuLiveServerless,
     CpuServerlessEndpoint,
     LiveLoadBalancer,
@@ -209,3 +210,28 @@ async def stubbed_resource(
         )
 
     return stubbed_resource
+
+
+@stub_resource.register(CpuLiveLoadBalancer)
+def _(resource, **extra):
+    """Create stub for CpuLiveLoadBalancer (HTTP-based execution, local testing)."""
+    stub = LoadBalancerSlsStub(resource)
+
+    async def stubbed_resource(
+        func,
+        dependencies,
+        system_dependencies,
+        accelerate_downloads,
+        *args,
+        **kwargs,
+    ) -> dict:
+        return await stub(
+            func,
+            dependencies,
+            system_dependencies,
+            accelerate_downloads,
+            *args,
+            **kwargs,
+        )
+
+    return stubbed_resource
diff --git a/tests/unit/cli/test_run.py b/tests/unit/cli/test_run.py
index 2152b526..8e92ff0c 100644
--- a/tests/unit/cli/test_run.py
+++ b/tests/unit/cli/test_run.py
@@ -448,29 +448,29 @@ def _make_lb_worker(self, tmp_path: Path, method: str = "GET") -> WorkerInfo:
             ],
         )
 
-    def test_lb_route_generates_proxy_handler(self, tmp_path):
-        """All LB routes (any method) generate a proxy handler, not a local call."""
+    def test_lb_route_generates_execute_handler(self, tmp_path):
+        """All LB routes (any method) generate a stub-based execute handler."""
         for method in ("GET", "POST", "DELETE", "PUT", "PATCH"):
             worker = self._make_lb_worker(tmp_path, method)
             content = _generate_flash_server(tmp_path, [worker]).read_text()
             assert "async def _route_api_list_routes(request: Request):" in content
-            assert "_lb_proxy(" in content
+            assert "_lb_execute(" in content
             assert "body: dict" not in content
 
-    def test_lb_config_variable_passed_to_proxy(self, tmp_path):
-        """The resource config variable is passed to lb_proxy, not a string name."""
+    def test_lb_config_and_function_passed_to_execute(self, tmp_path):
+        """Both config variable and function are passed to lb_execute."""
         worker = self._make_lb_worker(tmp_path)
         content = _generate_flash_server(tmp_path, [worker]).read_text()
-        # Config variable is passed as a Python reference, not a quoted string
-        assert "_lb_proxy(api_config," in content
+        assert "_lb_execute(api_config, list_routes, request)" in content
         assert "from api import api_config" in content
+        assert "from api import list_routes" in content
 
-    def test_lb_proxy_import_present_when_lb_routes_exist(self, tmp_path):
-        """server.py imports _lb_proxy when there are LB workers."""
+    def test_lb_execute_import_present_when_lb_routes_exist(self, tmp_path):
+        """server.py imports _lb_execute when there are LB workers."""
         worker = self._make_lb_worker(tmp_path)
         content = _generate_flash_server(tmp_path, [worker]).read_text()
-        assert "_lb_proxy" in content
-        assert "lb_proxy" in content
+        assert "_lb_execute" in content
+        assert "lb_execute" in content
 
     def test_qb_function_still_imported_directly(self, tmp_path):
         """QB workers still import and call functions directly."""

From 91dcc155190a1f3b8739e12172744bbb4606474b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Thu, 19 Feb 2026 08:18:20 -0800
Subject: [PATCH 14/26] fix(run): revert LB to remote dispatch, remove QB /run
 route
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Restore lb_execute to dispatch through LoadBalancerSlsStub instead of
calling functions locally — LB resources require Live Serverless
containers and cannot execute on a local machine.

Keep _map_body_to_params and body: dict signatures for OpenAPI/Swagger
compatibility while dispatching remotely via the stub's /execute path.

Remove /run from generated QB routes, retaining only /run_sync since
the dev server executes synchronously.
---
 .../cli/commands/_run_server_helpers.py       | 50 +++++++----
 src/runpod_flash/cli/commands/run.py          | 41 +++++----
 tests/unit/cli/test_run.py                    | 85 +++++++++++++++++--
 3 files changed, 128 insertions(+), 48 deletions(-)

diff --git a/src/runpod_flash/cli/commands/_run_server_helpers.py b/src/runpod_flash/cli/commands/_run_server_helpers.py
index 8de16732..70391bbd 100644
--- a/src/runpod_flash/cli/commands/_run_server_helpers.py
+++ b/src/runpod_flash/cli/commands/_run_server_helpers.py
@@ -1,6 +1,8 @@
 """Helpers for the flash run dev server — loaded inside the generated server.py."""
 
-from fastapi import HTTPException, Request
+import inspect
+
+from fastapi import HTTPException
 
 from runpod_flash.core.resources.resource_manager import ResourceManager
 from runpod_flash.stubs.load_balancer_sls import LoadBalancerSlsStub
@@ -8,12 +10,36 @@
 _resource_manager = ResourceManager()
 
 
-async def lb_execute(resource_config, func, request: Request):
-    """Execute LB function on deployed endpoint via LoadBalancerSlsStub.
+def _map_body_to_params(func, body):
+    """Map an HTTP request body to function parameters.
 
-    Uses the same /execute dispatch path that works on main — provisions
-    the endpoint, serializes the function via cloudpickle, and POSTs to
-    /execute on the deployed container.
+    If the body is a dict whose keys match the function's parameter names,
+    spread it as kwargs.  Otherwise pass the whole body as the value of the
+    first parameter (mirrors how FastAPI maps a JSON body to a single param).
+    """
+    sig = inspect.signature(func)
+    param_names = set(sig.parameters.keys())
+
+    if isinstance(body, dict) and body.keys() <= param_names:
+        return body
+
+    first_param = next(iter(sig.parameters), None)
+    if first_param is None:
+        return {}
+    return {first_param: body}
+
+
+async def lb_execute(resource_config, func, body: dict):
+    """Dispatch an LB route to the deployed endpoint via LoadBalancerSlsStub.
+
+    Provisions the endpoint via ResourceManager, maps the HTTP body to
+    function kwargs, then dispatches through the stub's /execute path
+    which serializes the function via cloudpickle to the remote container.
+
+    Args:
+        resource_config: The resource config object (e.g. LiveLoadBalancer instance).
+        func: The @remote LB route handler function.
+        body: Parsed request body (from FastAPI's automatic JSON parsing).
     """
     try:
         deployed = await _resource_manager.get_or_deploy_resource(resource_config)
@@ -24,17 +50,7 @@ async def lb_execute(resource_config, func, request: Request):
         )
 
     stub = LoadBalancerSlsStub(deployed)
-
-    # Parse HTTP request into function kwargs
-    if request.method in ("POST", "PUT", "PATCH"):
-        try:
-            kwargs = await request.json()
-            if not isinstance(kwargs, dict):
-                kwargs = {"input": kwargs}
-        except Exception:
-            kwargs = {}
-    else:
-        kwargs = dict(request.query_params)
+    kwargs = _map_body_to_params(func, body)
 
     try:
         return await stub(func, None, None, False, **kwargs)
diff --git a/src/runpod_flash/cli/commands/run.py b/src/runpod_flash/cli/commands/run.py
index 1c5b117c..8bc94a22 100644
--- a/src/runpod_flash/cli/commands/run.py
+++ b/src/runpod_flash/cli/commands/run.py
@@ -213,11 +213,9 @@ def _generate_flash_server(project_root: Path, workers: List[WorkerInfo]) -> Pat
         if worker.worker_type == "QB":
             if len(worker.functions) == 1:
                 fn = worker.functions[0]
-                handler_name = _sanitize_fn_name(f"{worker.resource_name}_run")
-                run_path = f"{worker.url_prefix}/run"
+                handler_name = _sanitize_fn_name(f"{worker.resource_name}_run_sync")
                 sync_path = f"{worker.url_prefix}/run_sync"
                 lines += [
-                    f'@app.post("{run_path}", tags=["{tag}"])',
                     f'@app.post("{sync_path}", tags=["{tag}"])',
                     f"async def {handler_name}(body: dict):",
                     f'    result = await {fn}(body.get("input", body))',
@@ -226,11 +224,11 @@ def _generate_flash_server(project_root: Path, workers: List[WorkerInfo]) -> Pat
                 ]
             else:
                 for fn in worker.functions:
-                    handler_name = _sanitize_fn_name(f"{worker.resource_name}_{fn}_run")
-                    run_path = f"{worker.url_prefix}/{fn}/run"
+                    handler_name = _sanitize_fn_name(
+                        f"{worker.resource_name}_{fn}_run_sync"
+                    )
                     sync_path = f"{worker.url_prefix}/{fn}/run_sync"
                     lines += [
-                        f'@app.post("{run_path}", tags=["{tag}"])',
                         f'@app.post("{sync_path}", tags=["{tag}"])',
                         f"async def {handler_name}(body: dict):",
                         f'    result = await {fn}(body.get("input", body))',
@@ -248,12 +246,21 @@ def _generate_flash_server(project_root: Path, workers: List[WorkerInfo]) -> Pat
                 handler_name = _sanitize_fn_name(
                     f"_route_{worker.resource_name}_{fn_name}"
                 )
-                lines += [
-                    f'@app.{method}("{full_path}", tags=["{tag}"])',
-                    f"async def {handler_name}(request: Request):",
-                    f"    return await _lb_execute({config_var}, {fn_name}, request)",
-                    "",
-                ]
+                has_body = method in ("post", "put", "patch", "delete")
+                if has_body:
+                    lines += [
+                        f'@app.{method}("{full_path}", tags=["{tag}"])',
+                        f"async def {handler_name}(body: dict):",
+                        f"    return await _lb_execute({config_var}, {fn_name}, body)",
+                        "",
+                    ]
+                else:
+                    lines += [
+                        f'@app.{method}("{full_path}", tags=["{tag}"])',
+                        f"async def {handler_name}(request: Request):",
+                        f"    return await _lb_execute({config_var}, {fn_name}, dict(request.query_params))",
+                        "",
+                    ]
 
     # Health endpoints
     lines += [
@@ -286,11 +293,6 @@ def _print_startup_table(workers: List[WorkerInfo], host: str, port: int) -> Non
     for worker in workers:
         if worker.worker_type == "QB":
             if len(worker.functions) == 1:
-                table.add_row(
-                    f"POST  {worker.url_prefix}/run",
-                    worker.resource_name,
-                    "QB",
-                )
                 table.add_row(
                     f"POST  {worker.url_prefix}/run_sync",
                     worker.resource_name,
@@ -298,11 +300,6 @@ def _print_startup_table(workers: List[WorkerInfo], host: str, port: int) -> Non
                 )
             else:
                 for fn in worker.functions:
-                    table.add_row(
-                        f"POST  {worker.url_prefix}/{fn}/run",
-                        worker.resource_name,
-                        "QB",
-                    )
                     table.add_row(
                         f"POST  {worker.url_prefix}/{fn}/run_sync",
                         worker.resource_name,
diff --git a/tests/unit/cli/test_run.py b/tests/unit/cli/test_run.py
index 8e92ff0c..d13abd12 100644
--- a/tests/unit/cli/test_run.py
+++ b/tests/unit/cli/test_run.py
@@ -448,20 +448,28 @@ def _make_lb_worker(self, tmp_path: Path, method: str = "GET") -> WorkerInfo:
             ],
         )
 
-    def test_lb_route_generates_execute_handler(self, tmp_path):
-        """All LB routes (any method) generate a stub-based execute handler."""
-        for method in ("GET", "POST", "DELETE", "PUT", "PATCH"):
+    def test_post_lb_route_generates_body_param(self, tmp_path):
+        """POST/PUT/PATCH/DELETE LB routes use body: dict for OpenAPI docs."""
+        for method in ("POST", "PUT", "PATCH", "DELETE"):
             worker = self._make_lb_worker(tmp_path, method)
             content = _generate_flash_server(tmp_path, [worker]).read_text()
-            assert "async def _route_api_list_routes(request: Request):" in content
-            assert "_lb_execute(" in content
-            assert "body: dict" not in content
+            assert "async def _route_api_list_routes(body: dict):" in content
+            assert "_lb_execute(api_config, list_routes, body)" in content
 
-    def test_lb_config_and_function_passed_to_execute(self, tmp_path):
-        """Both config variable and function are passed to lb_execute."""
+    def test_get_lb_route_uses_query_params(self, tmp_path):
+        """GET LB routes pass query params as a dict."""
+        worker = self._make_lb_worker(tmp_path, "GET")
+        content = _generate_flash_server(tmp_path, [worker]).read_text()
+        assert "async def _route_api_list_routes(request: Request):" in content
+        assert (
+            "_lb_execute(api_config, list_routes, dict(request.query_params))"
+            in content
+        )
+
+    def test_lb_config_var_and_function_imported(self, tmp_path):
+        """LB config vars and functions are both imported for remote dispatch."""
         worker = self._make_lb_worker(tmp_path)
         content = _generate_flash_server(tmp_path, [worker]).read_text()
-        assert "_lb_execute(api_config, list_routes, request)" in content
         assert "from api import api_config" in content
         assert "from api import list_routes" in content
 
@@ -485,3 +493,62 @@ def test_qb_function_still_imported_directly(self, tmp_path):
         content = _generate_flash_server(tmp_path, [worker]).read_text()
         assert "from worker import process" in content
         assert "await process(" in content
+
+
+class TestMapBodyToParams:
+    """Tests for _map_body_to_params — maps HTTP body to function arguments."""
+
+    def test_body_keys_match_params_spreads_as_kwargs(self):
+        from runpod_flash.cli.commands._run_server_helpers import _map_body_to_params
+
+        def process(name: str, value: int):
+            pass
+
+        result = _map_body_to_params(process, {"name": "test", "value": 42})
+        assert result == {"name": "test", "value": 42}
+
+    def test_body_keys_mismatch_wraps_in_first_param(self):
+        from runpod_flash.cli.commands._run_server_helpers import _map_body_to_params
+
+        def run_pipeline(input_data: dict):
+            pass
+
+        body = {"text": "hello", "mode": "fast"}
+        result = _map_body_to_params(run_pipeline, body)
+        assert result == {"input_data": {"text": "hello", "mode": "fast"}}
+
+    def test_non_dict_body_wraps_in_first_param(self):
+        from runpod_flash.cli.commands._run_server_helpers import _map_body_to_params
+
+        def run_pipeline(input_data):
+            pass
+
+        result = _map_body_to_params(run_pipeline, [1, 2, 3])
+        assert result == {"input_data": [1, 2, 3]}
+
+    def test_no_params_returns_empty(self):
+        from runpod_flash.cli.commands._run_server_helpers import _map_body_to_params
+
+        def no_args():
+            pass
+
+        result = _map_body_to_params(no_args, {"key": "val"})
+        assert result == {}
+
+    def test_partial_key_match_wraps_in_first_param(self):
+        from runpod_flash.cli.commands._run_server_helpers import _map_body_to_params
+
+        def process(name: str, value: int):
+            pass
+
+        result = _map_body_to_params(process, {"name": "test", "extra": "bad"})
+        assert result == {"name": {"name": "test", "extra": "bad"}}
+
+    def test_empty_dict_body_spreads_as_empty_kwargs(self):
+        from runpod_flash.cli.commands._run_server_helpers import _map_body_to_params
+
+        def run_pipeline(input_data: dict):
+            pass
+
+        result = _map_body_to_params(run_pipeline, {})
+        assert result == {}

From d15b51e8ead31345427b8242118e511a8aaf7f28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Thu, 19 Feb 2026 12:26:44 -0800
Subject: [PATCH 15/26] refactor(init): simplify skeleton to flat worker files
 for flash run

Replace the old multi-directory skeleton (main.py, mothership.py,
workers/) with three flat files: gpu_worker.py, cpu_worker.py, and
lb_worker.py. flash run auto-discovers @remote functions so the
FastAPI boilerplate and router structure are no longer needed.

- Remove main.py, mothership.py, workers/, .ruff_cache from skeleton
- Add gpu_worker.py (QB GPU), cpu_worker.py (QB CPU), lb_worker.py (LB)
- Simplify pyproject.toml deps (drop fastapi/uvicorn)
- Add .flash/ to .gitignore
- Rewrite README with uv setup, QB/LB examples, GpuType reference
- Update init command panel output and next steps
- Add Ctrl+C cleanup hint to flash run startup output
- Update skeleton tests for new file structure
---
 src/runpod_flash/cli/commands/init.py         |  12 +-
 src/runpod_flash/cli/commands/run.py          |   5 +-
 .../cli/utils/skeleton_template/.gitignore    |   1 +
 .../cli/utils/skeleton_template/README.md     | 304 +++++++-----------
 .../cli/utils/skeleton_template/cpu_worker.py |  17 +
 .../cli/utils/skeleton_template/gpu_worker.py |  27 ++
 .../cli/utils/skeleton_template/lb_worker.py  |  24 ++
 .../cli/utils/skeleton_template/main.py       |  44 ---
 .../cli/utils/skeleton_template/mothership.py |  55 ----
 .../utils/skeleton_template/pyproject.toml    |  47 +--
 .../skeleton_template/workers/__init__.py     |   0
 .../skeleton_template/workers/cpu/__init__.py |  19 --
 .../skeleton_template/workers/cpu/endpoint.py |  36 ---
 .../skeleton_template/workers/gpu/__init__.py |  19 --
 .../skeleton_template/workers/gpu/endpoint.py |  61 ----
 tests/unit/test_skeleton.py                   |  51 ++-
 16 files changed, 217 insertions(+), 505 deletions(-)
 create mode 100644 src/runpod_flash/cli/utils/skeleton_template/cpu_worker.py
 create mode 100644 src/runpod_flash/cli/utils/skeleton_template/gpu_worker.py
 create mode 100644 src/runpod_flash/cli/utils/skeleton_template/lb_worker.py
 delete mode 100644 src/runpod_flash/cli/utils/skeleton_template/main.py
 delete mode 100644 src/runpod_flash/cli/utils/skeleton_template/mothership.py
 delete mode 100644 src/runpod_flash/cli/utils/skeleton_template/workers/__init__.py
 delete mode 100644 src/runpod_flash/cli/utils/skeleton_template/workers/cpu/__init__.py
 delete mode 100644 src/runpod_flash/cli/utils/skeleton_template/workers/cpu/endpoint.py
 delete mode 100644 src/runpod_flash/cli/utils/skeleton_template/workers/gpu/__init__.py
 delete mode 100644 src/runpod_flash/cli/utils/skeleton_template/workers/gpu/endpoint.py

diff --git a/src/runpod_flash/cli/commands/init.py b/src/runpod_flash/cli/commands/init.py
index f684e537..eabd7583 100644
--- a/src/runpod_flash/cli/commands/init.py
+++ b/src/runpod_flash/cli/commands/init.py
@@ -81,12 +81,10 @@ def init_command(
         panel_content += "Project structure:\n"
         panel_content += f"  {actual_project_name}/\n"
 
-    panel_content += "  ├── main.py              # Flash Server (FastAPI)\n"
-    panel_content += "  ├── mothership.py        # Mothership endpoint config\n"
-    panel_content += "  ├── pyproject.toml       # Python project config\n"
-    panel_content += "  ├── workers/\n"
-    panel_content += "  │   ├── gpu/             # GPU worker\n"
-    panel_content += "  │   └── cpu/             # CPU worker\n"
+    panel_content += "  ├── gpu_worker.py        # GPU serverless worker\n"
+    panel_content += "  ├── cpu_worker.py        # CPU serverless worker\n"
+    panel_content += "  ├── lb_worker.py         # CPU load-balanced API\n"
+    panel_content += "  ├── pyproject.toml\n"
     panel_content += "  ├── .env.example\n"
     panel_content += "  ├── requirements.txt\n"
     panel_content += "  └── README.md\n"
@@ -105,8 +103,6 @@ def init_command(
         steps_table.add_row(f"{step_num}.", f"cd {actual_project_name}")
         step_num += 1
 
-    steps_table.add_row(f"{step_num}.", "Review and customize mothership.py (optional)")
-    step_num += 1
     steps_table.add_row(f"{step_num}.", "pip install -r requirements.txt")
     step_num += 1
     steps_table.add_row(f"{step_num}.", "cp .env.example .env")
diff --git a/src/runpod_flash/cli/commands/run.py b/src/runpod_flash/cli/commands/run.py
index 8bc94a22..b3000a20 100644
--- a/src/runpod_flash/cli/commands/run.py
+++ b/src/runpod_flash/cli/commands/run.py
@@ -316,7 +316,10 @@ def _print_startup_table(workers: List[WorkerInfo], host: str, port: int) -> Non
                 )
 
     console.print(table)
-    console.print(f"\n  Visit [bold]http://{host}:{port}/docs[/bold] for Swagger UI\n")
+    console.print(f"\n  Visit [bold]http://{host}:{port}/docs[/bold] for Swagger UI")
+    console.print(
+        "  Press [bold]Ctrl+C[/bold] to stop — provisioned endpoints are cleaned up automatically\n"
+    )
 
 
 def _cleanup_live_endpoints() -> None:
diff --git a/src/runpod_flash/cli/utils/skeleton_template/.gitignore b/src/runpod_flash/cli/utils/skeleton_template/.gitignore
index f0673581..0e3b93d7 100644
--- a/src/runpod_flash/cli/utils/skeleton_template/.gitignore
+++ b/src/runpod_flash/cli/utils/skeleton_template/.gitignore
@@ -36,6 +36,7 @@ wheels/
 .env.local
 
 # Flash
+.flash/
 .runpod/
 dist/
 
diff --git a/src/runpod_flash/cli/utils/skeleton_template/README.md b/src/runpod_flash/cli/utils/skeleton_template/README.md
index 6c4801e5..f30adf00 100644
--- a/src/runpod_flash/cli/utils/skeleton_template/README.md
+++ b/src/runpod_flash/cli/utils/skeleton_template/README.md
@@ -1,243 +1,176 @@
 # {{project_name}}
 
-Flash application demonstrating distributed GPU and CPU computing on Runpod's serverless infrastructure.
-
-## About This Template
-
-This project was generated using `flash init`. The `{{project_name}}` placeholder is automatically replaced with your actual project name during initialization.
+Runpod Flash application with GPU and CPU workers on Runpod serverless infrastructure.
 
 ## Quick Start
 
-### 1. Install Dependencies
+Install [uv](https://docs.astral.sh/uv/getting-started/installation/) (recommended Python package manager):
 
 ```bash
-pip install -r requirements.txt
+curl -LsSf https://astral.sh/uv/install.sh | sh
 ```
 
-### 2. Configure Environment
-
-Create `.env` file:
+Set up the project:
 
 ```bash
-RUNPOD_API_KEY=your_api_key_here
+uv venv && source .venv/bin/activate
+uv sync
+cp .env.example .env   # Add your RUNPOD_API_KEY
+flash run
 ```
 
-Get your API key from [Runpod Settings](https://www.runpod.io/console/user/settings).
-
-### 3. Run Locally
+Or with pip:
 
 ```bash
-# Standard run
+python -m venv .venv && source .venv/bin/activate
+pip install -r requirements.txt
+cp .env.example .env   # Add your RUNPOD_API_KEY
 flash run
-
-# Faster development: pre-provision endpoints (eliminates cold-start delays)
-flash run --auto-provision
 ```
 
-Server starts at **http://localhost:8000**
+Server starts at **http://localhost:8888**. Visit **http://localhost:8888/docs** for interactive Swagger UI.
 
-With `--auto-provision`, all serverless endpoints deploy before testing begins. This is much faster for development because endpoints are cached and reused across server restarts. Subsequent runs skip deployment and start immediately.
+Use `flash run --auto-provision` to pre-deploy all endpoints on startup, eliminating cold-start delays on first request. Provisioned endpoints are cached and reused across restarts.
 
-### 4. Test the API
+When you stop the server with Ctrl+C, all endpoints provisioned during the session are automatically cleaned up.
 
-```bash
-# Health check
-curl http://localhost:8000/ping
+Get your API key from [Runpod Settings](https://www.runpod.io/console/user/settings).
+Learn more about it from our [Documentation](https://docs.runpod.io/get-started/api-keys).
+
+## Test the API
 
-# GPU worker
-curl -X POST http://localhost:8000/gpu/hello \
+```bash
+# Queue-based GPU worker
+curl -X POST http://localhost:8888/gpu_worker/run_sync \
   -H "Content-Type: application/json" \
   -d '{"message": "Hello GPU!"}'
 
-# CPU worker
-curl -X POST http://localhost:8000/cpu/hello \
+# Queue-based CPU worker
+curl -X POST http://localhost:8888/cpu_worker/run_sync \
   -H "Content-Type: application/json" \
   -d '{"message": "Hello CPU!"}'
-```
-
-Visit **http://localhost:8000/docs** for interactive API documentation.
 
-## What This Demonstrates
-
-### GPU Worker (`workers/gpu/`)
-Simple GPU-based serverless function:
-- Remote execution with `@remote` decorator
-- GPU resource configuration
-- Automatic scaling (0-3 workers)
-- No external dependencies required
-
-```python
-@remote(
-    resource_config=LiveServerless(
-        name="gpu_worker",
-        gpus=[GpuGroup.ADA_24],  # RTX 4090
-        workersMin=0,
-        workersMax=3,
-    )
-)
-async def gpu_hello(input_data: dict) -> dict:
-    # Your GPU code here
-    return {"status": "success", "message": "Hello from GPU!"}
-```
-
-### CPU Worker (`workers/cpu/`)
-Simple CPU-based serverless function:
-- CPU-only execution (no GPU overhead)
-- CpuLiveServerless configuration
-- Efficient for API endpoints
-- Automatic scaling (0-5 workers)
+# Load-balanced HTTP endpoint
+curl -X POST http://localhost:8888/lb_worker/process \
+  -H "Content-Type: application/json" \
+  -d '{"input": "test"}'
 
-```python
-@remote(
-    resource_config=CpuLiveServerless(
-        name="cpu_worker",
-        instanceIds=[CpuInstanceType.CPU3G_2_8],  # 2 vCPU, 8GB RAM
-        workersMin=0,
-        workersMax=5,
-    )
-)
-async def cpu_hello(input_data: dict) -> dict:
-    # Your CPU code here
-    return {"status": "success", "message": "Hello from CPU!"}
+# Load-balanced health check
+curl http://localhost:8888/lb_worker/health
 ```
 
 ## Project Structure
 
 ```
 {{project_name}}/
-├── main.py                    # FastAPI application
-├── workers/
-│   ├── gpu/                  # GPU worker
-│   │   ├── __init__.py       # FastAPI router
-│   │   └── endpoint.py       # @remote decorated function
-│   └── cpu/                  # CPU worker
-│       ├── __init__.py       # FastAPI router
-│       └── endpoint.py       # @remote decorated function
-├── .env                      # Environment variables
-├── requirements.txt          # Dependencies
-└── README.md                 # This file
+├── gpu_worker.py      # GPU serverless worker (queue-based)
+├── cpu_worker.py      # CPU serverless worker (queue-based)
+├── lb_worker.py       # CPU load-balanced HTTP endpoint
+├── .env.example       # Environment variable template
+├── requirements.txt   # Python dependencies
+└── README.md
 ```
 
-## Key Concepts
-
-### Remote Execution
-The `@remote` decorator transparently executes functions on serverless infrastructure:
-- Code runs locally during development
-- Automatically deploys to Runpod when configured
-- Handles serialization, dependencies, and resource management
-
-### Resource Scaling
-Both workers scale to zero when idle to minimize costs:
-- **idleTimeout**: Seconds before scaling down (default: 60)
-- **workersMin**: 0 = completely scales to zero
-- **workersMax**: Maximum concurrent workers
-
-### GPU Types
-Available GPU options for `LiveServerless`:
-- `GpuGroup.ADA_24` - RTX 4090 (24GB)
-- `GpuGroup.ADA_48_PRO` - RTX 6000 Ada, L40 (48GB)
-- `GpuGroup.AMPERE_80` - A100 (80GB)
-- `GpuGroup.ANY` - Any available GPU
-
-### CPU Types
-Available CPU options for `CpuLiveServerless`:
-- `CpuInstanceType.CPU3G_2_8` - 2 vCPU, 8GB RAM (General Purpose)
-- `CpuInstanceType.CPU3C_4_8` - 4 vCPU, 8GB RAM (Compute Optimized)
-- `CpuInstanceType.CPU5G_4_16` - 4 vCPU, 16GB RAM (Latest Gen)
-- `CpuInstanceType.ANY` - Any available GPU
-
-## Development Workflow
-
-### Test Workers Locally
-```bash
-# Test GPU worker
-python -m workers.gpu.endpoint
+## Worker Types
 
-# Test CPU worker
-python -m workers.cpu.endpoint
-```
+### Queue-Based (QB) Workers
 
-### Run the Application
-```bash
-flash run
-```
-
-### Deploy to Production
-```bash
-# Build and deploy in one step
-flash deploy
-
-# Or deploy to a specific environment
-flash deploy --env production
-```
-
-## Adding New Workers
-
-### Add a GPU Worker
+QB workers process jobs from a queue. Each call to `/run_sync` sends a job and waits
+for the result. Use QB for compute-heavy tasks that may take seconds to minutes.
 
-1. Create `workers/my_worker/endpoint.py`:
+**gpu_worker.py** — GPU serverless function:
 ```python
-from runpod_flash import remote, LiveServerless
+from runpod_flash import GpuType, LiveServerless, remote
 
-config = LiveServerless(name="my_worker")
+gpu_config = LiveServerless(
+    name="gpu_worker",
+    gpus=[GpuType.ANY],
+)
 
-@remote(resource_config=config, dependencies=["torch"])
-async def my_function(data: dict) -> dict:
+@remote(resource_config=gpu_config, dependencies=["torch"])
+async def gpu_hello(input_data: dict) -> dict:
     import torch
-    # Your code here
-    return {"result": "success"}
+    gpu_available = torch.cuda.is_available()
+    gpu_name = torch.cuda.get_device_name(0) if gpu_available else "No GPU detected"
+    return {"message": gpu_name}
 ```
 
-2. Create `workers/my_worker/__init__.py`:
+**cpu_worker.py** — CPU serverless function:
 ```python
-from fastapi import APIRouter
-from .endpoint import my_function
+from runpod_flash import CpuLiveServerless, remote
 
-router = APIRouter()
+cpu_config = CpuLiveServerless(name="cpu_worker")
 
-@router.post("/process")
-async def handler(data: dict):
-    return await my_function(data)
+@remote(resource_config=cpu_config)
+async def cpu_hello(input_data: dict = {}) -> dict:
+    return {"message": "Hello from CPU!"} + input_data
 ```
 
-3. Add to `main.py`:
-```python
-from workers.my_worker import router as my_router
-app.include_router(my_router, prefix="/my_worker")
-```
+### Load-Balanced (LB) Workers
 
-### Add a CPU Worker
+LB workers expose standard HTTP endpoints (GET, POST, etc.) behind a load balancer.
+Use LB for low-latency API endpoints that need horizontal scaling.
 
-Same pattern but use `CpuLiveServerless`:
+**lb_worker.py** — HTTP endpoints on a load-balanced container:
 ```python
-from runpod_flash import remote, CpuLiveServerless, CpuInstanceType
+from runpod_flash import CpuLiveLoadBalancer, remote
 
-config = CpuLiveServerless(
-    name="my_cpu_worker",
-    instanceIds=[CpuInstanceType.CPU3G_2_8]
+api_config = CpuLiveLoadBalancer(
+    name="lb_worker",
+    workersMin=1,
 )
 
-@remote(resource_config=config, dependencies=["requests"])
-async def fetch_data(url: str) -> dict:
-    import requests
-    return requests.get(url).json()
+@remote(resource_config=api_config, method="POST", path="/process")
+async def process(input_data: dict) -> dict:
+    return {"status": "success", "echo": input_data}
+
+@remote(resource_config=api_config, method="GET", path="/health")
+async def health() -> dict:
+    return {"status": "healthy"}
 ```
 
-## Adding Dependencies
+## Adding New Workers
+
+Create a new `.py` file with a `@remote` function. `flash run` auto-discovers all
+`@remote` functions in the project.
 
-Specify dependencies in the `@remote` decorator:
 ```python
-@remote(
-    resource_config=config,
-    dependencies=["torch>=2.0.0", "transformers"],  # Python packages
-    system_dependencies=["ffmpeg"]  # System packages
-)
-async def my_function(data: dict) -> dict:
-    # Dependencies are automatically installed
-    import torch
-    import transformers
+# my_worker.py
+from runpod_flash import LiveServerless, GpuType, remote
+
+config = LiveServerless(name="my_worker", gpus=[GpuType.NVIDIA_GEFORCE_RTX_4090])
+
+@remote(resource_config=config, dependencies=["transformers"])
+async def predict(input_data: dict) -> dict:
+    from transformers import pipeline
+    pipe = pipeline("sentiment-analysis")
+    return pipe(input_data["text"])[0]
 ```
 
+Then run `flash run` — the new worker appears automatically.
+
+## GPU Types
+
+| Config | Hardware | VRAM |
+|--------|----------|------|
+| `GpuType.ANY` | Any available GPU | varies |
+| `GpuType.NVIDIA_GEFORCE_RTX_4090` | RTX 4090 | 24 GB |
+| `GpuType.NVIDIA_GEFORCE_RTX_5090` | RTX 5090 | 32 GB |
+| `GpuType.NVIDIA_RTX_6000_ADA_GENERATION` | RTX 6000 Ada | 48 GB |
+| `GpuType.NVIDIA_L4` | L4 | 24 GB |
+| `GpuType.NVIDIA_A100_80GB_PCIe` | A100 PCIe | 80 GB |
+| `GpuType.NVIDIA_A100_SXM4_80GB` | A100 SXM4 | 80 GB |
+| `GpuType.NVIDIA_H100_80GB_HBM3` | H100 | 80 GB |
+| `GpuType.NVIDIA_H200` | H200 | 141 GB |
+
+## CPU Types
+
+| Config | vCPU | RAM |
+|--------|------|-----|
+| `CpuInstanceType.CPU3G_2_8` | 2 | 8 GB |
+| `CpuInstanceType.CPU3C_4_8` | 4 | 8 GB |
+| `CpuInstanceType.CPU5G_4_16` | 4 | 16 GB |
+
 ## Environment Variables
 
 ```bash
@@ -245,16 +178,13 @@ async def my_function(data: dict) -> dict:
 RUNPOD_API_KEY=your_api_key
 
 # Optional
-FLASH_HOST=localhost  # Host to bind the server to (default: localhost)
-FLASH_PORT=8888       # Port to bind the server to (default: 8888)
-LOG_LEVEL=INFO        # Logging level (default: INFO)
+FLASH_HOST=localhost   # Server host (default: localhost)
+FLASH_PORT=8888        # Server port (default: 8888)
+LOG_LEVEL=INFO         # Logging level (default: INFO)
 ```
 
-## Next Steps
+## Deploy
 
-- Add your ML models or processing logic
-- Configure GPU/CPU resources based on your needs
-- Add authentication to your endpoints
-- Implement error handling and retries
-- Add monitoring and logging
-- Deploy to production with `flash deploy`
+```bash
+flash deploy
+```
diff --git a/src/runpod_flash/cli/utils/skeleton_template/cpu_worker.py b/src/runpod_flash/cli/utils/skeleton_template/cpu_worker.py
new file mode 100644
index 00000000..aee4b5a3
--- /dev/null
+++ b/src/runpod_flash/cli/utils/skeleton_template/cpu_worker.py
@@ -0,0 +1,17 @@
+from runpod_flash import CpuLiveServerless, remote
+
+cpu_config = CpuLiveServerless(name="cpu_worker")
+
+
+@remote(resource_config=cpu_config)
+async def cpu_hello(input_data: dict) -> dict:
+    """CPU worker — lightweight processing without GPU."""
+    import platform
+    from datetime import datetime
+
+    return {
+        "message": input_data.get("message", "Hello from CPU worker!"),
+        "timestamp": datetime.now().isoformat(),
+        "platform": platform.system(),
+        "python_version": platform.python_version(),
+    }
diff --git a/src/runpod_flash/cli/utils/skeleton_template/gpu_worker.py b/src/runpod_flash/cli/utils/skeleton_template/gpu_worker.py
new file mode 100644
index 00000000..d787e1e9
--- /dev/null
+++ b/src/runpod_flash/cli/utils/skeleton_template/gpu_worker.py
@@ -0,0 +1,27 @@
+from runpod_flash import GpuType, LiveServerless, remote
+
+gpu_config = LiveServerless(
+    name="gpu_worker",
+    gpus=[GpuType.ANY],
+)
+
+
+@remote(resource_config=gpu_config, dependencies=["torch"])
+async def gpu_hello(input_data: dict) -> dict:
+    """GPU worker — detects available GPU hardware."""
+    import platform
+
+    try:
+        import torch
+
+        gpu_available = torch.cuda.is_available()
+        gpu_name = torch.cuda.get_device_name(0) if gpu_available else "No GPU detected"
+    except Exception as e:
+        gpu_available = False
+        gpu_name = f"Error: {e}"
+
+    return {
+        "message": input_data.get("message", "Hello from GPU worker!"),
+        "gpu": {"available": gpu_available, "name": gpu_name},
+        "python_version": platform.python_version(),
+    }
diff --git a/src/runpod_flash/cli/utils/skeleton_template/lb_worker.py b/src/runpod_flash/cli/utils/skeleton_template/lb_worker.py
new file mode 100644
index 00000000..1b40ed0c
--- /dev/null
+++ b/src/runpod_flash/cli/utils/skeleton_template/lb_worker.py
@@ -0,0 +1,24 @@
+from runpod_flash import CpuLiveLoadBalancer, remote
+
+api_config = CpuLiveLoadBalancer(
+    name="lb_worker",
+    workersMin=1,
+)
+
+
+@remote(resource_config=api_config, method="POST", path="/process")
+async def process(input_data: dict) -> dict:
+    """Process input data on a load-balanced CPU endpoint."""
+    from datetime import datetime
+
+    return {
+        "status": "success",
+        "echo": input_data,
+        "timestamp": datetime.now().isoformat(),
+    }
+
+
+@remote(resource_config=api_config, method="GET", path="/health")
+async def health() -> dict:
+    """Health check for the load-balanced endpoint."""
+    return {"status": "healthy"}
diff --git a/src/runpod_flash/cli/utils/skeleton_template/main.py b/src/runpod_flash/cli/utils/skeleton_template/main.py
deleted file mode 100644
index ad3ce717..00000000
--- a/src/runpod_flash/cli/utils/skeleton_template/main.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import logging
-import os
-
-from fastapi import FastAPI
-
-from workers.cpu import cpu_router
-from workers.gpu import gpu_router
-
-logger = logging.getLogger(__name__)
-
-
-app = FastAPI(
-    title="Flash Application",
-    description="Distributed GPU and CPU computing with Runpod Flash",
-    version="0.1.0",
-)
-
-# Include routers
-app.include_router(gpu_router, prefix="/gpu", tags=["GPU Workers"])
-app.include_router(cpu_router, prefix="/cpu", tags=["CPU Workers"])
-
-
-@app.get("/")
-def home():
-    return {
-        "message": "Flash Application",
-        "docs": "/docs",
-        "endpoints": {"gpu_hello": "/gpu/hello", "cpu_hello": "/cpu/hello"},
-    }
-
-
-@app.get("/ping")
-def ping():
-    return {"status": "healthy"}
-
-
-if __name__ == "__main__":
-    import uvicorn
-
-    host = os.getenv("FLASH_HOST", "localhost")
-    port = int(os.getenv("FLASH_PORT", 8888))
-    logger.info(f"Starting Flash server on {host}:{port}")
-
-    uvicorn.run(app, host=host, port=port)
diff --git a/src/runpod_flash/cli/utils/skeleton_template/mothership.py b/src/runpod_flash/cli/utils/skeleton_template/mothership.py
deleted file mode 100644
index 85779bfc..00000000
--- a/src/runpod_flash/cli/utils/skeleton_template/mothership.py
+++ /dev/null
@@ -1,55 +0,0 @@
-"""
-Mothership Endpoint Configuration
-
-The mothership endpoint serves your FastAPI application routes.
-It is automatically deployed as a CPU-optimized load-balanced endpoint.
-
-To customize this configuration:
-- Modify worker scaling: change workersMin and workersMax values
-- Use GPU load balancer: import LiveLoadBalancer instead of CpuLiveLoadBalancer
-- Change endpoint name: update the 'name' parameter
-
-To disable mothership deployment:
-- Delete this file, or
-- Comment out the 'mothership' variable below
-
-Documentation: https://docs.runpod.io/flash/mothership
-"""
-
-from runpod_flash import CpuLiveLoadBalancer
-
-# Mothership endpoint configuration
-# This serves your FastAPI app routes from main.py
-mothership = CpuLiveLoadBalancer(
-    name="mothership",
-    workersMin=1,
-    workersMax=1,
-)
-
-# Examples of customization:
-
-# Increase scaling for high traffic
-# mothership = CpuLiveLoadBalancer(
-#     name="mothership",
-#     workersMin=2,
-#     workersMax=10,
-# )
-
-# Use GPU-based load balancer instead of CPU
-# (requires importing LiveLoadBalancer)
-# from runpod_flash import LiveLoadBalancer
-# mothership = LiveLoadBalancer(
-#     name="mothership",
-#     gpus=[GpuGroup.ANY],
-# )
-
-# Custom endpoint name
-# mothership = CpuLiveLoadBalancer(
-#     name="my-api-gateway",
-#     workersMin=1,
-#     workersMax=1,
-# )
-
-# To disable mothership:
-# - Delete this entire file, or
-# - Comment out the 'mothership' variable above
diff --git a/src/runpod_flash/cli/utils/skeleton_template/pyproject.toml b/src/runpod_flash/cli/utils/skeleton_template/pyproject.toml
index 7987ad22..a58ae558 100644
--- a/src/runpod_flash/cli/utils/skeleton_template/pyproject.toml
+++ b/src/runpod_flash/cli/utils/skeleton_template/pyproject.toml
@@ -5,54 +5,9 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "{{project_name}}"
 version = "0.1.0"
-description = "Flash serverless application"
+description = "Runpod Flash Serverless Application"
 readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
     "runpod-flash",
-    "fastapi>=0.104.0",
-    "uvicorn>=0.24.0",
-]
-
-[project.optional-dependencies]
-dev = [
-    "pytest>=7.0",
-    "pytest-asyncio>=0.21",
-    "pytest-cov>=4.0",
-    "ruff>=0.1",
-    "mypy>=1.0",
-]
-
-[tool.ruff]
-line-length = 100
-target-version = "py311"
-
-[tool.ruff.lint]
-select = ["E", "F", "I", "N", "W"]
-ignore = ["E501"]
-
-[tool.pytest.ini_options]
-testpaths = ["tests"]
-python_files = ["test_*.py", "*_test.py"]
-python_classes = ["Test*"]
-python_functions = ["test_*"]
-asyncio_mode = "auto"
-
-[tool.mypy]
-python_version = "3.11"
-warn_return_any = false
-warn_unused_configs = true
-disallow_untyped_defs = false
-
-[tool.coverage.run]
-source = ["src"]
-omit = ["*/tests/*"]
-
-[tool.coverage.report]
-exclude_lines = [
-    "pragma: no cover",
-    "def __repr__",
-    "raise AssertionError",
-    "raise NotImplementedError",
-    "if __name__ == .__main__.:",
 ]
diff --git a/src/runpod_flash/cli/utils/skeleton_template/workers/__init__.py b/src/runpod_flash/cli/utils/skeleton_template/workers/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/runpod_flash/cli/utils/skeleton_template/workers/cpu/__init__.py b/src/runpod_flash/cli/utils/skeleton_template/workers/cpu/__init__.py
deleted file mode 100644
index aef10a1a..00000000
--- a/src/runpod_flash/cli/utils/skeleton_template/workers/cpu/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from fastapi import APIRouter
-from pydantic import BaseModel
-
-from .endpoint import cpu_hello
-
-cpu_router = APIRouter()
-
-
-class MessageRequest(BaseModel):
-    """Request model for CPU worker."""
-
-    message: str = "Hello from CPU!"
-
-
-@cpu_router.post("/hello")
-async def hello(request: MessageRequest):
-    """Simple CPU worker endpoint."""
-    result = await cpu_hello({"message": request.message})
-    return result
diff --git a/src/runpod_flash/cli/utils/skeleton_template/workers/cpu/endpoint.py b/src/runpod_flash/cli/utils/skeleton_template/workers/cpu/endpoint.py
deleted file mode 100644
index 8161e5a7..00000000
--- a/src/runpod_flash/cli/utils/skeleton_template/workers/cpu/endpoint.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from runpod_flash import CpuLiveServerless, remote
-
-cpu_config = CpuLiveServerless(
-    name="cpu_worker",
-    workersMin=0,
-    workersMax=1,
-    idleTimeout=60,
-)
-
-
-@remote(resource_config=cpu_config)
-async def cpu_hello(input_data: dict) -> dict:
-    """Simple CPU worker example."""
-    import platform
-    from datetime import datetime
-
-    message = input_data.get("message", "Hello from CPU worker!")
-
-    return {
-        "status": "success",
-        "message": message,
-        "worker_type": "CPU",
-        "timestamp": datetime.now().isoformat(),
-        "platform": platform.system(),
-        "python_version": platform.python_version(),
-    }
-
-
-# Test locally with: python -m workers.cpu.endpoint
-if __name__ == "__main__":
-    import asyncio
-
-    test_payload = {"message": "Testing CPU worker"}
-    print(f"Testing CPU worker with payload: {test_payload}")
-    result = asyncio.run(cpu_hello(test_payload))
-    print(f"Result: {result}")
diff --git a/src/runpod_flash/cli/utils/skeleton_template/workers/gpu/__init__.py b/src/runpod_flash/cli/utils/skeleton_template/workers/gpu/__init__.py
deleted file mode 100644
index a6a3caad..00000000
--- a/src/runpod_flash/cli/utils/skeleton_template/workers/gpu/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from fastapi import APIRouter
-from pydantic import BaseModel
-
-from .endpoint import gpu_hello
-
-gpu_router = APIRouter()
-
-
-class MessageRequest(BaseModel):
-    """Request model for GPU worker."""
-
-    message: str = "Hello from GPU!"
-
-
-@gpu_router.post("/hello")
-async def hello(request: MessageRequest):
-    """Simple GPU worker endpoint."""
-    result = await gpu_hello({"message": request.message})
-    return result
diff --git a/src/runpod_flash/cli/utils/skeleton_template/workers/gpu/endpoint.py b/src/runpod_flash/cli/utils/skeleton_template/workers/gpu/endpoint.py
deleted file mode 100644
index f3c4466c..00000000
--- a/src/runpod_flash/cli/utils/skeleton_template/workers/gpu/endpoint.py
+++ /dev/null
@@ -1,61 +0,0 @@
-from runpod_flash import GpuGroup, LiveServerless, remote
-
-gpu_config = LiveServerless(
-    name="gpu_worker",
-    gpus=[GpuGroup.ANY],
-    workersMin=0,
-    workersMax=1,
-    idleTimeout=60,
-)
-
-
-@remote(resource_config=gpu_config, dependencies=["torch"])
-async def gpu_hello(input_data: dict) -> dict:
-    """Simple GPU worker example with GPU detection."""
-    import platform
-    from datetime import datetime
-
-    try:
-        import torch
-
-        gpu_available = torch.cuda.is_available()
-        if gpu_available:
-            gpu_name = torch.cuda.get_device_name(0)
-            gpu_count = torch.cuda.device_count()
-            gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
-        else:
-            gpu_name = "No GPU detected"
-            gpu_count = 0
-            gpu_memory = 0
-    except Exception as e:
-        gpu_available = False
-        gpu_name = f"Error detecting GPU: {str(e)}"
-        gpu_count = 0
-        gpu_memory = 0
-
-    message = input_data.get("message", "Hello from GPU worker!")
-
-    return {
-        "status": "success",
-        "message": message,
-        "worker_type": "GPU",
-        "gpu_info": {
-            "available": gpu_available,
-            "name": gpu_name,
-            "count": gpu_count,
-            "memory_gb": round(gpu_memory, 2) if gpu_memory else 0,
-        },
-        "timestamp": datetime.now().isoformat(),
-        "platform": platform.system(),
-        "python_version": platform.python_version(),
-    }
-
-
-# Test locally with: python -m workers.gpu.endpoint
-if __name__ == "__main__":
-    import asyncio
-
-    test_payload = {"message": "Testing GPU worker"}
-    print(f"Testing GPU worker with payload: {test_payload}")
-    result = asyncio.run(gpu_hello(test_payload))
-    print(f"Result: {result}")
diff --git a/tests/unit/test_skeleton.py b/tests/unit/test_skeleton.py
index 0c3962b4..ea788bd6 100644
--- a/tests/unit/test_skeleton.py
+++ b/tests/unit/test_skeleton.py
@@ -85,13 +85,13 @@ def test_detect_no_conflicts_empty_directory(self, tmp_path):
     def test_detect_conflict_with_existing_file(self, tmp_path):
         """Test that existing files are detected as conflicts."""
         # Create a file that exists in the template
-        (tmp_path / "main.py").write_text("# existing file")
+        (tmp_path / "gpu_worker.py").write_text("# existing file")
 
         conflicts = detect_file_conflicts(tmp_path)
 
-        # Should detect main.py as a conflict
+        # Should detect gpu_worker.py as a conflict
         conflict_names = [str(c) for c in conflicts]
-        assert "main.py" in conflict_names
+        assert "gpu_worker.py" in conflict_names
 
     def test_detect_conflict_with_hidden_file(self, tmp_path):
         """Test that existing hidden files are detected as conflicts."""
@@ -138,7 +138,9 @@ def test_create_skeleton_in_empty_directory(self, tmp_path):
         assert len(created_files) > 0
 
         # Check that key files exist
-        assert (tmp_path / "main.py").exists()
+        assert (tmp_path / "gpu_worker.py").exists()
+        assert (tmp_path / "cpu_worker.py").exists()
+        assert (tmp_path / "lb_worker.py").exists()
         assert (tmp_path / "README.md").exists()
         assert (tmp_path / "requirements.txt").exists()
 
@@ -147,13 +149,6 @@ def test_create_skeleton_in_empty_directory(self, tmp_path):
         assert (tmp_path / ".gitignore").exists()
         assert (tmp_path / ".flashignore").exists()
 
-        # Check that workers directory structure exists
-        assert (tmp_path / "workers").is_dir()
-        assert (tmp_path / "workers" / "cpu").is_dir()
-        assert (tmp_path / "workers" / "gpu").is_dir()
-        assert (tmp_path / "workers" / "cpu" / "__init__.py").exists()
-        assert (tmp_path / "workers" / "gpu" / "__init__.py").exists()
-
     def test_create_skeleton_with_project_name_substitution(self, tmp_path):
         """Test that {{project_name}} placeholder is replaced."""
         project_dir = tmp_path / "my_test_project"
@@ -169,14 +164,14 @@ def test_create_skeleton_with_project_name_substitution(self, tmp_path):
     def test_create_skeleton_skips_existing_files_without_force(self, tmp_path):
         """Test that existing files are not overwritten without force flag."""
         # Create an existing file with specific content
-        existing_content = "# This is my custom main.py"
-        (tmp_path / "main.py").write_text(existing_content)
+        existing_content = "# This is my custom gpu_worker.py"
+        (tmp_path / "gpu_worker.py").write_text(existing_content)
 
         # Create skeleton without force
         create_project_skeleton(tmp_path, force=False)
 
         # Existing file should not be overwritten
-        assert (tmp_path / "main.py").read_text() == existing_content
+        assert (tmp_path / "gpu_worker.py").read_text() == existing_content
 
         # But other files should be created
         assert (tmp_path / ".env.example").exists()
@@ -184,16 +179,16 @@ def test_create_skeleton_skips_existing_files_without_force(self, tmp_path):
     def test_create_skeleton_overwrites_with_force(self, tmp_path):
         """Test that existing files are overwritten with force=True."""
         # Create an existing file
-        existing_content = "# This is my custom main.py"
-        (tmp_path / "main.py").write_text(existing_content)
+        existing_content = "# This is my custom gpu_worker.py"
+        (tmp_path / "gpu_worker.py").write_text(existing_content)
 
         # Create skeleton with force
         create_project_skeleton(tmp_path, force=True)
 
         # Existing file should be overwritten
-        new_content = (tmp_path / "main.py").read_text()
+        new_content = (tmp_path / "gpu_worker.py").read_text()
         assert new_content != existing_content
-        assert "# This is my custom main.py" not in new_content
+        assert "# This is my custom gpu_worker.py" not in new_content
 
     def test_create_skeleton_ignores_pycache(self, tmp_path):
         """Test that __pycache__ directories are not copied."""
@@ -225,7 +220,7 @@ def test_create_skeleton_creates_parent_directories(self, tmp_path):
 
         # All parent directories should exist
         assert project_dir.exists()
-        assert (project_dir / "main.py").exists()
+        assert (project_dir / "gpu_worker.py").exists()
 
     def test_create_skeleton_returns_created_files_list(self, tmp_path):
         """Test that function returns list of created files."""
@@ -236,14 +231,14 @@ def test_create_skeleton_returns_created_files_list(self, tmp_path):
         assert all(isinstance(f, str) for f in created_files)
 
         # Should contain expected files
-        assert "main.py" in created_files
+        assert "gpu_worker.py" in created_files
         assert ".env.example" in created_files
         assert "README.md" in created_files
 
     def test_create_skeleton_handles_readonly_files_gracefully(self, tmp_path):
         """Test handling of read-only files during creation."""
         # Create a read-only file
-        readonly_file = tmp_path / "main.py"
+        readonly_file = tmp_path / "gpu_worker.py"
         readonly_file.write_text("# readonly")
         readonly_file.chmod(0o444)
 
@@ -287,7 +282,9 @@ def test_full_init_workflow_in_place(self, tmp_path):
 
         # Verify all expected files exist
         expected_files = [
-            "main.py",
+            "gpu_worker.py",
+            "cpu_worker.py",
+            "lb_worker.py",
             "README.md",
             "requirements.txt",
             ".env.example",
@@ -297,14 +294,10 @@ def test_full_init_workflow_in_place(self, tmp_path):
         for filename in expected_files:
             assert (tmp_path / filename).exists(), f"{filename} should exist"
 
-        # Verify workers structure
-        assert (tmp_path / "workers" / "cpu" / "endpoint.py").exists()
-        assert (tmp_path / "workers" / "gpu" / "endpoint.py").exists()
-
     def test_full_init_workflow_with_conflicts(self, tmp_path):
         """Test complete workflow when conflicts exist."""
         # Create some existing files
-        (tmp_path / "main.py").write_text("# my custom main")
+        (tmp_path / "gpu_worker.py").write_text("# my custom worker")
         (tmp_path / ".env.example").write_text("MY_VAR=123")
 
         # Detect conflicts
@@ -312,14 +305,14 @@ def test_full_init_workflow_with_conflicts(self, tmp_path):
         assert len(conflicts) == 2
 
         conflict_names = [str(c) for c in conflicts]
-        assert "main.py" in conflict_names
+        assert "gpu_worker.py" in conflict_names
         assert ".env.example" in conflict_names
 
         # Create skeleton without force (should preserve existing)
         create_project_skeleton(tmp_path, force=False)
 
         # Check that existing files were preserved
-        assert (tmp_path / "main.py").read_text() == "# my custom main"
+        assert (tmp_path / "gpu_worker.py").read_text() == "# my custom worker"
         assert (tmp_path / ".env.example").read_text() == "MY_VAR=123"
 
         # But new files should be created

From ef24356ccd2687f21d0cd9b40579fa93d4d9068b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Thu, 19 Feb 2026 15:45:28 -0800
Subject: [PATCH 16/26] fix(run): handle numeric-prefixed directories in server
 codegen

Directory names starting with digits (e.g. 01_getting_started/) produce
invalid Python when used in import statements and function names.

- Add _flash_import helper to generated server.py that uses
  importlib.import_module() with scoped sys.path so sibling imports
  (e.g. `from cpu_worker import ...`) resolve to the correct directory
- Prefix generated function names with '_' when they start with a digit
- Scope sys.path per-import to prevent name collisions when multiple
  directories contain files with the same name (e.g. cpu_worker.py)
---
 src/runpod_flash/cli/commands/run.py |  88 +++++++++-
 tests/unit/cli/test_run.py           | 251 ++++++++++++++++++++++++++-
 2 files changed, 332 insertions(+), 7 deletions(-)

diff --git a/src/runpod_flash/cli/commands/run.py b/src/runpod_flash/cli/commands/run.py
index b3000a20..bf3bbd74 100644
--- a/src/runpod_flash/cli/commands/run.py
+++ b/src/runpod_flash/cli/commands/run.py
@@ -131,8 +131,53 @@ def _ensure_gitignore(project_root: Path) -> None:
 
 
 def _sanitize_fn_name(name: str) -> str:
-    """Sanitize a string for use as a Python function name."""
-    return name.replace("/", "_").replace(".", "_").replace("-", "_")
+    """Sanitize a string for use as a Python function name.
+
+    Replaces non-identifier characters with underscores and prepends '_'
+    if the result starts with a digit (Python identifiers cannot start
+    with digits).
+    """
+    result = name.replace("/", "_").replace(".", "_").replace("-", "_")
+    if result and result[0].isdigit():
+        result = "_" + result
+    return result
+
+
+def _has_numeric_module_segments(module_path: str) -> bool:
+    """Check if any segment in a dotted module path starts with a digit.
+
+    Python identifiers cannot start with digits, so ``from 01_foo import bar``
+    is a SyntaxError. Callers should use ``importlib.import_module()`` instead.
+    """
+    return any(seg and seg[0].isdigit() for seg in module_path.split("."))
+
+
+def _module_parent_subdir(module_path: str) -> str | None:
+    """Return the parent sub-directory for a dotted module path, or None for top-level.
+
+    Example: ``01_getting_started.03_mixed.pipeline`` → ``01_getting_started/03_mixed``
+    """
+    parts = module_path.rsplit(".", 1)
+    if len(parts) == 1:
+        return None
+    return parts[0].replace(".", "/")
+
+
+def _make_import_line(module_path: str, name: str) -> str:
+    """Build an import statement for *name* from *module_path*.
+
+    Uses a regular ``from … import …`` when the module path is a valid
+    Python identifier chain. Falls back to ``_flash_import()`` (a generated
+    helper in server.py) when any segment starts with a digit. The helper
+    temporarily scopes ``sys.path`` so sibling imports in the target module
+    resolve to the correct directory.
+    """
+    if _has_numeric_module_segments(module_path):
+        subdir = _module_parent_subdir(module_path)
+        if subdir:
+            return f'{name} = _flash_import("{module_path}", "{name}", "{subdir}")'
+        return f'{name} = _flash_import("{module_path}", "{name}")'
+    return f"from {module_path} import {name}"
 
 
 def _generate_flash_server(project_root: Path, workers: List[WorkerInfo]) -> Path:
@@ -157,10 +202,41 @@ def _generate_flash_server(project_root: Path, workers: List[WorkerInfo]) -> Pat
         "import sys",
         "import uuid",
         "from pathlib import Path",
-        "sys.path.insert(0, str(Path(__file__).parent.parent))",
+        "_project_root = Path(__file__).parent.parent",
+        "sys.path.insert(0, str(_project_root))",
         "",
     ]
 
+    # When modules live in directories with numeric prefixes (e.g. 01_hello/),
+    # we cannot use ``from … import …`` — Python identifiers cannot start with
+    # digits.  Instead we emit a small ``_flash_import`` helper that uses
+    # ``importlib.import_module()`` *and* temporarily scopes ``sys.path`` so
+    # that sibling imports inside the loaded module (e.g. ``from cpu_worker
+    # import …``) resolve to the correct directory rather than a same-named
+    # file from a different example subdirectory.
+    needs_importlib = any(_has_numeric_module_segments(w.module_path) for w in workers)
+
+    if needs_importlib:
+        lines += [
+            "import importlib as _importlib",
+            "",
+            "",
+            "def _flash_import(module_path, name, subdir=None):",
+            '    """Import *name* from *module_path* with scoped sys.path for sibling imports."""',
+            "    _path = str(_project_root / subdir) if subdir else None",
+            "    if _path:",
+            "        sys.path.insert(0, _path)",
+            "    try:",
+            "        return getattr(_importlib.import_module(module_path), name)",
+            "    finally:",
+            "        if _path:",
+            "            try:",
+            "                sys.path.remove(_path)",
+            "            except ValueError:",
+            "                pass",
+            "",
+        ]
+
     if has_lb_workers:
         lines += [
             "from fastapi import FastAPI, Request",
@@ -179,7 +255,7 @@ def _generate_flash_server(project_root: Path, workers: List[WorkerInfo]) -> Pat
     for worker in workers:
         if worker.worker_type == "QB":
             for fn_name in worker.functions:
-                all_imports.append(f"from {worker.module_path} import {fn_name}")
+                all_imports.append(_make_import_line(worker.module_path, fn_name))
         elif worker.worker_type == "LB":
             # Import the resource config variable (e.g. "api" from api = LiveLoadBalancer(...))
             config_vars = {
@@ -188,9 +264,9 @@ def _generate_flash_server(project_root: Path, workers: List[WorkerInfo]) -> Pat
                 if r.get("config_variable")
             }
             for var in sorted(config_vars):
-                all_imports.append(f"from {worker.module_path} import {var}")
+                all_imports.append(_make_import_line(worker.module_path, var))
             for fn_name in worker.functions:
-                all_imports.append(f"from {worker.module_path} import {fn_name}")
+                all_imports.append(_make_import_line(worker.module_path, fn_name))
 
     if all_imports:
         lines.extend(all_imports)
diff --git a/tests/unit/cli/test_run.py b/tests/unit/cli/test_run.py
index d13abd12..6014e9a3 100644
--- a/tests/unit/cli/test_run.py
+++ b/tests/unit/cli/test_run.py
@@ -6,7 +6,14 @@
 from typer.testing import CliRunner
 
 from runpod_flash.cli.main import app
-from runpod_flash.cli.commands.run import WorkerInfo, _generate_flash_server
+from runpod_flash.cli.commands.run import (
+    WorkerInfo,
+    _generate_flash_server,
+    _has_numeric_module_segments,
+    _make_import_line,
+    _module_parent_subdir,
+    _sanitize_fn_name,
+)
 
 
 @pytest.fixture
@@ -495,6 +502,248 @@ def test_qb_function_still_imported_directly(self, tmp_path):
         assert "await process(" in content
 
 
+class TestSanitizeFnName:
+    """Test _sanitize_fn_name handles leading-digit identifiers."""
+
+    def test_normal_name_unchanged(self):
+        assert _sanitize_fn_name("worker_run_sync") == "worker_run_sync"
+
+    def test_leading_digit_gets_underscore_prefix(self):
+        assert _sanitize_fn_name("01_hello_run_sync") == "_01_hello_run_sync"
+
+    def test_slashes_replaced(self):
+        assert _sanitize_fn_name("a/b/c") == "a_b_c"
+
+    def test_dots_and_hyphens_replaced(self):
+        assert _sanitize_fn_name("a.b-c") == "a_b_c"
+
+    def test_numeric_after_slash(self):
+        assert _sanitize_fn_name("01_foo/02_bar") == "_01_foo_02_bar"
+
+
+class TestHasNumericModuleSegments:
+    """Test _has_numeric_module_segments detects digit-prefixed segments."""
+
+    def test_normal_module_path(self):
+        assert _has_numeric_module_segments("worker") is False
+
+    def test_dotted_normal(self):
+        assert _has_numeric_module_segments("longruns.stage1") is False
+
+    def test_leading_digit_first_segment(self):
+        assert _has_numeric_module_segments("01_hello.worker") is True
+
+    def test_leading_digit_nested_segment(self):
+        assert _has_numeric_module_segments("getting_started.01_hello.worker") is True
+
+    def test_digit_in_middle_not_leading(self):
+        assert _has_numeric_module_segments("stage1.worker") is False
+
+
+class TestModuleParentSubdir:
+    """Test _module_parent_subdir extracts parent directory from dotted path."""
+
+    def test_top_level_returns_none(self):
+        assert _module_parent_subdir("worker") is None
+
+    def test_single_parent(self):
+        assert _module_parent_subdir("01_hello.gpu_worker") == "01_hello"
+
+    def test_nested_parent(self):
+        assert (
+            _module_parent_subdir("01_getting_started.03_mixed.pipeline")
+            == "01_getting_started/03_mixed"
+        )
+
+
+class TestMakeImportLine:
+    """Test _make_import_line generates correct import syntax."""
+
+    def test_normal_module_uses_from_import(self):
+        result = _make_import_line("worker", "process")
+        assert result == "from worker import process"
+
+    def test_numeric_module_uses_flash_import(self):
+        result = _make_import_line("01_hello.gpu_worker", "gpu_hello")
+        assert (
+            result
+            == 'gpu_hello = _flash_import("01_hello.gpu_worker", "gpu_hello", "01_hello")'
+        )
+
+    def test_nested_numeric_includes_full_subdir(self):
+        result = _make_import_line(
+            "01_getting_started.01_hello.gpu_worker", "gpu_hello"
+        )
+        assert '"01_getting_started/01_hello"' in result
+
+    def test_top_level_numeric_module_no_subdir(self):
+        result = _make_import_line("01_worker", "process")
+        assert result == 'process = _flash_import("01_worker", "process")'
+
+
+class TestGenerateFlashServerNumericDirs:
+    """Test _generate_flash_server with numeric-prefixed directory names."""
+
+    def test_qb_numeric_dir_uses_flash_import(self, tmp_path):
+        """QB workers in numeric dirs use _flash_import with scoped sys.path."""
+        worker = WorkerInfo(
+            file_path=tmp_path / "01_hello" / "gpu_worker.py",
+            url_prefix="/01_hello/gpu_worker",
+            module_path="01_hello.gpu_worker",
+            resource_name="01_hello_gpu_worker",
+            worker_type="QB",
+            functions=["gpu_hello"],
+        )
+        content = _generate_flash_server(tmp_path, [worker]).read_text()
+
+        # Must NOT contain invalid 'from 01_hello...' import
+        assert "from 01_hello" not in content
+        # Must have _flash_import helper and importlib
+        assert "import importlib as _importlib" in content
+        assert "def _flash_import(" in content
+        assert (
+            '_flash_import("01_hello.gpu_worker", "gpu_hello", "01_hello")' in content
+        )
+
+    def test_qb_numeric_dir_function_name_prefixed(self, tmp_path):
+        """QB handler function names starting with digits get '_' prefix."""
+        worker = WorkerInfo(
+            file_path=tmp_path / "01_hello" / "gpu_worker.py",
+            url_prefix="/01_hello/gpu_worker",
+            module_path="01_hello.gpu_worker",
+            resource_name="01_hello_gpu_worker",
+            worker_type="QB",
+            functions=["gpu_hello"],
+        )
+        content = _generate_flash_server(tmp_path, [worker]).read_text()
+
+        # Function name must start with '_', not a digit
+        assert "async def _01_hello_gpu_worker_run_sync(body: dict):" in content
+
+    def test_lb_numeric_dir_uses_flash_import(self, tmp_path):
+        """LB workers in numeric dirs use _flash_import for config and function imports."""
+        worker = WorkerInfo(
+            file_path=tmp_path / "03_advanced" / "05_lb" / "cpu_lb.py",
+            url_prefix="/03_advanced/05_lb/cpu_lb",
+            module_path="03_advanced.05_lb.cpu_lb",
+            resource_name="03_advanced_05_lb_cpu_lb",
+            worker_type="LB",
+            functions=["validate_data"],
+            lb_routes=[
+                {
+                    "method": "POST",
+                    "path": "/validate",
+                    "fn_name": "validate_data",
+                    "config_variable": "cpu_config",
+                }
+            ],
+        )
+        content = _generate_flash_server(tmp_path, [worker]).read_text()
+
+        assert "from 03_advanced" not in content
+        assert (
+            '_flash_import("03_advanced.05_lb.cpu_lb", "cpu_config", "03_advanced/05_lb")'
+            in content
+        )
+        assert (
+            '_flash_import("03_advanced.05_lb.cpu_lb", "validate_data", "03_advanced/05_lb")'
+            in content
+        )
+
+    def test_mixed_numeric_and_normal_dirs(self, tmp_path):
+        """Normal modules use 'from' imports, numeric modules use _flash_import."""
+        normal_worker = WorkerInfo(
+            file_path=tmp_path / "worker.py",
+            url_prefix="/worker",
+            module_path="worker",
+            resource_name="worker",
+            worker_type="QB",
+            functions=["process"],
+        )
+        numeric_worker = WorkerInfo(
+            file_path=tmp_path / "01_hello" / "gpu_worker.py",
+            url_prefix="/01_hello/gpu_worker",
+            module_path="01_hello.gpu_worker",
+            resource_name="01_hello_gpu_worker",
+            worker_type="QB",
+            functions=["gpu_hello"],
+        )
+        content = _generate_flash_server(
+            tmp_path, [normal_worker, numeric_worker]
+        ).read_text()
+
+        # Normal worker uses standard import
+        assert "from worker import process" in content
+        # Numeric worker uses scoped _flash_import
+        assert (
+            '_flash_import("01_hello.gpu_worker", "gpu_hello", "01_hello")' in content
+        )
+
+    def test_no_importlib_when_all_normal_dirs(self, tmp_path):
+        """importlib and _flash_import are not emitted when no numeric dirs exist."""
+        worker = WorkerInfo(
+            file_path=tmp_path / "worker.py",
+            url_prefix="/worker",
+            module_path="worker",
+            resource_name="worker",
+            worker_type="QB",
+            functions=["process"],
+        )
+        content = _generate_flash_server(tmp_path, [worker]).read_text()
+        assert "importlib" not in content
+        assert "_flash_import" not in content
+
+    def test_scoped_import_includes_subdir(self, tmp_path):
+        """_flash_import calls pass the subdirectory for sibling import scoping."""
+        worker = WorkerInfo(
+            file_path=tmp_path / "01_getting_started" / "03_mixed" / "pipeline.py",
+            url_prefix="/01_getting_started/03_mixed/pipeline",
+            module_path="01_getting_started.03_mixed.pipeline",
+            resource_name="01_getting_started_03_mixed_pipeline",
+            worker_type="LB",
+            functions=["classify"],
+            lb_routes=[
+                {
+                    "method": "POST",
+                    "path": "/classify",
+                    "fn_name": "classify",
+                    "config_variable": "pipeline_config",
+                }
+            ],
+        )
+        content = _generate_flash_server(tmp_path, [worker]).read_text()
+
+        # Must scope to correct subdirectory, not add all dirs to sys.path
+        assert '"01_getting_started/03_mixed"' in content
+        # No global sys.path additions for subdirs — only the project root
+        # line at the top and the one inside _flash_import helper body
+        lines = content.split("\n")
+        global_sys_path_lines = [
+            line
+            for line in lines
+            if "sys.path.insert" in line and not line.startswith(" ")
+        ]
+        assert len(global_sys_path_lines) == 1
+
+    def test_generated_server_is_valid_python(self, tmp_path):
+        """Generated server.py with numeric dirs must be parseable Python."""
+        worker = WorkerInfo(
+            file_path=tmp_path / "01_getting_started" / "01_hello" / "gpu_worker.py",
+            url_prefix="/01_getting_started/01_hello/gpu_worker",
+            module_path="01_getting_started.01_hello.gpu_worker",
+            resource_name="01_getting_started_01_hello_gpu_worker",
+            worker_type="QB",
+            functions=["gpu_hello"],
+        )
+        server_path = _generate_flash_server(tmp_path, [worker])
+        content = server_path.read_text()
+
+        # Must parse without SyntaxError
+        import ast
+
+        ast.parse(content)
+
+
 class TestMapBodyToParams:
     """Tests for _map_body_to_params — maps HTTP body to function arguments."""
 

From ff7f0ebb8c85522544fa16c97aa66eebd7c15bea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Thu, 19 Feb 2026 16:37:14 -0800
Subject: [PATCH 17/26] fix(ci): update validate-wheel.sh for flat skeleton
 template

The skeleton template was replaced with flat worker files (cpu_worker.py,
gpu_worker.py, lb_worker.py, pyproject.toml) but the wheel validation
script still expected the old multi-directory structure (main.py,
workers/**). This caused the Build Package CI check to fail.
---
 scripts/validate-wheel.sh | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/scripts/validate-wheel.sh b/scripts/validate-wheel.sh
index 7e4dd517..a38db25f 100755
--- a/scripts/validate-wheel.sh
+++ b/scripts/validate-wheel.sh
@@ -21,14 +21,12 @@ REQUIRED_TEMPLATE_FILES=(
     "runpod_flash/cli/utils/skeleton_template/.env.example"
     "runpod_flash/cli/utils/skeleton_template/.gitignore"
     "runpod_flash/cli/utils/skeleton_template/.flashignore"
-    "runpod_flash/cli/utils/skeleton_template/main.py"
+    "runpod_flash/cli/utils/skeleton_template/cpu_worker.py"
+    "runpod_flash/cli/utils/skeleton_template/gpu_worker.py"
+    "runpod_flash/cli/utils/skeleton_template/lb_worker.py"
+    "runpod_flash/cli/utils/skeleton_template/pyproject.toml"
     "runpod_flash/cli/utils/skeleton_template/README.md"
     "runpod_flash/cli/utils/skeleton_template/requirements.txt"
-    "runpod_flash/cli/utils/skeleton_template/workers/__init__.py"
-    "runpod_flash/cli/utils/skeleton_template/workers/cpu/__init__.py"
-    "runpod_flash/cli/utils/skeleton_template/workers/cpu/endpoint.py"
-    "runpod_flash/cli/utils/skeleton_template/workers/gpu/__init__.py"
-    "runpod_flash/cli/utils/skeleton_template/workers/gpu/endpoint.py"
 )
 
 MISSING_IN_WHEEL=0
@@ -77,7 +75,7 @@ flash init test_project > /dev/null 2>&1
 # Verify critical files exist
 echo ""
 echo "Verifying created files..."
-REQUIRED_FILES=(".env.example" ".gitignore" ".flashignore" "main.py" "README.md" "requirements.txt")
+REQUIRED_FILES=(".env.example" ".gitignore" ".flashignore" "cpu_worker.py" "gpu_worker.py" "lb_worker.py" "pyproject.toml" "README.md" "requirements.txt")
 MISSING_IN_OUTPUT=0
 
 for file in "${REQUIRED_FILES[@]}"; do
@@ -94,15 +92,6 @@ for file in "${REQUIRED_FILES[@]}"; do
     fi
 done
 
-# Verify workers directory structure
-if [ -d "test_project/workers/cpu" ] && [ -d "test_project/workers/gpu" ]; then
-    echo "[OK] workers/cpu/"
-    echo "[OK] workers/gpu/"
-else
-    echo "[MISSING] workers directory structure"
-    MISSING_IN_OUTPUT=$((MISSING_IN_OUTPUT + 1))
-fi
-
 # Cleanup
 deactivate
 cd - > /dev/null

From 89b45d719db8624e6baf0f2f0f83a2de4be2bd48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Thu, 19 Feb 2026 17:13:14 -0800
Subject: [PATCH 18/26] fix: address PR 208 review feedback

- Guard watcher_thread.join() with is_alive() check for --no-reload
- Wrap watchfiles import in try/except for missing dependency
- Fix debug log to show actual type instead of hardcoded class name
- Fix invalid dict addition in skeleton README example
- Fix PRD spec to match actual /run_sync-only behavior
---
 PRD.md                                        |  2 +-
 src/runpod_flash/cli/commands/run.py          | 24 +++++++++++++++----
 .../cli/utils/skeleton_template/README.md     |  2 +-
 src/runpod_flash/stubs/load_balancer_sls.py   |  6 ++++-
 4 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/PRD.md b/PRD.md
index a5d9d98f..2df30adc 100644
--- a/PRD.md
+++ b/PRD.md
@@ -40,7 +40,7 @@ async def process(input_data: dict) -> dict:
     return {"result": "processed", "input": input_data}
 ```
 
-`flash run` → `POST /gpu_worker/run` and `POST /gpu_worker/run_sync`
+`flash run` → `POST /gpu_worker/run_sync`
 `flash deploy` → standalone QB endpoint at `api.runpod.ai/v2/{id}/run`
 
 ### 4.2 LB endpoint
diff --git a/src/runpod_flash/cli/commands/run.py b/src/runpod_flash/cli/commands/run.py
index bf3bbd74..da96af7e 100644
--- a/src/runpod_flash/cli/commands/run.py
+++ b/src/runpod_flash/cli/commands/run.py
@@ -13,8 +13,22 @@
 import typer
 from rich.console import Console
 from rich.table import Table
-from watchfiles import DefaultFilter as _WatchfilesDefaultFilter
-from watchfiles import watch as _watchfiles_watch
+
+try:
+    from watchfiles import DefaultFilter as _WatchfilesDefaultFilter
+    from watchfiles import watch as _watchfiles_watch
+except ModuleNotFoundError:
+
+    def _watchfiles_watch(*_a, **_kw):  # type: ignore[misc]
+        raise ModuleNotFoundError(
+            "watchfiles is required for flash run --reload. "
+            "Install it with: pip install watchfiles"
+        )
+
+    class _WatchfilesDefaultFilter:  # type: ignore[no-redef]
+        def __init__(self, **_kw):
+            pass
+
 
 from .build_utils.scanner import (
     RemoteDecoratorScanner,
@@ -709,7 +723,8 @@ def run_command(
         console.print("\n[yellow]Stopping server and cleaning up...[/yellow]")
 
         stop_event.set()
-        watcher_thread.join(timeout=2)
+        if watcher_thread.is_alive():
+            watcher_thread.join(timeout=2)
 
         if process:
             try:
@@ -738,7 +753,8 @@ def run_command(
         console.print(f"[red]Error:[/red] {e}")
 
         stop_event.set()
-        watcher_thread.join(timeout=2)
+        if watcher_thread.is_alive():
+            watcher_thread.join(timeout=2)
 
         if process:
             try:
diff --git a/src/runpod_flash/cli/utils/skeleton_template/README.md b/src/runpod_flash/cli/utils/skeleton_template/README.md
index f30adf00..328a8ab3 100644
--- a/src/runpod_flash/cli/utils/skeleton_template/README.md
+++ b/src/runpod_flash/cli/utils/skeleton_template/README.md
@@ -103,7 +103,7 @@ cpu_config = CpuLiveServerless(name="cpu_worker")
 
 @remote(resource_config=cpu_config)
 async def cpu_hello(input_data: dict = {}) -> dict:
-    return {"message": "Hello from CPU!"} + input_data
+    return {"message": "Hello from CPU!", **input_data}
 ```
 
 ### Load-Balanced (LB) Workers
diff --git a/src/runpod_flash/stubs/load_balancer_sls.py b/src/runpod_flash/stubs/load_balancer_sls.py
index f44bccbb..d08a0c5a 100644
--- a/src/runpod_flash/stubs/load_balancer_sls.py
+++ b/src/runpod_flash/stubs/load_balancer_sls.py
@@ -79,7 +79,11 @@ def _should_use_execute_endpoint(self, func: Callable[..., Any]) -> bool:
 
         # Always use /execute for live resources (local development)
         if isinstance(self.server, LiveServerlessMixin):
-            log.debug(f"Using /execute endpoint for LiveLoadBalancer: {func.__name__}")
+            log.debug(
+                "Using /execute endpoint for live resource %s (type=%s)",
+                func.__name__,
+                type(self.server).__name__,
+            )
             return True
 
         # Check if function has routing metadata

From d168633595cd25090b7493726598d3cbd1eca71b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Thu, 19 Feb 2026 17:14:51 -0800
Subject: [PATCH 19/26] docs: remove coordinator/hub-spoke terminology, all
 endpoints are peers

The previous doc round incorrectly replaced "mothership" with
"coordinator", implying a hub-and-spoke architecture. Flash endpoints
are equal peers that discover each other via State Manager GraphQL.

- Replace hub-and-spoke diagrams with flat peer topology
- Remove "coordinator endpoint", "child endpoint", "primary endpoint"
- Reconciliation happens at deploy time via CLI, not at boot
- Scope FLASH_IS_MOTHERSHIP as legacy env var, not architectural concept
- Merge endpoint boot phases into single peer-based flow
---
 README.md                                 |  43 ++---
 docs/Cross_Endpoint_Routing.md            |  14 +-
 docs/Deployment_Architecture.md           |  53 +++---
 docs/Flash_Deploy_Guide.md                | 214 +++++++---------------
 docs/Load_Balancer_Endpoints.md           |  29 ++-
 src/runpod_flash/cli/docs/README.md       |  18 +-
 src/runpod_flash/cli/docs/flash-app.md    |   3 +-
 src/runpod_flash/cli/docs/flash-build.md  |   6 +-
 src/runpod_flash/cli/docs/flash-deploy.md |  33 ++--
 src/runpod_flash/cli/docs/flash-env.md    |   3 +-
 src/runpod_flash/cli/docs/flash-init.md   |  16 +-
 src/runpod_flash/cli/docs/flash-run.md    |  30 +--
 12 files changed, 178 insertions(+), 284 deletions(-)

diff --git a/README.md b/README.md
index 8b9c3ea9..cce420ce 100644
--- a/README.md
+++ b/README.md
@@ -141,7 +141,7 @@ These endpoints use the same Python `@remote` decorators [demonstrated above](#g
 
 ### Step 1: Initialize a new project
 
-Use the `flash init` command to generate a structured project template with a preconfigured FastAPI application entry point.
+Use the `flash init` command to generate a project template with example worker files.
 
 Run this command to initialize a new project directory:
 
@@ -162,30 +162,24 @@ This is the structure of the project template created by `flash init`:
 
 ```txt
 my_project/
-├── main.py                    # FastAPI application entry point
-├── workers/
-│   ├── gpu/                   # GPU worker example
-│   │   ├── __init__.py        # FastAPI router
-│   │   └── endpoint.py        # GPU script @remote decorated function
-│   └── cpu/                   # CPU worker example
-│       ├── __init__.py        # FastAPI router
-│       └── endpoint.py        # CPU script with @remote decorated function
-├── .env               # Environment variable template
+├── gpu_worker.py              # GPU worker with @remote function
+├── cpu_worker.py              # CPU worker with @remote function
+├── .env                       # Environment variable template
 ├── .gitignore                 # Git ignore patterns
 ├── .flashignore               # Flash deployment ignore patterns
-├── requirements.txt           # Python dependencies
+├── pyproject.toml             # Python dependencies (uv/pip compatible)
 └── README.md                  # Project documentation
 ```
 
 This template includes:
 
-- A FastAPI application entry point and routers.
+- Example worker files with `@remote` decorated functions.
 - Templates for Python dependencies, `.env`, `.gitignore`, etc.
-- Flash scripts (`endpoint.py`) for both GPU and CPU workers, which include:
+- Each worker file contains:
     - Pre-configured worker scaling limits using the `LiveServerless()` object.
     - A `@remote` decorated function that returns a response from a worker.
 
-When you start the FastAPI server, it creates API endpoints at `/gpu/hello` and `/cpu/hello`, which call the remote function described in their respective `endpoint.py` files.
+When you run `flash run`, it auto-discovers all `@remote` functions and generates a local development server at `.flash/server.py`. Queue-based workers are exposed at `/{file_prefix}/run_sync` (e.g., `/gpu_worker/run_sync`).
 
 ### Step 3: Install Python dependencies
 
@@ -195,9 +189,11 @@ After initializing the project, navigate into the project directory:
 cd my_project
 ```
 
-Install required dependencies:
+Install required dependencies using uv (recommended) or pip:
 
 ```bash
+uv sync        # recommended
+# or
 pip install -r requirements.txt
 ```
 
@@ -232,7 +228,7 @@ flash run
 Open a new terminal tab or window and test your GPU API using cURL:
 
 ```bash
-curl -X POST http://localhost:8888/gpu/hello \
+curl -X POST http://localhost:8888/gpu_worker/run_sync \
     -H "Content-Type: application/json" \
     -d '{"message": "Hello from the GPU!"}'
 ```
@@ -257,19 +253,18 @@ Besides starting the API server, `flash run` also starts an interactive API expl
 
 To run remote functions in the explorer:
 
-1. Expand one of the functions under **GPU Workers** or **CPU Workers**.
-2. Click **Try it out** and then **Execute**
+1. Expand one of the available endpoints (e.g., `/gpu_worker/run_sync`).
+2. Click **Try it out** and then **Execute**.
 
 You'll get a response from your workers right in the explorer.
 
 ### Step 7: Customize your API
 
-To customize your API endpoint and functionality:
+To customize your API:
 
-1. Add/edit remote functions in your `endpoint.py` files.
-2. Test the scripts individually by running `python endpoint.py`.
-3. Configure your FastAPI routers by editing the `__init__.py` files.
-4. Add any new endpoints to your `main.py` file.
+1. Create new `.py` files with `@remote` decorated functions.
+2. Test the scripts individually by running `python your_worker.py`.
+3. Run `flash run` to auto-discover all `@remote` functions and serve them.
 
 ## CLI Reference
 
@@ -541,7 +536,7 @@ After `flash build` completes:
 - `.flash/artifact.tar.gz`: Deployment package
 - `.flash/flash_manifest.json`: Service discovery configuration
 
-For information on load-balanced endpoints (required for Mothership and HTTP services), see [docs/Load_Balancer_Endpoints.md](docs/Load_Balancer_Endpoints.md).
+For information on load-balanced endpoints (required for HTTP services), see [docs/Load_Balancer_Endpoints.md](docs/Load_Balancer_Endpoints.md).
 
 #### Troubleshooting Build Issues
 
diff --git a/docs/Cross_Endpoint_Routing.md b/docs/Cross_Endpoint_Routing.md
index aa851705..800f6101 100644
--- a/docs/Cross_Endpoint_Routing.md
+++ b/docs/Cross_Endpoint_Routing.md
@@ -342,7 +342,7 @@ graph TD
 
     B -->|"load service configuration"| C["ServiceRegistry"]
     C -->|"if not cached"| D["ManifestClient"]
-    D -->|"query mothership API"| E["Manifest<br/>Endpoint URLs"]
+    D -->|"query State Manager API"| E["Manifest<br/>Endpoint URLs"]
     E -->|"cache result<br/>TTL 300s"| C
 
     C -->|"lookup in manifest<br/>flash_manifest.json"| F{"Routing<br/>Decision"}
@@ -465,7 +465,7 @@ class ServiceRegistry:
 
         Environment Variables (for local vs remote detection):
             RUNPOD_API_KEY: API key for State Manager GraphQL access (peer-to-peer).
-            FLASH_RESOURCE_NAME: Resource config name for this endpoint (child endpoints).
+            FLASH_RESOURCE_NAME: Resource config name for this endpoint (worker endpoints).
                 Identifies which resource config this endpoint represents in the manifest.
             RUNPOD_ENDPOINT_ID: Endpoint ID (used as fallback for identification).
         """
@@ -473,7 +473,7 @@ class ServiceRegistry:
         self._state_manager_client = state_manager_client or StateManagerClient()
         self._endpoint_registry = {}  # Cached endpoint URLs
         self._endpoint_registry_lock = asyncio.Lock()
-        # Child endpoints use FLASH_RESOURCE_NAME to identify which resource they represent
+        # Worker endpoints use FLASH_RESOURCE_NAME to identify which resource they represent
         # Falls back to RUNPOD_ENDPOINT_ID if not set
         self._current_endpoint = os.getenv("FLASH_RESOURCE_NAME") or os.getenv(
             "RUNPOD_ENDPOINT_ID"
@@ -531,7 +531,7 @@ class ServiceRegistry:
 
 **Location**: `src/runpod_flash/runtime/state_manager_client.py`
 
-GraphQL client for State Manager manifest persistence (used by mothership auto-provisioning):
+GraphQL client for State Manager manifest persistence (used by endpoint auto-provisioning):
 
 ```python
 class StateManagerClient:
@@ -815,7 +815,7 @@ class JsonSerializer:
 
 #### Adding New Manifest Backends
 
-To support directories other than mothership:
+To support alternative manifest backends:
 
 1. Create client class with `get_manifest()` method:
 ```python
@@ -983,7 +983,7 @@ manifest = await client.get_persisted_manifest(mothership_id)
 
 Cross-endpoint routing uses a **peer-to-peer architecture** where all endpoints query State Manager directly for service discovery. This eliminates single points of failure and simplifies the system architecture compared to previous hub-and-spoke models.
 
-**Key Difference**: No mothership endpoint exposing a `/manifest` HTTP endpoint. Instead, all endpoints use `StateManagerClient` to query the Runpod GraphQL API directly.
+**Key Difference**: No dedicated endpoint exposing a `/manifest` HTTP endpoint. Instead, all endpoints use `StateManagerClient` to query the Runpod GraphQL API directly.
 
 ### Architecture
 
@@ -1034,7 +1034,7 @@ export RUNPOD_ENDPOINT_ID=gpu-endpoint-123
 - **Caching**: 300-second TTL cache to minimize API calls
 - **Retry Logic**: Exponential backoff on failures (default 3 attempts)
 - **Thread-Safe**: Uses `asyncio.Lock` for concurrent operations
-- **Auto-Provisioning**: Used by mothership provisioner to update resource state
+- **Auto-Provisioning**: Used by endpoint provisioner to update resource state
 
 ## Key Implementation Highlights
 
diff --git a/docs/Deployment_Architecture.md b/docs/Deployment_Architecture.md
index cc395aaf..c5ab7733 100644
--- a/docs/Deployment_Architecture.md
+++ b/docs/Deployment_Architecture.md
@@ -1,7 +1,7 @@
 # Flash App Deployment Architecture Specification
 
 ## Overview
-A deployed Flash App consists of a Mothership coordinator and distributed Child Endpoints, where functions are partitioned across endpoints. The system uses a manifest-driven approach to route requests and coordinate execution across the distributed topology.
+A deployed Flash App consists of peer endpoints, where functions are partitioned across endpoints. The system uses a manifest-driven approach to route requests and coordinate execution across the distributed topology.
 
 ## Build and Deploy Flow
 
@@ -11,33 +11,31 @@ graph TD
     B -->|"Write"| C["flash_manifest.json"]
     B -->|"Archive"| D["artifact.tar.gz"]
 
-    D -->|"flash deploy"| E["Push Archive +<br/>Provision Resources"]
+    D -->|"flash deploy"| E["Push Archive +<br/>Load Manifest"]
 
-    E -->|"CLI provisions<br/>upfront"| F["Child Endpoints<br/>Deployed"]
-
-    G["🎯 Mothership<br/>Endpoint"] -->|"Load from<br/>.flash/"| H["Load Local<br/>Manifest"]
-
-    H --> I["reconcile_children()"]
+    E --> I["Reconcile:<br/>Compute Diff"]
 
     I --> J["Categorize:<br/>New, Changed,<br/>Removed, Unchanged"]
 
-    J --> K["Verify NEW<br/>Endpoints"]
-    J --> L["Verify CHANGED<br/>Endpoints"]
-    J --> M["Verify REMOVED<br/>Endpoints"]
+    J --> K["Provision NEW<br/>Endpoints"]
+    J --> L["Update CHANGED<br/>Endpoints"]
+    J --> M["Remove DELETED<br/>Endpoints"]
     J --> N["Skip UNCHANGED<br/>Endpoints"]
 
-    K -->|"Healthy?"| O["Update State"]
-    L -->|"Healthy?"| O
-    M -->|"Decommissioned?"| O
+    K -->|"Deployed"| O["Update State"]
+    L -->|"Updated"| O
+    M -->|"Decommissioned"| O
 
     O --> P["Persist to State Manager"]
 
-    P --> Q["🚀 Reconciliation<br/>Complete"]
+    P --> Q["🚀 Deploy<br/>Complete"]
+
+    Q -.->|"Endpoints boot"| F["Peer Endpoints<br/>Running"]
 
     F -.->|"Peer-to-peer<br/>Service Discovery"| R["Query State Manager<br/>GraphQL API"]
 
     style A fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff
-    style G fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff
+    style E fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff
     style I fill:#f57c00,stroke:#bf360c,stroke-width:3px,color:#fff
     style K fill:#f57c00,stroke:#bf360c,stroke-width:3px,color:#fff
     style L fill:#f57c00,stroke:#bf360c,stroke-width:3px,color:#fff
@@ -51,9 +49,9 @@ graph TD
 
 ```mermaid
 graph TD
-    A["Request arrives at<br/>Mothership for funcA"] -->|"Consult manifest"| B{"Function<br/>Location?"}
+    A["Request arrives at<br/>Endpoint for funcA"] -->|"Consult manifest"| B{"Function<br/>Location?"}
 
-    B -->|"Local to Mothership"| C["Execute locally"]
+    B -->|"Local to Endpoint"| C["Execute locally"]
     B -->|"On Endpoint1"| D["Route request to<br/>Endpoint1 with payload"]
 
     D --> E["Endpoint1 receives<br/>Endpoint1>funcA"]
@@ -69,7 +67,7 @@ graph TD
 
     L --> J
     J --> M["funcA completes<br/>with all results"]
-    M --> N["Response back<br/>to Mothership"]
+    M --> N["Response back<br/>to Endpoint"]
     N --> O["Return to client"]
 
     style A fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff
@@ -82,7 +80,7 @@ graph TD
 
 ```mermaid
 graph LR
-    subgraph Mothership["🎯 Mothership<br/>(Coordinator)"]
+    subgraph CoordinatorNode["🎯 Manifest Store"]
         MF["Manifest Store<br/>Function Map"]
     end
 
@@ -105,7 +103,7 @@ graph LR
     E1F1 -.->|"Local execution"| E1F2
     E1F1 -.->|"Remote call"| E2F1
 
-    style Mothership fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff
+    style CoordinatorNode fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff
     style EP1 fill:#f57c00,stroke:#bf360c,stroke-width:3px,color:#fff
     style EP2 fill:#f57c00,stroke:#bf360c,stroke-width:3px,color:#fff
     style MF fill:#1565c0,stroke:#0d47a1,stroke-width:2px,color:#fff
@@ -118,8 +116,8 @@ graph LR
 - **Smart Routing**: System automatically determines if execution is local (in-process) or remote (inter-endpoint)
 - **Deployed Mode**: Unlike Live mode, endpoints are aware they're in distributed deployment with explicit role assignments
 - **Transparent Execution**: Functions can call other functions without knowing deployment topology; manifest handles routing
-- **State Synchronization**: Mothership maintains single source of truth, synced with GQL State Manager
-- **Reconciliation**: On each boot, Mothership reconciles local manifest with persisted state to deploy/update/undeploy resources
+- **State Synchronization**: State Manager maintains the source of truth; endpoints sync via GraphQL
+- **Reconciliation**: The CLI reconciles the manifest with persisted state during `flash deploy`
 - **Peer-to-Peer Discovery**: All endpoints query State Manager GraphQL API directly for service discovery
 
 ## Actual Manifest Structure
@@ -285,12 +283,9 @@ Each reconciliation action updates State Manager:
 
 ## Environment Variables
 
-### Mothership
-- `FLASH_IS_MOTHERSHIP=true` - Identifies this endpoint as mothership
-- `RUNPOD_API_KEY` - For State Manager authentication
-- `FLASH_MANIFEST_PATH` - Optional explicit path to manifest
-
-### Child Endpoints
+### All Endpoints
 - `RUNPOD_API_KEY` - For State Manager GraphQL access (peer-to-peer service discovery)
 - `FLASH_RESOURCE_NAME` - Which resource config this endpoint represents
-- `RUNPOD_ENDPOINT_ID` - This child's endpoint ID
+- `RUNPOD_ENDPOINT_ID` - This endpoint's ID (set by Runpod)
+- `FLASH_MANIFEST_PATH` - Optional explicit path to manifest
+- `FLASH_IS_MOTHERSHIP` - Legacy env var; not used in production `flash deploy`
diff --git a/docs/Flash_Deploy_Guide.md b/docs/Flash_Deploy_Guide.md
index 234e5f33..2a88898a 100644
--- a/docs/Flash_Deploy_Guide.md
+++ b/docs/Flash_Deploy_Guide.md
@@ -30,12 +30,7 @@ graph TB
     subgraph Cloud["Runpod Cloud"]
         S3["S3 Storage<br/>artifact.tar.gz"]
 
-        subgraph Mothership["Mothership Endpoint<br/>(FLASH_IS_MOTHERSHIP=true)"]
-            MothershipReconciler["MothershipsProvisioner<br/>Reconcile Children"]
-            MothershipState["State Sync<br/>to State Manager"]
-        end
-
-        subgraph ChildEndpoints["Child Endpoints<br/>(Resource Configs)"]
+        subgraph Endpoints["Peer Endpoints<br/>(one per resource config)"]
             Handler1["GPU Handler<br/>@remote functions"]
             Handler2["CPU Handler<br/>@remote functions"]
             StateQuery["Service Registry<br/>Query State Manager"]
@@ -47,22 +42,19 @@ graph TB
     Developer -->|flash build| Build
     Build -->|archive| S3
     Developer -->|flash deploy --env| S3
-    CLI -->|provision upfront<br/>before activation| ChildEndpoints
-    Mothership -->|reconcile_children<br/>on boot| ChildEndpoints
-    MothershipReconciler -->|update state| Database
-    ChildEndpoints -->|query manifest<br/>peer-to-peer| Database
-    Developer -->|call @remote| ChildEndpoints
-
-    style Mothership fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff
-    style ChildEndpoints fill:#388e3c,stroke:#1b5e20,stroke-width:3px,color:#fff
+    CLI -->|provision all endpoints| Endpoints
+    Endpoints -->|query manifest<br/>peer-to-peer| Database
+    Developer -->|call @remote| Endpoints
+
+    style Endpoints fill:#388e3c,stroke:#1b5e20,stroke-width:3px,color:#fff
     style Build fill:#f57c00,stroke:#e65100,stroke-width:3px,color:#fff
 ```
 
 ### Key Concepts
 
-**Mothership**: The orchestration endpoint responsible for deployment, resource provisioning, and manifest distribution. Created via `flash env create <env_name>`.
+**Endpoints**: All deployed endpoints are peers. The CLI provisions them upfront during `flash deploy`. Each endpoint loads the manifest from its `.flash/` directory and queries State Manager for peer discovery.
 
-**Child Endpoints**: Worker endpoints that execute `@remote` functions. One per resource config (e.g., `gpu_config`, `cpu_config`).
+**Worker Endpoints**: Endpoints that execute `@remote` functions. One per resource config (e.g., `gpu_config`, `cpu_config`).
 
 **Manifest**: JSON document describing all deployed functions, their resource configs, routing rules, and metadata. Built at compile-time, distributed to all endpoints.
 
@@ -76,7 +68,7 @@ graph TB
 
 ### flash env create
 
-Create a new deployment environment (mothership).
+Create a new deployment environment.
 
 ```bash
 flash env create <env_name> [--app <app_name>]
@@ -91,7 +83,7 @@ flash env create <env_name> [--app <app_name>]
 **What it does:**
 1. Creates a FlashApp in Runpod (if first environment for the app)
 2. Creates FlashEnvironment with the specified name
-3. Provisions a mothership serverless endpoint
+3. Provisions serverless endpoints
 
 **Example:**
 ```bash
@@ -277,93 +269,37 @@ sequenceDiagram
 **Upload Process** (`src/runpod_flash/cli/commands/deploy.py:197-224`):
 1. Archive uploaded to Runpod's built-in S3 storage
 2. URL generated with temporary access
-3. URL passed to mothership endpoint creation
+3. URL passed to endpoint creation
 
 **Key Files:**
 - `src/runpod_flash/cli/commands/deploy.py` - Deploy CLI commands
 
 ---
 
-### Phase 3: Mothership Boot & Reconciliation
-
-The mothership runs on each boot to perform reconcile_children() - reconciling desired state (manifest) with current state (local resources). Note: All resources are provisioned upfront by the CLI before environment activation.
-
-```mermaid
-sequenceDiagram
-    Runpod->>Mothership: Boot endpoint
-    Mothership->>Mothership: Initialize runtime
-    Mothership->>ManifestFetcher: Load manifest from .flash/
-    ManifestFetcher->>ManifestFetcher: Read flash_manifest.json
-    Mothership->>MothershipsProvisioner: Execute reconcile_children()
-    MothershipsProvisioner->>StateManager: Fetch persisted state
-    StateManager->>GraphQL: Query persisted manifest
-    GraphQL->>StateManager: Return persisted manifest
-    MothershipsProvisioner->>MothershipsProvisioner: Compute diff:<br/>new, changed, removed
-    MothershipsProvisioner->>StateManager: Update state after<br/>reconciliation
-    StateManager->>GraphQL: Mutation:<br/>updateFlashBuildManifest
-    MothershipsProvisioner->>Mothership: Reconciliation complete
-```
-
-**Key Components:**
-
-**MothershipsProvisioner** (`src/runpod_flash/runtime/mothership_provisioner.py`):
-- `is_mothership()`: Check if endpoint is mothership (FLASH_IS_MOTHERSHIP=true)
-- `reconcile_children()`: Compute diff between desired and current state
-- Verifies child endpoints are deployed and healthy
-- Updates State Manager with reconciliation results
-
-**ResourceManager** (`src/runpod_flash/core/resources/resource_manager.py`):
-- Singleton pattern (global resource registry)
-- Stores state in `.runpod/resources.pkl` with file locking
-- Tracks config hashes for drift detection (hash comparison)
-- Provisioned upfront by CLI before environment activation
-- Auto-migrates legacy resources
-
-**StateManagerClient** (`src/runpod_flash/runtime/state_manager_client.py`):
-- GraphQL client for persisting manifest state
-- Read-modify-write pattern for updates (3 GQL roundtrips)
-- Thread-safe with asyncio.Lock for concurrent updates
-- Retries with exponential backoff (3 attempts)
-
-**Reconciliation Logic**:
-1. **Fetch persisted manifest**: Query State Manager for previous reconciliation state
-2. **Compare with current manifest**: Detect new, changed, and removed resources
-3. **Verify new resources**: Check that new endpoints are deployed and healthy
-4. **Verify changed resources**: Check if hash differs, verify endpoint health
-5. **Verify removed resources**: Check that deleted endpoints are decommissioned
-6. **Persist new state**: Update State Manager with current reconciliation results
-
-**Key Files:**
-- `src/runpod_flash/runtime/mothership_provisioner.py` - Reconciliation logic
-- `src/runpod_flash/core/resources/resource_manager.py` - Resource provisioning
-- `src/runpod_flash/runtime/state_manager_client.py` - State persistence
-
----
-
-### Phase 4: Child Endpoint Initialization
+### Phase 3: Endpoint Boot & Service Discovery
 
-Each child endpoint boots independently and prepares for function execution.
+Each endpoint boots independently. Endpoints that make cross-endpoint calls (i.e., call `@remote` functions deployed on a different resource config) query State Manager to discover peer endpoint URLs. Endpoints that only execute local functions do not need State Manager access.
 
 ```mermaid
 sequenceDiagram
-    Runpod->>Child: Boot with handler_gpu_config.py
-    Child->>Child: Initialize runtime
-    Child->>ManifestFetcher: Load manifest from .flash/
+    Runpod->>Endpoint: Boot with handler
+    Endpoint->>Endpoint: Initialize runtime
+    Endpoint->>ManifestFetcher: Load manifest from .flash/
     ManifestFetcher->>ManifestFetcher: Check cache<br/>(TTL: 300s)
     alt Cache expired
-        ManifestFetcher->>StateManager: Query GraphQL API<br/>State Manager
+        ManifestFetcher->>StateManager: Query GraphQL API
         StateManager->>ManifestFetcher: Return manifest
     else Cache valid
         ManifestFetcher->>ManifestFetcher: Return cached
     end
-    ManifestFetcher->>Child: Manifest loaded
-    Child->>ServiceRegistry: Load manifest
+    ManifestFetcher->>Endpoint: Manifest loaded
+    Endpoint->>ServiceRegistry: Load manifest
     ServiceRegistry->>ServiceRegistry: Build function_registry
     ServiceRegistry->>ServiceRegistry: Build resource_mapping
-    Child->>StateManager: Query State Manager<br/>peer-to-peer discovery
-    StateManager->>Child: Return peer endpoints
-    Child->>ServiceRegistry: Cache endpoint URLs
-    Child->>Ready: Ready to execute functions
+    Endpoint->>StateManager: Query State Manager<br/>peer-to-peer discovery
+    StateManager->>Endpoint: Return peer endpoints
+    Endpoint->>ServiceRegistry: Cache endpoint URLs
+    Endpoint->>Ready: Ready to execute functions
 ```
 
 **ManifestFetcher** (`src/runpod_flash/runtime/manifest_fetcher.py`):
@@ -394,7 +330,7 @@ sequenceDiagram
 
 ---
 
-### Phase 5: Runtime Function Execution
+### Phase 4: Runtime Function Execution
 
 When client calls `@remote function`:
 
@@ -530,7 +466,7 @@ The manifest is the contract between build-time and runtime. It defines all depl
 
 ### Runtime: Distribution & Caching
 
-**Mothership Side** - `ManifestFetcher`:
+**Endpoint Side** - `ManifestFetcher`:
 
 1. **Check cache**: Is manifest cached and TTL valid?
    - Cache TTL: 300 seconds (configurable)
@@ -547,7 +483,7 @@ The manifest is the contract between build-time and runtime. It defines all depl
 
 **Code Reference**: `src/runpod_flash/runtime/manifest_fetcher.py:47-118`
 
-**Child Endpoint Side** - `ServiceRegistry`:
+**Worker Endpoint Side** - `ServiceRegistry`:
 
 1. **Load manifest**: From local file
    - Searches multiple locations (cwd, module dir, etc)
@@ -558,7 +494,7 @@ The manifest is the contract between build-time and runtime. It defines all depl
 
 3. **Query State Manager**: Get endpoint URLs via GraphQL
    - Queries Runpod State Manager GraphQL API directly
-   - Returns: Resource endpoints for all deployed child endpoints
+   - Returns: Resource endpoints for all deployed worker endpoints
    - Retries with exponential backoff
 
 4. **Cache endpoints**: Store for routing decisions
@@ -608,7 +544,7 @@ Write: Mutation updateFlashBuildManifest
 
 ## Resource Provisioning
 
-Resources are dynamically provisioned by the mothership during boot, based on the manifest.
+Resources are provisioned by the CLI during `flash deploy`, based on the manifest.
 
 ### ResourceManager: Local State
 
@@ -646,7 +582,7 @@ Resources are dynamically provisioned by the mothership during boot, based on th
 
 ### Deployment Orchestration
 
-**MothershipsProvisioner** reconciles manifest with local state:
+The **MothershipsProvisioner** reconciles the manifest with the endpoint's local state:
 
 ```python
 # 1. Load manifest from flash_manifest.json
@@ -870,21 +806,15 @@ graph TB
         Archive["Archive Builder<br/>(tar.gz)"]
     end
 
-    subgraph Upload["Upload"]
+    subgraph Deploy["Deploy (CLI)"]
         S3["S3 Storage"]
+        Provisioner["ResourceManager<br/>(provision endpoints)"]
+        StateMgr["StateManagerClient<br/>(persist state)"]
     end
 
-    subgraph MothershipBoot["Mothership Boot"]
-        Fetcher["ManifestFetcher<br/>(cache + GQL)"]
-        MProvisioner["MothershipsProvisioner<br/>(reconciliation)"]
-        ResMgr["ResourceManager<br/>(state)"]
-        StateMgr["StateManagerClient<br/>(persistence)"]
-    end
-
-    subgraph ChildBoot["Child Endpoint Boot"]
-        ChildFetcher["ManifestFetcher<br/>(local file)"]
+    subgraph EndpointBoot["Endpoint Boot"]
+        Fetcher["ManifestFetcher<br/>(local file + GQL)"]
         Registry["ServiceRegistry<br/>(function mapping)"]
-        ManifestC["ManifestClient<br/>(query mothership)"]
     end
 
     subgraph Runtime["Runtime Execution"]
@@ -896,20 +826,16 @@ graph TB
     Scanner --> ManifestB
     ManifestB --> Archive
     Archive --> S3
-    S3 --> Fetcher
-    Fetcher --> MProvisioner
-    MProvisioner --> ResMgr
-    ResMgr --> StateMgr
-    StateMgr -->|update| S3
-    ChildFetcher --> Registry
-    ManifestC -->|query| Fetcher
-    Registry --> ManifestC
+    S3 --> Provisioner
+    Provisioner --> StateMgr
+    Fetcher --> Registry
+    Registry -->|query State Manager<br/>peer-to-peer| StateMgr
     Handler --> Serial
     Serial --> Exec
 
     style Build fill:#f57c00,stroke:#e65100,stroke-width:3px,color:#fff
-    style MothershipBoot fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff
-    style ChildBoot fill:#388e3c,stroke:#1b5e20,stroke-width:3px,color:#fff
+    style Deploy fill:#1976d2,stroke:#0d47a1,stroke-width:3px,color:#fff
+    style EndpointBoot fill:#388e3c,stroke:#1b5e20,stroke-width:3px,color:#fff
     style Runtime fill:#7b1fa2,stroke:#4a148c,stroke-width:3px,color:#fff
 ```
 
@@ -921,14 +847,12 @@ graph TB
 graph LR
     A["Build Time<br/>ManifestBuilder"] -->|Generate| B["flash_manifest.json<br/>(embedded in archive)"]
     B -->|Upload| C["S3<br/>(artifact.tar.gz)"]
-    C -->|Provision upfront<br/>before activation| D["Child Endpoints<br/>(deployed)"]
+    C -->|CLI provisions<br/>endpoints| D["Endpoints<br/>(deployed)"]
     D -->|Extract from<br/>.flash/ directory| E["LocalManifest<br/>(from archive)"]
-    Mothership -->|Load from<br/>.flash/| E
     E -->|Build registry| F["ServiceRegistry<br/>(function mapping)"]
     F -->|Query State Manager<br/>peer-to-peer| G["StateManager<br/>(GraphQL API)"]
     G -->|Return endpoints| F
     F -->|Route calls| H["Handler<br/>(execute)"]
-    Mothership -->|reconcile_children<br/>on boot| D
 
     style A fill:#f57c00,stroke:#e65100,stroke-width:2px,color:#fff
     style B fill:#ff6f00,stroke:#e65100,stroke-width:2px,color:#fff
@@ -938,7 +862,6 @@ graph LR
     style F fill:#388e3c,stroke:#1b5e20,stroke-width:2px,color:#fff
     style G fill:#0d47a1,stroke:#051c66,stroke-width:2px,color:#fff
     style H fill:#388e3c,stroke:#1b5e20,stroke-width:2px,color:#fff
-    style Mothership fill:#1976d2,stroke:#0d47a1,stroke-width:2px,color:#fff
 ```
 
 ---
@@ -947,7 +870,7 @@ graph LR
 
 ```mermaid
 graph LR
-    A["Mothership Boots"] -->|Load manifest| B["Desired State"]
+    A["CLI: flash deploy"] -->|Load manifest| B["Desired State"]
     B -->|Fetch persisted| C["Current State"]
     C -->|Compute diff| D{"Reconciliation"}
     D -->|new| E["Create Resource"]
@@ -959,7 +882,7 @@ graph LR
     D -->|removed| J["Delete Resource"]
     J -->|Decommission| K["Deleted"]
     K -->|Remove state| G
-    G -->|On next boot| C
+    G -->|On next deploy| C
 
     style A fill:#1976d2,stroke:#0d47a1,stroke-width:2px,color:#fff
     style B fill:#1976d2,stroke:#0d47a1,stroke-width:2px,color:#fff
@@ -974,38 +897,31 @@ graph LR
 
 ## Environment Variables Reference
 
-### Mothership Configuration
-
-**FLASH_IS_MOTHERSHIP** (Required on mothership)
-- Value: `"true"`
-- Enables mothership auto-provisioning logic
-- Triggers manifest reconciliation on boot
-
-**RUNPOD_ENDPOINT_ID** (Required on mothership)
-- Runpod serverless endpoint ID
-- Used to construct mothership URL: `https://{RUNPOD_ENDPOINT_ID}.api.runpod.ai`
-- Set automatically by Runpod platform
+### All Endpoints
 
-**RUNPOD_API_KEY** (Required for State Manager)
+**RUNPOD_API_KEY** (Required)
 - Runpod API authentication token
 - Used by StateManagerClient for GraphQL queries
-- Enables manifest persistence
+- Enables peer-to-peer service discovery and manifest persistence
 
-### Child Endpoint Configuration
-
-**FLASH_RESOURCE_NAME** (Required on child endpoints)
+**FLASH_RESOURCE_NAME** (Required)
 - Resource config name (e.g., "gpu_config", "cpu_config")
 - Identifies which resource config this endpoint represents
 - Used by ServiceRegistry for local vs remote detection
 
-**RUNPOD_API_KEY** (Required for peer-to-peer discovery)
-- API key for State Manager GraphQL access
-- Enables endpoints to query manifest peer-to-peer
-- Used by all endpoints for service discovery
+**RUNPOD_ENDPOINT_ID** (Set by Runpod)
+- Runpod serverless endpoint ID
+- Used to construct endpoint URL: `https://{RUNPOD_ENDPOINT_ID}.api.runpod.ai`
+- Set automatically by Runpod platform
 
 **FLASH_MANIFEST_PATH** (Optional)
 - Override default manifest file location
 - If not set, searches: cwd, module dir, parent dirs
+
+**FLASH_IS_MOTHERSHIP** (Legacy)
+- Value: `"true"`
+- Triggers `reconcile_children()` on boot
+- Not used in production `flash deploy` (CLI handles reconciliation)
 - Useful for testing or non-standard layouts
 
 ### Runtime Configuration
@@ -1048,7 +964,7 @@ Flash Deploy uses a dual-layer state system for reliability and consistency.
 
 ### Remote State: Runpod State Manager (GraphQL API)
 
-**Purpose**: Persist deployment state across mothership boots
+**Purpose**: Persist deployment state across endpoint boots
 
 **Data Model**:
 ```graphql
@@ -1092,7 +1008,7 @@ async with state_manager_lock:
 ```
 
 **Reconciliation**:
-On mothership boot:
+On deploy:
 1. Load local manifest from .flash/ (desired state)
 2. Fetch persisted manifest from State Manager (previous reconciliation state)
 3. Compare → detect new, changed, removed resources
@@ -1117,9 +1033,9 @@ flash build --preview
 1. Builds your project (creates archive, manifest)
 2. Creates a Docker network for inter-container communication
 3. Starts one Docker container per resource config:
-   - Mothership container (orchestrator)
+   - Application container
    - All worker containers (GPU, CPU, etc.)
-4. Exposes mothership on `localhost:8000`
+4. Exposes application on `localhost:8000`
 5. All containers communicate via Docker DNS
 6. Auto-cleanup on exit (Ctrl+C)
 
@@ -1140,7 +1056,7 @@ For testing complete deployment flow locally:
 # Build project
 flash build
 
-# Start local mothership simulator
+# Start local endpoint simulator
 docker run -it \
   -e FLASH_IS_MOTHERSHIP=true \
   -e RUNPOD_API_KEY=$RUNPOD_API_KEY \
@@ -1189,7 +1105,7 @@ logging.getLogger("runpod_flash.runtime.service_registry").setLevel(logging.DEBU
 |------|---------|
 | `src/runpod_flash/cli/commands/deploy.py` | Deploy environment management commands |
 | `src/runpod_flash/cli/commands/build.py` | Build packaging and archive creation |
-| `src/runpod_flash/cli/commands/test_mothership.py` | Local mothership testing |
+| `src/runpod_flash/cli/commands/test_mothership.py` | Local endpoint testing |
 
 ### Build System
 
@@ -1235,7 +1151,7 @@ logging.getLogger("runpod_flash.runtime.service_registry").setLevel(logging.DEBU
 
 ## Common Issues & Solutions
 
-### Issue: Manifest not found on child endpoint
+### Issue: Manifest not found on worker endpoint
 
 **Cause**: flash_manifest.json not included in archive or not found at runtime
 
@@ -1255,13 +1171,13 @@ logging.getLogger("runpod_flash.runtime.service_registry").setLevel(logging.DEBU
 
 ### Issue: Remote function calls fail with endpoint not found
 
-**Cause**: ServiceRegistry unable to query mothership or manifest outdated
+**Cause**: ServiceRegistry unable to query State Manager or manifest outdated
 
 **Solution**:
 1. Verify `RUNPOD_API_KEY` environment variable is set
 2. Check State Manager GraphQL API is accessible
 3. Verify manifest includes the resource config: `grep resource_name flash_manifest.json`
-4. Check that child endpoints are deployed and healthy
+4. Check that worker endpoints are deployed and healthy
 
 ### Issue: Manifest cache staleness
 
diff --git a/docs/Load_Balancer_Endpoints.md b/docs/Load_Balancer_Endpoints.md
index 091e1893..77ba38bc 100644
--- a/docs/Load_Balancer_Endpoints.md
+++ b/docs/Load_Balancer_Endpoints.md
@@ -4,7 +4,7 @@
 
 The `LoadBalancerSlsResource` class enables provisioning and management of Runpod load-balanced serverless endpoints. Unlike queue-based endpoints that process requests sequentially, load-balanced endpoints expose HTTP servers directly to clients, enabling REST APIs, webhooks, and real-time communication patterns.
 
-This resource type is used for specialized endpoints like the Mothership. Cross-endpoint service discovery now uses State Manager GraphQL API (peer-to-peer) rather than HTTP endpoints.
+This resource type is used for specialized endpoints like entry-point endpoints. Cross-endpoint service discovery now uses State Manager GraphQL API (peer-to-peer) rather than HTTP endpoints.
 
 ## Design Context
 
@@ -35,10 +35,10 @@ Load-balanced endpoints require different provisioning and health check logic th
 
 ### Why This Matters
 
-The Mothership coordinates resource deployment and reconciliation. This requires:
-- Peer-to-peer service discovery via State Manager GraphQL API (not HTTP-based)
-- Ability to expose custom endpoints (HTTP routes like `/ping`, user-defined routes)
-- Health checking to verify children are ready before routing traffic
+Load-balanced endpoints expose HTTP servers directly to clients. This enables:
+- Custom HTTP routes (user-defined REST endpoints, `/ping` for health checks)
+- Direct request routing to workers (lower latency than queue-based)
+- Health check polling to verify workers are ready before routing traffic
 
 ## Architecture
 
@@ -147,9 +147,9 @@ This document focuses on the `LoadBalancerSlsResource` class implementation and
 from runpod_flash import LoadBalancerSlsResource
 
 # Create a load-balanced endpoint
-mothership = LoadBalancerSlsResource(
-    name="mothership",
-    imageName="my-mothership-app:latest",
+api_endpoint = LoadBalancerSlsResource(
+    name="api-endpoint",
+    imageName="my-api-app:latest",
     workersMin=1,
     workersMax=3,
     env={
@@ -159,7 +159,7 @@ mothership = LoadBalancerSlsResource(
 )
 
 # Deploy endpoint (returns immediately)
-deployed = await mothership.deploy()
+deployed = await api_endpoint.deploy()
 
 # Endpoint is now deployed (may still be initializing)
 print(f"Endpoint ID: {deployed.id}")
@@ -246,7 +246,7 @@ except ValueError as e:
 ```python
 try:
     endpoint = LoadBalancerSlsResource(
-        name="mothership",
+        name="api-endpoint",
         imageName="my-image:latest",
     )
     deployed = await endpoint.deploy()
@@ -294,10 +294,10 @@ If you need to verify the endpoint is ready before routing traffic:
 
 ```python
 # Deploy returns immediately
-mothership = await LoadBalancerSlsResource(name="my-lb", ...).deploy()
+endpoint = await LoadBalancerSlsResource(name="my-lb", ...).deploy()
 
 # Optional: Wait for endpoint to become healthy
-healthy = await mothership._wait_for_health(max_retries=10, retry_interval=5)
+healthy = await endpoint._wait_for_health(max_retries=10, retry_interval=5)
 if not healthy:
     print("Warning: Endpoint deployed but not yet healthy")
 ```
@@ -319,7 +319,7 @@ Default health check configuration:
 | Scalability | Per-function | Per-worker |
 | Health checks | Runpod SDK | `/ping` endpoint |
 | Use cases | Batch processing | APIs, webhooks, real-time |
-| Suitable for | Workers | Mothership, services |
+| Suitable for | Workers | APIs, services |
 
 ## Implementation Details
 
@@ -411,7 +411,6 @@ endpoint = LoadBalancerSlsResource(
 
 ## Next Steps
 
-- **Mothership integration**: Use LoadBalancerSlsResource for Mothership endpoints
+- **Entry-point integration**: Use LoadBalancerSlsResource for entry-point endpoints
 - **Upfront provisioning**: CLI provisions all resources before environment activation
-- **Reconciliation**: Mothership performs reconcile_children() on boot
 - **Cross-endpoint routing**: Route requests using State Manager GraphQL API (peer-to-peer)
diff --git a/src/runpod_flash/cli/docs/README.md b/src/runpod_flash/cli/docs/README.md
index a9a70853..1a1b4dfe 100644
--- a/src/runpod_flash/cli/docs/README.md
+++ b/src/runpod_flash/cli/docs/README.md
@@ -15,7 +15,7 @@ Create a new project, navigate to it, and install dependencies:
 ```bash
 flash init my-project
 cd my-project
-pip install -r requirements.txt
+uv sync                          # or: pip install -r requirements.txt
 ```
 
 Add your Runpod API key to `.env`:
@@ -295,16 +295,10 @@ Default location: `.flash/logs/activity.log`
 
 ```
 my-project/
-├── main.py              # Flash Server (FastAPI)
-├── workers/
-│   ├── gpu/             # GPU worker
-│   │   ├── __init__.py
-│   │   └── endpoint.py
-│   └── cpu/             # CPU worker
-│       ├── __init__.py
-│       └── endpoint.py
+├── gpu_worker.py        # GPU worker with @remote function
+├── cpu_worker.py        # CPU worker with @remote function
 ├── .env
-├── requirements.txt
+├── pyproject.toml       # Python dependencies (uv/pip compatible)
 └── README.md
 ```
 
@@ -322,12 +316,12 @@ RUNPOD_API_KEY=your_api_key_here
 curl http://localhost:8888/ping
 
 # Call GPU worker
-curl -X POST http://localhost:8888/gpu/hello \
+curl -X POST http://localhost:8888/gpu_worker/run_sync \
   -H "Content-Type: application/json" \
   -d '{"message": "Hello GPU!"}'
 
 # Call CPU worker
-curl -X POST http://localhost:8888/cpu/hello \
+curl -X POST http://localhost:8888/cpu_worker/run_sync \
   -H "Content-Type: application/json" \
   -d '{"message": "Hello CPU!"}'
 ```
diff --git a/src/runpod_flash/cli/docs/flash-app.md b/src/runpod_flash/cli/docs/flash-app.md
index 3abc29a2..00cecaff 100644
--- a/src/runpod_flash/cli/docs/flash-app.md
+++ b/src/runpod_flash/cli/docs/flash-app.md
@@ -444,8 +444,7 @@ flash deploy --app my-project
 ```
 
 Or ensure you're in a valid Flash project directory with:
-- `main.py` with Flash server
-- `workers/` directory
+- Python files containing `@remote` decorated functions
 - Proper project structure
 
 ### Multiple Apps With Same Name
diff --git a/src/runpod_flash/cli/docs/flash-build.md b/src/runpod_flash/cli/docs/flash-build.md
index 120fe60e..deb0e633 100644
--- a/src/runpod_flash/cli/docs/flash-build.md
+++ b/src/runpod_flash/cli/docs/flash-build.md
@@ -108,9 +108,9 @@ Launch a local Docker-based test environment immediately after building. This al
 1. Builds your project (creates archive, manifest)
 2. Creates a Docker network for inter-container communication
 3. Starts one Docker container per resource config:
-   - Mothership container (orchestrator)
+   - Application container
    - All worker containers (GPU, CPU, etc.)
-4. Exposes the mothership on `localhost:8000`
+4. Exposes the application on `localhost:8888`
 5. All containers communicate via Docker DNS
 6. On shutdown (Ctrl+C), automatically stops and removes all containers
 
@@ -192,7 +192,7 @@ Successful build displays:
 
 ### Build fails with "functions not found"
 
-Ensure your project has `@remote` decorated functions in `workers/` directory:
+Ensure your project has `@remote` decorated functions in your `.py` files:
 
 ```python
 from runpod_flash import remote, LiveServerless
diff --git a/src/runpod_flash/cli/docs/flash-deploy.md b/src/runpod_flash/cli/docs/flash-deploy.md
index 504ad874..d0fcb6a7 100644
--- a/src/runpod_flash/cli/docs/flash-deploy.md
+++ b/src/runpod_flash/cli/docs/flash-deploy.md
@@ -27,25 +27,25 @@ The `flash deploy` command is the primary way to get your Flash application runn
 
 ## Architecture: Fully Deployed to Runpod
 
-With `flash deploy`, your **entire application** runs on Runpod Serverless—both your FastAPI app (the "orchestrator") and all `@remote` worker functions:
+With `flash deploy`, your **entire application** runs on Runpod Serverless—all `@remote` functions deploy as peer serverless endpoints:
 
 ```
 ┌─────────────────────────────────────────────────────────────────┐
 │  RUNPOD SERVERLESS                                              │
 │                                                                 │
-│  ┌─────────────────────────────────────┐                        │
-│  │  MOTHERSHIP ENDPOINT                │                        │
-│  │  (your FastAPI app from main.py)    │                        │
-│  │  - Your HTTP routes                 │                        │
-│  │  - Orchestrates @remote calls       │───────────┐            │
-│  │  - Public URL for users             │           │            │
-│  └─────────────────────────────────────┘           │            │
-│                                                    │ internal   │
-│                                                    ▼            │
+│  All endpoints deployed as peers, using manifest for discovery  │
+│                                                                 │
 │  ┌─────────────────────────┐  ┌─────────────────────────┐       │
 │  │ gpu-worker              │  │ cpu-worker              │       │
 │  │ (your @remote function) │  │ (your @remote function) │       │
 │  └─────────────────────────┘  └─────────────────────────┘       │
+│                                                                 │
+│  ┌─────────────────────────┐                                    │
+│  │ lb-worker               │                                    │
+│  │ (load-balanced endpoint)│                                    │
+│  └─────────────────────────┘                                    │
+│                                                                 │
+│  Service discovery: flash_manifest.json + State Manager GraphQL │
 └─────────────────────────────────────────────────────────────────┘
           ▲
           │ HTTPS (authenticated)
@@ -56,9 +56,8 @@ With `flash deploy`, your **entire application** runs on Runpod Serverless—bot
 ```
 
 **Key points:**
-- **Your FastAPI app runs on Runpod** as the "mothership" endpoint
-- **`@remote` functions run on Runpod** as separate worker endpoints
-- **Users call the mothership URL** directly (e.g., `https://xyz123.api.runpod.ai/api/hello`)
+- **All `@remote` functions run on Runpod** as serverless endpoints
+- **Users call endpoint URLs** directly (e.g., `https://xyz123.api.runpod.ai/api/hello`)
 - **No `live-` prefix** on endpoint names (these are production endpoints)
 - **No hot reload:** code changes require a new deployment
 
@@ -68,7 +67,7 @@ This is different from `flash run`, where your FastAPI app runs locally on your
 
 | Aspect | `flash run` | `flash deploy` |
 |--------|-------------|----------------|
-| **FastAPI app runs on** | Your machine (localhost) | Runpod Serverless (mothership) |
+| **App runs on** | Your machine (localhost) | Runpod Serverless |
 | **`@remote` functions run on** | Runpod Serverless | Runpod Serverless |
 | **Endpoint naming** | `live-` prefix (e.g., `live-gpu-worker`) | No prefix (e.g., `gpu-worker`) |
 | **Hot reload** | Yes | No |
@@ -183,9 +182,9 @@ Builds your project and launches a local Docker-based test environment instead o
 1. Builds your project (creates the archive and manifest)
 2. Creates a Docker network for inter-container communication
 3. Starts one Docker container per resource config:
-   - Mothership container (orchestrator)
+   - Application container
    - All worker containers (GPU, CPU, etc.)
-4. Exposes the mothership on `localhost:8000`
+4. Exposes the application on `localhost:8000`
 5. All containers communicate via Docker DNS
 6. On shutdown (Ctrl+C), automatically stops and removes all containers
 
@@ -350,7 +349,7 @@ Next Steps:
    variable...
 
 2. Call Your Functions
-   Your mothership is deployed at:
+   Your application is deployed at:
    https://api-xxxxx.runpod.net
 
 3. Available Routes
diff --git a/src/runpod_flash/cli/docs/flash-env.md b/src/runpod_flash/cli/docs/flash-env.md
index c3f87744..81ce7993 100644
--- a/src/runpod_flash/cli/docs/flash-env.md
+++ b/src/runpod_flash/cli/docs/flash-env.md
@@ -464,8 +464,7 @@ flash env delete <name>
 **Problem**: Command requires `--app` flag even when in project directory
 
 **Solution**: Ensure you're in a Flash project directory with:
-- `main.py` with Flash server
-- `workers/` directory
+- Python files containing `@remote` decorated functions
 - `.env` file with `RUNPOD_API_KEY`
 
 Or specify app explicitly:
diff --git a/src/runpod_flash/cli/docs/flash-init.md b/src/runpod_flash/cli/docs/flash-init.md
index 082b619a..19c32f13 100644
--- a/src/runpod_flash/cli/docs/flash-init.md
+++ b/src/runpod_flash/cli/docs/flash-init.md
@@ -4,7 +4,7 @@ Create a new Flash project with a ready-to-use template structure.
 
 ## Overview
 
-The `flash init` command scaffolds a new Flash project with everything you need to get started: a main server (mothership), example GPU and CPU workers, and the directory structure that Flash expects. It's the fastest way to go from zero to a working distributed application.
+The `flash init` command scaffolds a new Flash project with everything you need to get started: example GPU and CPU worker files with `@remote` functions and the project structure that Flash expects. It's the fastest way to go from zero to a working distributed application.
 
 > **Note:** This command only creates **local files**. It doesn't interact with Runpod or create any cloud resources. Cloud resources (apps, environments, endpoints) are created later when you run `flash deploy`.
 
@@ -51,16 +51,10 @@ flash init my-project --force
 
 ```
 my-project/
-├── main.py              # Flash Server (FastAPI)
-├── workers/
-│   ├── gpu/             # GPU worker example
-│   │   ├── __init__.py
-│   │   └── endpoint.py
-│   └── cpu/             # CPU worker example
-│       ├── __init__.py
-│       └── endpoint.py
+├── gpu_worker.py        # GPU worker with @remote function
+├── cpu_worker.py        # CPU worker with @remote function
 ├── .env
-├── requirements.txt
+├── pyproject.toml       # Python dependencies (uv/pip compatible)
 └── README.md
 ```
 
@@ -68,7 +62,7 @@ my-project/
 
 ```bash
 cd my-project
-pip install -r requirements.txt  # or use your preferred environment manager
+uv sync                          # or: pip install -r requirements.txt
 # Add RUNPOD_API_KEY to .env
 flash run
 ```
diff --git a/src/runpod_flash/cli/docs/flash-run.md b/src/runpod_flash/cli/docs/flash-run.md
index 0b9cfd73..70976d6c 100644
--- a/src/runpod_flash/cli/docs/flash-run.md
+++ b/src/runpod_flash/cli/docs/flash-run.md
@@ -4,7 +4,7 @@ Start the Flash development server for testing/debugging/development.
 
 ## Overview
 
-The `flash run` command starts a local development server that hosts your FastAPI app on your machine while deploying `@remote` functions to Runpod Serverless. This hybrid architecture lets you rapidly iterate on your application with hot-reload while testing real GPU/CPU workloads in the cloud.
+The `flash run` command starts a local development server that auto-discovers your `@remote` functions and serves them on your machine while deploying them to Runpod Serverless. This hybrid architecture lets you rapidly iterate on your application with hot-reload while testing real GPU/CPU workloads in the cloud.
 
 Use `flash run` when you want to skip the build step and test/develop/debug your remote functions rapidly before deploying your full application with `flash deploy`. (See [Flash Deploy](./flash-deploy.md) for details.)
 
@@ -16,10 +16,10 @@ With `flash run`, your system runs in a **hybrid architecture**:
 ┌─────────────────────────────────────────────────────────────────┐
 │  YOUR MACHINE (localhost:8888)                                  │
 │  ┌─────────────────────────────────────┐                        │
-│  │  FastAPI App (main.py)              │                        │
-│  │  - Your HTTP routes                 │                        │
-│  │  - Orchestrates @remote calls       │─────────┐              │
-│  │  - Hot-reload enabled               │         │              │
+│  │  Auto-generated server              │                        │
+│  │  (.flash/server.py)                 │                        │
+│  │  - Discovers @remote functions      │─────────┐              │
+│  │  - Hot-reload via watchfiles        │         │              │
 │  └─────────────────────────────────────┘         │              │
 └──────────────────────────────────────────────────│──────────────┘
                                                    │ HTTPS
@@ -34,10 +34,11 @@ With `flash run`, your system runs in a **hybrid architecture**:
 ```
 
 **Key points:**
-- **Your FastAPI app runs locally** on your machine (uvicorn at `localhost:8888`)
+- **`flash run` auto-discovers `@remote` functions** and generates `.flash/server.py`
+- **Queue-based (QB) routes execute locally** at `/{file_prefix}/run_sync`
+- **Load-balanced (LB) routes dispatch remotely** via `LoadBalancerSlsStub`
 - **`@remote` functions run on Runpod** as serverless endpoints
-- **Your machine is the orchestrator** that calls remote endpoints when you invoke `@remote` functions
-- **Hot reload works** because your app code is local—changes are picked up instantly
+- **Hot reload** watches for `.py` file changes via watchfiles
 - **Endpoints are prefixed with `live-`** to distinguish development endpoints from production (e.g., `gpu-worker` becomes `live-gpu-worker`)
 
 This is different from `flash deploy`, where **everything** (including your FastAPI app) runs on Runpod. See [flash deploy](./flash-deploy.md) for the fully-deployed architecture.
@@ -73,9 +74,9 @@ flash run --host 0.0.0.0 --port 8000
 
 ## What It Does
 
-1. Discovers `main.py` (or `app.py`, `server.py`)
-2. Checks for FastAPI app
-3. Starts uvicorn server with hot reload
+1. Scans project files for `@remote` decorated functions
+2. Generates `.flash/server.py` with QB and LB routes
+3. Starts uvicorn server with hot-reload via watchfiles
 4. GPU workers use LiveServerless (no packaging needed)
 ### How It Works
 
@@ -84,8 +85,11 @@ When you call a `@remote` function using `flash run`, Flash deploys a **Serverle
 ```
 flash run
     │
+    ├── Scans project for @remote functions
+    ├── Generates .flash/server.py
     ├── Starts local server (e.g. localhost:8888)
-    │   └── Hosts your FastAPI mothership
+    │   ├── QB routes: /{file_prefix}/run_sync (local execution)
+    │   └── LB routes: /{file_prefix}/{path} (remote dispatch)
     │
     └── On @remote function call:
         └── Deploys a Serverless endpoint (if not cached)
@@ -106,7 +110,7 @@ Auto-provisioning discovers and deploys Serverless endpoints before the Flash de
 
 ### How It Works
 
-1. **Resource Discovery**: Scans your FastAPI app for `@remote` decorated functions
+1. **Resource Discovery**: Scans project files for `@remote` decorated functions
 2. **Parallel Deployment**: Deploys resources concurrently (up to 3 at a time)
 3. **Confirmation**: Asks for confirmation if deploying more than 5 endpoints
 4. **Caching**: Stores deployed resources in `.runpod/resources.pkl` for reuse across runs

From dd22a7212a4c0e0567cf151f83b737eb6f0b793f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Fri, 20 Feb 2026 02:17:47 -0800
Subject: [PATCH 20/26] docs: remove remaining mothership references from
 documentation

Replace mothership_id with flash_environment_id in StateManagerClient
examples, remove MothershipsProvisioner from prose, drop legacy
FLASH_IS_MOTHERSHIP env var sections, remove outdated Docker testing
section, and delete obsolete VERIFICATION.md.
---
 VERIFICATION.md                 | 303 --------------------------------
 docs/Cross_Endpoint_Routing.md  |   9 +-
 docs/Deployment_Architecture.md |   1 -
 docs/Flash_Deploy_Guide.md      |  35 +---
 4 files changed, 9 insertions(+), 339 deletions(-)
 delete mode 100644 VERIFICATION.md

diff --git a/VERIFICATION.md b/VERIFICATION.md
deleted file mode 100644
index 02def0ec..00000000
--- a/VERIFICATION.md
+++ /dev/null
@@ -1,303 +0,0 @@
-# Docker Image Constants Fix - Verification Guide
-
-This document provides step-by-step instructions for verifying the Docker image constant configuration fix.
-
-## Overview
-
-**Commit**: `1f3a6fd` - "refactor(resources): centralize docker image configuration"
-
-The fix centralizes all Docker image references into constants that support environment variable overrides. This eliminates hardcoded image names and enables flexible configuration for local development, testing, and production deployment.
-
-## Quick Start
-
-### Run All Tests
-
-```bash
-cd /Users/deanquinanola/Github/python/runpod-flash
-
-# Run the verification script
-uv run python3 scripts/test-image-constants.py
-```
-
-Expected output:
-```
-✓ 20/20 tests passed
-✓ ALL TESTS PASSED
-
-The Docker image configuration fix is working correctly:
-  ✓ Constants are properly centralized
-  ✓ Manifest builder uses constants
-  ✓ LiveServerless classes use constants
-  ✓ Environment variables override constants
-  ✓ No hardcoded values remain
-```
-
-## Individual Test Scenarios
-
-### Test 1: Constants Are Defined
-
-```bash
-uv run python3 << 'EOF'
-import sys
-sys.path.insert(0, 'src')
-
-from runpod_flash.core.resources.constants import (
-    FLASH_IMAGE_TAG,
-    FLASH_GPU_IMAGE,
-    FLASH_CPU_IMAGE,
-    FLASH_LB_IMAGE,
-    FLASH_CPU_LB_IMAGE,
-    DEFAULT_WORKERS_MIN,
-    DEFAULT_WORKERS_MAX,
-)
-
-print(f"FLASH_IMAGE_TAG: {FLASH_IMAGE_TAG}")
-print(f"FLASH_GPU_IMAGE: {FLASH_GPU_IMAGE}")
-print(f"FLASH_CPU_IMAGE: {FLASH_CPU_IMAGE}")
-print(f"FLASH_LB_IMAGE: {FLASH_LB_IMAGE}")
-print(f"FLASH_CPU_LB_IMAGE: {FLASH_CPU_LB_IMAGE}")
-print(f"DEFAULT_WORKERS_MIN: {DEFAULT_WORKERS_MIN}")
-print(f"DEFAULT_WORKERS_MAX: {DEFAULT_WORKERS_MAX}")
-EOF
-```
-
-### Test 2: Environment Variable Override (FLASH_IMAGE_TAG=local)
-
-```bash
-FLASH_IMAGE_TAG=local uv run python3 << 'EOF'
-import sys
-sys.path.insert(0, 'src')
-
-from runpod_flash.core.resources.constants import (
-    FLASH_IMAGE_TAG,
-    FLASH_GPU_IMAGE,
-    FLASH_LB_IMAGE,
-    FLASH_CPU_LB_IMAGE,
-)
-
-print(f"With FLASH_IMAGE_TAG={FLASH_IMAGE_TAG}:")
-print(f"  FLASH_GPU_IMAGE: {FLASH_GPU_IMAGE}")
-print(f"  FLASH_LB_IMAGE: {FLASH_LB_IMAGE}")
-print(f"  FLASH_CPU_LB_IMAGE: {FLASH_CPU_LB_IMAGE}")
-
-assert ":local" in FLASH_GPU_IMAGE
-assert ":local" in FLASH_LB_IMAGE
-assert ":local" in FLASH_CPU_LB_IMAGE
-print("✓ All images use :local tag")
-EOF
-```
-
-### Test 3: Individual Image Override
-
-```bash
-FLASH_CPU_LB_IMAGE=custom/lb-cpu:v1 uv run python3 << 'EOF'
-import sys
-sys.path.insert(0, 'src')
-
-from runpod_flash.core.resources.constants import FLASH_CPU_LB_IMAGE
-
-print(f"FLASH_CPU_LB_IMAGE: {FLASH_CPU_LB_IMAGE}")
-assert FLASH_CPU_LB_IMAGE == "custom/lb-cpu:v1"
-print("✓ Custom override works")
-EOF
-```
-
-### Test 4: Manifest Builder Uses Constants
-
-```bash
-uv run python3 << 'EOF'
-import sys
-sys.path.insert(0, 'src')
-
-from pathlib import Path
-from runpod_flash.cli.commands.build_utils.manifest import ManifestBuilder
-from runpod_flash.core.resources.constants import (
-    FLASH_CPU_LB_IMAGE,
-    DEFAULT_WORKERS_MIN,
-    DEFAULT_WORKERS_MAX,
-)
-
-builder = ManifestBuilder(project_name="test", remote_functions=[])
-mothership = builder._create_mothership_resource({
-    "file_path": Path("main.py"),
-    "app_variable": "app"
-})
-
-print(f"Mothership configuration:")
-print(f"  imageName: {mothership['imageName']} (expected: {FLASH_CPU_LB_IMAGE})")
-print(f"  workersMin: {mothership['workersMin']} (expected: {DEFAULT_WORKERS_MIN})")
-print(f"  workersMax: {mothership['workersMax']} (expected: {DEFAULT_WORKERS_MAX})")
-
-assert mothership['imageName'] == FLASH_CPU_LB_IMAGE
-assert mothership['workersMin'] == DEFAULT_WORKERS_MIN
-assert mothership['workersMax'] == DEFAULT_WORKERS_MAX
-
-print("✓ Manifest builder uses constants correctly")
-EOF
-```
-
-### Test 5: LiveServerless Uses Constants
-
-```bash
-uv run python3 << 'EOF'
-import sys
-sys.path.insert(0, 'src')
-
-from runpod_flash import LiveServerless, LiveLoadBalancer, CpuLiveLoadBalancer
-from runpod_flash.core.resources.constants import (
-    FLASH_GPU_IMAGE,
-    FLASH_LB_IMAGE,
-    FLASH_CPU_LB_IMAGE,
-)
-
-gpu_ls = LiveServerless(name="test-gpu")
-gpu_lb = LiveLoadBalancer(name="test-gpu-lb")
-cpu_lb = CpuLiveLoadBalancer(name="test-cpu-lb")
-
-print(f"Resource image configuration:")
-print(f"  LiveServerless: {gpu_ls.imageName} (expected: {FLASH_GPU_IMAGE})")
-print(f"  LiveLoadBalancer: {gpu_lb.imageName} (expected: {FLASH_LB_IMAGE})")
-print(f"  CpuLiveLoadBalancer: {cpu_lb.imageName} (expected: {FLASH_CPU_LB_IMAGE})")
-
-assert gpu_ls.imageName == FLASH_GPU_IMAGE
-assert gpu_lb.imageName == FLASH_LB_IMAGE
-assert cpu_lb.imageName == FLASH_CPU_LB_IMAGE
-
-print("✓ All LiveServerless classes use correct image constants")
-EOF
-```
-
-### Test 6: No Hardcoded Values Remain
-
-```bash
-# Verify no hardcoded image names in manifest.py
-grep -n "runpod/runpod-flash-lb" src/runpod_flash/cli/commands/build_utils/manifest.py || echo "✓ No hardcoded images found"
-
-# Verify constants are imported
-grep "FLASH_CPU_LB_IMAGE\|FLASH_LB_IMAGE\|DEFAULT_WORKERS" src/runpod_flash/cli/commands/build_utils/manifest.py
-```
-
-### Test 7: Unit Tests Pass
-
-```bash
-# Run manifest mothership tests
-uv run pytest tests/unit/cli/commands/build_utils/test_manifest_mothership.py -v
-
-# Run all tests
-uv run pytest --tb=short
-```
-
-## Test Coverage
-
-The verification tests cover:
-
-1. **Constants Definition** (✓ 7 tests)
-   - All 7 constants properly defined
-   - Default values correct
-   - Support environment variable overrides
-
-2. **Manifest Builder Integration** (✓ 3 tests)
-   - `_create_mothership_resource()` uses constants
-   - `_create_mothership_from_explicit()` uses constants
-   - Worker count constants used correctly
-
-3. **LiveServerless Integration** (✓ 3 tests)
-   - `LiveServerless` uses `FLASH_GPU_IMAGE`
-   - `LiveLoadBalancer` uses `FLASH_LB_IMAGE`
-   - `CpuLiveLoadBalancer` uses `FLASH_CPU_LB_IMAGE`
-
-4. **Environment Variable Overrides** (✓ 1 test)
-   - `FLASH_IMAGE_TAG=dev` works correctly
-   - Individual image overrides work
-
-5. **Code Quality** (✓ 6 tests)
-   - No hardcoded image names remain
-   - Constants are properly imported
-   - Code follows project patterns
-
-## Environment Variables
-
-### Global Override: FLASH_IMAGE_TAG
-
-Affects all images at once:
-
-```bash
-export FLASH_IMAGE_TAG=local
-# or
-export FLASH_IMAGE_TAG=dev
-# or
-export FLASH_IMAGE_TAG=staging
-```
-
-### Individual Overrides
-
-Override specific images:
-
-```bash
-export FLASH_GPU_IMAGE=my-registry/runpod-flash:custom
-export FLASH_CPU_IMAGE=my-registry/runpod-flash-cpu:custom
-export FLASH_LB_IMAGE=my-registry/runpod-flash-lb:custom
-export FLASH_CPU_LB_IMAGE=my-registry/runpod-flash-lb-cpu:custom
-```
-
-## Files Modified
-
-- `src/runpod_flash/cli/commands/build_utils/manifest.py` - Uses constants
-- `src/runpod_flash/cli/commands/test_mothership.py` - Uses constants
-- `src/runpod_flash/core/resources/constants.py` - Centralizes constants
-- `src/runpod_flash/core/resources/live_serverless.py` - Imports from constants
-- `tests/unit/cli/commands/build_utils/test_manifest_mothership.py` - Updated tests
-
-## Related Documentation
-
-- **Commit**: `1f3a6fd` - Full diff of changes
-- **CLAUDE.md**: Project development guidelines
-- **README**: Project overview
-
-## Future Verification
-
-To re-run this verification after future changes:
-
-```bash
-cd /Users/deanquinanola/Github/python/runpod-flash
-uv run python3 scripts/test-image-constants.py
-```
-
-This script can be retained indefinitely and re-run to ensure the fix remains intact.
-
-## Troubleshooting
-
-### Test Fails with "Module not found"
-
-Make sure you're running from the runpod-flash directory:
-```bash
-cd /Users/deanquinanola/Github/python/runpod-flash
-```
-
-### Constants Have Unexpected Values
-
-Check if environment variables are set:
-```bash
-echo $FLASH_IMAGE_TAG
-echo $FLASH_CPU_LB_IMAGE
-```
-
-Unset them if they're interfering:
-```bash
-unset FLASH_IMAGE_TAG FLASH_CPU_LB_IMAGE FLASH_LB_IMAGE
-```
-
-### Manifest Not Using Constants
-
-Verify imports in manifest.py:
-```bash
-grep "from runpod_flash.core.resources.constants import" src/runpod_flash/cli/commands/build_utils/manifest.py
-```
-
-## Summary
-
-✅ All hardcoded image names have been eliminated
-✅ Constants are centralized with environment variable support
-✅ All tests pass (856 passed, 68.74% coverage)
-✅ Backward compatible (defaults unchanged)
-✅ Ready for production deployment
diff --git a/docs/Cross_Endpoint_Routing.md b/docs/Cross_Endpoint_Routing.md
index 800f6101..b9318c9b 100644
--- a/docs/Cross_Endpoint_Routing.md
+++ b/docs/Cross_Endpoint_Routing.md
@@ -542,10 +542,13 @@ class StateManagerClient:
     """
 
     async def get_persisted_manifest(
-        self, mothership_id: str
+        self, flash_environment_id: str
     ) -> Optional[Dict[str, Any]]:
         """Fetch persisted manifest from State Manager.
 
+        Args:
+            flash_environment_id: ID of the Flash environment.
+
         Returns:
             Manifest dict or None if not found (first boot).
 
@@ -556,7 +559,7 @@ class StateManagerClient:
 
     async def update_resource_state(
         self,
-        mothership_id: str,
+        flash_environment_id: str,
         resource_name: str,
         resource_data: Dict[str, Any],
     ) -> None:
@@ -974,7 +977,7 @@ print(f"RUNPOD_ENDPOINT_ID: {os.getenv('RUNPOD_ENDPOINT_ID')}")
 
 # Check state manager client directly
 client = StateManagerClient()
-manifest = await client.get_persisted_manifest(mothership_id)
+manifest = await client.get_persisted_manifest(flash_environment_id)
 ```
 
 ## Peer-to-Peer Architecture with StateManagerClient
diff --git a/docs/Deployment_Architecture.md b/docs/Deployment_Architecture.md
index c5ab7733..c483debf 100644
--- a/docs/Deployment_Architecture.md
+++ b/docs/Deployment_Architecture.md
@@ -288,4 +288,3 @@ Each reconciliation action updates State Manager:
 - `FLASH_RESOURCE_NAME` - Which resource config this endpoint represents
 - `RUNPOD_ENDPOINT_ID` - This endpoint's ID (set by Runpod)
 - `FLASH_MANIFEST_PATH` - Optional explicit path to manifest
-- `FLASH_IS_MOTHERSHIP` - Legacy env var; not used in production `flash deploy`
diff --git a/docs/Flash_Deploy_Guide.md b/docs/Flash_Deploy_Guide.md
index 2a88898a..1cd39780 100644
--- a/docs/Flash_Deploy_Guide.md
+++ b/docs/Flash_Deploy_Guide.md
@@ -582,14 +582,14 @@ Resources are provisioned by the CLI during `flash deploy`, based on the manifes
 
 ### Deployment Orchestration
 
-The **MothershipsProvisioner** reconciles the manifest with the endpoint's local state:
+The reconciler reconciles the manifest with the endpoint's local state:
 
 ```python
 # 1. Load manifest from flash_manifest.json
 manifest = load_manifest()
 
 # 2. Fetch persisted state from State Manager
-persisted = await StateManagerClient.get_persisted_manifest(mothership_id)
+persisted = await StateManagerClient.get_persisted_manifest(flash_environment_id)
 
 # 3. Compute diff
 diff = compute_manifest_diff(manifest, persisted)
@@ -610,7 +610,7 @@ for resource_config in diff.removed:
     delete_resource(resource_config)
 
 # 7. Persist new state
-await StateManagerClient.update_resource_state(mothership_id, resources)
+await StateManagerClient.update_resource_state(flash_environment_id, resources)
 ```
 
 **Parallel Deployment**:
@@ -622,8 +622,6 @@ await StateManagerClient.update_resource_state(mothership_id, resources)
 - If hashes differ: Resource has been modified, trigger update
 - Prevents unnecessary updates when resource unchanged
 
-**Code Reference**: `src/runpod_flash/runtime/mothership_provisioner.py:1-150`
-
 ---
 
 ## Remote Execution
@@ -918,12 +916,6 @@ graph LR
 - Override default manifest file location
 - If not set, searches: cwd, module dir, parent dirs
 
-**FLASH_IS_MOTHERSHIP** (Legacy)
-- Value: `"true"`
-- Triggers `reconcile_children()` on boot
-- Not used in production `flash deploy` (CLI handles reconciliation)
-- Useful for testing or non-standard layouts
-
 ### Runtime Configuration
 
 **RUNPOD_ENDPOINT_ID** (Set by Runpod)
@@ -1048,25 +1040,6 @@ flash build --preview
 
 **Code Reference**: `src/runpod_flash/cli/commands/preview.py`
 
-### Local Docker Testing
-
-For testing complete deployment flow locally:
-
-```bash
-# Build project
-flash build
-
-# Start local endpoint simulator
-docker run -it \
-  -e FLASH_IS_MOTHERSHIP=true \
-  -e RUNPOD_API_KEY=$RUNPOD_API_KEY \
-  -v $(pwd)/.flash:/workspace/.flash \
-  runpod-flash:latest
-
-# Run provisioner
-python -m runpod_flash.runtime.mothership_provisioner
-```
-
 ### Debugging Tips
 
 **Enable Debug Logging**:
@@ -1105,7 +1078,6 @@ logging.getLogger("runpod_flash.runtime.service_registry").setLevel(logging.DEBU
 |------|---------|
 | `src/runpod_flash/cli/commands/deploy.py` | Deploy environment management commands |
 | `src/runpod_flash/cli/commands/build.py` | Build packaging and archive creation |
-| `src/runpod_flash/cli/commands/test_mothership.py` | Local endpoint testing |
 
 ### Build System
 
@@ -1128,7 +1100,6 @@ logging.getLogger("runpod_flash.runtime.service_registry").setLevel(logging.DEBU
 |------|---------|
 | `src/runpod_flash/runtime/manifest_fetcher.py` | Manifest loading from local .flash/ directory |
 | `src/runpod_flash/runtime/state_manager_client.py` | GraphQL client for peer-to-peer service discovery |
-| `src/runpod_flash/runtime/mothership_provisioner.py` | Auto-provisioning logic |
 
 ### Runtime: Execution
 

From 80da273983e84b616a3fd9ac7c3a4a2f29f0a798 Mon Sep 17 00:00:00 2001
From: zeke <40004347+KAJdev@users.noreply.github.com>
Date: Fri, 20 Feb 2026 14:19:22 -0800
Subject: [PATCH 21/26] refactor: replace flash run codegen with programmatic
 flash dev server

---
 src/runpod_flash/cli/commands/_dev_server.py  |  214 ++++
 .../cli/commands/_run_server_helpers.py       |    2 +-
 src/runpod_flash/cli/commands/init.py         |    2 +-
 src/runpod_flash/cli/commands/run.py          |  351 +-----
 .../cli/commands/test_mothership.py           |    4 +-
 src/runpod_flash/cli/main.py                  |    2 +-
 src/runpod_flash/core/discovery.py            |    2 +-
 .../core/resources/live_serverless.py         |    8 +-
 src/runpod_flash/core/resources/serverless.py |    2 +-
 tests/conftest.py                             |    2 +-
 tests/unit/cli/test_run.py                    | 1002 +++++------------
 tests/unit/resources/test_serverless.py       |    4 +-
 12 files changed, 557 insertions(+), 1038 deletions(-)
 create mode 100644 src/runpod_flash/cli/commands/_dev_server.py

diff --git a/src/runpod_flash/cli/commands/_dev_server.py b/src/runpod_flash/cli/commands/_dev_server.py
new file mode 100644
index 00000000..6378f3fd
--- /dev/null
+++ b/src/runpod_flash/cli/commands/_dev_server.py
@@ -0,0 +1,214 @@
+"""Programmatic FastAPI dev server for flash dev.
+
+Instead of generating a server.py file (codegen), this module builds the
+FastAPI app in-memory. User modules are imported directly, so tracebacks
+point to the original source files.
+"""
+
+import importlib
+import os
+import sys
+import uuid
+from pathlib import Path
+from typing import TYPE_CHECKING, Callable, List, Optional
+
+from fastapi import FastAPI, Request
+
+if TYPE_CHECKING:
+    from .run import WorkerInfo
+
+
+def create_app(
+    project_root: Optional[Path] = None,
+    workers: Optional[List["WorkerInfo"]] = None,
+) -> FastAPI:
+    """Factory function for the Flash dev server.
+
+    When called by uvicorn via ``--factory``, both parameters are None and
+    the function reads ``FLASH_PROJECT_ROOT`` from the environment and
+    scans for workers itself. Tests can pass both directly.
+    """
+    if project_root is None:
+        project_root = Path(os.environ.get("FLASH_PROJECT_ROOT", os.getcwd()))
+
+    root_str = str(project_root)
+    if root_str not in sys.path:
+        sys.path.insert(0, root_str)
+
+    if workers is None:
+        from .run import _scan_project_workers
+
+        workers = _scan_project_workers(project_root)
+
+    app = FastAPI(
+        title="Flash Dev Server",
+        description="Built by `flash dev`. Visit /docs for interactive testing.",
+    )
+
+    for worker in workers:
+        _register_worker_routes(app, worker, project_root)
+
+    @app.get("/", tags=["health"])
+    def home():
+        return {"message": "Flash Dev Server", "docs": "/docs"}
+
+    @app.get("/ping", tags=["health"])
+    def ping():
+        return {"status": "healthy"}
+
+    return app
+
+
+def _import_from_module(module_path: str, name: str, project_root: Path):
+    """Import *name* from *module_path*, handling numeric-prefix directories.
+
+    When a dotted module path contains segments starting with a digit
+    (e.g. ``01_hello.gpu_worker``), Python's ``from`` syntax cannot be
+    used. This function uses ``importlib.import_module`` and temporarily
+    scopes ``sys.path`` so that sibling imports within the target module
+    resolve to the correct subdirectory.
+    """
+    has_numeric = any(
+        seg and seg[0].isdigit() for seg in module_path.split(".")
+    )
+
+    if has_numeric:
+        parts = module_path.rsplit(".", 1)
+        if len(parts) > 1:
+            subdir = str(project_root / parts[0].replace(".", os.sep))
+            sys.path.insert(0, subdir)
+            try:
+                mod = importlib.import_module(module_path)
+            finally:
+                try:
+                    sys.path.remove(subdir)
+                except ValueError:
+                    pass
+        else:
+            mod = importlib.import_module(module_path)
+    else:
+        mod = importlib.import_module(module_path)
+
+    return getattr(mod, name)
+
+
+def _register_worker_routes(
+    app: FastAPI, worker: "WorkerInfo", project_root: Path
+) -> None:
+    """Register FastAPI routes for a single discovered worker."""
+    tag = f"{worker.url_prefix.lstrip('/')} [{worker.worker_type}]"
+
+    if worker.worker_type == "QB":
+        _register_qb_routes(app, worker, project_root, tag)
+    elif worker.worker_type == "LB":
+        _register_lb_routes(app, worker, project_root, tag)
+
+
+def _register_qb_routes(
+    app: FastAPI, worker: "WorkerInfo", project_root: Path, tag: str
+) -> None:
+    """Register queue-based (QB) routes.
+
+    Single-function workers get one ``/run_sync`` endpoint.
+    Multi-function workers get ``/<fn_name>/run_sync`` for each function.
+    """
+    if len(worker.functions) == 1:
+        fn_name = worker.functions[0]
+        fn = _import_from_module(worker.module_path, fn_name, project_root)
+        path = f"{worker.url_prefix}/run_sync"
+
+        async def qb_handler(body: dict, _fn=fn):
+            result = await _fn(body.get("input", body))
+            return {
+                "id": str(uuid.uuid4()),
+                "status": "COMPLETED",
+                "output": result,
+            }
+
+        qb_handler.__name__ = f"{worker.resource_name}_run_sync"
+        app.add_api_route(path, qb_handler, methods=["POST"], tags=[tag])
+    else:
+        for fn_name in worker.functions:
+            fn = _import_from_module(worker.module_path, fn_name, project_root)
+            path = f"{worker.url_prefix}/{fn_name}/run_sync"
+
+            async def qb_handler(body: dict, _fn=fn):
+                result = await _fn(body.get("input", body))
+                return {
+                    "id": str(uuid.uuid4()),
+                    "status": "COMPLETED",
+                    "output": result,
+                }
+
+            qb_handler.__name__ = f"{worker.resource_name}_{fn_name}_run_sync"
+            app.add_api_route(path, qb_handler, methods=["POST"], tags=[tag])
+
+
+def _register_lb_routes(
+    app: FastAPI,
+    worker: "WorkerInfo",
+    project_root: Path,
+    tag: str,
+    executor: Optional[Callable] = None,
+) -> None:
+    """Register load-balanced (LB) routes.
+
+    Each LB route is dispatched through *executor* (defaults to
+    ``lb_execute`` from ``_run_server_helpers``). Tests can pass a
+    substitute to avoid hitting real infrastructure.
+    """
+    if executor is None:
+        from ._run_server_helpers import lb_execute
+
+        executor = lb_execute
+
+    # import config variables (deduplicated)
+    config_vars: dict = {}
+    for route in worker.lb_routes:
+        var_name = route.get("config_variable")
+        if var_name and var_name not in config_vars:
+            config_vars[var_name] = _import_from_module(
+                worker.module_path, var_name, project_root
+            )
+
+    for route in worker.lb_routes:
+        method = route["method"]
+        sub_path = route["path"].lstrip("/")
+        fn_name = route["fn_name"]
+        config_var_name = route["config_variable"]
+        full_path = f"{worker.url_prefix}/{sub_path}"
+
+        fn = _import_from_module(worker.module_path, fn_name, project_root)
+        config = config_vars.get(config_var_name)
+
+        has_body = method.upper() in ("POST", "PUT", "PATCH", "DELETE")
+        if has_body:
+
+            async def lb_body_handler(
+                body: dict, _config=config, _fn=fn, _exec=executor
+            ):
+                return await _exec(_config, _fn, body)
+
+            lb_body_handler.__name__ = f"_route_{worker.resource_name}_{fn_name}"
+            app.add_api_route(
+                full_path,
+                lb_body_handler,
+                methods=[method.upper()],
+                tags=[tag],
+            )
+        else:
+
+            async def lb_query_handler(
+                request: Request, _config=config, _fn=fn, _exec=executor
+            ):
+                return await _exec(
+                    _config, _fn, dict(request.query_params)
+                )
+
+            lb_query_handler.__name__ = f"_route_{worker.resource_name}_{fn_name}"
+            app.add_api_route(
+                full_path,
+                lb_query_handler,
+                methods=[method.upper()],
+                tags=[tag],
+            )
diff --git a/src/runpod_flash/cli/commands/_run_server_helpers.py b/src/runpod_flash/cli/commands/_run_server_helpers.py
index 70391bbd..75827561 100644
--- a/src/runpod_flash/cli/commands/_run_server_helpers.py
+++ b/src/runpod_flash/cli/commands/_run_server_helpers.py
@@ -1,4 +1,4 @@
-"""Helpers for the flash run dev server — loaded inside the generated server.py."""
+"""Helpers for the flash dev server."""
 
 import inspect
 
diff --git a/src/runpod_flash/cli/commands/init.py b/src/runpod_flash/cli/commands/init.py
index eabd7583..b2db9b4e 100644
--- a/src/runpod_flash/cli/commands/init.py
+++ b/src/runpod_flash/cli/commands/init.py
@@ -109,7 +109,7 @@ def init_command(
     step_num += 1
     steps_table.add_row(f"{step_num}.", "Add your RUNPOD_API_KEY to .env")
     step_num += 1
-    steps_table.add_row(f"{step_num}.", "flash run")
+    steps_table.add_row(f"{step_num}.", "flash dev")
 
     console.print(steps_table)
 
diff --git a/src/runpod_flash/cli/commands/run.py b/src/runpod_flash/cli/commands/run.py
index da96af7e..37436824 100644
--- a/src/runpod_flash/cli/commands/run.py
+++ b/src/runpod_flash/cli/commands/run.py
@@ -1,11 +1,10 @@
-"""Run Flash development server."""
+"""Flash dev server command."""
 
 import logging
 import os
 import signal
 import subprocess
 import sys
-import threading
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import List
@@ -14,22 +13,6 @@
 from rich.console import Console
 from rich.table import Table
 
-try:
-    from watchfiles import DefaultFilter as _WatchfilesDefaultFilter
-    from watchfiles import watch as _watchfiles_watch
-except ModuleNotFoundError:
-
-    def _watchfiles_watch(*_a, **_kw):  # type: ignore[misc]
-        raise ModuleNotFoundError(
-            "watchfiles is required for flash run --reload. "
-            "Install it with: pip install watchfiles"
-        )
-
-    class _WatchfilesDefaultFilter:  # type: ignore[no-redef]
-        def __init__(self, **_kw):
-            pass
-
-
 from .build_utils.scanner import (
     RemoteDecoratorScanner,
     file_to_module_path,
@@ -40,7 +23,7 @@ def __init__(self, **_kw):
 logger = logging.getLogger(__name__)
 console = Console()
 
-# Resource state file written by ResourceManager in the uvicorn subprocess.
+# resource state file written by ResourceManager in the uvicorn subprocess
 _RESOURCE_STATE_FILE = Path(".runpod") / "resources.pkl"
 
 
@@ -76,7 +59,7 @@ def _scan_project_workers(project_root: Path) -> List[WorkerInfo]:
     scanner = RemoteDecoratorScanner(project_root)
     remote_functions = scanner.discover_remote_functions()
 
-    # Group by file path
+    # group by file path
     by_file: dict[Path, List] = {}
     for func in remote_functions:
         by_file.setdefault(func.file_path, []).append(func)
@@ -127,249 +110,6 @@ def _scan_project_workers(project_root: Path) -> List[WorkerInfo]:
     return workers
 
 
-def _ensure_gitignore(project_root: Path) -> None:
-    """Add .flash/ to .gitignore if not already present."""
-    gitignore = project_root / ".gitignore"
-    entry = ".flash/"
-
-    if gitignore.exists():
-        content = gitignore.read_text(encoding="utf-8")
-        if entry in content:
-            return
-        # Append with a newline
-        if not content.endswith("\n"):
-            content += "\n"
-        gitignore.write_text(content + entry + "\n", encoding="utf-8")
-    else:
-        gitignore.write_text(entry + "\n", encoding="utf-8")
-
-
-def _sanitize_fn_name(name: str) -> str:
-    """Sanitize a string for use as a Python function name.
-
-    Replaces non-identifier characters with underscores and prepends '_'
-    if the result starts with a digit (Python identifiers cannot start
-    with digits).
-    """
-    result = name.replace("/", "_").replace(".", "_").replace("-", "_")
-    if result and result[0].isdigit():
-        result = "_" + result
-    return result
-
-
-def _has_numeric_module_segments(module_path: str) -> bool:
-    """Check if any segment in a dotted module path starts with a digit.
-
-    Python identifiers cannot start with digits, so ``from 01_foo import bar``
-    is a SyntaxError. Callers should use ``importlib.import_module()`` instead.
-    """
-    return any(seg and seg[0].isdigit() for seg in module_path.split("."))
-
-
-def _module_parent_subdir(module_path: str) -> str | None:
-    """Return the parent sub-directory for a dotted module path, or None for top-level.
-
-    Example: ``01_getting_started.03_mixed.pipeline`` → ``01_getting_started/03_mixed``
-    """
-    parts = module_path.rsplit(".", 1)
-    if len(parts) == 1:
-        return None
-    return parts[0].replace(".", "/")
-
-
-def _make_import_line(module_path: str, name: str) -> str:
-    """Build an import statement for *name* from *module_path*.
-
-    Uses a regular ``from … import …`` when the module path is a valid
-    Python identifier chain. Falls back to ``_flash_import()`` (a generated
-    helper in server.py) when any segment starts with a digit. The helper
-    temporarily scopes ``sys.path`` so sibling imports in the target module
-    resolve to the correct directory.
-    """
-    if _has_numeric_module_segments(module_path):
-        subdir = _module_parent_subdir(module_path)
-        if subdir:
-            return f'{name} = _flash_import("{module_path}", "{name}", "{subdir}")'
-        return f'{name} = _flash_import("{module_path}", "{name}")'
-    return f"from {module_path} import {name}"
-
-
-def _generate_flash_server(project_root: Path, workers: List[WorkerInfo]) -> Path:
-    """Generate .flash/server.py from the discovered workers.
-
-    Args:
-        project_root: Root of the Flash project
-        workers: List of discovered worker infos
-
-    Returns:
-        Path to the generated server.py
-    """
-    flash_dir = project_root / ".flash"
-    flash_dir.mkdir(exist_ok=True)
-
-    _ensure_gitignore(project_root)
-
-    has_lb_workers = any(w.worker_type == "LB" for w in workers)
-
-    lines = [
-        '"""Auto-generated Flash dev server. Do not edit — regenerated on each flash run."""',
-        "import sys",
-        "import uuid",
-        "from pathlib import Path",
-        "_project_root = Path(__file__).parent.parent",
-        "sys.path.insert(0, str(_project_root))",
-        "",
-    ]
-
-    # When modules live in directories with numeric prefixes (e.g. 01_hello/),
-    # we cannot use ``from … import …`` — Python identifiers cannot start with
-    # digits.  Instead we emit a small ``_flash_import`` helper that uses
-    # ``importlib.import_module()`` *and* temporarily scopes ``sys.path`` so
-    # that sibling imports inside the loaded module (e.g. ``from cpu_worker
-    # import …``) resolve to the correct directory rather than a same-named
-    # file from a different example subdirectory.
-    needs_importlib = any(_has_numeric_module_segments(w.module_path) for w in workers)
-
-    if needs_importlib:
-        lines += [
-            "import importlib as _importlib",
-            "",
-            "",
-            "def _flash_import(module_path, name, subdir=None):",
-            '    """Import *name* from *module_path* with scoped sys.path for sibling imports."""',
-            "    _path = str(_project_root / subdir) if subdir else None",
-            "    if _path:",
-            "        sys.path.insert(0, _path)",
-            "    try:",
-            "        return getattr(_importlib.import_module(module_path), name)",
-            "    finally:",
-            "        if _path:",
-            "            try:",
-            "                sys.path.remove(_path)",
-            "            except ValueError:",
-            "                pass",
-            "",
-        ]
-
-    if has_lb_workers:
-        lines += [
-            "from fastapi import FastAPI, Request",
-            "from runpod_flash.cli.commands._run_server_helpers import lb_execute as _lb_execute",
-            "",
-        ]
-    else:
-        lines += [
-            "from fastapi import FastAPI",
-            "",
-        ]
-
-    # Collect imports — QB functions are called directly, LB config variables and
-    # functions are passed to lb_execute for dispatch via LoadBalancerSlsStub.
-    all_imports: List[str] = []
-    for worker in workers:
-        if worker.worker_type == "QB":
-            for fn_name in worker.functions:
-                all_imports.append(_make_import_line(worker.module_path, fn_name))
-        elif worker.worker_type == "LB":
-            # Import the resource config variable (e.g. "api" from api = LiveLoadBalancer(...))
-            config_vars = {
-                r["config_variable"]
-                for r in worker.lb_routes
-                if r.get("config_variable")
-            }
-            for var in sorted(config_vars):
-                all_imports.append(_make_import_line(worker.module_path, var))
-            for fn_name in worker.functions:
-                all_imports.append(_make_import_line(worker.module_path, fn_name))
-
-    if all_imports:
-        lines.extend(all_imports)
-        lines.append("")
-
-    lines += [
-        "app = FastAPI(",
-        '    title="Flash Dev Server",',
-        '    description="Auto-generated by `flash run`. Visit /docs for interactive testing.",',
-        ")",
-        "",
-    ]
-
-    for worker in workers:
-        tag = f"{worker.url_prefix.lstrip('/')} [{worker.worker_type}]"
-        lines.append(f"# {'─' * 60}")
-        lines.append(f"# {worker.worker_type}: {worker.file_path.name}")
-        lines.append(f"# {'─' * 60}")
-
-        if worker.worker_type == "QB":
-            if len(worker.functions) == 1:
-                fn = worker.functions[0]
-                handler_name = _sanitize_fn_name(f"{worker.resource_name}_run_sync")
-                sync_path = f"{worker.url_prefix}/run_sync"
-                lines += [
-                    f'@app.post("{sync_path}", tags=["{tag}"])',
-                    f"async def {handler_name}(body: dict):",
-                    f'    result = await {fn}(body.get("input", body))',
-                    '    return {"id": str(uuid.uuid4()), "status": "COMPLETED", "output": result}',
-                    "",
-                ]
-            else:
-                for fn in worker.functions:
-                    handler_name = _sanitize_fn_name(
-                        f"{worker.resource_name}_{fn}_run_sync"
-                    )
-                    sync_path = f"{worker.url_prefix}/{fn}/run_sync"
-                    lines += [
-                        f'@app.post("{sync_path}", tags=["{tag}"])',
-                        f"async def {handler_name}(body: dict):",
-                        f'    result = await {fn}(body.get("input", body))',
-                        '    return {"id": str(uuid.uuid4()), "status": "COMPLETED", "output": result}',
-                        "",
-                    ]
-
-        elif worker.worker_type == "LB":
-            for route in worker.lb_routes:
-                method = route["method"].lower()
-                sub_path = route["path"].lstrip("/")
-                fn_name = route["fn_name"]
-                config_var = route["config_variable"]
-                full_path = f"{worker.url_prefix}/{sub_path}"
-                handler_name = _sanitize_fn_name(
-                    f"_route_{worker.resource_name}_{fn_name}"
-                )
-                has_body = method in ("post", "put", "patch", "delete")
-                if has_body:
-                    lines += [
-                        f'@app.{method}("{full_path}", tags=["{tag}"])',
-                        f"async def {handler_name}(body: dict):",
-                        f"    return await _lb_execute({config_var}, {fn_name}, body)",
-                        "",
-                    ]
-                else:
-                    lines += [
-                        f'@app.{method}("{full_path}", tags=["{tag}"])',
-                        f"async def {handler_name}(request: Request):",
-                        f"    return await _lb_execute({config_var}, {fn_name}, dict(request.query_params))",
-                        "",
-                    ]
-
-    # Health endpoints
-    lines += [
-        "# Health",
-        '@app.get("/", tags=["health"])',
-        "def home():",
-        '    return {"message": "Flash Dev Server", "docs": "/docs"}',
-        "",
-        '@app.get("/ping", tags=["health"])',
-        "def ping():",
-        '    return {"status": "healthy"}',
-        "",
-    ]
-
-    server_path = flash_dir / "server.py"
-    server_path.write_text("\n".join(lines), encoding="utf-8")
-    return server_path
-
-
 def _print_startup_table(workers: List[WorkerInfo], host: str, port: int) -> None:
     """Print the startup table showing local paths, resource names, and types."""
     console.print(f"\n[bold green]Flash Dev Server[/bold green]  http://{host}:{port}")
@@ -473,7 +213,7 @@ async def _do_cleanup():
             f"resource(s) undeployed in {elapsed:.1f}s"
         )
 
-        # Remove live- entries from persisted state so they don't linger.
+        # remove live- entries from persisted state so they don't linger
         remaining = {k: v for k, v in resources.items() if k not in live_items}
         remaining_configs = {k: v for k, v in configs.items() if k not in live_items}
         try:
@@ -492,36 +232,6 @@ def _is_reload() -> bool:
     return "UVICORN_RELOADER_PID" in os.environ
 
 
-def _watch_and_regenerate(project_root: Path, stop_event: threading.Event) -> None:
-    """Watch project .py files and regenerate server.py when they change.
-
-    Ignores .flash/ to avoid reacting to our own writes. Runs until
-    stop_event is set.
-    """
-    # Suppress watchfiles' internal debug chatter (filter hits, rust timeouts).
-    logging.getLogger("watchfiles").setLevel(logging.WARNING)
-
-    watch_filter = _WatchfilesDefaultFilter(ignore_paths=[str(project_root / ".flash")])
-
-    try:
-        for changes in _watchfiles_watch(
-            project_root,
-            watch_filter=watch_filter,
-            stop_event=stop_event,
-        ):
-            py_changed = [p for _, p in changes if p.endswith(".py")]
-            if not py_changed:
-                continue
-            try:
-                workers = _scan_project_workers(project_root)
-                _generate_flash_server(project_root, workers)
-                logger.debug("server.py regenerated (%d changed)", len(py_changed))
-            except Exception as e:
-                logger.warning("Failed to regenerate server.py: %s", e)
-    except Exception:
-        pass  # stop_event was set or watchfiles unavailable — both are fine
-
-
 def _discover_resources(project_root: Path):
     """Discover deployable resources in project files.
 
@@ -545,8 +255,7 @@ def _discover_resources(project_root: Path):
         )
     )
 
-    # Add project root to sys.path so cross-module imports resolve
-    # (e.g. api/routes.py doing "from longruns.stage1 import stage1_process").
+    # add project root to sys.path so cross-module imports resolve
     root_str = str(project_root)
     added_to_path = root_str not in sys.path
     if added_to_path:
@@ -623,21 +332,22 @@ def run_command(
         help="Auto-provision all endpoints on startup (eliminates cold-start on first request)",
     ),
 ):
-    """Run Flash development server.
+    """Start Flash development server.
 
-    Scans the project for @remote decorated functions, generates a dev server
-    at .flash/server.py, and starts uvicorn with hot-reload.
+    Scans the project for @remote decorated functions and starts a FastAPI
+    dev server via uvicorn. The app is built programmatically (no codegen)
+    so tracebacks point directly to your source files.
 
     No main.py or FastAPI boilerplate required. Any .py file with @remote
     decorated functions is a valid Flash project.
     """
     project_root = Path.cwd()
 
-    # Set flag for live provisioning so stubs get the live- prefix
+    # set flag for live provisioning so stubs get the live- prefix
     if not _is_reload():
         os.environ["FLASH_IS_LIVE_PROVISIONING"] = "true"
 
-    # Auto-provision all endpoints upfront (eliminates cold-start)
+    # auto-provision all endpoints upfront (eliminates cold-start)
     if auto_provision and not _is_reload():
         try:
             resources = _discover_resources(project_root)
@@ -650,14 +360,14 @@ def run_command(
                 "[dim]Resources will be provisioned on-demand at first request.[/dim]"
             )
 
-    # Discover @remote functions
+    # discover @remote functions for the startup table
     workers = _scan_project_workers(project_root)
 
     if not workers:
         console.print("[red]Error:[/red] No @remote functions found.")
         console.print("Add @remote decorators to your functions to get started.")
-        console.print("\nExample:")
         console.print(
+            "\nExample:\n"
             "  from runpod_flash import LiveServerless, remote\n"
             "  gpu_config = LiveServerless(name='my_worker')\n"
             "\n"
@@ -667,19 +377,17 @@ def run_command(
         )
         raise typer.Exit(1)
 
-    # Generate .flash/server.py
-    _generate_flash_server(project_root, workers)
-
     _print_startup_table(workers, host, port)
 
-    # Build uvicorn command using --app-dir so server:app is importable
+    # tell the factory function where the project lives
+    os.environ["FLASH_PROJECT_ROOT"] = str(project_root)
+
     cmd = [
         sys.executable,
         "-m",
         "uvicorn",
-        "server:app",
-        "--app-dir",
-        ".flash",
+        "--factory",
+        "runpod_flash.cli.commands._dev_server:create_app",
         "--host",
         host,
         "--port",
@@ -692,19 +400,9 @@ def run_command(
         cmd += [
             "--reload",
             "--reload-dir",
-            ".flash",
-            "--reload-include",
-            "server.py",
+            str(project_root),
         ]
 
-    stop_event = threading.Event()
-    watcher_thread = threading.Thread(
-        target=_watch_and_regenerate,
-        args=(project_root, stop_event),
-        daemon=True,
-        name="flash-watcher",
-    )
-
     process = None
     try:
         if sys.platform == "win32":
@@ -714,18 +412,11 @@ def run_command(
         else:
             process = subprocess.Popen(cmd, preexec_fn=os.setsid)
 
-        if reload:
-            watcher_thread.start()
-
         process.wait()
 
     except KeyboardInterrupt:
         console.print("\n[yellow]Stopping server and cleaning up...[/yellow]")
 
-        stop_event.set()
-        if watcher_thread.is_alive():
-            watcher_thread.join(timeout=2)
-
         if process:
             try:
                 if sys.platform == "win32":
@@ -752,10 +443,6 @@ def run_command(
     except Exception as e:
         console.print(f"[red]Error:[/red] {e}")
 
-        stop_event.set()
-        if watcher_thread.is_alive():
-            watcher_thread.join(timeout=2)
-
         if process:
             try:
                 if sys.platform == "win32":
diff --git a/src/runpod_flash/cli/commands/test_mothership.py b/src/runpod_flash/cli/commands/test_mothership.py
index 4e7b66bd..246401e5 100644
--- a/src/runpod_flash/cli/commands/test_mothership.py
+++ b/src/runpod_flash/cli/commands/test_mothership.py
@@ -278,7 +278,7 @@ def _create_entrypoint_script(build_dir: str) -> None:
     """Create entrypoint.sh script for Docker container.
 
     This script handles signal trapping and cleanup on shutdown.
-    It runs manifest-based provisioning then flash run (without --auto-provision
+    It runs manifest-based provisioning then flash dev (without --auto-provision
     to avoid duplicate discovery from bundled dependencies).
     """
     build_path = Path(build_dir)
@@ -369,7 +369,7 @@ def _display_test_objectives() -> None:
 def _display_config(build_dir: str, image: str, port: int, endpoint_id: str) -> None:
     """Display test configuration."""
     config_text = f"""[bold]Build directory:[/bold] {build_dir}
-[bold]Command:[/bold] flash run
+[bold]Command:[/bold] flash dev
 [bold]Docker image:[/bold] {image}
 [bold]Endpoint ID:[/bold] {endpoint_id}
 [bold]Port:[/bold] http://localhost:{port}"""
diff --git a/src/runpod_flash/cli/main.py b/src/runpod_flash/cli/main.py
index 405a8f61..a5d9f46b 100644
--- a/src/runpod_flash/cli/main.py
+++ b/src/runpod_flash/cli/main.py
@@ -36,7 +36,7 @@ def get_version() -> str:
 
 # command: flash <command>
 app.command("init")(init.init_command)
-app.command("run")(run.run_command)
+app.command("dev")(run.run_command)
 app.command("build")(build.build_command)
 app.command("deploy")(deploy.deploy_command)
 # app.command("report")(resource.report_command)
diff --git a/src/runpod_flash/core/discovery.py b/src/runpod_flash/core/discovery.py
index 06c5d57e..88fa0b86 100644
--- a/src/runpod_flash/core/discovery.py
+++ b/src/runpod_flash/core/discovery.py
@@ -1,4 +1,4 @@
-"""Resource discovery for auto-provisioning during flash run startup."""
+"""Resource discovery for auto-provisioning during flash dev startup."""
 
 import ast
 import importlib.util
diff --git a/src/runpod_flash/core/resources/live_serverless.py b/src/runpod_flash/core/resources/live_serverless.py
index 8ae0b3a5..228e4673 100644
--- a/src/runpod_flash/core/resources/live_serverless.py
+++ b/src/runpod_flash/core/resources/live_serverless.py
@@ -74,7 +74,7 @@ class LiveLoadBalancer(LiveServerlessMixin, LoadBalancerSlsResource):
     Features:
     - Locks to Flash LB image (flash-lb)
     - Direct HTTP execution (not queue-based)
-    - Local development with flash run
+    - Local development with flash dev
     - Same @remote decorator pattern as LoadBalancerSlsResource
 
     Usage:
@@ -92,7 +92,7 @@ async def process_data(x: int, y: int):
     Local Development Flow:
         1. Create LiveLoadBalancer with routing
         2. Decorate functions with @remote(lb_resource, method=..., path=...)
-        3. Run with `flash run` to start local endpoint
+        3. Run with `flash dev` to start local endpoint
         4. Call functions directly in tests or scripts
         5. Deploy to production with `flash build` and `flash deploy`
 
@@ -124,7 +124,7 @@ class CpuLiveLoadBalancer(LiveServerlessMixin, CpuLoadBalancerSlsResource):
     - Locks to CPU Flash LB image (flash-lb-cpu)
     - CPU instance support with automatic disk sizing
     - Direct HTTP execution (not queue-based)
-    - Local development with flash run
+    - Local development with flash dev
     - Same @remote decorator pattern as CpuLoadBalancerSlsResource
 
     Usage:
@@ -142,7 +142,7 @@ async def process_data(x: int, y: int):
     Local Development Flow:
         1. Create CpuLiveLoadBalancer with routing
         2. Decorate functions with @remote(lb_resource, method=..., path=...)
-        3. Run with `flash run` to start local endpoint
+        3. Run with `flash dev` to start local endpoint
         4. Call functions directly in tests or scripts
         5. Deploy to production with `flash build` and `flash deploy`
     """
diff --git a/src/runpod_flash/core/resources/serverless.py b/src/runpod_flash/core/resources/serverless.py
index 27d811a4..bdacfcef 100644
--- a/src/runpod_flash/core/resources/serverless.py
+++ b/src/runpod_flash/core/resources/serverless.py
@@ -474,7 +474,7 @@ def is_deployed(self) -> bool:
             if not self.id:
                 return False
 
-            # During flash run, skip the health check. Newly-created endpoints
+            # During flash dev, skip the health check. Newly-created endpoints
             # can fail health checks due to RunPod propagation delay — the
             # endpoint exists but the health API hasn't registered it yet.
             # Trusting the cached ID is correct here; actual failures surface
diff --git a/tests/conftest.py b/tests/conftest.py
index 7641d521..85766c5d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -283,7 +283,7 @@ def isolate_resource_state_file(
 def clear_live_provisioning_env(monkeypatch: pytest.MonkeyPatch):
     """Clear FLASH_IS_LIVE_PROVISIONING env var between tests.
 
-    This fixture ensures that the flag set by `flash run` command
+    This fixture ensures that the flag set by `flash dev` command
     doesn't leak into unit tests. It's autouse so it runs for all tests.
 
     Args:
diff --git a/tests/unit/cli/test_run.py b/tests/unit/cli/test_run.py
index 6014e9a3..2b38d6dd 100644
--- a/tests/unit/cli/test_run.py
+++ b/tests/unit/cli/test_run.py
@@ -1,30 +1,33 @@
-"""Unit tests for run CLI command."""
+"""Unit tests for run CLI command and programmatic dev server."""
 
-import pytest
+import os
+import sys
 from pathlib import Path
-from unittest.mock import patch, MagicMock
+from unittest.mock import MagicMock, patch
+
+import pytest
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
 from typer.testing import CliRunner
 
-from runpod_flash.cli.main import app
-from runpod_flash.cli.commands.run import (
-    WorkerInfo,
-    _generate_flash_server,
-    _has_numeric_module_segments,
-    _make_import_line,
-    _module_parent_subdir,
-    _sanitize_fn_name,
+from runpod_flash.cli.commands._dev_server import (
+    _import_from_module,
+    _register_lb_routes,
+    _register_qb_routes,
+    create_app,
 )
+from runpod_flash.cli.commands.run import WorkerInfo
+from runpod_flash.cli.main import app
 
 
 @pytest.fixture
 def runner():
-    """Create CLI test runner."""
     return CliRunner()
 
 
 @pytest.fixture
-def temp_fastapi_app(tmp_path):
-    """Create minimal Flash project with @remote function for testing."""
+def temp_project(tmp_path):
+    """Create a minimal Flash project with a @remote function."""
     worker_file = tmp_path / "worker.py"
     worker_file.write_text(
         "from runpod_flash import LiveServerless, remote\n"
@@ -36,768 +39,383 @@ def temp_fastapi_app(tmp_path):
     return tmp_path
 
 
-class TestRunCommandEnvironmentVariables:
-    """Test flash run command environment variable support."""
-
-    @pytest.fixture(autouse=True)
-    def patch_watcher(self):
-        """Prevent the background watcher thread from blocking tests."""
-        with patch("runpod_flash.cli.commands.run._watch_and_regenerate"):
-            yield
-
-    def test_port_from_environment_variable(
-        self, runner, temp_fastapi_app, monkeypatch
-    ):
-        """Test that FLASH_PORT environment variable is respected."""
-        monkeypatch.chdir(temp_fastapi_app)
-        monkeypatch.setenv("FLASH_PORT", "8080")
-
-        # Mock subprocess to capture command and prevent actual server start
-        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.pid = 12345
-            mock_process.wait.side_effect = KeyboardInterrupt()
-            mock_popen.return_value = mock_process
-
-            # Mock OS-level process group operations
-            with patch("runpod_flash.cli.commands.run.os.getpgid") as mock_getpgid:
-                mock_getpgid.return_value = 12345
-                with patch("runpod_flash.cli.commands.run.os.killpg"):
-                    runner.invoke(app, ["run"])
-
-                    # Verify port 8080 was used in uvicorn command
-                    call_args = mock_popen.call_args[0][0]
-                    assert "--port" in call_args
-                    port_index = call_args.index("--port")
-                    assert call_args[port_index + 1] == "8080"
-
-    def test_host_from_environment_variable(
-        self, runner, temp_fastapi_app, monkeypatch
-    ):
-        """Test that FLASH_HOST environment variable is respected."""
-        monkeypatch.chdir(temp_fastapi_app)
-        monkeypatch.setenv("FLASH_HOST", "0.0.0.0")
-
-        # Mock subprocess to capture command
-        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.pid = 12345
-            mock_process.wait.side_effect = KeyboardInterrupt()
-            mock_popen.return_value = mock_process
-
-            # Mock OS-level operations
-            with patch("runpod_flash.cli.commands.run.os.getpgid") as mock_getpgid:
-                mock_getpgid.return_value = 12345
-                with patch("runpod_flash.cli.commands.run.os.killpg"):
-                    runner.invoke(app, ["run"])
-
-                    # Verify host 0.0.0.0 was used
-                    call_args = mock_popen.call_args[0][0]
-                    assert "--host" in call_args
-                    host_index = call_args.index("--host")
-                    assert call_args[host_index + 1] == "0.0.0.0"
-
-    def test_cli_flag_overrides_environment_variable(
-        self, runner, temp_fastapi_app, monkeypatch
-    ):
-        """Test that --port flag overrides FLASH_PORT environment variable."""
-        monkeypatch.chdir(temp_fastapi_app)
-        monkeypatch.setenv("FLASH_PORT", "8080")
-
-        # Mock subprocess to capture command
-        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.pid = 12345
-            mock_process.wait.side_effect = KeyboardInterrupt()
-            mock_popen.return_value = mock_process
-
-            # Mock OS-level operations
-            with patch("runpod_flash.cli.commands.run.os.getpgid") as mock_getpgid:
-                mock_getpgid.return_value = 12345
-                with patch("runpod_flash.cli.commands.run.os.killpg"):
-                    # Use --port flag to override env var
-                    runner.invoke(app, ["run", "--port", "9000"])
-
-                    # Verify port 9000 was used (flag overrides env)
-                    call_args = mock_popen.call_args[0][0]
-                    assert "--port" in call_args
-                    port_index = call_args.index("--port")
-                    assert call_args[port_index + 1] == "9000"
-
-    def test_default_port_when_no_env_or_flag(
-        self, runner, temp_fastapi_app, monkeypatch
-    ):
-        """Test that default port 8888 is used when no env var or flag."""
-        monkeypatch.chdir(temp_fastapi_app)
-        # Ensure FLASH_PORT is not set
-        monkeypatch.delenv("FLASH_PORT", raising=False)
-
-        # Mock subprocess to capture command
-        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.pid = 12345
-            mock_process.wait.side_effect = KeyboardInterrupt()
-            mock_popen.return_value = mock_process
-
-            # Mock OS-level operations
-            with patch("runpod_flash.cli.commands.run.os.getpgid") as mock_getpgid:
-                mock_getpgid.return_value = 12345
-                with patch("runpod_flash.cli.commands.run.os.killpg"):
-                    runner.invoke(app, ["run"])
-
-                    # Verify default port 8888 was used
-                    call_args = mock_popen.call_args[0][0]
-                    assert "--port" in call_args
-                    port_index = call_args.index("--port")
-                    assert call_args[port_index + 1] == "8888"
-
-    def test_default_host_when_no_env_or_flag(
-        self, runner, temp_fastapi_app, monkeypatch
-    ):
-        """Test that default host localhost is used when no env var or flag."""
-        monkeypatch.chdir(temp_fastapi_app)
-        # Ensure FLASH_HOST is not set
-        monkeypatch.delenv("FLASH_HOST", raising=False)
-
-        # Mock subprocess to capture command
-        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.pid = 12345
-            mock_process.wait.side_effect = KeyboardInterrupt()
-            mock_popen.return_value = mock_process
-
-            # Mock OS-level operations
-            with patch("runpod_flash.cli.commands.run.os.getpgid") as mock_getpgid:
-                mock_getpgid.return_value = 12345
-                with patch("runpod_flash.cli.commands.run.os.killpg"):
-                    runner.invoke(app, ["run"])
-
-                    # Verify default host localhost was used
-                    call_args = mock_popen.call_args[0][0]
-                    assert "--host" in call_args
-                    host_index = call_args.index("--host")
-                    assert call_args[host_index + 1] == "localhost"
-
-    def test_both_host_and_port_from_environment(
-        self, runner, temp_fastapi_app, monkeypatch
-    ):
-        """Test that both FLASH_HOST and FLASH_PORT environment variables work together."""
-        monkeypatch.chdir(temp_fastapi_app)
-        monkeypatch.setenv("FLASH_HOST", "0.0.0.0")
-        monkeypatch.setenv("FLASH_PORT", "3000")
-
-        # Mock subprocess to capture command
-        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.pid = 12345
-            mock_process.wait.side_effect = KeyboardInterrupt()
-            mock_popen.return_value = mock_process
-
-            # Mock OS-level operations
-            with patch("runpod_flash.cli.commands.run.os.getpgid") as mock_getpgid:
-                mock_getpgid.return_value = 12345
-                with patch("runpod_flash.cli.commands.run.os.killpg"):
-                    runner.invoke(app, ["run"])
-
-                    # Verify both host and port were used
-                    call_args = mock_popen.call_args[0][0]
-
-                    assert "--host" in call_args
-                    host_index = call_args.index("--host")
-                    assert call_args[host_index + 1] == "0.0.0.0"
-
-                    assert "--port" in call_args
-                    port_index = call_args.index("--port")
-                    assert call_args[port_index + 1] == "3000"
-
-    def test_short_port_flag_overrides_environment(
-        self, runner, temp_fastapi_app, monkeypatch
-    ):
-        """Test that -p short flag also overrides FLASH_PORT environment variable."""
-        monkeypatch.chdir(temp_fastapi_app)
-        monkeypatch.setenv("FLASH_PORT", "8080")
-
-        # Mock subprocess to capture command
-        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.pid = 12345
-            mock_process.wait.side_effect = KeyboardInterrupt()
-            mock_popen.return_value = mock_process
-
-            # Mock OS-level operations
-            with patch("runpod_flash.cli.commands.run.os.getpgid") as mock_getpgid:
-                mock_getpgid.return_value = 12345
-                with patch("runpod_flash.cli.commands.run.os.killpg"):
-                    # Use -p short flag
-                    runner.invoke(app, ["run", "-p", "7000"])
-
-                    # Verify port 7000 was used (short flag overrides env)
-                    call_args = mock_popen.call_args[0][0]
-                    assert "--port" in call_args
-                    port_index = call_args.index("--port")
-                    assert call_args[port_index + 1] == "7000"
-
-
-class TestRunCommandHotReload:
-    """Test flash run hot-reload behavior."""
-
-    @pytest.fixture(autouse=True)
-    def patch_watcher(self):
-        """Prevent the background watcher thread from blocking tests."""
-        with patch("runpod_flash.cli.commands.run._watch_and_regenerate"):
-            yield
-
-    def _invoke_run(self, runner, monkeypatch, temp_fastapi_app, extra_args=None):
-        """Helper: invoke flash run and return the Popen call args."""
-        monkeypatch.chdir(temp_fastapi_app)
-        monkeypatch.delenv("FLASH_PORT", raising=False)
-        monkeypatch.delenv("FLASH_HOST", raising=False)
-
-        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.pid = 12345
-            mock_process.wait.side_effect = KeyboardInterrupt()
-            mock_popen.return_value = mock_process
-
-            with patch("runpod_flash.cli.commands.run.os.getpgid", return_value=12345):
-                with patch("runpod_flash.cli.commands.run.os.killpg"):
-                    runner.invoke(app, ["run"] + (extra_args or []))
-
-            return mock_popen.call_args[0][0]
-
-    def test_reload_watches_flash_server_py(
-        self, runner, temp_fastapi_app, monkeypatch
-    ):
-        """Uvicorn watches .flash/server.py, not the whole project."""
-        cmd = self._invoke_run(runner, monkeypatch, temp_fastapi_app)
-
-        assert "--reload" in cmd
-        assert "--reload-dir" in cmd
-        reload_dir_index = cmd.index("--reload-dir")
-        assert cmd[reload_dir_index + 1] == ".flash"
-
-        assert "--reload-include" in cmd
-        reload_include_index = cmd.index("--reload-include")
-        assert cmd[reload_include_index + 1] == "server.py"
-
-    def test_reload_does_not_watch_project_root(
-        self, runner, temp_fastapi_app, monkeypatch
-    ):
-        """Uvicorn reload-dir must not be '.' to prevent double-reload."""
-        cmd = self._invoke_run(runner, monkeypatch, temp_fastapi_app)
-
-        reload_dir_index = cmd.index("--reload-dir")
-        assert cmd[reload_dir_index + 1] != "."
-
-    def test_no_reload_skips_watcher_thread(
-        self, runner, temp_fastapi_app, monkeypatch
-    ):
-        """--no-reload: neither uvicorn reload args nor watcher thread started."""
-        monkeypatch.chdir(temp_fastapi_app)
-
-        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.pid = 12345
-            mock_process.wait.side_effect = KeyboardInterrupt()
-            mock_popen.return_value = mock_process
-
-            with patch("runpod_flash.cli.commands.run.os.getpgid", return_value=12345):
-                with patch("runpod_flash.cli.commands.run.os.killpg"):
-                    with patch(
-                        "runpod_flash.cli.commands.run.threading.Thread"
-                    ) as mock_thread_cls:
-                        mock_thread = MagicMock()
-                        mock_thread_cls.return_value = mock_thread
-
-                        runner.invoke(app, ["run", "--no-reload"])
-
-            cmd = mock_popen.call_args[0][0]
-            assert "--reload" not in cmd
-            mock_thread.start.assert_not_called()
-
-    def test_watcher_thread_started_on_reload(
-        self, runner, temp_fastapi_app, monkeypatch, patch_watcher
-    ):
-        """When reload=True, the background watcher thread is started."""
-        monkeypatch.chdir(temp_fastapi_app)
-
-        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.pid = 12345
-            mock_process.wait.side_effect = KeyboardInterrupt()
-            mock_popen.return_value = mock_process
-
-            with patch("runpod_flash.cli.commands.run.os.getpgid", return_value=12345):
-                with patch("runpod_flash.cli.commands.run.os.killpg"):
-                    with patch(
-                        "runpod_flash.cli.commands.run.threading.Thread"
-                    ) as mock_thread_cls:
-                        mock_thread = MagicMock()
-                        mock_thread_cls.return_value = mock_thread
-
-                        runner.invoke(app, ["run"])
-
-            mock_thread.start.assert_called_once()
-
-    def test_watcher_thread_stopped_on_keyboard_interrupt(
-        self, runner, temp_fastapi_app, monkeypatch
-    ):
-        """KeyboardInterrupt sets stop_event and joins the watcher thread."""
-        monkeypatch.chdir(temp_fastapi_app)
-
-        with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
-            mock_process = MagicMock()
-            mock_process.pid = 12345
-            mock_process.wait.side_effect = KeyboardInterrupt()
-            mock_popen.return_value = mock_process
-
-            with patch("runpod_flash.cli.commands.run.os.getpgid", return_value=12345):
-                with patch("runpod_flash.cli.commands.run.os.killpg"):
-                    with patch(
-                        "runpod_flash.cli.commands.run.threading.Thread"
-                    ) as mock_thread_cls:
-                        mock_thread = MagicMock()
-                        mock_thread_cls.return_value = mock_thread
-                        with patch(
-                            "runpod_flash.cli.commands.run.threading.Event"
-                        ) as mock_event_cls:
-                            mock_stop = MagicMock()
-                            mock_event_cls.return_value = mock_stop
-
-                            runner.invoke(app, ["run"])
-
-            mock_stop.set.assert_called_once()
-            mock_thread.join.assert_called_once_with(timeout=2)
-
-
-class TestWatchAndRegenerate:
-    """Unit tests for the _watch_and_regenerate background function."""
-
-    def test_regenerates_server_py_on_py_file_change(self, tmp_path):
-        """When a .py file changes, server.py is regenerated."""
-        import threading
-        from runpod_flash.cli.commands.run import _watch_and_regenerate
-
-        stop = threading.Event()
-
-        with patch(
-            "runpod_flash.cli.commands.run._scan_project_workers", return_value=[]
-        ) as mock_scan:
-            with patch(
-                "runpod_flash.cli.commands.run._generate_flash_server"
-            ) as mock_gen:
-                with patch(
-                    "runpod_flash.cli.commands.run._watchfiles_watch"
-                ) as mock_watch:
-                    # Yield one batch of changes then stop
-                    mock_watch.return_value = iter([{(1, "/path/to/worker.py")}])
-                    stop.set()  # ensures the loop exits after one iteration
-                    _watch_and_regenerate(tmp_path, stop)
-
-        mock_scan.assert_called_once_with(tmp_path)
-        mock_gen.assert_called_once()
-
-    def test_ignores_non_py_changes(self, tmp_path):
-        """Changes to non-.py files do not trigger regeneration."""
-        import threading
-        from runpod_flash.cli.commands.run import _watch_and_regenerate
-
-        stop = threading.Event()
-
-        with patch("runpod_flash.cli.commands.run._scan_project_workers") as mock_scan:
-            with patch(
-                "runpod_flash.cli.commands.run._generate_flash_server"
-            ) as mock_gen:
-                with patch(
-                    "runpod_flash.cli.commands.run._watchfiles_watch"
-                ) as mock_watch:
-                    mock_watch.return_value = iter([{(1, "/path/to/README.md")}])
-                    _watch_and_regenerate(tmp_path, stop)
-
-        mock_scan.assert_not_called()
-        mock_gen.assert_not_called()
-
-    def test_scan_error_does_not_crash_watcher(self, tmp_path):
-        """If regeneration raises, the watcher logs a warning and continues."""
-        import threading
-        from runpod_flash.cli.commands.run import _watch_and_regenerate
-
-        stop = threading.Event()
-
-        with patch(
-            "runpod_flash.cli.commands.run._scan_project_workers",
-            side_effect=RuntimeError("scan failed"),
-        ):
-            with patch("runpod_flash.cli.commands.run._watchfiles_watch") as mock_watch:
-                mock_watch.return_value = iter([{(1, "/path/to/worker.py")}])
-                # Should not raise
-                _watch_and_regenerate(tmp_path, stop)
-
-
-class TestGenerateFlashServer:
-    """Test _generate_flash_server() route code generation."""
-
-    def _make_lb_worker(self, tmp_path: Path, method: str = "GET") -> WorkerInfo:
-        return WorkerInfo(
-            file_path=tmp_path / "api.py",
-            url_prefix="/api",
-            module_path="api",
-            resource_name="api",
-            worker_type="LB",
-            functions=["list_routes"],
-            lb_routes=[
-                {
-                    "method": method,
-                    "path": "/routes/list",
-                    "fn_name": "list_routes",
-                    "config_variable": "api_config",
-                }
-            ],
-        )
-
-    def test_post_lb_route_generates_body_param(self, tmp_path):
-        """POST/PUT/PATCH/DELETE LB routes use body: dict for OpenAPI docs."""
-        for method in ("POST", "PUT", "PATCH", "DELETE"):
-            worker = self._make_lb_worker(tmp_path, method)
-            content = _generate_flash_server(tmp_path, [worker]).read_text()
-            assert "async def _route_api_list_routes(body: dict):" in content
-            assert "_lb_execute(api_config, list_routes, body)" in content
-
-    def test_get_lb_route_uses_query_params(self, tmp_path):
-        """GET LB routes pass query params as a dict."""
-        worker = self._make_lb_worker(tmp_path, "GET")
-        content = _generate_flash_server(tmp_path, [worker]).read_text()
-        assert "async def _route_api_list_routes(request: Request):" in content
-        assert (
-            "_lb_execute(api_config, list_routes, dict(request.query_params))"
-            in content
-        )
-
-    def test_lb_config_var_and_function_imported(self, tmp_path):
-        """LB config vars and functions are both imported for remote dispatch."""
-        worker = self._make_lb_worker(tmp_path)
-        content = _generate_flash_server(tmp_path, [worker]).read_text()
-        assert "from api import api_config" in content
-        assert "from api import list_routes" in content
-
-    def test_lb_execute_import_present_when_lb_routes_exist(self, tmp_path):
-        """server.py imports _lb_execute when there are LB workers."""
-        worker = self._make_lb_worker(tmp_path)
-        content = _generate_flash_server(tmp_path, [worker]).read_text()
-        assert "_lb_execute" in content
-        assert "lb_execute" in content
-
-    def test_qb_function_still_imported_directly(self, tmp_path):
-        """QB workers still import and call functions directly."""
-        worker = WorkerInfo(
-            file_path=tmp_path / "worker.py",
-            url_prefix="/worker",
-            module_path="worker",
-            resource_name="worker",
-            worker_type="QB",
-            functions=["process"],
-        )
-        content = _generate_flash_server(tmp_path, [worker]).read_text()
-        assert "from worker import process" in content
-        assert "await process(" in content
+def _run_cli(runner, project_dir, extra_args=None):
+    """Invoke ``flash dev`` with subprocess mocked and return the Popen command."""
+    with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
+        mock_process = MagicMock()
+        mock_process.pid = 12345
+        mock_process.wait.side_effect = KeyboardInterrupt()
+        mock_popen.return_value = mock_process
 
+        with patch("runpod_flash.cli.commands.run.os.getpgid", return_value=12345):
+            with patch("runpod_flash.cli.commands.run.os.killpg"):
+                old_cwd = os.getcwd()
+                try:
+                    os.chdir(project_dir)
+                    runner.invoke(app, ["dev"] + (extra_args or []))
+                finally:
+                    os.chdir(old_cwd)
 
-class TestSanitizeFnName:
-    """Test _sanitize_fn_name handles leading-digit identifiers."""
+        return mock_popen.call_args[0][0]
 
-    def test_normal_name_unchanged(self):
-        assert _sanitize_fn_name("worker_run_sync") == "worker_run_sync"
 
-    def test_leading_digit_gets_underscore_prefix(self):
-        assert _sanitize_fn_name("01_hello_run_sync") == "_01_hello_run_sync"
+# ---------------------------------------------------------------------------
+# CLI: uvicorn command construction
+# ---------------------------------------------------------------------------
 
-    def test_slashes_replaced(self):
-        assert _sanitize_fn_name("a/b/c") == "a_b_c"
 
-    def test_dots_and_hyphens_replaced(self):
-        assert _sanitize_fn_name("a.b-c") == "a_b_c"
+class TestRunCommandFlags:
+    """Test that run_command builds the correct uvicorn command."""
 
-    def test_numeric_after_slash(self):
-        assert _sanitize_fn_name("01_foo/02_bar") == "_01_foo_02_bar"
+    def test_uses_factory_flag(self, runner, temp_project):
+        cmd = _run_cli(runner, temp_project)
+        assert "--factory" in cmd
+        idx = cmd.index("--factory")
+        assert cmd[idx + 1] == "runpod_flash.cli.commands._dev_server:create_app"
 
+    def test_no_flash_dir_created(self, runner, temp_project):
+        _run_cli(runner, temp_project)
+        assert not (temp_project / ".flash").exists()
 
-class TestHasNumericModuleSegments:
-    """Test _has_numeric_module_segments detects digit-prefixed segments."""
+    def test_default_host_and_port(self, runner, temp_project):
+        cmd = _run_cli(runner, temp_project)
+        assert cmd[cmd.index("--host") + 1] == "localhost"
+        assert cmd[cmd.index("--port") + 1] == "8888"
 
-    def test_normal_module_path(self):
-        assert _has_numeric_module_segments("worker") is False
+    def test_custom_port_flag(self, runner, temp_project):
+        cmd = _run_cli(runner, temp_project, ["--port", "9000"])
+        assert cmd[cmd.index("--port") + 1] == "9000"
 
-    def test_dotted_normal(self):
-        assert _has_numeric_module_segments("longruns.stage1") is False
+    def test_custom_host_flag(self, runner, temp_project):
+        cmd = _run_cli(runner, temp_project, ["--host", "0.0.0.0"])
+        assert cmd[cmd.index("--host") + 1] == "0.0.0.0"
 
-    def test_leading_digit_first_segment(self):
-        assert _has_numeric_module_segments("01_hello.worker") is True
+    def test_short_port_flag(self, runner, temp_project):
+        cmd = _run_cli(runner, temp_project, ["-p", "7000"])
+        assert cmd[cmd.index("--port") + 1] == "7000"
 
-    def test_leading_digit_nested_segment(self):
-        assert _has_numeric_module_segments("getting_started.01_hello.worker") is True
-
-    def test_digit_in_middle_not_leading(self):
-        assert _has_numeric_module_segments("stage1.worker") is False
-
-
-class TestModuleParentSubdir:
-    """Test _module_parent_subdir extracts parent directory from dotted path."""
+    def test_reload_watches_project_root(self, runner, temp_project):
+        cmd = _run_cli(runner, temp_project)
+        assert "--reload" in cmd
+        idx = cmd.index("--reload-dir")
+        assert cmd[idx + 1] == str(temp_project)
 
-    def test_top_level_returns_none(self):
-        assert _module_parent_subdir("worker") is None
+    def test_no_reload_flag(self, runner, temp_project):
+        cmd = _run_cli(runner, temp_project, ["--no-reload"])
+        assert "--reload" not in cmd
+        assert "--reload-dir" not in cmd
 
-    def test_single_parent(self):
-        assert _module_parent_subdir("01_hello.gpu_worker") == "01_hello"
+    def test_sets_project_root_env_var(self, runner, temp_project):
+        _run_cli(runner, temp_project)
+        assert os.environ.get("FLASH_PROJECT_ROOT") == str(temp_project)
 
-    def test_nested_parent(self):
-        assert (
-            _module_parent_subdir("01_getting_started.03_mixed.pipeline")
-            == "01_getting_started/03_mixed"
-        )
 
+# ---------------------------------------------------------------------------
+# create_app factory
+# ---------------------------------------------------------------------------
 
-class TestMakeImportLine:
-    """Test _make_import_line generates correct import syntax."""
 
-    def test_normal_module_uses_from_import(self):
-        result = _make_import_line("worker", "process")
-        assert result == "from worker import process"
+class TestCreateApp:
+    """Test the programmatic create_app factory."""
 
-    def test_numeric_module_uses_flash_import(self):
-        result = _make_import_line("01_hello.gpu_worker", "gpu_hello")
-        assert (
-            result
-            == 'gpu_hello = _flash_import("01_hello.gpu_worker", "gpu_hello", "01_hello")'
-        )
+    def test_returns_fastapi_instance(self, tmp_path):
+        result = create_app(project_root=tmp_path, workers=[])
+        assert isinstance(result, FastAPI)
 
-    def test_nested_numeric_includes_full_subdir(self):
-        result = _make_import_line(
-            "01_getting_started.01_hello.gpu_worker", "gpu_hello"
-        )
-        assert '"01_getting_started/01_hello"' in result
+    def test_health_endpoints(self, tmp_path):
+        test_app = create_app(project_root=tmp_path, workers=[])
+        client = TestClient(test_app)
 
-    def test_top_level_numeric_module_no_subdir(self):
-        result = _make_import_line("01_worker", "process")
-        assert result == 'process = _flash_import("01_worker", "process")'
+        resp = client.get("/")
+        assert resp.status_code == 200
+        assert resp.json()["docs"] == "/docs"
 
+        resp = client.get("/ping")
+        assert resp.status_code == 200
+        assert resp.json()["status"] == "healthy"
 
-class TestGenerateFlashServerNumericDirs:
-    """Test _generate_flash_server with numeric-prefixed directory names."""
+    def test_registers_qb_worker_routes(self, tmp_path):
+        mod = tmp_path / "worker.py"
+        mod.write_text("async def process(data):\n    return {'echo': data}\n")
 
-    def test_qb_numeric_dir_uses_flash_import(self, tmp_path):
-        """QB workers in numeric dirs use _flash_import with scoped sys.path."""
         worker = WorkerInfo(
-            file_path=tmp_path / "01_hello" / "gpu_worker.py",
-            url_prefix="/01_hello/gpu_worker",
-            module_path="01_hello.gpu_worker",
-            resource_name="01_hello_gpu_worker",
+            file_path=mod,
+            url_prefix="/worker",
+            module_path="worker",
+            resource_name="worker",
             worker_type="QB",
-            functions=["gpu_hello"],
-        )
-        content = _generate_flash_server(tmp_path, [worker]).read_text()
-
-        # Must NOT contain invalid 'from 01_hello...' import
-        assert "from 01_hello" not in content
-        # Must have _flash_import helper and importlib
-        assert "import importlib as _importlib" in content
-        assert "def _flash_import(" in content
-        assert (
-            '_flash_import("01_hello.gpu_worker", "gpu_hello", "01_hello")' in content
+            functions=["process"],
         )
+        sys.path.insert(0, str(tmp_path))
+        try:
+            test_app = create_app(project_root=tmp_path, workers=[worker])
+            client = TestClient(test_app)
+            resp = client.post("/worker/run_sync", json={"input": "hello"})
+            assert resp.status_code == 200
+            assert resp.json()["output"] == {"echo": "hello"}
+        finally:
+            sys.path.remove(str(tmp_path))
+            sys.modules.pop("worker", None)
 
-    def test_qb_numeric_dir_function_name_prefixed(self, tmp_path):
-        """QB handler function names starting with digits get '_' prefix."""
-        worker = WorkerInfo(
-            file_path=tmp_path / "01_hello" / "gpu_worker.py",
-            url_prefix="/01_hello/gpu_worker",
-            module_path="01_hello.gpu_worker",
-            resource_name="01_hello_gpu_worker",
-            worker_type="QB",
-            functions=["gpu_hello"],
-        )
-        content = _generate_flash_server(tmp_path, [worker]).read_text()
 
-        # Function name must start with '_', not a digit
-        assert "async def _01_hello_gpu_worker_run_sync(body: dict):" in content
+# ---------------------------------------------------------------------------
+# QB routes
+# ---------------------------------------------------------------------------
 
-    def test_lb_numeric_dir_uses_flash_import(self, tmp_path):
-        """LB workers in numeric dirs use _flash_import for config and function imports."""
-        worker = WorkerInfo(
-            file_path=tmp_path / "03_advanced" / "05_lb" / "cpu_lb.py",
-            url_prefix="/03_advanced/05_lb/cpu_lb",
-            module_path="03_advanced.05_lb.cpu_lb",
-            resource_name="03_advanced_05_lb_cpu_lb",
-            worker_type="LB",
-            functions=["validate_data"],
-            lb_routes=[
-                {
-                    "method": "POST",
-                    "path": "/validate",
-                    "fn_name": "validate_data",
-                    "config_variable": "cpu_config",
-                }
-            ],
-        )
-        content = _generate_flash_server(tmp_path, [worker]).read_text()
 
-        assert "from 03_advanced" not in content
-        assert (
-            '_flash_import("03_advanced.05_lb.cpu_lb", "cpu_config", "03_advanced/05_lb")'
-            in content
-        )
-        assert (
-            '_flash_import("03_advanced.05_lb.cpu_lb", "validate_data", "03_advanced/05_lb")'
-            in content
-        )
+class TestRegisterQBRoutes:
+    """Test QB route registration and invocation."""
+
+    def test_single_function_run_sync(self, tmp_path):
+        mod = tmp_path / "worker.py"
+        mod.write_text("async def process(data):\n    return {'echo': data}\n")
 
-    def test_mixed_numeric_and_normal_dirs(self, tmp_path):
-        """Normal modules use 'from' imports, numeric modules use _flash_import."""
-        normal_worker = WorkerInfo(
-            file_path=tmp_path / "worker.py",
+        worker = WorkerInfo(
+            file_path=mod,
             url_prefix="/worker",
             module_path="worker",
             resource_name="worker",
             worker_type="QB",
             functions=["process"],
         )
-        numeric_worker = WorkerInfo(
-            file_path=tmp_path / "01_hello" / "gpu_worker.py",
-            url_prefix="/01_hello/gpu_worker",
-            module_path="01_hello.gpu_worker",
-            resource_name="01_hello_gpu_worker",
-            worker_type="QB",
-            functions=["gpu_hello"],
-        )
-        content = _generate_flash_server(
-            tmp_path, [normal_worker, numeric_worker]
-        ).read_text()
-
-        # Normal worker uses standard import
-        assert "from worker import process" in content
-        # Numeric worker uses scoped _flash_import
-        assert (
-            '_flash_import("01_hello.gpu_worker", "gpu_hello", "01_hello")' in content
+        sys.path.insert(0, str(tmp_path))
+        try:
+            test_app = FastAPI()
+            _register_qb_routes(test_app, worker, tmp_path, "test [QB]")
+            client = TestClient(test_app)
+            resp = client.post("/worker/run_sync", json={"input": {"k": "v"}})
+            body = resp.json()
+            assert resp.status_code == 200
+            assert body["status"] == "COMPLETED"
+            assert body["output"] == {"echo": {"k": "v"}}
+            assert "id" in body
+        finally:
+            sys.path.remove(str(tmp_path))
+            sys.modules.pop("worker", None)
+
+    def test_multi_function_routes(self, tmp_path):
+        mod = tmp_path / "multi.py"
+        mod.write_text(
+            "async def alpha(d):\n    return 'a'\n"
+            "async def beta(d):\n    return 'b'\n"
         )
-
-    def test_no_importlib_when_all_normal_dirs(self, tmp_path):
-        """importlib and _flash_import are not emitted when no numeric dirs exist."""
         worker = WorkerInfo(
-            file_path=tmp_path / "worker.py",
-            url_prefix="/worker",
-            module_path="worker",
-            resource_name="worker",
+            file_path=mod,
+            url_prefix="/multi",
+            module_path="multi",
+            resource_name="multi",
             worker_type="QB",
-            functions=["process"],
+            functions=["alpha", "beta"],
+        )
+        sys.path.insert(0, str(tmp_path))
+        try:
+            test_app = FastAPI()
+            _register_qb_routes(test_app, worker, tmp_path, "test [QB]")
+            client = TestClient(test_app)
+            assert client.post("/multi/alpha/run_sync", json={"input": {}}).json()["output"] == "a"
+            assert client.post("/multi/beta/run_sync", json={"input": {}}).json()["output"] == "b"
+        finally:
+            sys.path.remove(str(tmp_path))
+            sys.modules.pop("multi", None)
+
+
+# ---------------------------------------------------------------------------
+# LB routes
+# ---------------------------------------------------------------------------
+
+
+class TestRegisterLBRoutes:
+    """Test LB route registration using an injected executor."""
+
+    def _write_lb_module(self, tmp_path, name, config_var, fn_name):
+        mod = tmp_path / f"{name}.py"
+        mod.write_text(
+            f"{config_var} = 'fake_config'\n"
+            f"async def {fn_name}(d):\n    return d\n"
         )
-        content = _generate_flash_server(tmp_path, [worker]).read_text()
-        assert "importlib" not in content
-        assert "_flash_import" not in content
 
-    def test_scoped_import_includes_subdir(self, tmp_path):
-        """_flash_import calls pass the subdirectory for sibling import scoping."""
-        worker = WorkerInfo(
-            file_path=tmp_path / "01_getting_started" / "03_mixed" / "pipeline.py",
-            url_prefix="/01_getting_started/03_mixed/pipeline",
-            module_path="01_getting_started.03_mixed.pipeline",
-            resource_name="01_getting_started_03_mixed_pipeline",
+    def _make_lb_worker(self, tmp_path, name, config_var, fn_name, method, path):
+        return WorkerInfo(
+            file_path=tmp_path / f"{name}.py",
+            url_prefix=f"/{name}",
+            module_path=name,
+            resource_name=name,
             worker_type="LB",
-            functions=["classify"],
+            functions=[fn_name],
             lb_routes=[
                 {
-                    "method": "POST",
-                    "path": "/classify",
-                    "fn_name": "classify",
-                    "config_variable": "pipeline_config",
+                    "method": method,
+                    "path": path,
+                    "fn_name": fn_name,
+                    "config_variable": config_var,
                 }
             ],
         )
-        content = _generate_flash_server(tmp_path, [worker]).read_text()
-
-        # Must scope to correct subdirectory, not add all dirs to sys.path
-        assert '"01_getting_started/03_mixed"' in content
-        # No global sys.path additions for subdirs — only the project root
-        # line at the top and the one inside _flash_import helper body
-        lines = content.split("\n")
-        global_sys_path_lines = [
-            line
-            for line in lines
-            if "sys.path.insert" in line and not line.startswith(" ")
-        ]
-        assert len(global_sys_path_lines) == 1
-
-    def test_generated_server_is_valid_python(self, tmp_path):
-        """Generated server.py with numeric dirs must be parseable Python."""
-        worker = WorkerInfo(
-            file_path=tmp_path / "01_getting_started" / "01_hello" / "gpu_worker.py",
-            url_prefix="/01_getting_started/01_hello/gpu_worker",
-            module_path="01_getting_started.01_hello.gpu_worker",
-            resource_name="01_getting_started_01_hello_gpu_worker",
-            worker_type="QB",
-            functions=["gpu_hello"],
-        )
-        server_path = _generate_flash_server(tmp_path, [worker])
-        content = server_path.read_text()
 
-        # Must parse without SyntaxError
-        import ast
-
-        ast.parse(content)
+    def test_post_route_passes_body(self, tmp_path):
+        """POST LB routes forward the request body to the executor."""
+        self._write_lb_module(tmp_path, "api", "api_config", "handle")
+        worker = self._make_lb_worker(tmp_path, "api", "api_config", "handle", "POST", "/do")
+        captured = {}
+
+        async def fake_executor(config, fn, body):
+            captured["config"] = config
+            captured["body"] = body
+            return {"ok": True}
+
+        sys.path.insert(0, str(tmp_path))
+        try:
+            test_app = FastAPI()
+            _register_lb_routes(test_app, worker, tmp_path, "lb", executor=fake_executor)
+            client = TestClient(test_app)
+            resp = client.post("/api/do", json={"key": "val"})
+            assert resp.status_code == 200
+            assert captured["config"] == "fake_config"
+            assert captured["body"] == {"key": "val"}
+        finally:
+            sys.path.remove(str(tmp_path))
+            sys.modules.pop("api", None)
+
+    def test_get_route_passes_query_params(self, tmp_path):
+        """GET LB routes forward query params as a dict."""
+        self._write_lb_module(tmp_path, "search", "search_cfg", "find")
+        worker = self._make_lb_worker(tmp_path, "search", "search_cfg", "find", "GET", "/query")
+        captured = {}
+
+        async def fake_executor(config, fn, body):
+            captured["body"] = body
+            return {"ok": True}
+
+        sys.path.insert(0, str(tmp_path))
+        try:
+            test_app = FastAPI()
+            _register_lb_routes(test_app, worker, tmp_path, "lb", executor=fake_executor)
+            client = TestClient(test_app)
+            resp = client.get("/search/query?q=test&limit=10")
+            assert resp.status_code == 200
+            assert captured["body"] == {"q": "test", "limit": "10"}
+        finally:
+            sys.path.remove(str(tmp_path))
+            sys.modules.pop("search", None)
+
+    def test_all_body_methods(self, tmp_path):
+        """POST/PUT/PATCH/DELETE all register as body-accepting routes."""
+        for method in ("POST", "PUT", "PATCH", "DELETE"):
+            mod_name = f"mod_{method.lower()}"
+            self._write_lb_module(tmp_path, mod_name, "cfg", "handler")
+            worker = self._make_lb_worker(tmp_path, mod_name, "cfg", "handler", method, "/ep")
+
+            async def noop_executor(config, fn, body):
+                return {"ok": True}
+
+            sys.path.insert(0, str(tmp_path))
+            try:
+                test_app = FastAPI()
+                _register_lb_routes(test_app, worker, tmp_path, "lb", executor=noop_executor)
+                route = next(
+                    r for r in test_app.routes
+                    if hasattr(r, "path") and r.path == f"/{mod_name}/ep"
+                )
+                assert method in route.methods
+            finally:
+                sys.path.remove(str(tmp_path))
+                sys.modules.pop(mod_name, None)
+
+
+# ---------------------------------------------------------------------------
+# _import_from_module
+# ---------------------------------------------------------------------------
+
+
+class TestImportFromModule:
+    """Test module importing with standard and numeric-prefix paths."""
+
+    def test_standard_module(self, tmp_path):
+        (tmp_path / "mymod.py").write_text("MY_VAR = 42\n")
+        sys.path.insert(0, str(tmp_path))
+        try:
+            assert _import_from_module("mymod", "MY_VAR", tmp_path) == 42
+        finally:
+            sys.path.remove(str(tmp_path))
+            sys.modules.pop("mymod", None)
+
+    def test_numeric_prefix_module(self, tmp_path):
+        subdir = tmp_path / "01_hello"
+        subdir.mkdir()
+        (subdir / "__init__.py").write_text("")
+        (subdir / "gpu_worker.py").write_text("VALUE = 'hello'\n")
+        sys.path.insert(0, str(tmp_path))
+        try:
+            assert _import_from_module("01_hello.gpu_worker", "VALUE", tmp_path) == "hello"
+        finally:
+            sys.path.remove(str(tmp_path))
+            sys.modules.pop("01_hello.gpu_worker", None)
+            sys.modules.pop("01_hello", None)
+
+    def test_top_level_numeric_module(self, tmp_path):
+        (tmp_path / "01_worker.py").write_text("RESULT = 'ok'\n")
+        sys.path.insert(0, str(tmp_path))
+        try:
+            assert _import_from_module("01_worker", "RESULT", tmp_path) == "ok"
+        finally:
+            sys.path.remove(str(tmp_path))
+            sys.modules.pop("01_worker", None)
+
+
+# ---------------------------------------------------------------------------
+# _map_body_to_params
+# ---------------------------------------------------------------------------
 
 
 class TestMapBodyToParams:
-    """Tests for _map_body_to_params — maps HTTP body to function arguments."""
+    """Tests for _map_body_to_params."""
 
-    def test_body_keys_match_params_spreads_as_kwargs(self):
+    def test_matching_keys_spread_as_kwargs(self):
         from runpod_flash.cli.commands._run_server_helpers import _map_body_to_params
 
         def process(name: str, value: int):
             pass
 
-        result = _map_body_to_params(process, {"name": "test", "value": 42})
-        assert result == {"name": "test", "value": 42}
+        assert _map_body_to_params(process, {"name": "t", "value": 1}) == {"name": "t", "value": 1}
 
-    def test_body_keys_mismatch_wraps_in_first_param(self):
+    def test_mismatched_keys_wrap_in_first_param(self):
         from runpod_flash.cli.commands._run_server_helpers import _map_body_to_params
 
-        def run_pipeline(input_data: dict):
+        def run(input_data: dict):
             pass
 
-        body = {"text": "hello", "mode": "fast"}
-        result = _map_body_to_params(run_pipeline, body)
-        assert result == {"input_data": {"text": "hello", "mode": "fast"}}
+        assert _map_body_to_params(run, {"a": 1}) == {"input_data": {"a": 1}}
 
-    def test_non_dict_body_wraps_in_first_param(self):
+    def test_non_dict_wraps_in_first_param(self):
         from runpod_flash.cli.commands._run_server_helpers import _map_body_to_params
 
-        def run_pipeline(input_data):
+        def run(input_data):
             pass
 
-        result = _map_body_to_params(run_pipeline, [1, 2, 3])
-        assert result == {"input_data": [1, 2, 3]}
+        assert _map_body_to_params(run, [1, 2]) == {"input_data": [1, 2]}
 
     def test_no_params_returns_empty(self):
         from runpod_flash.cli.commands._run_server_helpers import _map_body_to_params
 
-        def no_args():
+        def noop():
             pass
 
-        result = _map_body_to_params(no_args, {"key": "val"})
-        assert result == {}
+        assert _map_body_to_params(noop, {"k": "v"}) == {}
 
-    def test_partial_key_match_wraps_in_first_param(self):
+    def test_partial_match_wraps_in_first_param(self):
         from runpod_flash.cli.commands._run_server_helpers import _map_body_to_params
 
         def process(name: str, value: int):
             pass
 
-        result = _map_body_to_params(process, {"name": "test", "extra": "bad"})
-        assert result == {"name": {"name": "test", "extra": "bad"}}
+        assert _map_body_to_params(process, {"name": "t", "extra": "x"}) == {
+            "name": {"name": "t", "extra": "x"}
+        }
 
-    def test_empty_dict_body_spreads_as_empty_kwargs(self):
+    def test_empty_dict_spreads_as_empty(self):
         from runpod_flash.cli.commands._run_server_helpers import _map_body_to_params
 
-        def run_pipeline(input_data: dict):
+        def run(input_data: dict):
             pass
 
-        result = _map_body_to_params(run_pipeline, {})
-        assert result == {}
+        assert _map_body_to_params(run, {}) == {}
diff --git a/tests/unit/resources/test_serverless.py b/tests/unit/resources/test_serverless.py
index 124eb136..1fa2f3c9 100644
--- a/tests/unit/resources/test_serverless.py
+++ b/tests/unit/resources/test_serverless.py
@@ -468,7 +468,7 @@ def test_is_deployed_false_when_no_id(self):
         assert serverless.is_deployed() is False
 
     def test_is_deployed_skips_health_check_during_live_provisioning(self, monkeypatch):
-        """During flash run, is_deployed returns True based on ID alone."""
+        """During flash dev, is_deployed returns True based on ID alone."""
         monkeypatch.setenv("FLASH_IS_LIVE_PROVISIONING", "true")
         serverless = ServerlessResource(name="test")
         serverless.id = "ep-live-123"
@@ -477,7 +477,7 @@ def test_is_deployed_skips_health_check_during_live_provisioning(self, monkeypat
         assert serverless.is_deployed() is True
 
     def test_is_deployed_uses_health_check_outside_live_provisioning(self, monkeypatch):
-        """Outside flash run, is_deployed falls back to health check."""
+        """Outside flash dev, is_deployed falls back to health check."""
         monkeypatch.delenv("FLASH_IS_LIVE_PROVISIONING", raising=False)
         serverless = ServerlessResource(name="test")
         serverless.id = "ep-123"

From 5013185bf4cfd67e5d15bfd33d697f446e7a1ac7 Mon Sep 17 00:00:00 2001
From: zeke <40004347+KAJdev@users.noreply.github.com>
Date: Fri, 20 Feb 2026 14:27:23 -0800
Subject: [PATCH 22/26] format

---
 src/runpod_flash/cli/commands/_dev_server.py |  8 +--
 tests/unit/cli/test_run.py                   | 54 ++++++++++++++------
 2 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/src/runpod_flash/cli/commands/_dev_server.py b/src/runpod_flash/cli/commands/_dev_server.py
index 6378f3fd..5242fcf5 100644
--- a/src/runpod_flash/cli/commands/_dev_server.py
+++ b/src/runpod_flash/cli/commands/_dev_server.py
@@ -68,9 +68,7 @@ def _import_from_module(module_path: str, name: str, project_root: Path):
     scopes ``sys.path`` so that sibling imports within the target module
     resolve to the correct subdirectory.
     """
-    has_numeric = any(
-        seg and seg[0].isdigit() for seg in module_path.split(".")
-    )
+    has_numeric = any(seg and seg[0].isdigit() for seg in module_path.split("."))
 
     if has_numeric:
         parts = module_path.rsplit(".", 1)
@@ -201,9 +199,7 @@ async def lb_body_handler(
             async def lb_query_handler(
                 request: Request, _config=config, _fn=fn, _exec=executor
             ):
-                return await _exec(
-                    _config, _fn, dict(request.query_params)
-                )
+                return await _exec(_config, _fn, dict(request.query_params))
 
             lb_query_handler.__name__ = f"_route_{worker.resource_name}_{fn_name}"
             app.add_api_route(
diff --git a/tests/unit/cli/test_run.py b/tests/unit/cli/test_run.py
index 2b38d6dd..231e8e6e 100644
--- a/tests/unit/cli/test_run.py
+++ b/tests/unit/cli/test_run.py
@@ -196,8 +196,7 @@ def test_single_function_run_sync(self, tmp_path):
     def test_multi_function_routes(self, tmp_path):
         mod = tmp_path / "multi.py"
         mod.write_text(
-            "async def alpha(d):\n    return 'a'\n"
-            "async def beta(d):\n    return 'b'\n"
+            "async def alpha(d):\n    return 'a'\nasync def beta(d):\n    return 'b'\n"
         )
         worker = WorkerInfo(
             file_path=mod,
@@ -212,8 +211,16 @@ def test_multi_function_routes(self, tmp_path):
             test_app = FastAPI()
             _register_qb_routes(test_app, worker, tmp_path, "test [QB]")
             client = TestClient(test_app)
-            assert client.post("/multi/alpha/run_sync", json={"input": {}}).json()["output"] == "a"
-            assert client.post("/multi/beta/run_sync", json={"input": {}}).json()["output"] == "b"
+            assert (
+                client.post("/multi/alpha/run_sync", json={"input": {}}).json()[
+                    "output"
+                ]
+                == "a"
+            )
+            assert (
+                client.post("/multi/beta/run_sync", json={"input": {}}).json()["output"]
+                == "b"
+            )
         finally:
             sys.path.remove(str(tmp_path))
             sys.modules.pop("multi", None)
@@ -230,8 +237,7 @@ class TestRegisterLBRoutes:
     def _write_lb_module(self, tmp_path, name, config_var, fn_name):
         mod = tmp_path / f"{name}.py"
         mod.write_text(
-            f"{config_var} = 'fake_config'\n"
-            f"async def {fn_name}(d):\n    return d\n"
+            f"{config_var} = 'fake_config'\nasync def {fn_name}(d):\n    return d\n"
         )
 
     def _make_lb_worker(self, tmp_path, name, config_var, fn_name, method, path):
@@ -255,7 +261,9 @@ def _make_lb_worker(self, tmp_path, name, config_var, fn_name, method, path):
     def test_post_route_passes_body(self, tmp_path):
         """POST LB routes forward the request body to the executor."""
         self._write_lb_module(tmp_path, "api", "api_config", "handle")
-        worker = self._make_lb_worker(tmp_path, "api", "api_config", "handle", "POST", "/do")
+        worker = self._make_lb_worker(
+            tmp_path, "api", "api_config", "handle", "POST", "/do"
+        )
         captured = {}
 
         async def fake_executor(config, fn, body):
@@ -266,7 +274,9 @@ async def fake_executor(config, fn, body):
         sys.path.insert(0, str(tmp_path))
         try:
             test_app = FastAPI()
-            _register_lb_routes(test_app, worker, tmp_path, "lb", executor=fake_executor)
+            _register_lb_routes(
+                test_app, worker, tmp_path, "lb", executor=fake_executor
+            )
             client = TestClient(test_app)
             resp = client.post("/api/do", json={"key": "val"})
             assert resp.status_code == 200
@@ -279,7 +289,9 @@ async def fake_executor(config, fn, body):
     def test_get_route_passes_query_params(self, tmp_path):
         """GET LB routes forward query params as a dict."""
         self._write_lb_module(tmp_path, "search", "search_cfg", "find")
-        worker = self._make_lb_worker(tmp_path, "search", "search_cfg", "find", "GET", "/query")
+        worker = self._make_lb_worker(
+            tmp_path, "search", "search_cfg", "find", "GET", "/query"
+        )
         captured = {}
 
         async def fake_executor(config, fn, body):
@@ -289,7 +301,9 @@ async def fake_executor(config, fn, body):
         sys.path.insert(0, str(tmp_path))
         try:
             test_app = FastAPI()
-            _register_lb_routes(test_app, worker, tmp_path, "lb", executor=fake_executor)
+            _register_lb_routes(
+                test_app, worker, tmp_path, "lb", executor=fake_executor
+            )
             client = TestClient(test_app)
             resp = client.get("/search/query?q=test&limit=10")
             assert resp.status_code == 200
@@ -303,7 +317,9 @@ def test_all_body_methods(self, tmp_path):
         for method in ("POST", "PUT", "PATCH", "DELETE"):
             mod_name = f"mod_{method.lower()}"
             self._write_lb_module(tmp_path, mod_name, "cfg", "handler")
-            worker = self._make_lb_worker(tmp_path, mod_name, "cfg", "handler", method, "/ep")
+            worker = self._make_lb_worker(
+                tmp_path, mod_name, "cfg", "handler", method, "/ep"
+            )
 
             async def noop_executor(config, fn, body):
                 return {"ok": True}
@@ -311,9 +327,12 @@ async def noop_executor(config, fn, body):
             sys.path.insert(0, str(tmp_path))
             try:
                 test_app = FastAPI()
-                _register_lb_routes(test_app, worker, tmp_path, "lb", executor=noop_executor)
+                _register_lb_routes(
+                    test_app, worker, tmp_path, "lb", executor=noop_executor
+                )
                 route = next(
-                    r for r in test_app.routes
+                    r
+                    for r in test_app.routes
                     if hasattr(r, "path") and r.path == f"/{mod_name}/ep"
                 )
                 assert method in route.methods
@@ -346,7 +365,9 @@ def test_numeric_prefix_module(self, tmp_path):
         (subdir / "gpu_worker.py").write_text("VALUE = 'hello'\n")
         sys.path.insert(0, str(tmp_path))
         try:
-            assert _import_from_module("01_hello.gpu_worker", "VALUE", tmp_path) == "hello"
+            assert (
+                _import_from_module("01_hello.gpu_worker", "VALUE", tmp_path) == "hello"
+            )
         finally:
             sys.path.remove(str(tmp_path))
             sys.modules.pop("01_hello.gpu_worker", None)
@@ -376,7 +397,10 @@ def test_matching_keys_spread_as_kwargs(self):
         def process(name: str, value: int):
             pass
 
-        assert _map_body_to_params(process, {"name": "t", "value": 1}) == {"name": "t", "value": 1}
+        assert _map_body_to_params(process, {"name": "t", "value": 1}) == {
+            "name": "t",
+            "value": 1,
+        }
 
     def test_mismatched_keys_wrap_in_first_param(self):
         from runpod_flash.cli.commands._run_server_helpers import _map_body_to_params

From 6636bdfb04111176924b69a758c4f63d14fe335b Mon Sep 17 00:00:00 2001
From: zeke <40004347+KAJdev@users.noreply.github.com>
Date: Fri, 20 Feb 2026 14:29:53 -0800
Subject: [PATCH 23/26] fix: remove unused import flagged by ruff

---
 tests/unit/cli/test_run.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/unit/cli/test_run.py b/tests/unit/cli/test_run.py
index 231e8e6e..49d50fa4 100644
--- a/tests/unit/cli/test_run.py
+++ b/tests/unit/cli/test_run.py
@@ -2,7 +2,6 @@
 
 import os
 import sys
-from pathlib import Path
 from unittest.mock import MagicMock, patch
 
 import pytest

From 89c81b368de69b02eeb0e6234a25c51baf9c96e5 Mon Sep 17 00:00:00 2001
From: zeke <40004347+KAJdev@users.noreply.github.com>
Date: Fri, 20 Feb 2026 14:31:49 -0800
Subject: [PATCH 24/26] fix: clean up env var leaks in tests, remove em-dash

---
 src/runpod_flash/cli/commands/run.py |  2 +-
 tests/unit/cli/test_run.py           | 36 ++++++++++++++++++++++++++--
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/src/runpod_flash/cli/commands/run.py b/src/runpod_flash/cli/commands/run.py
index 37436824..6bfad066 100644
--- a/src/runpod_flash/cli/commands/run.py
+++ b/src/runpod_flash/cli/commands/run.py
@@ -148,7 +148,7 @@ def _print_startup_table(workers: List[WorkerInfo], host: str, port: int) -> Non
     console.print(table)
     console.print(f"\n  Visit [bold]http://{host}:{port}/docs[/bold] for Swagger UI")
     console.print(
-        "  Press [bold]Ctrl+C[/bold] to stop — provisioned endpoints are cleaned up automatically\n"
+        "  Press [bold]Ctrl+C[/bold] to stop. Provisioned endpoints are cleaned up automatically.\n"
     )
 
 
diff --git a/tests/unit/cli/test_run.py b/tests/unit/cli/test_run.py
index 49d50fa4..b4166ac4 100644
--- a/tests/unit/cli/test_run.py
+++ b/tests/unit/cli/test_run.py
@@ -40,6 +40,10 @@ def temp_project(tmp_path):
 
 def _run_cli(runner, project_dir, extra_args=None):
     """Invoke ``flash dev`` with subprocess mocked and return the Popen command."""
+    saved_env = {
+        k: os.environ.get(k)
+        for k in ("FLASH_PROJECT_ROOT", "FLASH_IS_LIVE_PROVISIONING")
+    }
     with patch("runpod_flash.cli.commands.run.subprocess.Popen") as mock_popen:
         mock_process = MagicMock()
         mock_process.pid = 12345
@@ -54,6 +58,11 @@ def _run_cli(runner, project_dir, extra_args=None):
                     runner.invoke(app, ["dev"] + (extra_args or []))
                 finally:
                     os.chdir(old_cwd)
+                    for k, v in saved_env.items():
+                        if v is None:
+                            os.environ.pop(k, None)
+                        else:
+                            os.environ[k] = v
 
         return mock_popen.call_args[0][0]
 
@@ -105,8 +114,31 @@ def test_no_reload_flag(self, runner, temp_project):
         assert "--reload-dir" not in cmd
 
     def test_sets_project_root_env_var(self, runner, temp_project):
-        _run_cli(runner, temp_project)
-        assert os.environ.get("FLASH_PROJECT_ROOT") == str(temp_project)
+        """FLASH_PROJECT_ROOT is set when Popen is called (inherited by child)."""
+        captured_env = {}
+
+        def capture_popen(cmd, **kwargs):
+            captured_env["FLASH_PROJECT_ROOT"] = os.environ.get("FLASH_PROJECT_ROOT")
+            mock_process = MagicMock()
+            mock_process.pid = 12345
+            mock_process.wait.side_effect = KeyboardInterrupt()
+            return mock_process
+
+        with patch(
+            "runpod_flash.cli.commands.run.subprocess.Popen", side_effect=capture_popen
+        ):
+            with patch("runpod_flash.cli.commands.run.os.getpgid", return_value=12345):
+                with patch("runpod_flash.cli.commands.run.os.killpg"):
+                    old_cwd = os.getcwd()
+                    try:
+                        os.chdir(temp_project)
+                        runner.invoke(app, ["dev"])
+                    finally:
+                        os.chdir(old_cwd)
+                        os.environ.pop("FLASH_PROJECT_ROOT", None)
+                        os.environ.pop("FLASH_IS_LIVE_PROVISIONING", None)
+
+        assert captured_env["FLASH_PROJECT_ROOT"] == str(temp_project)
 
 
 # ---------------------------------------------------------------------------

From b04c436ea25717378af3959834c2dd85b91ff0a4 Mon Sep 17 00:00:00 2001
From: zeke <40004347+KAJdev@users.noreply.github.com>
Date: Fri, 20 Feb 2026 14:41:12 -0800
Subject: [PATCH 25/26] fix claudes generated doc changes lol

---
 PRD.md                                       | 6 +++---
 src/runpod_flash/cli/commands/_dev_server.py | 6 +++---
 src/runpod_flash/cli/commands/run.py         | 3 +--
 src/runpod_flash/cli/docs/flash-dev.md       | 8 ++++----
 4 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/PRD.md b/PRD.md
index 1c9a3005..ad936c0a 100644
--- a/PRD.md
+++ b/PRD.md
@@ -146,7 +146,7 @@ my_project/
 
 ### 7.1 Dev server launch
 
-The dev server is built programmatically in-memory (no codegen). Uvicorn is invoked with the `--factory` flag pointing to the app factory function, and the project root is passed via the `FLASH_PROJECT_ROOT` environment variable:
+Uvicorn is invoked with the `--factory` flag pointing to the app factory function, and the project root is passed via the `FLASH_PROJECT_ROOT` environment variable:
 
 ```bash
 uvicorn --factory runpod_flash.cli.commands._dev_server:create_app \
@@ -159,7 +159,7 @@ uvicorn --factory runpod_flash.cli.commands._dev_server:create_app \
 1. Scan project for all `@remote` functions (QB and LB) in any `.py` file
    - Skip: `.flash/`, `__pycache__`, `*.pyc`, `__init__.py`
 2. If none found: print error with usage instructions, exit 1
-3. Build FastAPI app programmatically (no codegen, no `.flash/server.py`)
+3. Build FastAPI app programmatically via `_dev_server.create_app()`
 4. Start uvicorn with `--factory` and `--reload` watching the project root
 5. Print startup table: local paths, resource names, types
 6. Swagger UI available at `http://localhost:{port}/docs`
@@ -231,7 +231,7 @@ Resource names are derived from file paths (slashes → underscores):
 
 ## 11. Dev Server App Structure
 
-The dev server is built programmatically by `_dev_server.create_app()`. No file is generated. User modules are imported via `importlib.import_module()` and routes are registered with `app.add_api_route()`. Tracebacks point directly to the original source files.
+The dev server is built programmatically by `_dev_server.create_app()`. User modules are imported via `importlib.import_module()` and routes are registered with `app.add_api_route()`. Tracebacks point directly to the original source files.
 
 Conceptual equivalent of the generated app:
 
diff --git a/src/runpod_flash/cli/commands/_dev_server.py b/src/runpod_flash/cli/commands/_dev_server.py
index 5242fcf5..6133b220 100644
--- a/src/runpod_flash/cli/commands/_dev_server.py
+++ b/src/runpod_flash/cli/commands/_dev_server.py
@@ -1,8 +1,8 @@
 """Programmatic FastAPI dev server for flash dev.
 
-Instead of generating a server.py file (codegen), this module builds the
-FastAPI app in-memory. User modules are imported directly, so tracebacks
-point to the original source files.
+Builds the FastAPI app by scanning for @remote functions and registering
+routes via add_api_route(). User modules are imported directly, so
+tracebacks point to the original source files.
 """
 
 import importlib
diff --git a/src/runpod_flash/cli/commands/run.py b/src/runpod_flash/cli/commands/run.py
index 6bfad066..f0d25722 100644
--- a/src/runpod_flash/cli/commands/run.py
+++ b/src/runpod_flash/cli/commands/run.py
@@ -335,8 +335,7 @@ def run_command(
     """Start Flash development server.
 
     Scans the project for @remote decorated functions and starts a FastAPI
-    dev server via uvicorn. The app is built programmatically (no codegen)
-    so tracebacks point directly to your source files.
+    dev server via uvicorn. Tracebacks point directly to your source files.
 
     No main.py or FastAPI boilerplate required. Any .py file with @remote
     decorated functions is a valid Flash project.
diff --git a/src/runpod_flash/cli/docs/flash-dev.md b/src/runpod_flash/cli/docs/flash-dev.md
index bbeca323..39cdfba2 100644
--- a/src/runpod_flash/cli/docs/flash-dev.md
+++ b/src/runpod_flash/cli/docs/flash-dev.md
@@ -17,7 +17,7 @@ With `flash dev`, your system runs in a **hybrid architecture**:
 │  YOUR MACHINE (localhost:8888)                                  │
 │  ┌─────────────────────────────────────┐                        │
 │  │  Programmatic FastAPI server        │                        │
-│  │  (built in-memory, no codegen)      │                        │
+│  │  (programmatic FastAPI app)          │                        │
 │  │  - Discovers @remote functions      │─────────┐              │
 │  │  - Hot-reload via uvicorn           │         │              │
 │  └─────────────────────────────────────┘         │              │
@@ -40,7 +40,7 @@ With `flash dev`, your system runs in a **hybrid architecture**:
 - **`@remote` functions run on Runpod** as serverless endpoints
 - **Hot reload** watches your project directory via uvicorn's built-in reloader
 - **Endpoints are prefixed with `live-`** to distinguish development endpoints from production (e.g., `gpu-worker` becomes `live-gpu-worker`)
-- **No codegen** - tracebacks point directly to your source files
+- **Direct tracebacks** - errors point to your original source files
 
 This is different from `flash deploy`, where **everything** (including your FastAPI app) runs on Runpod. See [flash deploy](./flash-deploy.md) for the fully-deployed architecture.
 
@@ -76,7 +76,7 @@ flash dev --host 0.0.0.0 --port 8000
 ## What It Does
 
 1. Scans project files for `@remote` decorated functions
-2. Builds FastAPI routes programmatically (no `.flash/server.py` artifact)
+2. Builds FastAPI routes programmatically via `create_app()`
 3. Starts uvicorn server with hot-reload watching your project directory
 4. GPU workers use LiveServerless (no packaging needed)
 
@@ -88,7 +88,7 @@ When you call a `@remote` function using `flash dev`, Flash deploys a **Serverle
 flash dev
     │
     ├── Scans project for @remote functions
-    ├── Builds FastAPI app in-memory
+    ├── Builds FastAPI app programmatically
     ├── Starts local server (e.g. localhost:8888)
     │   ├── QB routes: /{file_prefix}/run_sync (local execution)
     │   └── LB routes: /{file_prefix}/{path} (remote dispatch)

From ad0356dd43dcba7ebaab0615f36cfa3178a70125 Mon Sep 17 00:00:00 2001
From: Ezekiel Wotring <40004347+KAJdev@users.noreply.github.com>
Date: Fri, 20 Feb 2026 16:00:54 -0800
Subject: [PATCH 26/26] feat: auto-select next available port when default is
 in use (#213)

---
 src/runpod_flash/cli/commands/run.py | 33 ++++++++++
 tests/unit/cli/test_run.py           | 99 ++++++++++++++++++++++++----
 2 files changed, 119 insertions(+), 13 deletions(-)

diff --git a/src/runpod_flash/cli/commands/run.py b/src/runpod_flash/cli/commands/run.py
index f0d25722..740286c6 100644
--- a/src/runpod_flash/cli/commands/run.py
+++ b/src/runpod_flash/cli/commands/run.py
@@ -3,6 +3,7 @@
 import logging
 import os
 import signal
+import socket
 import subprocess
 import sys
 from dataclasses import dataclass, field
@@ -26,6 +27,30 @@
 # resource state file written by ResourceManager in the uvicorn subprocess
 _RESOURCE_STATE_FILE = Path(".runpod") / "resources.pkl"
 
+_MAX_PORT_ATTEMPTS = 20
+
+
+def _find_available_port(host: str, start_port: int) -> int:
+    """Find the first available port starting from start_port.
+
+    Tries up to _MAX_PORT_ATTEMPTS consecutive ports. Raises typer.Exit
+    if no port is available.
+    """
+    for offset in range(_MAX_PORT_ATTEMPTS):
+        port = start_port + offset
+        try:
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                s.bind((host, port))
+                return port
+        except OSError:
+            continue
+
+    console.print(
+        f"[red]Error:[/red] No available port found in range "
+        f"{start_port}-{start_port + _MAX_PORT_ATTEMPTS - 1}."
+    )
+    raise typer.Exit(1)
+
 
 @dataclass
 class WorkerInfo:
@@ -376,6 +401,14 @@ def run_command(
         )
         raise typer.Exit(1)
 
+    # find a free port, counting up from the requested one
+    actual_port = _find_available_port(host, port)
+    if actual_port != port:
+        console.print(
+            f"[yellow]Port {port} is in use, using {actual_port} instead.[/yellow]"
+        )
+    port = actual_port
+
     _print_startup_table(workers, host, port)
 
     # tell the factory function where the project lives
diff --git a/tests/unit/cli/test_run.py b/tests/unit/cli/test_run.py
index b4166ac4..ec930082 100644
--- a/tests/unit/cli/test_run.py
+++ b/tests/unit/cli/test_run.py
@@ -50,19 +50,24 @@ def _run_cli(runner, project_dir, extra_args=None):
         mock_process.wait.side_effect = KeyboardInterrupt()
         mock_popen.return_value = mock_process
 
-        with patch("runpod_flash.cli.commands.run.os.getpgid", return_value=12345):
-            with patch("runpod_flash.cli.commands.run.os.killpg"):
-                old_cwd = os.getcwd()
-                try:
-                    os.chdir(project_dir)
-                    runner.invoke(app, ["dev"] + (extra_args or []))
-                finally:
-                    os.chdir(old_cwd)
-                    for k, v in saved_env.items():
-                        if v is None:
-                            os.environ.pop(k, None)
-                        else:
-                            os.environ[k] = v
+        # bypass port probing so tests get the exact port they requested
+        with patch(
+            "runpod_flash.cli.commands.run._find_available_port",
+            side_effect=lambda host, port: port,
+        ):
+            with patch("runpod_flash.cli.commands.run.os.getpgid", return_value=12345):
+                with patch("runpod_flash.cli.commands.run.os.killpg"):
+                    old_cwd = os.getcwd()
+                    try:
+                        os.chdir(project_dir)
+                        runner.invoke(app, ["dev"] + (extra_args or []))
+                    finally:
+                        os.chdir(old_cwd)
+                        for k, v in saved_env.items():
+                            if v is None:
+                                os.environ.pop(k, None)
+                            else:
+                                os.environ[k] = v
 
         return mock_popen.call_args[0][0]
 
@@ -474,3 +479,71 @@ def run(input_data: dict):
             pass
 
         assert _map_body_to_params(run, {}) == {}
+
+
+class TestFindAvailablePort:
+    def test_returns_start_port_when_free(self):
+        from runpod_flash.cli.commands.run import _find_available_port
+
+        # use port 0 trick: bind to 0 to get a free port, then test near it
+        import socket
+
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("localhost", 0))
+            free_port = s.getsockname()[1]
+
+        # free_port is now unbound, so _find_available_port should return it
+        assert _find_available_port("localhost", free_port) == free_port
+
+    def test_skips_occupied_port(self):
+        from runpod_flash.cli.commands.run import _find_available_port
+
+        import socket
+
+        # occupy a port
+        blocker = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        blocker.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        blocker.bind(("localhost", 0))
+        occupied_port = blocker.getsockname()[1]
+        blocker.listen(1)
+
+        try:
+            result = _find_available_port("localhost", occupied_port)
+            assert result > occupied_port
+        finally:
+            blocker.close()
+
+    def test_exits_when_no_port_available(self):
+        from runpod_flash.cli.commands.run import _find_available_port
+
+        import socket
+
+        from runpod_flash.cli.commands.run import _MAX_PORT_ATTEMPTS
+
+        blockers = []
+        # find a free starting port first
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("127.0.0.1", 0))
+            start = s.getsockname()[1]
+
+        # bind all ports in the range
+        for i in range(_MAX_PORT_ATTEMPTS):
+            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            try:
+                s.bind(("127.0.0.1", start + i))
+                s.listen(1)
+                blockers.append(s)
+            except OSError:
+                s.close()
+                blockers.append(None)
+
+        try:
+            from click.exceptions import Exit as ClickExit
+
+            with pytest.raises((SystemExit, ClickExit)):
+                _find_available_port("127.0.0.1", start)
+        finally:
+            for s in blockers:
+                if s:
+                    s.close()