diff --git a/doc/source/llm/doc_code/serve/multi_gpu/dp_autoscaling_example.py b/doc/source/llm/doc_code/serve/multi_gpu/dp_autoscaling_example.py new file mode 100644 index 000000000000..4c6ce2c1ee74 --- /dev/null +++ b/doc/source/llm/doc_code/serve/multi_gpu/dp_autoscaling_example.py @@ -0,0 +1,93 @@ +""" +This file serves as a documentation example and CI test for autoscaling data parallel attention deployment. + +Structure: +1. Monkeypatch setup: Ensures serve.run is non-blocking and removes accelerator requirements for CI testing. +2. Docs example (between __dp_autoscaling_example_start/end__): Embedded in Sphinx docs via literalinclude. +3. Test validation (deployment status polling + cleanup) +""" + +import time +from ray import serve +from ray.serve.schema import ApplicationStatus +from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME +from ray.serve import llm + +_original_serve_run = serve.run +_original_build_dp_openai_app = llm.build_dp_openai_app + + +def _non_blocking_serve_run(app, **kwargs): + """Forces blocking=False for testing""" + kwargs["blocking"] = False + return _original_serve_run(app, **kwargs) + + +def _testing_build_dp_openai_app(builder_config, **kwargs): + """Removes accelerator requirements for testing""" + if "llm_config" in builder_config: + config = builder_config["llm_config"] + if hasattr(config, "accelerator_type") and config.accelerator_type is not None: + config.accelerator_type = None + return _original_build_dp_openai_app(builder_config, **kwargs) + + +serve.run = _non_blocking_serve_run +llm.build_dp_openai_app = _testing_build_dp_openai_app + + +from ray import serve +from ray.serve.llm import LLMConfig, build_dp_openai_app + +# __dp_autoscaling_example_start__ +config = LLMConfig( + model_loading_config={ + "model_id": "microsoft/Phi-tiny-MoE-instruct" + }, + deployment_config={ + "num_replicas": "auto", + "autoscaling_config": { + "min_replicas": 1, # Min number of DP groups + "max_replicas": 2, # Max number of DP groups + "initial_replicas": 1, # Initial number of DP groups + # Other Ray Serve autoscaling knobs still apply + "upscale_delay_s": 0.1, + "downscale_delay_s": 2, + }, + }, + engine_kwargs={ + "data_parallel_size": 2, # Number of DP replicas + "tensor_parallel_size": 1, # TP size per replica + "max_model_len": 1024, + "max_num_seqs": 32, + }, +) + +app = build_dp_openai_app({ + "llm_config": config +}) +# __dp_autoscaling_example_end__ + +serve.run(app, blocking=True) + +status = ApplicationStatus.NOT_STARTED +timeout_seconds = 300 +start_time = time.time() + +while ( + status != ApplicationStatus.RUNNING and time.time() - start_time < timeout_seconds +): + status = serve.status().applications[SERVE_DEFAULT_APP_NAME].status + + if status in [ApplicationStatus.DEPLOY_FAILED, ApplicationStatus.UNHEALTHY]: + raise AssertionError(f"Deployment failed with status: {status}") + + time.sleep(1) + +if status != ApplicationStatus.RUNNING: + raise AssertionError( + f"Deployment failed to reach RUNNING status within {timeout_seconds}s. Current status: {status}" + ) + +serve.shutdown() + diff --git a/doc/source/llm/doc_code/serve/multi_gpu/dp_basic_example.py b/doc/source/llm/doc_code/serve/multi_gpu/dp_basic_example.py index ca79f5ac9fe4..87ec8da4681b 100644 --- a/doc/source/llm/doc_code/serve/multi_gpu/dp_basic_example.py +++ b/doc/source/llm/doc_code/serve/multi_gpu/dp_basic_example.py @@ -44,6 +44,9 @@ def _testing_build_dp_openai_app(builder_config, **kwargs): model_loading_config={ "model_id": "microsoft/Phi-tiny-MoE-instruct" }, + deployment_config={ + "num_replicas": 2 + }, engine_kwargs={ "data_parallel_size": 2, # Number of DP replicas "tensor_parallel_size": 1, # TP size per replica @@ -51,10 +54,6 @@ def _testing_build_dp_openai_app(builder_config, **kwargs): "max_model_len": 1024, "max_num_seqs": 32, }, - experimental_configs={ - # This is a temporary required config. We will remove this in future versions. - "dp_size_per_node": 2, # DP replicas per node - }, ) app = build_dp_openai_app({ diff --git a/doc/source/llm/doc_code/serve/multi_gpu/dp_pd_example.py b/doc/source/llm/doc_code/serve/multi_gpu/dp_pd_example.py index e803e8fe0354..1a68e38e27e6 100644 --- a/doc/source/llm/doc_code/serve/multi_gpu/dp_pd_example.py +++ b/doc/source/llm/doc_code/serve/multi_gpu/dp_pd_example.py @@ -70,9 +70,6 @@ def _testing_build_dp_deployment(llm_config, **kwargs): "max_model_len": 1024, "max_num_seqs": 32, }, - experimental_configs={ - "dp_size_per_node": 2, - }, ) # Configure decode with data parallel attention @@ -91,9 +88,6 @@ def _testing_build_dp_deployment(llm_config, **kwargs): "max_model_len": 1024, "max_num_seqs": 32, }, - experimental_configs={ - "dp_size_per_node": 2, - }, ) # Build prefill and decode deployments with DP diff --git a/doc/source/serve/advanced-guides/performance.md b/doc/source/serve/advanced-guides/performance.md index 838c30542c1b..36c34aca2593 100644 --- a/doc/source/serve/advanced-guides/performance.md +++ b/doc/source/serve/advanced-guides/performance.md @@ -198,6 +198,98 @@ You can also configure each option individually. The following table details the You may want to enable throughput-optimized serving while customizing the options above. You can do this by setting `RAY_SERVE_THROUGHPUT_OPTIMIZED=1` and overriding the specific options. For example, to enable throughput-optimized serving and continue logging to stderr, you should set `RAY_SERVE_THROUGHPUT_OPTIMIZED=1` and override with `RAY_SERVE_LOG_TO_STDERR=1`. +(serve-haproxy)= +### Use HAProxy load balancing + +By default, Ray Serve uses a Python-based HTTP/gRPC proxy to route requests to replicas. You can replace this with [HAProxy](https://www.haproxy.org/), a high-performance C-based load balancer, for improved throughput and lower latency at high request rates. + +When HAProxy mode is enabled: +- An `HAProxyManager` actor runs on each node (by default) and translates Serve's routing table into HAProxy configuration reloads. +- Each ingress replica opens a port, and HAProxy routes traffic directly to replicas, replacing the Python proxy entirely. +- Live traffic flows through the HAProxy subprocess, not through any Python actor. + +#### Prerequisites + +HAProxy must be installed and available on `$PATH` as `haproxy` on every node that runs a Serve proxy. The [official Ray Docker images](https://hub.docker.com/r/rayproject/ray) (2.55+) include HAProxy pre-built. No additional installation is needed when using `rayproject/ray` images. + +#### Enabling HAProxy + +Set the `RAY_SERVE_ENABLE_HA_PROXY` environment variable to `1` on all nodes before starting Ray: + +```bash +export RAY_SERVE_ENABLE_HA_PROXY=1 +``` + +This environment variable must be set on all nodes in the ray cluster. + +::::{tab-set} + +:::{tab-item} KubeRay +```yaml +# In the Ray container spec for head and worker groups: +env: + - name: RAY_SERVE_ENABLE_HA_PROXY + value: "1" +``` +::: + +:::{tab-item} VM cluster +```bash +# On every node (head and workers) +export RAY_SERVE_ENABLE_HA_PROXY=1 +ray start --head # or ray start --address=:6379 on workers +``` +::: + +:::: + +#### Installing HAProxy manually (example) + +If you are not using the official Ray Docker images, install HAProxy 2.8+ from source on every node. These steps are provided as an example only. In the future, HAProxy will be bundled with the `ray` Python package. + +The following steps are for Ubuntu/Debian: +```bash +# Install build dependencies +apt-get update -y && apt-get install -y --no-install-recommends \ + build-essential ca-certificates curl libc6-dev \ + liblua5.3-dev libpcre3-dev libssl-dev zlib1g-dev + +# Build HAProxy from source +export HAPROXY_VERSION="2.8.12" +curl -sSfL -o /tmp/haproxy.tar.gz \ + "https://www.haproxy.org/download/2.8/src/haproxy-${HAPROXY_VERSION}.tar.gz" +mkdir -p /tmp/haproxy-build && tar -xzf /tmp/haproxy.tar.gz -C /tmp/haproxy-build --strip-components=1 +make -C /tmp/haproxy-build TARGET=linux-glibc \ + USE_OPENSSL=1 USE_ZLIB=1 USE_PCRE=1 USE_LUA=1 USE_PROMEX=1 -j$(nproc) +make -C /tmp/haproxy-build install SBINDIR=/usr/local/bin +rm -rf /tmp/haproxy-build /tmp/haproxy.tar.gz + +# Install runtime dependencies +apt-get install -y --no-install-recommends socat liblua5.3-0 + +# Create required directories +mkdir -p /etc/haproxy /run/haproxy /var/log/haproxy +``` + +The required build flags are `USE_OPENSSL=1 USE_ZLIB=1 USE_PCRE=1 USE_LUA=1 USE_PROMEX=1`. The runtime dependencies are `socat` (for the admin socket) and `liblua5.3-0` (Lua runtime library). + +(serve-interdeployment-grpc)= +### Use gRPC for interdeployment communication + +By default, when one deployment calls another via a `DeploymentHandle`, requests are sent through Ray's actor RPC system. You can switch this internal transport to gRPC by setting `RAY_SERVE_USE_GRPC_BY_DEFAULT=1` on all nodes before starting Ray. This makes all `DeploymentHandle` calls use gRPC transport, which serializes requests and sends them directly to the target replica's gRPC server. gRPC transport is most beneficial for high-throughput workloads with small payloads (under ~1 MB), where bypassing Ray's object store reduces per-request overhead. + +#### When not to use gRPC + +1. Since gRPC is required to serialize every payload, it should not be used for large payloads (greater than ~1 MB). If gRPC was enabled by default, individual handles' transport mechanism can be manually set to actor RPC with `handle.options(_by_reference=True)`, and passes larger objects by reference. + +2. When passing a `DeploymentResponse` from one deployment into another (i.e., without `await`-ing it first), gRPC resolves the value at the caller and serializes it over the wire. With actor RPC, the underlying `ObjectRef` is forwarded without materializing the data. If your pipeline chains `DeploymentResponse` objects through multiple deployments with payload sizes >100KB, avoid using gRPC for those handles. + +```{literalinclude} ../doc_code/interdeployment_grpc.py +:start-after: __start_grpc_override__ +:end-before: __end_grpc_override__ +:language: python +``` + ## Debugging performance issues in controller The Serve Controller runs on the Ray head node and is responsible for a variety of tasks, diff --git a/doc/source/serve/doc_code/interdeployment_grpc.py b/doc/source/serve/doc_code/interdeployment_grpc.py new file mode 100644 index 000000000000..e386f5e31290 --- /dev/null +++ b/doc/source/serve/doc_code/interdeployment_grpc.py @@ -0,0 +1,32 @@ +# flake8: noqa + +from ray import serve +from ray.serve.handle import DeploymentHandle + +# __start_grpc_override__ +@serve.deployment +class Caller: + def __init__(self, target: DeploymentHandle): + # Override this specific handle to use actor RPC instead of gRPC. + # This is useful for large payloads (over ~1 MB) where passing + # objects by reference through Ray's object store is more efficient. + self._target = target.options(_by_reference=True) + + async def __call__(self, data: bytes) -> str: + return await self._target.remote(data) + + +@serve.deployment +class LargePayloadProcessor: + def __call__(self, data: bytes) -> str: + return f"processed {len(data)} bytes" + + +processor = LargePayloadProcessor.bind() +app = Caller.bind(processor) + +handle: DeploymentHandle = serve.run(app) +assert handle.remote(b"x" * 1024).result() == "processed 1024 bytes" +# __end_grpc_override__ + +serve.shutdown() diff --git a/doc/source/serve/llm/architecture/serving-patterns/data-parallel.md b/doc/source/serve/llm/architecture/serving-patterns/data-parallel.md index 2909ae45936a..9d6830c9daf9 100644 --- a/doc/source/serve/llm/architecture/serving-patterns/data-parallel.md +++ b/doc/source/serve/llm/architecture/serving-patterns/data-parallel.md @@ -12,15 +12,16 @@ In this serving pattern, engine replicas aren't isolated. In fact, they need to width: 700px name: dp-architecture --- -Data parallel attention architecture showing DPRankAssigner coordinating multiple LLMServer replicas. +Data parallel attention architecture showing LLMServer replicas coordination. ``` In data parallel attention serving: -- The system creates `dp_size` replicas of the LLM server. -- Each replica runs an independent inference engine with the same model. +- Ray Serve's controller creates `data_parallel_size` replicas as a cohesive gang (i.e. data parallel group), assigning each data parallel replica a unique rank (0 to `data_parallel_size-1`). +- Each data parallel replica runs an independent vLLM inference engine with its assigned data parallel rank. - Requests are distributed across replicas through Ray Serve's routing. -- All replicas work together as a cohesive unit. +- All data parallel replicas carry out GPU collective operations (e.g. dispatch-combine) as a cohesive unit, i.e. data parallel group. +- If any data parallel replica fails, the entire data parallel group is restarted atomically. ### When to use DP @@ -36,7 +37,7 @@ Data parallel attention serving works best when: Consider alternatives when: - **Low to medium throughput**: If you can't saturate the MoE layers, don't use DP. -- **Non-MLA Attention with sufficient TP**: DP is most beneficial with MLA (Multi-head Latent Attention), where KV cache can't be sharded along the head dimension. For models with GQA (Grouped Query Attention), you can use TP to shard the KV cache up to the degree where `TP_size <= num_kv_heads`. Beyond that point, TP requires KV cache replication, which wastes memory—DP becomes a better choice to avoid duplication. For example, for Qwen-235b, using `DP=2, TP=4, EP=8` makes more sense than `DP=8, EP=8` because you can still shard the KV cache with TP=4 before needing to replicate it. Benchmark these configurations with your workload to determine the optimal setup. +- **Non-MLA Attention with sufficient TP**: DP is most beneficial with MLA (Multi-head Latent Attention), where KV cache can't be sharded along the head dimension. For models with GQA (Grouped Query Attention), you can use TP to shard the KV cache up to the degree where `TP_size <= num_kv_heads`. Beyond that point, TP requires KV cache replication, which wastes memory—DP becomes a better choice to avoid duplication. For example, for Qwen-235B, using `DP=2, TP=4, EP=8` makes more sense than `DP=8, EP=8` because you can still shard the KV cache with TP=4 before needing to replicate it. Benchmark these configurations with your workload to determine the optimal setup. - **Non-MoE models**: The main reason for using DP at the cost of this complexity is to lift the effective batch size during decoding for saturating the experts. ## Components @@ -45,78 +46,86 @@ The following are the main components of DP deployments: ### DPServer -`DPServer` extends `LLMServer` with data parallel attention coordination. The following pseudocode shows the structure: +`DPServer` extends `LLMServer` with data parallel attention coordination. `DPServer` utilizes Ray Serve's gang scheduling capability to ensure that all replicas in a DP group start and fail together atomically. In the following sections, we will use "gang" and "DP group" interchangeably. The following pseudocode shows the structure: ```python from ray import serve class DPServer(LLMServer): """LLM server with data parallel attention coordination.""" - - async def __init__( - self, - llm_config: LLMConfig, - rank_assigner_handle: DeploymentHandle, - dp_size: int, - **kwargs - ): - self.rank_assigner = rank_assigner_handle - self.dp_size = dp_size - - # Get assigned rank from coordinator and pass it to engine. - replica_id = serve.get_replica_context().replica_id - llm_config.rank = await self.rank_assigner.assign_rank.remote(replica_id) - - # Call parent initialization - await super().__init__(llm_config, **kwargs) + + async def __init__(self, llm_config: LLMConfig): + # Get rank and gang info from Ray Serve's gang context. + gang_context = serve.get_replica_context().gang_context + + self.dp_rank = gang_context.rank + self.gang_id = gang_context.gang_id + + # Register and obtain master address / port + if self.dp_rank == 0: + address, rpc_port = get_address_and_port() + GangMasterInfoRegistry.register(self.gang_id, address, rpc_port) + else: + address, rpc_port = await GangMasterInfoRegistry.get(self.gang_id) + + # Pass DP metadata to the engine + llm_config.engine_kwargs["data_parallel_rank"] = self.dp_rank + llm_config.engine_kwargs["data_parallel_address"] = address + llm_config.engine_kwargs["data_parallel_rpc_port"] = rpc_port + + await super().__init__(llm_config) + + @classmethod + def get_deployment_options(cls, llm_config): + options = super().get_deployment_options(llm_config) + dp_size = llm_config.engine_kwargs.get("data_parallel_size", 1) + + # Configure gang scheduling for the DP group. + # This tells Ray Serve controller to treat data parallel replicas within a DP group as a cohesive unit. + options["gang_scheduling_config"] = GangSchedulingConfig( + gang_size=dp_size, + gang_placement_strategy=GangPlacementStrategy.PACK, + runtime_failure_policy=GangRuntimeFailurePolicy.RESTART_GANG, + ) + return options ``` Key responsibilities: -- Register with the rank assigner coordinator. -- Obtain a unique rank (0 to `dp_size-1`). +- Obtain rank and gang identity from Ray Serve's gang context. +- Exchange DP master address/port through `GangMasterInfoRegistry`. - Coordinate with other replicas for collective operations. -- Handle replica failures and re-registration. +- Gracefully handle failures and re-registration. -### DPRankAssigner +### GangMasterInfoRegistry -`DPRankAssigner` is a singleton coordinator that manages rank assignment for data parallel attention replicas. The following pseudocode shows the structure: +`GangMasterInfoRegistry` is a GCS-backed KV store, persisting the DP master address and port. The following pseudocode shows the structure: ```python -class DPRankAssigner: - """Coordinator for data parallel attention rank assignment.""" - - def __init__(self, dp_size: int): - self.dp_size = dp_size - self.assigned_ranks: Set[int] = set() - self.rank_to_replica: Dict[int, str] = {} - self.lock = asyncio.Lock() - - async def assign_rank(self, replica_id: str) -> int: - """Assign a rank to a replica. - - Returns: - int: Assigned rank (0 to dp_size-1) - """ - async with self.lock: - # Find first available rank - for rank in range(self.dp_size): - if rank not in self.assigned_ranks: - self.assigned_ranks.add(rank) - self.rank_to_replica[rank] = replica_id - return rank - - async def release_rank(self, rank: int): - """Release a rank when replica dies.""" - async with self.lock: - self.assigned_ranks.discard(rank) - self.rank_to_replica.pop(rank, None) +class GangMasterInfoRegistry: + """Registry for gang DP master info using GCS KV store.""" + + @classmethod + def register(cls, gang_id: str, address: str, port: int): + """Persists address and port associated with a DP group (gang) in the KV store.""" + ... + + @classmethod + async def get(cls, gang_id: str, timeout: float) -> Tuple[str, int]: + """Polls for address and port for a given DP group.""" + ... + + @classmethod + def unregister(cls, gang_id: str): + """Remove the DP master info on shutdown.""" + ... ``` Key responsibilities: -- Assign unique ranks to replicas. -- Ensure exactly `dp_size` replicas are serving. +- Store DP master info in GCS KV store. +- Provide async polling for non-zero ranks to discover the master. +- Clean up entries on shutdown. ## Request flow @@ -131,69 +140,56 @@ Data parallel attention request flow from client to distributed replicas. The following is the request flow through a data parallel attention deployment: 1. **Client request**: HTTP request arrives at ingress. -2. **Ingress routing**: Ingress uses deployment handle to call DPServer. +2. **Ingress routing**: Ingress uses deployment handle to call `DPServer`. 3. **Ray Serve routing**: Ray Serve's request router selects a replica. - Default: Power of Two Choices (load balancing). - Custom: Prefix-aware, session-aware, etc. -4. **Replica processing**: Selected DPServer replica processes request. +4. **Replica processing**: Selected `DPServer` replica processes request. 5. **Engine inference**: vLLM engine generates response. 6. **Streaming response**: Tokens stream back to client. -The key difference from basic serving is that all the `dp_size` replicas are working in coordination with each other rather than in isolation. - -## Scaling - -### Scaling behavior +The key difference from basic serving is that all the `dp_size` replicas coordinate with each other rather than in isolation. -Data parallel attention deployments require a fixed number of replicas equal to `dp_size`, as autoscaling isn't supported for this pattern. You must set `num_replicas` to `dp_size`, or if using `autoscaling_config`, both `min_replicas` and `max_replicas` must equal `dp_size`. +## Autoscaling +### Autoscaling behavior -## Design considerations +Data parallel attention deployments support autoscaling based on request queue length. Specify `min_replicas`, `max_replicas`, `initial_replicas` to configure autoscaling bound and starting point. Note that all `min_replicas`, `max_replicas`, `initial_replicas` refer to the number of DP groups, where each group has `dp_size` of engine instances. -### Coordination overhead -The `DPRankAssigner` introduces minimal coordination overhead: +```{literalinclude} ../../../../llm/doc_code/serve/multi_gpu/dp_autoscaling_example.py +:language: python +:start-after: __dp_autoscaling_example_start__ +:end-before: __dp_autoscaling_example_end__ +``` -- **Startup**: Each replica makes one RPC to get its rank. -- **Runtime**: No coordination overhead during request processing. -The singleton actor pattern ensures consistency during startup time. +## Design considerations ### Placement strategy -The PACK strategy places each replica's resources together: +`DPServer` always uses the `PACK` strategy to place each replica's resources together: - Tensor parallel workers for one replica pack on the same node when possible. - Different replicas can be on different nodes. - This minimizes inter-node communication within each replica. +### Fault tolerance + +If any DP replica in a DP group fails, Ray Serve controller restarts the entire DP group atomically. This ensures all replicas in a group are always in a consistent state, which is critical because DP replicas perform collective operations together. + ## Combining with other patterns ### DP + Prefill-decode disaggregation You can run data parallel attention on both prefill and decode phases: -``` -┌─────────────────────────────────────────────┐ -│ OpenAiIngress │ -└─────────────┬───────────────────────────────┘ - │ - ▼ - ┌─────────────┐ - │PDProxyServer│ - └──┬───────┬──┘ - │ │ - ┌─────┘ └─────┐ - ▼ ▼ -┌──────────┐ ┌──────────┐ -│ Prefill │ │ Decode │ -│ DP-2 │ │ DP-4 │ -│ │ │ │ -│ Replica0 │ │ Replica0 │ -│ Replica1 │ │ Replica1 │ -└──────────┘ │ Replica2 │ - │ Replica3 │ - └──────────┘ +```{figure} ../../images/dp_pd.png +--- +width: 700px +name: dp-pd +--- +Using DP attention pattern along with PD deployments with independent DP sizes. ``` ## See also diff --git a/doc/source/serve/llm/images/dp.png b/doc/source/serve/llm/images/dp.png index 369fc6e74b6c..ebbdbe5866ba 100644 Binary files a/doc/source/serve/llm/images/dp.png and b/doc/source/serve/llm/images/dp.png differ diff --git a/doc/source/serve/llm/images/dp_flow.png b/doc/source/serve/llm/images/dp_flow.png index 544e85dba0f4..79936900158f 100644 Binary files a/doc/source/serve/llm/images/dp_flow.png and b/doc/source/serve/llm/images/dp_flow.png differ diff --git a/doc/source/serve/llm/images/dp_pd.png b/doc/source/serve/llm/images/dp_pd.png new file mode 100644 index 000000000000..4abec4b024cd Binary files /dev/null and b/doc/source/serve/llm/images/dp_pd.png differ diff --git a/doc/source/serve/llm/user-guides/data-parallel-attention.md b/doc/source/serve/llm/user-guides/data-parallel-attention.md index 7622650922ad..c1e918a49ce6 100644 --- a/doc/source/serve/llm/user-guides/data-parallel-attention.md +++ b/doc/source/serve/llm/user-guides/data-parallel-attention.md @@ -27,7 +27,7 @@ Consider this pattern when: ## Basic deployment -The following example shows how to deploy with data parallel attention: +The following example shows how to deploy with data parallel attention. Each data parallel deployment requires `num_replicas * data_parallel_size * tensor_parallel_size` GPUs. ```{literalinclude} ../../../llm/doc_code/serve/multi_gpu/dp_basic_example.py :language: python @@ -37,7 +37,7 @@ The following example shows how to deploy with data parallel attention: ## Production YAML configuration -For production deployments, use a YAML configuration file: +For production deployments, use a declarative YAML configuration file: ```yaml applications: @@ -48,46 +48,45 @@ applications: llm_config: model_loading_config: model_id: Qwen/Qwen2.5-0.5B-Instruct + deployment_config: + num_replicas: 2 engine_kwargs: data_parallel_size: 4 tensor_parallel_size: 2 - experimental_configs: - dp_size_per_node: 4 ``` -Deploy with: +Deploy with CLI: ```bash serve deploy dp_config.yaml ``` -:::{note} -The `num_replicas` in `deployment_config` must equal `data_parallel_size` in `engine_kwargs`. Autoscaling is not supported for data parallel attention deployments since all replicas must be present and coordinated. -::: - ## Configuration parameters ### Required parameters -- `data_parallel_size`: Number of data parallel replicas to create. Must be a positive integer. -- `dp_size_per_node`: Number of DP replicas per node. Must be set in `experimental_configs`. This controls how replicas are distributed across nodes. This is a temporary required config that we will remove in future versions. +- `data_parallel_size`: Number of data parallel replicas within a data parallel group. Must be a positive integer and passed in via `engine_kwargs`. ### Deployment configuration -- `num_replicas`: Must be set to `data_parallel_size`. Data parallel attention requires a fixed number of replicas. -- `placement_group_strategy`: Automatically set to `"STRICT_PACK"` to ensure replicas are properly placed. +- `num_replicas`: Can be set to any positive integer, unset (defaults to 1), or `"auto"` to enable autoscaling based on request queue length. + +:::{note} +Within a data parallel deployment, `num_replicas` under the `deployment_config` refers to the number of data parallel groups, which translates to `num_replicas * data_parallel_size` data parallel replicas (equivalent to the number of Ray serve replicas). Each data parallel replica inherently runs a vLLM data parallel server. +::: -## Understanding replica coordination +## Understanding data parallel replica coordination -In data parallel attention, all replicas work together as a cohesive unit: +In data parallel attention, all data parallel replicas within a data parallel group work together as a cohesive unit by leveraging Ray Serve's gang scheduling capability: -1. **Rank assignment**: Each replica receives a unique rank (0 to `dp_size-1`) from a coordinator. +1. **Rank assignment**: Each replica receives a unique rank (0 to `data_parallel_size-1`) from Ray Serve's controller to start a vLLM data parallel server. 2. **Request distribution**: Ray Serve's request router distributes requests across replicas using load balancing. -3. **Collective operations**: Replicas coordinate for collective operations (e.g., all-reduce) required by the model. -4. **Synchronization**: All replicas must be present and healthy for the deployment to function correctly. +3. **Collective operations**: Replicas coordinate for collective operations (e.g., all-reduce, dispatch and combine) required by the model. +4. **Synchronization**: All data parallel replicas in a data parallel group must be present and healthy. MoE layers use all-to-all collectives to route tokens to experts across DP ranks. If any data parallel replica is unavailable, these collectives hang and tokens can't reach experts assigned to that rank. +5. **Fault tolerance**: If any data parallel replica in a data parallel group fails, the entire group becomes unavailable because the remaining replicas can't complete collective operations. While Ray Serve controller detects the failure and restarts the entire group, other data parallel groups keep serving requests without any downtime if `num_replicas > 1`. -The coordination overhead is minimal: -- **Startup**: Each replica makes one RPC call to get its rank. +There's no coordination overhead introduced by Ray Serve LLM: +- **Startup**: Data parallel ranks are assigned when Ray Serve's controller creates the data parallel replica. - **Runtime**: No coordination overhead during request processing. For more details, see {doc}`../architecture/serving-patterns/data-parallel`. diff --git a/python/ray/autoscaler/v2/event_logger.py b/python/ray/autoscaler/v2/event_logger.py index b4db3d2b798b..244ac8fd3173 100644 --- a/python/ray/autoscaler/v2/event_logger.py +++ b/python/ray/autoscaler/v2/event_logger.py @@ -24,8 +24,9 @@ class AutoscalerEventLogger: - Rate limit the events if too spammy. """ - def __init__(self, logger: EventLoggerAdapter): + def __init__(self, logger: EventLoggerAdapter, log_cluster_shape: bool = True): self._logger = logger + self._log_cluster_shape = log_cluster_shape def log_cluster_scheduling_update( self, @@ -65,7 +66,7 @@ def log_cluster_scheduling_update( """ # Log any launch events. - if launch_requests: + if self._log_cluster_shape and launch_requests: launch_type_count = defaultdict(int) for req in launch_requests: launch_type_count[req.instance_type] += req.count @@ -76,7 +77,7 @@ def log_cluster_scheduling_update( logger.info(f"{log_str}") # Log any terminate events. - if terminate_requests: + if self._log_cluster_shape and terminate_requests: termination_by_causes_and_type = defaultdict(int) for req in terminate_requests: termination_by_causes_and_type[(req.cause, req.instance_type)] += 1 @@ -96,7 +97,7 @@ def log_cluster_scheduling_update( logger.info(f"{log_str}") # Cluster shape changes. - if launch_requests or terminate_requests: + if self._log_cluster_shape and (launch_requests or terminate_requests): num_cpus = cluster_resources.get("CPU", 0) log_str = f"Resized to {int(num_cpus)} CPUs" diff --git a/python/ray/autoscaler/v2/monitor.py b/python/ray/autoscaler/v2/monitor.py index 34e31e7ac649..e8cbf61576c7 100644 --- a/python/ray/autoscaler/v2/monitor.py +++ b/python/ray/autoscaler/v2/monitor.py @@ -34,6 +34,7 @@ from ray.autoscaler.v2.instance_manager.config import ( FileConfigReader, IConfigReader, + Provider, ReadOnlyProviderConfigReader, ) from ray.autoscaler.v2.metrics_reporter import AutoscalerMetricsReporter @@ -95,7 +96,11 @@ def __init__( ray_event_logger = get_event_logger( RayEvent.SourceType.AUTOSCALER, log_dir ) - self.event_logger = AutoscalerEventLogger(ray_event_logger) + self.event_logger = AutoscalerEventLogger( + ray_event_logger, + log_cluster_shape=config_reader.get_cached_autoscaling_config().provider + != Provider.READ_ONLY, + ) except Exception: self.event_logger = None else: diff --git a/python/ray/autoscaler/v2/tests/test_event_logger.py b/python/ray/autoscaler/v2/tests/test_event_logger.py index 1f7a339aa903..78559e1018f2 100644 --- a/python/ray/autoscaler/v2/tests/test_event_logger.py +++ b/python/ray/autoscaler/v2/tests/test_event_logger.py @@ -106,6 +106,24 @@ def test_log_scheduling_updates(): ] +def test_log_scheduling_updates_without_cluster_shape(): + mock_logger = MockEventLogger(logger) + event_logger = AutoscalerEventLogger(mock_logger, log_cluster_shape=False) + + event_logger.log_cluster_scheduling_update( + launch_requests=[launch_request("m4.large", 1)], + terminate_requests=[termination_request("m4.xlarge", OUTDATED)], + infeasible_requests=[ResourceRequestUtil.make({"CPU": 4})], + cluster_resources={"CPU": 5}, + ) + + assert mock_logger.get_logs("info") == [] + assert mock_logger.get_logs("warning") == [ + "No available node types can fulfill resource requests {'CPU': 4.0}*1. Add suitable node types to this cluster to resolve this issue." + ] + assert mock_logger.get_logs("debug") == [] + + if __name__ == "__main__": if os.environ.get("PARALLEL_CI"): sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__])) diff --git a/python/ray/data/_internal/execution/operators/hash_shuffle.py b/python/ray/data/_internal/execution/operators/hash_shuffle.py index db188297da75..682fcc0fa9e6 100644 --- a/python/ray/data/_internal/execution/operators/hash_shuffle.py +++ b/python/ray/data/_internal/execution/operators/hash_shuffle.py @@ -275,7 +275,7 @@ def _shuffle_block( block, block_type=BlockType.ARROW ) - if block.num_rows == 0: + if block.num_rows == 0 and not send_empty_blocks: empty = BlockAccessor.for_block(block).get_metadata( block_exec_stats=stats.build(block_ser_time_s=0), ) @@ -354,7 +354,7 @@ def _shuffle_block( block_exec_stats=stats.build(block_ser_time_s=0) ) - if logger.isEnabledFor(logging.DEBUG): + if logger.isEnabledFor(logging.DEBUG) and partition_shards_stats: num_rows_series, byte_sizes_series = zip( *[(s.num_rows, s.byte_size) for s in partition_shards_stats.values()] ) diff --git a/python/ray/data/_internal/plan.py b/python/ray/data/_internal/plan.py index 058e6a34c52a..68150e78d2b2 100644 --- a/python/ray/data/_internal/plan.py +++ b/python/ray/data/_internal/plan.py @@ -398,7 +398,10 @@ def _build_limited_plan(plan: "ExecutionPlan") -> "ExecutionPlan": def input_files(self) -> Optional[List[str]]: """Get the input files of the dataset, if available.""" - return self._logical_plan.dag.infer_metadata().input_files + input_files = self._logical_plan.dag.infer_metadata().input_files + if input_files is None: + return None + return list(set(input_files)) def meta_count(self) -> Optional[int]: """Get the number of rows after applying all plan optimizations, if possible. diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py index a08fbf620ef3..67acc06084c0 100644 --- a/python/ray/data/dataset.py +++ b/python/ray/data/dataset.py @@ -4016,7 +4016,7 @@ def input_files(self) -> List[str]: The list of input files used to create the dataset, or an empty list if the input files is not known. """ - return list(set(self._plan.input_files())) + return self._plan.input_files() or [] @ConsumptionAPI @PublicAPI(api_group=IOC_API_GROUP) diff --git a/python/ray/data/tests/test_join.py b/python/ray/data/tests/test_join.py index d557b6a9dcc7..0c667822ed15 100644 --- a/python/ray/data/tests/test_join.py +++ b/python/ray/data/tests/test_join.py @@ -727,9 +727,6 @@ def test_join_with_predicate_pushdown( else: filtered_ds = joined.filter(expr=col("right_val") < 1000) - # Verify correctness by computing expected result with pandas - from ray.data._internal.util import rows_same - left_pd = left_ds.to_pandas() right_pd = right_ds.to_pandas() @@ -850,6 +847,96 @@ def test_join_cross_side_column_comparison_no_pushdown(ray_start_regular_shared_ ) +def test_chained_left_outer_join_with_empty_blocks(ray_start_regular_shared_2_cpus): + """Regression test for https://github.com/ray-project/ray/issues/60013. + + The bug + ------- + When a hash-shuffle join receives an **empty-row** block as the very first + block for an input sequence, _shuffle_block() returns early (num_rows == 0) + without sending any data to any aggregator. The caller, however, marks + _has_schemas_broadcasted[input_index] = True immediately after submitting + the task. Every subsequent block for that sequence uses + send_empty_blocks=False. Aggregators that receive no non-empty rows from + those subsequent blocks end up with an empty queue. When finalize() is + called, _combine([]) builds an ArrowBlockBuilder with zero blocks and + returns a (0 rows, 0 columns) table. The downstream join then fails with + ColumnNotFoundError because the join key column is absent. + + We bypass the first join entirely and use ray.data.from_blocks() to build a + dataset whose very first block is an explicit zero-row Arrow table that + carries the full column schema. With num_partitions=20 and only 10 data + rows the second join has at least 10 aggregator partitions that receive no + non-empty data. Before the fix those partitions produce (0, 0) tables and + the join raises ColumnNotFoundError. After the fix schema-carrier blocks + are broadcast even for the empty first block, so every aggregator can + finalize correctly. + """ + import pyarrow as pa + + # Build a dataset that simulates the output of a first left-outer join: + # - block 0: explicitly empty (0 rows) but carries the full schema + # - blocks 1-10: one row each, with b_val populated for id >= 5 + # + # from_blocks() preserves block order and count exactly, so the empty block + # is guaranteed to be the first block the second join's shuffle sees. + schema = pa.schema( + [ + pa.field("id", pa.int64()), + pa.field("a_val", pa.string()), + pa.field("b_val", pa.string()), + ] + ) + empty_block = schema.empty_table() # shape (0, 3), schema but no rows + + data_blocks = [ + pa.table( + { + "id": pa.array([i], type=pa.int64()), + "a_val": pa.array([f"a_{i}"], type=pa.string()), + "b_val": pa.array([f"b_{i}" if i >= 5 else None], type=pa.string()), + } + ) + for i in range(10) + ] + + # The first block must be the empty one so the bug fires. + # from_blocks guarantees block order and count are preserved as-is. + joined_1 = ray.data.from_blocks([empty_block] + data_blocks) + + # Second dataset for the chained join + ds_c = ray.data.from_arrow( + pa.table( + { + "id": pa.array(range(10), type=pa.int64()), + "c_val": pa.array([f"c_{i}" for i in range(10)], type=pa.string()), + } + ) + ) + + # num_partitions=20 with only 10 data rows means at least 10 aggregator + # partitions receive no non-empty left-side data. + joined_2 = joined_1.join( + ds_c, + join_type="left_outer", + on=("id",), + num_partitions=20, + ) + + result = joined_2.to_pandas() + + expected = pd.DataFrame( + { + "id": list(range(10)), + "a_val": [f"a_{i}" for i in range(10)], + "b_val": [f"b_{i}" if i >= 5 else None for i in range(10)], + "c_val": [f"c_{i}" for i in range(10)], + } + ) + + assert rows_same(result, expected) + + if __name__ == "__main__": import sys diff --git a/python/ray/data/tests/unit/test_resource_manager.py b/python/ray/data/tests/unit/test_resource_manager.py index fbcaf73608f7..bb84a0d9a3b4 100644 --- a/python/ray/data/tests/unit/test_resource_manager.py +++ b/python/ray/data/tests/unit/test_resource_manager.py @@ -128,16 +128,18 @@ def test_does_not_double_count_usage_from_union(): topology, ExecutionOptions(), lambda: total_resources, DataContext.get_current() ) - # Create a 1-byte `RefBundle`. - block_ref = ray.ObjectRef(b"1" * 28) + # Create two 1-byte `RefBundle`s. + block_ref1 = ray.ObjectRef(b"1" * 28) + block_ref2 = ray.ObjectRef(b"2" * 28) block_metadata = BlockMetadata( num_rows=1, size_bytes=1, input_files=None, exec_stats=None ) - bundle = RefBundle([(block_ref, block_metadata)], owns_blocks=True, schema=None) + bundle1 = RefBundle([(block_ref1, block_metadata)], owns_blocks=True, schema=None) + bundle2 = RefBundle([(block_ref2, block_metadata)], owns_blocks=True, schema=None) # Add two 1-byte `RefBundle` to the union operator. - topology[union_op].add_output(bundle) - topology[union_op].add_output(bundle) + topology[union_op].add_output(bundle1) + topology[union_op].add_output(bundle2) resource_manager.update_usages() # The total object store memory usage should be 2. If the resource manager double- diff --git a/python/ray/includes/common.pxd b/python/ray/includes/common.pxd index 41cbd5cf3d7c..eb4bd8ac8eb4 100644 --- a/python/ray/includes/common.pxd +++ b/python/ray/includes/common.pxd @@ -647,6 +647,13 @@ cdef extern from "ray/gcs_rpc_client/accessor.h" nogil: c_string &rejection_reason_message ) + CRayStatus ResizeRayletResourceInstances( + const c_string &node_id, + const unordered_map[c_string, double] &resources, + int64_t timeout_ms, + unordered_map[c_string, double] &total_resources + ) + cdef cppclass CPublisherAccessor "ray::gcs::PublisherAccessor": CRayStatus PublishError( c_string key_id, diff --git a/python/ray/includes/gcs_client.pxi b/python/ray/includes/gcs_client.pxi index b9142eb379c0..dbd296a2990a 100644 --- a/python/ray/includes/gcs_client.pxi +++ b/python/ray/includes/gcs_client.pxi @@ -655,6 +655,23 @@ cdef class InnerGcsClient: return (is_accepted, rejection_reason_message.decode()) + def resize_raylet_resource_instances( + self, + node_id: c_string, + resources: unordered_map[c_string, cython.double], + timeout_s=None): + """Send the ResizeRayletResourceInstances request to GCS.""" + cdef: + int64_t timeout_ms = round(1000 * timeout_s) if timeout_s else -1 + unordered_map[c_string, cython.double] total_resources + with nogil: + check_status_timeout_as_rpc_error( + self.inner.get().Autoscaler().ResizeRayletResourceInstances( + node_id, resources, timeout_ms, total_resources + ) + ) + return {key.decode(): value for key, value in total_resources} + ############################################################# # Publisher methods ############################################################# diff --git a/python/ray/llm/_internal/serve/core/configs/llm_config.py b/python/ray/llm/_internal/serve/core/configs/llm_config.py index cd516b4adc13..3e6e644b0465 100644 --- a/python/ray/llm/_internal/serve/core/configs/llm_config.py +++ b/python/ray/llm/_internal/serve/core/configs/llm_config.py @@ -40,7 +40,7 @@ KVConnectorBackendFactory, ) from ray.llm._internal.serve.observability.logging import get_logger -from ray.serve._private.config import DeploymentConfig +from ray.serve._private.config import DeploymentConfig, handle_num_replicas_auto transformers = try_import("transformers") @@ -395,7 +395,17 @@ def validate_llm_engine(cls, value: str) -> str: def validate_deployment_config(cls, value: Dict[str, Any]) -> Dict[str, Any]: """Validates the deployment config dictionary.""" try: - DeploymentConfig(**value) + # Resolve "auto" for num_replicas before validating against DeploymentConfig + if value.get("num_replicas") == "auto": + resolved = {**value, "num_replicas": None} + _, autoscaling_config = handle_num_replicas_auto( + resolved.get("max_ongoing_requests"), + resolved.get("autoscaling_config"), + ) + resolved["autoscaling_config"] = autoscaling_config + DeploymentConfig(**resolved) + else: + DeploymentConfig(**value) except Exception as e: raise ValueError(f"Invalid deployment config: {value}") from e diff --git a/python/ray/llm/_internal/serve/serving_patterns/data_parallel/builder.py b/python/ray/llm/_internal/serve/serving_patterns/data_parallel/builder.py index c1b071927e35..9f1a0ecc63a8 100644 --- a/python/ray/llm/_internal/serve/serving_patterns/data_parallel/builder.py +++ b/python/ray/llm/_internal/serve/serving_patterns/data_parallel/builder.py @@ -13,9 +13,6 @@ ) from ray.llm._internal.serve.core.server.builder import build_llm_deployment from ray.llm._internal.serve.observability.logging import get_logger -from ray.llm._internal.serve.serving_patterns.data_parallel.dp_rank_assigner import ( - _DPRankAssigner, -) from ray.llm._internal.serve.serving_patterns.data_parallel.dp_server import ( DPServer, ) @@ -41,27 +38,9 @@ def build_dp_deployment( Returns: The Ray Serve Application for the data parallel attention LLM deployment. """ - dp_size = llm_config.engine_kwargs.get("data_parallel_size", 1) - - # TODO(rui): figure out a better way to pass in dp_size_per_node. - # NOTE: we cannot use engine_kwargs.data_parallel_size_local to specify - # the number of ranks per node because that has special semantics in vLLM. - # When we make serve's rank asignment node affinity aware, then we won't - # need this hack to make the ranks orginally distributed across nodes. - dp_size_per_node = llm_config.experimental_configs.get("dp_size_per_node") - if dp_size_per_node is None: - raise ValueError( - "dp_size_per_node must be set in experimental_configs for DP deployment." - ) - - dp_rank_assigner = _DPRankAssigner.bind( - dp_size=dp_size, dp_size_per_node=dp_size_per_node - ) - return build_llm_deployment( llm_config, name_prefix=name_prefix, - bind_kwargs={"dp_rank_assigner": dp_rank_assigner}, override_serve_options=override_serve_options, deployment_cls=DPServer, ) diff --git a/python/ray/llm/_internal/serve/serving_patterns/data_parallel/dp_rank_assigner.py b/python/ray/llm/_internal/serve/serving_patterns/data_parallel/dp_rank_assigner.py deleted file mode 100644 index b9862a3fca44..000000000000 --- a/python/ray/llm/_internal/serve/serving_patterns/data_parallel/dp_rank_assigner.py +++ /dev/null @@ -1,128 +0,0 @@ -import asyncio -import logging -from typing import Dict, List, Optional - -from ray import serve - -logger = logging.getLogger(__name__) - - -@serve.deployment(num_replicas=1) -class _DPRankAssigner: - """ - Data Parallel Rank Assigner. - - This class is used to assign a rank to each replica in the data parallel - deployment. - """ - - def __init__(self, dp_size: int, dp_size_per_node: Optional[int] = None): - self.dp_size: int = dp_size - self.dp_size_per_node: Optional[int] = dp_size_per_node - self.lock: asyncio.Lock = asyncio.Lock() - self.dp_address: Optional[str] = None - self.dp_rpc_port: Optional[int] = None - self.master_info_event: asyncio.Event = asyncio.Event() - - # Fields for _register_random_placement(): - # Next rank to assign - self.next_rank: Optional[int] = None - - # Fields for _register_node_pack_placement(): - # Number of nodes to assign to - self.num_nodes: Optional[int] = None - # Map from node id to available ranks - self.node_to_avail_ranks: Dict[str, List[int]] = {} - - if dp_size_per_node is None: - self.next_rank = 0 - logger.info( - f"Using random placement rank assigner for DP size {self.dp_size}" - ) - else: - if self.dp_size_per_node <= 0: - raise ValueError( - f"dp_size_per_node {self.dp_size_per_node} must be greater than 0" - ) - if self.dp_size % self.dp_size_per_node != 0: - raise ValueError( - f"dp_size {self.dp_size} must be divisible by dp_size_per_node {self.dp_size_per_node}" - ) - self.num_nodes = self.dp_size // self.dp_size_per_node - logger.info( - f"Using node pack placement rank assigner for DP size {self.dp_size}" - f"with dp_size_per_node {self.dp_size_per_node}" - ) - - async def register(self, node_id: Optional[str] = None): - """ - Register a replica and assign a rank to it. - - Args: - node_id: The node id of the replica. - - Returns: - The rank of the replica. - """ - if self.dp_size_per_node is None: - return await self._register_random_placement() - else: - if node_id is None: - raise ValueError("node_id is required for node pack placement") - return await self._register_node_pack_placement(node_id) - - async def _register_random_placement(self): - """ - Assign a rank based on random placement. - - The ranks are assigned in a random order, regardless of its node id. - """ - async with self.lock: - if self.next_rank >= self.dp_size: - raise ValueError( - f"Attempted to assign rank {self.next_rank} but dp_size is {self.dp_size}" - ) - # TODO(rui): instead of using the naive increment approach, - # we should use the Ray Serve Replica Rank API to assign ranks. - rank = self.next_rank - self.next_rank += 1 - return rank - - async def _register_node_pack_placement(self, node_id: str): - """ - Assign a rank based on node pack placement. - - This should be used for DeepEP which assumes that the ranks ranging from - [dp_rank_per_node * node_rank, dp_rank_per_node * (node_rank + 1) - 1] are - assigned to the same node. - - For example, if dp_size_per_node is 8, and there are 16 ranks in total, then - the ranks [0, 7] should be assigned to one node, and ranks [8, 15] should be - assigned to another node. - """ - async with self.lock: - if not self.node_to_avail_ranks: - self.node_to_avail_ranks[node_id] = list( - range(1, self.dp_size_per_node) - ) - return 0 - elif node_id not in self.node_to_avail_ranks: - node_rank = len(self.node_to_avail_ranks) - assert node_rank < self.num_nodes - rank = node_rank * self.dp_size_per_node - self.node_to_avail_ranks[node_id] = list( - range(rank + 1, rank + self.dp_size_per_node) - ) - return rank - else: - rank = self.node_to_avail_ranks[node_id].pop(0) - return rank - - async def set_dp_master_info(self, dp_address: str, dp_rpc_port: int): - self.dp_address = dp_address - self.dp_rpc_port = dp_rpc_port - self.master_info_event.set() - - async def get_dp_master_info(self): - await self.master_info_event.wait() - return self.dp_address, self.dp_rpc_port diff --git a/python/ray/llm/_internal/serve/serving_patterns/data_parallel/dp_server.py b/python/ray/llm/_internal/serve/serving_patterns/data_parallel/dp_server.py index 93f31cdbd94c..2557b6e24537 100644 --- a/python/ray/llm/_internal/serve/serving_patterns/data_parallel/dp_server.py +++ b/python/ray/llm/_internal/serve/serving_patterns/data_parallel/dp_server.py @@ -1,37 +1,128 @@ +import asyncio +import json import logging +import os import time +from typing import List, Tuple +from ray import serve +from ray.experimental.internal_kv import ( + _internal_kv_del, + _internal_kv_get, + _internal_kv_put, +) from ray.llm._internal.serve.core.configs.llm_config import LLMConfig from ray.llm._internal.serve.core.server.llm_server import LLMServer -from ray.runtime_context import get_runtime_context -from ray.serve.handle import DeploymentHandle +from ray.llm._internal.serve.utils.pg_utils import get_bundle_indices_sorted_by_node +from ray.serve.config import ( + AutoscalingConfig, + GangPlacementStrategy, + GangRuntimeFailurePolicy, + GangSchedulingConfig, +) from ray.util.collective.collective import get_address_and_port +from ray.util.placement_group import get_placement_group logger = logging.getLogger(__name__) +TIMEOUT_SECONDS = 120 +POLL_INTERVAL_SECONDS = 0.5 + + +class GangMasterInfoRegistry: + """Registry for gang DP master info using GCS KV store.""" + + _KEY_PREFIX = "LLMServeRegistry:serve_global:gang_dp_master/" + + @classmethod + def _make_key(cls, gang_id: str) -> bytes: + return (cls._KEY_PREFIX + gang_id).encode("utf-8") + + @classmethod + def register(cls, gang_id: str, address: str, port: int) -> None: + """Store the DP master info in GCS KV store.""" + key = cls._make_key(gang_id) + value = json.dumps({"address": address, "port": port}).encode("utf-8") + _internal_kv_put(key, value, overwrite=True) + + @classmethod + def unregister(cls, gang_id: str) -> None: + """Remove the DP master info from GCS KV store.""" + key = cls._make_key(gang_id) + try: + _internal_kv_del(key) + except Exception: + logger.warning( + f"Failed to unregister gang master info for gang {gang_id}.", + exc_info=True, + ) + + @classmethod + async def get( + cls, + gang_id: str, + timeout: float = TIMEOUT_SECONDS, + poll_interval: float = POLL_INTERVAL_SECONDS, + ) -> Tuple[str, int]: + """Retrieve the DP master info for gang_id, polling until available. + + Args: + gang_id: The ID of the gang. + timeout: The timeout in seconds. + poll_interval: The poll interval in seconds. + + Returns: + A tuple of (address, port). + + Raises: + TimeoutError: If the info is not available within timeout_seconds seconds. + """ + key = cls._make_key(gang_id) + deadline = time.monotonic() + timeout + while True: + data = _internal_kv_get(key) + if data is not None: + info = json.loads(data) + return info["address"], info["port"] + if time.monotonic() >= deadline: + raise TimeoutError( + f"Timed out waiting for DP master info for gang {gang_id} " + f"after {timeout}s." + ) + await asyncio.sleep(poll_interval) + class DPServer(LLMServer): """ - Data Parallel LLM Server. + Gang-scheduled Data Parallel LLM Server. - This class is used to serve data parallel attention (DP Attention) - deployment paradigm, where the attention layers are replicated and - the MoE layers are sharded. DP Attention is typically used for models - like DeepSeek-V3. + Uses Ray Serve's gang scheduling so that if any replica in a DP group deployment + fails, the entire group is restarted atomically. """ - async def __init__(self, llm_config: LLMConfig, dp_rank_assigner: DeploymentHandle): - self.dp_rank_assigner = dp_rank_assigner + async def __init__(self, llm_config: LLMConfig): + ctx = serve.get_replica_context() + gang_context = ctx.gang_context + + if gang_context is None: + raise RuntimeError( + "DPServer requires gang scheduling to be enabled. " + "Set gang_scheduling_config in the deployment options." + ) - node_id = get_runtime_context().get_node_id() - self.dp_rank = await self.dp_rank_assigner.register.remote(node_id) + self.dp_rank = gang_context.rank + self.gang_id = gang_context.gang_id + self.dp_size = gang_context.world_size - logger.info(f"DP rank {self.dp_rank} registered with rank assigner") + logger.info( + f"DPServer replica initialized: dp_rank={self.dp_rank}, " + f"dp_size={self.dp_size}, gang_id={self.gang_id}" + ) if self.dp_rank == 0: self.dp_address, self.dp_rpc_port = get_address_and_port() - await self.dp_rank_assigner.set_dp_master_info.remote( - self.dp_address, self.dp_rpc_port + GangMasterInfoRegistry.register( + self.gang_id, self.dp_address, self.dp_rpc_port ) logger.info( f"DP rank {self.dp_rank} has set DP master info: " @@ -40,10 +131,9 @@ async def __init__(self, llm_config: LLMConfig, dp_rank_assigner: DeploymentHand ) else: timestamp = time.time() - ( - self.dp_address, - self.dp_rpc_port, - ) = await self.dp_rank_assigner.get_dp_master_info.remote() + self.dp_address, self.dp_rpc_port = await GangMasterInfoRegistry.get( + self.gang_id + ) logger.info( f"DP rank {self.dp_rank} got DP master info: " f"data_parallel_address={self.dp_address}, " @@ -58,8 +148,60 @@ async def __init__(self, llm_config: LLMConfig, dp_rank_assigner: DeploymentHand data_parallel_rpc_port=self.dp_rpc_port, ) + engine_config = llm_config.get_engine_config() + + # Direct vLLM to use this replica's bundles within the gang placement group. + # Gang placement group concatenates per-replica bundles for all ranks, so + # rank i owns bundles [i*B, i*B+1, ..., i*B+B-1] where B is the number of + # bundles per DP replica. + # + # However, adjacent bundle indices in a placement group don't necessarily + # map to adjacent physical ranks. We use get_bundle_indices_sorted_by_node + # to reorder bundle indices so that same-node bundles are adjacent and the + # driver node's bundles come first. This prevents us from scattering adjacent + # TP ranks in the same DP rank across nodes. + # + # Example: dp_size=2, tp_size=2, 2 GPUs per node for simplicity + # Gang placement group = [{GPU: 1}, {GPU: 1}, {GPU: 1}, {GPU: 1}] + # Physical rank location: ^^N0R0^^ ^^N1R1^^ ^^N0R1^^ ^^N1R0^^ + # DP placement: ^^DP0^^^ ^^DP1^^^ ^^DP0^^^ ^^DP1^^^ + # + # placement_bundles below is the gang placement group, and therefore + # get_current_placement_group from the actor yields the gang placement group, + # not the per-replica placement group. + bundles_per_replica = len(engine_config.placement_bundles) + pg = get_placement_group(gang_context.pg_name) + sorted_indices = get_bundle_indices_sorted_by_node(pg) + os.environ["VLLM_RAY_BUNDLE_INDICES"] = self._compute_bundle_indices( + self.dp_rank, bundles_per_replica, sorted_indices + ) + await super().__init__(llm_config) + @staticmethod + def _compute_bundle_indices( + dp_rank: int, bundles_per_replica: int, sorted_indices: List[int] + ) -> str: + """Return the VLLM_RAY_BUNDLE_INDICES value for a given DP rank. + + Slices into sorted_indices (bundle indices reordered so that + same-node bundles are adjacent) to pick the bundles that belong to + this DP rank. + + Args: + dp_rank: This replica's data-parallel rank. + bundles_per_replica: Number of placement-group bundles each DP + replica owns. + sorted_indices: Bundle indices sorted by node. + + Returns: + Comma-separated bundle indices, e.g. "0,2". + """ + start = dp_rank * bundles_per_replica + return ",".join( + str(sorted_indices[start + i]) for i in range(bundles_per_replica) + ) + @classmethod def get_deployment_options(cls, llm_config: "LLMConfig"): deployment_options = super().get_deployment_options(llm_config) @@ -67,26 +209,51 @@ def get_deployment_options(cls, llm_config: "LLMConfig"): dp_size = llm_config.engine_kwargs.get("data_parallel_size", 1) if not (isinstance(dp_size, int) and dp_size > 0): raise ValueError( - f"Invalid data_parallel_size: {dp_size}, expecting " "positive integer." + f"Invalid data_parallel_size: {dp_size}, expecting positive integer." ) if dp_size != 1: - if "num_replicas" in deployment_options: - raise ValueError( - "num_replicas should not be specified for DP deployment, " - f"use engine_kwargs.data_parallel_size={dp_size} instead." + num_replicas = deployment_options.get("num_replicas") + has_autoscaling = num_replicas == "auto" or ( + num_replicas is None and "autoscaling_config" in deployment_options + ) + if has_autoscaling: + autoscaling_config = AutoscalingConfig.default().dict() + user_config = deployment_options.get("autoscaling_config") + if user_config is not None: + autoscaling_config.update(user_config) + + logger.warning( + "In DP deployment, a replica refers to a DP group. " + "Multiplying autoscaling_config's min_replicas, max_replicas, and " + f"initial_replicas by data_parallel_size ({dp_size})." ) - if "autoscaling_config" in deployment_options: - raise ValueError( - "autoscaling_config is not supported for DP deployment, " - "remove autoscaling_config instead. The `num_replicas` " - "will be set to `data_parallel_size`." + for key in ["min_replicas", "max_replicas", "initial_replicas"]: + if autoscaling_config.get(key) is not None: + autoscaling_config[key] *= dp_size + + deployment_options["autoscaling_config"] = autoscaling_config + elif num_replicas is not None: + logger.warning( + "In DP deployment, num_replicas refers to the number of DP groups. " + f"Multiplying num_replicas ({num_replicas}) by data_parallel_size ({dp_size}) " + f"to get the total number of serve replicas ({num_replicas * dp_size})." ) - deployment_options["num_replicas"] = dp_size - if deployment_options["placement_group_strategy"] != "STRICT_PACK": + deployment_options["num_replicas"] = num_replicas * dp_size + else: + deployment_options["num_replicas"] = dp_size + + deployment_options["gang_scheduling_config"] = GangSchedulingConfig( + gang_size=dp_size, + gang_placement_strategy=GangPlacementStrategy.PACK, + runtime_failure_policy=GangRuntimeFailurePolicy.RESTART_GANG, + ) + # Remove per-replica placement_group_strategy. Ray Serve raises an error + # if both placement_group_strategy and gang_scheduling_config are provided. + if "placement_group_strategy" in deployment_options: logger.warning( - f"DP deployment with placement_strategy={deployment_options['placement_group_strategy']} " - "is not supported. Using STRICT_PACK instead." + "placement_group_strategy configured in the deployment config is ignored. " + "DP deployment uses PACK strategy for scheduling DP groups." ) - deployment_options["placement_group_strategy"] = "STRICT_PACK" + deployment_options.pop("placement_group_strategy", None) return deployment_options diff --git a/python/ray/llm/_internal/serve/utils/pg_utils.py b/python/ray/llm/_internal/serve/utils/pg_utils.py index 73c9687f10e0..48d5c712dff6 100644 --- a/python/ray/llm/_internal/serve/utils/pg_utils.py +++ b/python/ray/llm/_internal/serve/utils/pg_utils.py @@ -3,7 +3,7 @@ from collections import defaultdict from typing import Dict, List -import ray +from ray.serve._private.utils import get_head_node_id from ray.util.placement_group import PlacementGroup, placement_group_table @@ -55,5 +55,5 @@ def get_bundle_indices_sorted_by_node( """ table = placement_group_table(pg) bundles_to_node_id = table["bundles_to_node_id"] - driver_node_id = ray.get_runtime_context().get_node_id() + driver_node_id = get_head_node_id() return _sort_bundle_indices_by_node(bundles_to_node_id, driver_node_id) diff --git a/python/ray/llm/tests/serve/cpu/deployments/data_parallel/test_dp_server.py b/python/ray/llm/tests/serve/cpu/deployments/data_parallel/test_dp_server.py index 744c1158a652..31692d35494b 100644 --- a/python/ray/llm/tests/serve/cpu/deployments/data_parallel/test_dp_server.py +++ b/python/ray/llm/tests/serve/cpu/deployments/data_parallel/test_dp_server.py @@ -1,65 +1,195 @@ +import asyncio import sys from copy import deepcopy +from unittest.mock import patch import pytest from ray.llm._internal.serve.core.configs.llm_config import LLMConfig -from ray.llm._internal.serve.serving_patterns.data_parallel.dp_server import DPServer +from ray.llm._internal.serve.serving_patterns.data_parallel.dp_server import ( + DPServer, + GangMasterInfoRegistry, +) +from ray.serve.config import ( + GangPlacementStrategy, + GangRuntimeFailurePolicy, + GangSchedulingConfig, +) class TestGetDeploymentOptions: + """Mirrors test_dp_server.py but verifies gang scheduling config.""" + @pytest.mark.parametrize( - "data_parallel_size,num_replica,allowed", + "data_parallel_size,num_replicas", [ - (None, 1, True), - (None, 2, True), - (None, 3, True), - (1, 1, True), - (1, 2, True), - (1, 3, True), - (2, 2, False), - (2, 3, False), - (4, 2, False), - (2, None, True), - (None, None, True), + (None, 1), + (2, None), + (1, 1), + (2, 4), ], ) - def test_multi_replica_dp_validation( - self, data_parallel_size, num_replica, allowed - ): - """Test that multi-replica and DP size are mutually exclusive. - - Ray.llm's implementation does not yet support multi-replica - deployment along with DP. - """ + def test_num_replicas_dp_validation(self, data_parallel_size, num_replicas): engine_kwargs = ( {} if data_parallel_size is None else {"data_parallel_size": data_parallel_size} ) - deployment_config = {} if num_replica is None else {"num_replicas": num_replica} + deployment_config = ( + {} if num_replicas is None else {"num_replicas": num_replicas} + ) + llm_config = LLMConfig( + model_loading_config=dict(model_id="test_model"), + engine_kwargs=deepcopy(engine_kwargs), + deployment_config=deepcopy(deployment_config), + ) - def get_serve_options_with_num_replica(): - llm_config = LLMConfig( - model_loading_config=dict(model_id="test_model"), - engine_kwargs=deepcopy(engine_kwargs), - deployment_config=deepcopy(deployment_config), + opts = DPServer.get_deployment_options(llm_config) + dp_size = data_parallel_size or 1 + if dp_size > 1: + expected_replicas = ( + num_replicas * dp_size if num_replicas is not None else dp_size + ) + assert opts["num_replicas"] == expected_replicas + assert isinstance(opts["gang_scheduling_config"], GangSchedulingConfig) + assert opts["gang_scheduling_config"].gang_size == dp_size + assert ( + opts["gang_scheduling_config"].gang_placement_strategy + == GangPlacementStrategy.PACK ) - deployment_options = DPServer.get_deployment_options(llm_config) + assert ( + opts["gang_scheduling_config"].runtime_failure_policy + == GangRuntimeFailurePolicy.RESTART_GANG + ) + else: + assert "gang_scheduling_config" not in opts - return deployment_options + def test_autoscaling_config(self): + llm_config = LLMConfig( + model_loading_config=dict(model_id="test_model"), + engine_kwargs={"data_parallel_size": 4}, + deployment_config={ + "autoscaling_config": { + "target_ongoing_requests": 10, + "min_replicas": 2, + "max_replicas": 8, + "initial_replicas": 3, + } + }, + ) + opts = DPServer.get_deployment_options(llm_config) + assert isinstance(opts["gang_scheduling_config"], GangSchedulingConfig) + assert opts["gang_scheduling_config"].gang_size == 4 + # Autoscaling config should have min/max/initial replicas multiplied by dp_size + autoscaling_config = opts["autoscaling_config"] + assert autoscaling_config["target_ongoing_requests"] == 10 + assert autoscaling_config["min_replicas"] == 2 * 4 + assert autoscaling_config["max_replicas"] == 8 * 4 + assert autoscaling_config["initial_replicas"] == 3 * 4 - if allowed: - serve_options = get_serve_options_with_num_replica() - actual_num_replicas = serve_options.get("num_replicas", 1) - expected_num_replicas = (data_parallel_size or 1) * (num_replica or 1) - assert actual_num_replicas == expected_num_replicas - else: - with pytest.raises( - ValueError, - match="use engine_kwargs.data_parallel_size", - ): - get_serve_options_with_num_replica() + +class TestGangMasterInfoRegistry: + _KV_MODULE = "ray.llm._internal.serve.serving_patterns.data_parallel.dp_server" + + def _make_kv_store(self): + # Mocks GCS KV store + store = {} + return ( + store, + lambda key, value, overwrite=False: store.__setitem__(key, value), + lambda key: store.get(key), + lambda key: store.pop(key, None) is not None, + lambda key: key in store, + ) + + @patch(f"{_KV_MODULE}._internal_kv_get") + @patch(f"{_KV_MODULE}._internal_kv_put") + def test_get_timeout(self, mock_put, mock_get): + mock_get.return_value = None + with pytest.raises(TimeoutError, match="Timed out"): + asyncio.get_event_loop().run_until_complete( + GangMasterInfoRegistry.get( + "gang-missing", timeout=0.5, poll_interval=0.1 + ) + ) + + @patch(f"{_KV_MODULE}._internal_kv_get") + @patch(f"{_KV_MODULE}._internal_kv_put") + def test_gang_isolation(self, mock_put, mock_get): + _, fake_put, fake_get, _, _ = self._make_kv_store() + mock_put.side_effect = fake_put + mock_get.side_effect = fake_get + + GangMasterInfoRegistry.register("gang-1", "10.0.0.1", 1111) + GangMasterInfoRegistry.register("gang-2", "10.0.0.2", 2222) + + loop = asyncio.get_event_loop() + addr1, port1 = loop.run_until_complete(GangMasterInfoRegistry.get("gang-1")) + addr2, port2 = loop.run_until_complete(GangMasterInfoRegistry.get("gang-2")) + + assert (addr1, port1) == ("10.0.0.1", 1111) + assert (addr2, port2) == ("10.0.0.2", 2222) + + +class TestBundleIndices: + @pytest.mark.parametrize( + "engine_kwargs,placement_group_config,dp_rank,sorted_indices,expected", + [ + # TP=1: 1 bundle per replica, identity ordering + ({"tensor_parallel_size": 1}, None, 0, list(range(4)), "0"), + ({"tensor_parallel_size": 1}, None, 3, list(range(4)), "3"), + ( + {"tensor_parallel_size": 1}, + {"bundles": [{"GPU": 1, "CPU": 1}]}, + 2, + list(range(4)), + "2", + ), + # TP=2: 2 bundles per replica, identity ordering + ({"tensor_parallel_size": 2}, None, 0, list(range(8)), "0,1"), + ({"tensor_parallel_size": 2}, None, 2, list(range(8)), "4,5"), + ( + {"tensor_parallel_size": 2}, + {"bundles": [{"GPU": 1, "CPU": 1}, {"GPU": 1}]}, + 1, + list(range(4)), + "2,3", + ), + # TP=2, PP=2: 4 bundles per replica, identity ordering + ( + {"tensor_parallel_size": 2, "pipeline_parallel_size": 2}, + None, + 0, + list(range(8)), + "0,1,2,3", + ), + ( + {"tensor_parallel_size": 2, "pipeline_parallel_size": 2}, + None, + 1, + list(range(8)), + "4,5,6,7", + ), + # Out-of-order sorted_indices: bundles reordered by node + ({"tensor_parallel_size": 2}, None, 1, [0, 2, 1, 3], "1,3"), + ({"tensor_parallel_size": 1}, None, 0, [2, 0, 3, 1], "2"), + ], + ) + def test_bundle_indices( + self, engine_kwargs, placement_group_config, dp_rank, sorted_indices, expected + ): + llm_config = LLMConfig( + model_loading_config=dict(model_id="test_model"), + engine_kwargs=engine_kwargs, + placement_group_config=placement_group_config, + ) + engine_config = llm_config.get_engine_config() + bundles_per_replica = len(engine_config.placement_bundles) + + result = DPServer._compute_bundle_indices( + dp_rank, bundles_per_replica, sorted_indices + ) + assert result == expected if __name__ == "__main__": diff --git a/python/ray/llm/tests/serve/utils/test_pg_utils.py b/python/ray/llm/tests/serve/utils/test_pg_utils.py index 618f4580e0ed..99e4ab159aab 100644 --- a/python/ray/llm/tests/serve/utils/test_pg_utils.py +++ b/python/ray/llm/tests/serve/utils/test_pg_utils.py @@ -8,6 +8,7 @@ import ray from ray._private.test_utils import placement_group_assert_no_leak from ray.llm._internal.serve.utils.pg_utils import get_bundle_indices_sorted_by_node +from ray.serve._private.utils import get_head_node_id # Realistic 56-char hex node IDs NODE_A = "ab7c1d2e3f4a5b6c7d8e9f0a1b2c3d4e5f6a7b8c9d0e1f2a3b4c" # driver node @@ -62,17 +63,17 @@ def test_sort_bundle_indices_by_node( bundles_to_node_id, expected_sorted_bundle_indices ): mock_pg = MagicMock() - mock_ctx = MagicMock() - mock_ctx.get_node_id.return_value = NODE_A with ( patch( "ray.llm._internal.serve.utils.pg_utils.placement_group_table", return_value={"bundles_to_node_id": bundles_to_node_id}, ), - patch("ray.llm._internal.serve.utils.pg_utils.ray") as mock_ray, + patch( + "ray.llm._internal.serve.utils.pg_utils.get_head_node_id", + return_value=NODE_A, + ), ): - mock_ray.get_runtime_context.return_value = mock_ctx result = get_bundle_indices_sorted_by_node(mock_pg) assert result == expected_sorted_bundle_indices @@ -88,7 +89,7 @@ def test_sort_bundles(ray_start_cluster, strategy): ray.init(address=cluster.address) cluster.wait_for_nodes() - driver_node_id = ray.get_runtime_context().get_node_id() + driver_node_id = get_head_node_id() pg = ray.util.placement_group( name=f"test_sort_bundles_{strategy}", diff --git a/python/ray/serve/_private/deployment_state.py b/python/ray/serve/_private/deployment_state.py index 162f5e291415..d193530240cd 100644 --- a/python/ray/serve/_private/deployment_state.py +++ b/python/ray/serve/_private/deployment_state.py @@ -3401,11 +3401,14 @@ def _check_startup_replicas( # from the replica actor. The invariant is that the rank is assigned # during startup and before the replica is added to the replicas # data structure with RUNNING state. - # Recover rank from the replica actor during controller restart + # Skip if the rank is already assigned (e.g., health-check failure + # put the replica into RECOVERING without a controller crash, so the + # rank was never released). replica_id = replica.replica_id.unique_id - self._rank_manager.recover_rank( - replica_id, replica.actor_node_id, replica.rank - ) + if not self._rank_manager.has_replica_rank(replica_id): + self._rank_manager.recover_rank( + replica_id, replica.actor_node_id, replica.rank + ) # Register recovered gang replicas in the incremental # bookkeeping (newly created gang replicas are already # registered in _add_upscale_gang_replicas). diff --git a/python/ray/serve/_private/router.py b/python/ray/serve/_private/router.py index efb73257695c..0bd2a553b986 100644 --- a/python/ray/serve/_private/router.py +++ b/python/ray/serve/_private/router.py @@ -23,7 +23,7 @@ import ray from ray.actor import ActorHandle -from ray.exceptions import ActorDiedError, ActorUnavailableError, RayError +from ray.exceptions import ActorDiedError, ActorUnavailableError, RayError, RayTaskError from ray.serve._private.common import ( RUNNING_REQUESTS_KEY, DeploymentHandleSource, @@ -164,12 +164,15 @@ def __init__( # If the interval is set to 0, eagerly sets all metrics. self._cached_metrics_enabled = RAY_SERVE_METRICS_EXPORT_INTERVAL_MS != 0 self._cached_metrics_interval_s = RAY_SERVE_METRICS_EXPORT_INTERVAL_MS / 1000 + self._cached_metrics_task: Optional[asyncio.Task] = None if self._cached_metrics_enabled: self._cached_num_router_requests = defaultdict(int) def create_metrics_task(): - event_loop.create_task(self._report_cached_metrics_forever()) + self._cached_metrics_task = event_loop.create_task( + self._report_cached_metrics_forever() + ) # the constructor is called in the user thread, but its trying to create a task on the event loop # which is running in the router thread. This is not thread safe, so we need to use call_soon_threadsafe @@ -495,6 +498,13 @@ async def shutdown(self): self._shutdown = True + if self._cached_metrics_task is not None: + self._cached_metrics_task.cancel() + try: + await self._cached_metrics_task + except asyncio.CancelledError: + pass + class Router(ABC): @abstractmethod @@ -761,6 +771,7 @@ def _process_finished_request( self, replica_id: ReplicaID, internal_request_id: str, + replica_actor_id: Optional[ray.ActorID], result: Union[Any, RayError], ) -> None: if RAY_SERVE_COLLECT_AUTOSCALING_METRICS_ON_HANDLE: @@ -770,15 +781,10 @@ def _process_finished_request( if self.request_router: self.request_router.on_request_completed(replica_id, internal_request_id) - if isinstance(result, ActorDiedError): - # Replica has died but controller hasn't notified the router yet. - # Don't consider this replica for requests in the future, and retry - # routing request. - if self.request_router: - self.request_router.on_replica_actor_died(replica_id) - logger.warning( - f"{replica_id} will not be considered for future " - "requests because it has died." + actor_died_error = self._get_actor_died_error(result) + if actor_died_error is not None: + self._handle_actor_died_error( + replica_id, replica_actor_id, actor_died_error ) elif isinstance(result, ActorUnavailableError): # There are network issues, or replica has died but GCS is down so @@ -792,6 +798,58 @@ def _process_finished_request( f"Request failed because {replica_id} is temporarily unavailable." ) + def _get_actor_died_error( + self, result: Union[Any, RayError] + ) -> Optional[ActorDiedError]: + if isinstance(result, ActorDiedError): + return result + + if isinstance(result, RayTaskError) and isinstance( + getattr(result, "cause", None), ActorDiedError + ): + # RayTaskError wrapping ActorDiedError (e.g., from failed object ref + # resolution in chained deployment calls). + return result.cause + + return None + + def _handle_actor_died_error( + self, + replica_id: ReplicaID, + replica_actor_id: Optional[ray.ActorID], + actor_died_error: ActorDiedError, + ) -> None: + # Only mark the replica as dead if the ActorDiedError refers to this + # replica. With chained DeploymentResponses, the error may come from + # an upstream deployment that was passed as an object ref to this + # replica. In that case, this replica is still healthy. + error_actor_id = getattr(actor_died_error, "actor_id", None) + replica_actor_id_hex = ( + replica_actor_id.hex() if replica_actor_id is not None else None + ) + # When error_actor_id or replica_actor_id_hex is None, we cannot + # definitively compare. Treat as match to preserve conservative + # behavior: mark replica dead rather than leaving it in rotation. + if ( + error_actor_id is None + or replica_actor_id_hex is None + or error_actor_id == replica_actor_id_hex + ): + # Replica has died but controller hasn't notified the router yet. + if self.request_router: + self.request_router.on_replica_actor_died(replica_id) + logger.warning( + f"{replica_id} will not be considered for future " + "requests because it has died." + ) + else: + # Error from upstream dependency, not from this replica. + logger.debug( + f"ActorDiedError from upstream (actor_id={error_actor_id}), " + f"not from {replica_id} (actor_id={replica_actor_id_hex}). " + "Replica remains in rotation." + ) + async def _route_and_send_request_once( self, pr: PendingRequest, @@ -838,6 +896,7 @@ async def _route_and_send_request_once( self._process_finished_request, replica.replica_id, pr.metadata.internal_request_id, + replica.actor_id, ) result.add_done_callback(callback) callback_registered = True @@ -859,16 +918,9 @@ async def _route_and_send_request_once( result.cancel() raise - except ActorDiedError: - # Replica has died but controller hasn't notified the router yet. - # Don't consider this replica for requests in the future, and retry - # routing request. + except ActorDiedError as e: if replica is not None: - self.request_router.on_replica_actor_died(replica.replica_id) - logger.warning( - f"{replica.replica_id} will not be considered for future " - "requests because it has died." - ) + self._handle_actor_died_error(replica.replica_id, replica.actor_id, e) except ActorUnavailableError: # There are network issues, or replica has died but GCS is down so # ActorUnavailableError will be raised until GCS recovers. For the diff --git a/python/ray/serve/_private/test_utils.py b/python/ray/serve/_private/test_utils.py index 5f52e5f8dd8e..dc42aabb102e 100644 --- a/python/ray/serve/_private/test_utils.py +++ b/python/ray/serve/_private/test_utils.py @@ -1221,7 +1221,20 @@ def get_metric_dictionaries( name: str, timeout: float = 20, timeseries: Optional[PrometheusTimeseries] = None, + wait: bool = True, ) -> List[Dict]: + """Get metric samples as list of label dicts. + + Args: + name: Metric name to fetch. + timeout: Timeout for each fetch attempt. + timeseries: Optional shared timeseries to populate. + wait: If True (default), wait for metric to appear before returning. + If False, fetch once and return immediately (possibly empty). + + Returns: + List of metric samples as label dicts. + """ if timeseries is None: timeseries = PrometheusTimeseries() @@ -1235,7 +1248,18 @@ def metric_available() -> bool: ) return True - wait_for_condition(metric_available, retry_interval_ms=1000, timeout=timeout * 4) + if wait: + wait_for_condition( + metric_available, retry_interval_ms=1000, timeout=timeout * 4 + ) + else: + # Fetch once without asserting - allows outer wait_for_condition to retry + fetch_prometheus_metric_timeseries( + [f"localhost:{TEST_METRICS_EXPORT_PORT}"], + timeseries, + timeout=PROMETHEUS_METRICS_TIMEOUT_S, + ) + metric_dicts = [] for sample in timeseries.metric_samples.values(): if sample.name == name: diff --git a/python/ray/serve/tests/test_gang_scheduling.py b/python/ray/serve/tests/test_gang_scheduling.py index d3caa73791c2..957177097181 100644 --- a/python/ray/serve/tests/test_gang_scheduling.py +++ b/python/ray/serve/tests/test_gang_scheduling.py @@ -1,6 +1,5 @@ import os import sys -import tempfile import threading import time @@ -14,7 +13,6 @@ from ray.serve._private.test_utils import ( check_apps_running, check_num_replicas_eq, - check_num_replicas_gte, ) from ray.serve._private.utils import get_all_live_placement_group_names from ray.serve.config import GangPlacementStrategy, GangSchedulingConfig @@ -720,30 +718,27 @@ def test_partial_constructor_failure(self, ray_shutdown): ray.init(num_cpus=1) serve.start() - with tempfile.TemporaryDirectory() as tmpdir: - file_path = os.path.join(tmpdir, "test_deploy.txt") + failed_replica_store = FailedReplicaStore.remote() - @serve.deployment( - num_replicas=4, - ray_actor_options={"num_cpus": 0.1}, - gang_scheduling_config=GangSchedulingConfig(gang_size=2), - ) - class GangPartialConstructorFailure: - def __init__(self): - if not os.path.exists(file_path): - with open(file_path, "w") as f: - f.write(serve.get_replica_context().replica_id.unique_id) - raise RuntimeError("Consistently throwing on same replica.") - else: - with open(file_path) as f: - content = f.read() - if content == serve.get_replica_context().replica_id.unique_id: - raise RuntimeError("Consistently throwing on same replica.") + @serve.deployment( + num_replicas=4, + ray_actor_options={"num_cpus": 0.1}, + gang_scheduling_config=GangSchedulingConfig(gang_size=2), + ) + class GangPartialConstructorFailure: + def __init__(self, store): + replica_id = serve.get_replica_context().replica_id.unique_id + is_first = ray.get(store.set_if_first.remote(replica_id)) + if is_first: + raise RuntimeError("Consistently throwing on same replica.") + failed_id = ray.get(store.get.remote()) + if replica_id == failed_id: + raise RuntimeError("Consistently throwing on same replica.") - async def __call__(self, request): - return "hi" + async def __call__(self, request): + return "hi" - serve.run(GangPartialConstructorFailure.bind()) + serve.run(GangPartialConstructorFailure.bind(failed_replica_store)) client = serve.context._get_global_client() deployment_id = DeploymentID(name="GangPartialConstructorFailure") @@ -760,26 +755,24 @@ def test_transient_constructor_failure(self, ray_shutdown): ray.init(num_cpus=1) serve.start() - with tempfile.TemporaryDirectory() as tmpdir: - file_path = os.path.join(tmpdir, "test_deploy.txt") + failed_replica_store = FailedReplicaStore.remote() - @serve.deployment( - num_replicas=4, - ray_actor_options={"num_cpus": 0.1}, - gang_scheduling_config=GangSchedulingConfig(gang_size=2), - ) - class GangTransientConstructorFailure: - def __init__(self): - if os.path.exists(file_path): - return - with open(file_path, "w") as f: - f.write("ONE") + @serve.deployment( + num_replicas=4, + ray_actor_options={"num_cpus": 0.1}, + gang_scheduling_config=GangSchedulingConfig(gang_size=2), + ) + class GangTransientConstructorFailure: + def __init__(self, store): + replica_id = serve.get_replica_context().replica_id.unique_id + is_first = ray.get(store.set_if_first.remote(replica_id)) + if is_first: raise RuntimeError("Intentionally throw on first try.") - async def __call__(self, request): - return "hi" + async def __call__(self, request): + return "hi" - serve.run(GangTransientConstructorFailure.bind()) + serve.run(GangTransientConstructorFailure.bind(failed_replica_store)) client = serve.context._get_global_client() deployment_id = DeploymentID(name="GangTransientConstructorFailure") @@ -1381,7 +1374,9 @@ def __call__(self): # Continuously send requests in a background thread. stop = threading.Event() - errors = [] + recovered = threading.Event() + errors_before_recovery = [] + errors_after_recovery = [] successes = [] def send_requests(): @@ -1390,7 +1385,10 @@ def send_requests(): result = handle.remote().result() successes.append(result) except Exception as e: - errors.append(e) + if recovered.is_set(): + errors_after_recovery.append(e) + else: + errors_before_recovery.append(e) time.sleep(0.1) sender = threading.Thread(target=send_requests, daemon=True) @@ -1399,7 +1397,8 @@ def send_requests(): cluster.remove_node(node_to_kill) - # Wait for full recovery. + expected_num_pgs = num_replicas // gang_size + def fully_recovered(): replicas = ray.get( controller._dump_replica_states_for_testing.remote(dep_id) @@ -1410,24 +1409,39 @@ def fully_recovered(): for r in running: if r.gang_context is None: return False + # Verify PG count has converged: the old affected PG should be removed + # and the replacement PG should be created. + gang_pg_names = [ + n + for n in get_all_live_placement_group_names() + if n.startswith(GANG_PG_NAME_PREFIX) + ] + if len(gang_pg_names) != expected_num_pgs: + return False return True wait_for_condition(fully_recovered, timeout=60) + recovered.set() time.sleep(1) stop.set() sender.join(timeout=5) - assert len(errors) == 0 + # Requests may fail during the brief disruption window: the node + # is dead but the handle may still route to the dead replica actors + # until the controller detects the failure and restarts them. + # After full recovery, no errors should occur. + assert len(errors_after_recovery) == 0 assert len(successes) > 0 - # Verify no leaked PGs. + # Verify no leaked PGs (already checked in fully_recovered, but + # assert here for a clear failure message). gang_pg_names = [ n for n in get_all_live_placement_group_names() if n.startswith(GANG_PG_NAME_PREFIX) ] - assert len(gang_pg_names) == num_replicas // gang_size + assert len(gang_pg_names) == expected_num_pgs wait_for_condition(check_apps_running, apps=[app_name]) @@ -1436,24 +1450,10 @@ def fully_recovered(): class TestGangScaling: - @staticmethod - def _send_requests_background(handle, stop_event, errors, successes): - """Continuously send requests until *stop_event* is set. - - Any request that raises an exception is recorded in *errors*. - """ - while not stop_event.is_set(): - try: - handle.remote().result(timeout_s=10) - successes.append(1) - except Exception as e: - errors.append(str(e)) - time.sleep(0.05) - @pytest.mark.parametrize( "initial_num_replicas, final_num_replicas", [ - (4, 2), # Manual downscale: serve deploy num_replicas=4 -> 2 + (4, 2), # Manual downscale: serve deploy num_replicas = 4 -> 2 (8, 4), # Downscaling (4, 8), # Upscaling ], @@ -1497,14 +1497,20 @@ def __call__(self): initial_gang_ids.add(resp["gang_id"]) assert len(initial_gang_ids) == initial_num_gangs - # Monitor the deployment's replica states to ensure no downtime. + # Monitor requests during scaling to ensure zero downtime errors, successes = [], [] stop_event = threading.Event() - t = threading.Thread( - target=self._send_requests_background, - args=(handle, stop_event, errors, successes), - daemon=True, - ) + + def send_requests(): + while not stop_event.is_set(): + try: + handle.remote().result(timeout_s=10) + successes.append(True) + except Exception as e: + errors.append(str(e)) + time.sleep(0.1) + + t = threading.Thread(target=send_requests, daemon=True) t.start() # Scale to the final replica count. @@ -1519,6 +1525,7 @@ def __call__(self): stop_event.set() t.join(timeout=5) + # Scaling should be zero-downtime: no requests should fail. assert len(errors) == 0 assert len(successes) > 0 @@ -1584,6 +1591,7 @@ async def __call__(self): name="GangAutoscale", target=2, app_name="gang_autoscale_app", + use_controller=True, ) # Send enough requests to trigger upscaling @@ -1591,11 +1599,12 @@ async def __call__(self): # Wait for scale-up to 8 replicas (4 complete gangs). wait_for_condition( - check_num_replicas_gte, + check_num_replicas_eq, name="GangAutoscale", target=8, app_name="gang_autoscale_app", timeout=60, + use_controller=True, ) # Replica count should always be a multiple of gang_size @@ -1619,6 +1628,7 @@ async def __call__(self): target=2, app_name="gang_autoscale_app", timeout=60, + use_controller=True, ) deployment = ( @@ -1669,6 +1679,7 @@ async def __call__(self): name="UnalignedUpscale", target=3, app_name="unaligned_upscale_app", + use_controller=True, ) # Send 9 blocking requests. With target_ongoing_requests=2: @@ -1700,6 +1711,7 @@ def upscaled_and_aligned(): target=3, app_name="unaligned_upscale_app", timeout=60, + use_controller=True, ) serve.delete("unaligned_upscale_app") @@ -1743,6 +1755,7 @@ async def __call__(self): name="UnalignedDownscale", target=9, app_name="unaligned_downscale_app", + use_controller=True, timeout=60, ) @@ -1762,7 +1775,7 @@ def downscaled_and_aligned(): assert running == 6 return True - wait_for_condition(downscaled_and_aligned, timeout=30) + wait_for_condition(downscaled_and_aligned, timeout=60) # Release all requests so the queue drains. signal.send.remote() diff --git a/python/ray/serve/tests/test_handle_2.py b/python/ray/serve/tests/test_handle_2.py index 213240748415..5af345e73fb4 100644 --- a/python/ray/serve/tests/test_handle_2.py +++ b/python/ray/serve/tests/test_handle_2.py @@ -1,4 +1,5 @@ import asyncio +import os import sys from typing import Any, List @@ -6,12 +7,18 @@ import ray from ray import serve -from ray._common.test_utils import SignalActor, async_wait_for_condition +from ray._common.test_utils import ( + SignalActor, + async_wait_for_condition, + wait_for_condition, +) from ray._common.utils import get_or_create_event_loop +from ray.exceptions import RayActorError from ray.serve._private.constants import ( RAY_SERVE_FORCE_LOCAL_TESTING_MODE, RAY_SERVE_USE_GRPC_BY_DEFAULT, ) +from ray.serve._private.test_utils import check_num_replicas_eq from ray.serve.exceptions import RayServeException from ray.serve.handle import ( DeploymentHandle, @@ -279,6 +286,418 @@ async def __call__(self, number: int, await_order: str) -> int: assert result == 3600, f"Expected 3600, got {result}" +@pytest.mark.skipif( + sys.platform == "win32", + reason="ActorDiedError not raised properly on Windows.", +) +@pytest.mark.skipif( + RAY_SERVE_USE_GRPC_BY_DEFAULT, + reason="Chained object refs use Ray actor transport; gRPC path differs.", +) +@pytest.mark.skipif( + RAY_SERVE_FORCE_LOCAL_TESTING_MODE, + reason="local_testing_mode has different failure handling.", +) +def test_chained_deployment_response_upstream_crash_downstream_stays_in_rotation( + serve_instance, +): + """ + When upstream (Adder) crashes with os._exit in a chained DeploymentResponse + scenario, the downstream (Multiplier) must NOT be incorrectly marked as dead. + Subsequent requests should succeed after the upstream is restarted. + """ + + @serve.deployment + class Adder: + def __init__(self, increment: int): + self._increment = increment + + def __call__(self, val: int) -> int: + if val == 0: + os._exit(0) + return val + self._increment + + @serve.deployment + class Multiplier: + def __init__(self, multiple: int): + self._multiple = multiple + + def __call__(self, val: int) -> int: + return val * self._multiple + + def get_actor_id(self) -> str: + return ray.get_runtime_context().get_actor_id() + + @serve.deployment + class Ingress: + def __init__(self, adder: DeploymentHandle, multiplier: DeploymentHandle): + self._adder = adder + self._multiplier = multiplier + + async def __call__(self, input: int) -> int: + adder_response: DeploymentResponse = self._adder.remote(input) + multiplier_response: DeploymentResponse = self._multiplier.remote( + adder_response + ) + return await multiplier_response + + app = Ingress.bind( + Adder.bind(increment=1), + Multiplier.bind(multiple=2), + ) + handle: DeploymentHandle = serve.run(app) + + # Warmup: (5 + 1) * 2 = 12 + assert handle.remote(5).result(timeout_s=10) == 12 + + multiplier_handle = serve.get_deployment_handle("Multiplier", "default") + multiplier_actor_id_before = multiplier_handle.get_actor_id.remote().result() + + # Trigger Adder crash with os._exit(0) + crash_response = handle.remote(0) + with pytest.raises((RayActorError, Exception)): + crash_response.result(timeout_s=5) + + # Wait for Adder to be restarted by controller + wait_for_condition( + check_num_replicas_eq, + name="Adder", + target=1, + timeout=30, + ) + + # Multiplier should still be in rotation (don't mark it dead when error is + # from upstream). A new request should succeed. (2 + 1) * 2 = 6 + result = handle.remote(2).result(timeout_s=30) + assert result == 6, f"Expected 6, got {result}" + + # Multiplier replica must not have been restarted (same actor_id). + multiplier_actor_id_after = multiplier_handle.get_actor_id.remote().result() + assert multiplier_actor_id_before == multiplier_actor_id_after, ( + f"Multiplier was incorrectly marked dead and restarted: " + f"actor_id {multiplier_actor_id_before} -> {multiplier_actor_id_after}" + ) + + +@pytest.mark.skipif( + sys.platform == "win32", + reason="ActorDiedError not raised properly on Windows.", +) +@pytest.mark.skipif( + RAY_SERVE_USE_GRPC_BY_DEFAULT, + reason="Chained object refs use Ray actor transport; gRPC path differs.", +) +@pytest.mark.skipif( + RAY_SERVE_FORCE_LOCAL_TESTING_MODE, + reason="local_testing_mode has different failure handling.", +) +def test_chained_deployment_response_downstream_crash_replica_marked_dead( + serve_instance, +): + """ + When downstream (Multiplier) crashes in a chained DeploymentResponse + scenario, the Multiplier replica MUST be correctly marked as dead. + Subsequent requests should succeed after the Multiplier is restarted. + """ + # Adder returns input+1. Multiplier crashes when input==0. + # So input=-1 -> Adder returns 0 -> Multiplier gets 0 and crashes. + + @serve.deployment + class Adder: + def __init__(self, increment: int): + self._increment = increment + + def __call__(self, val: int) -> int: + return val + self._increment + + @serve.deployment + class Multiplier: + def __init__(self, multiple: int): + self._multiple = multiple + + def __call__(self, val: int) -> int: + if val == 0: + os._exit(0) + return val * self._multiple + + def get_actor_id(self) -> str: + return ray.get_runtime_context().get_actor_id() + + @serve.deployment + class Ingress: + def __init__(self, adder: DeploymentHandle, multiplier: DeploymentHandle): + self._adder = adder + self._multiplier = multiplier + + async def __call__(self, input: int) -> int: + adder_response: DeploymentResponse = self._adder.remote(input) + multiplier_response: DeploymentResponse = self._multiplier.remote( + adder_response + ) + return await multiplier_response + + app = Ingress.bind( + Adder.bind(increment=1), + Multiplier.bind(multiple=2), + ) + handle: DeploymentHandle = serve.run(app) + + # Warmup: (-1 + 1) * 2 = 0, but Multiplier crashes on 0. Use val=5: (5+1)*2=12 + assert handle.remote(5).result(timeout_s=10) == 12 + + multiplier_handle = serve.get_deployment_handle("Multiplier", "default") + multiplier_actor_id_before = multiplier_handle.get_actor_id.remote().result() + + # Trigger Multiplier crash: Adder returns 0, Multiplier gets 0 and os._exit(0) + crash_response = handle.remote(-1) + with pytest.raises((RayActorError, Exception)): + crash_response.result(timeout_s=5) + + # Wait for Multiplier to be restarted by controller + wait_for_condition( + check_num_replicas_eq, + name="Multiplier", + target=1, + timeout=30, + ) + + # New request should succeed with restarted Multiplier. (2 + 1) * 2 = 6 + result = handle.remote(2).result(timeout_s=30) + assert result == 6, f"Expected 6, got {result}" + + # Multiplier replica must have been restarted (different actor_id). + multiplier_actor_id_after = multiplier_handle.get_actor_id.remote().result() + assert multiplier_actor_id_before != multiplier_actor_id_after, ( + f"Multiplier was not correctly marked dead and restarted: " + f"actor_id unchanged at {multiplier_actor_id_before}" + ) + + +@pytest.mark.skipif( + sys.platform == "win32", + reason="ActorDiedError not raised properly on Windows.", +) +@pytest.mark.skipif( + RAY_SERVE_USE_GRPC_BY_DEFAULT, + reason="Chained object refs use Ray actor transport; gRPC path differs.", +) +@pytest.mark.skipif( + RAY_SERVE_FORCE_LOCAL_TESTING_MODE, + reason="local_testing_mode has different failure handling.", +) +def test_chained_deployment_response_middle_of_chain_crash(serve_instance): + """ + When the middle deployment (B) crashes in a 3-hop chain Ingress->A->B->C, + only B should be marked dead. A and C must NOT be incorrectly marked dead. + Subsequent requests should succeed after B is restarted. + """ + + @serve.deployment + class ServiceA: + def __call__(self, val: int) -> int: + return val * 2 + + @serve.deployment + class ServiceB: + def __call__(self, val: int) -> int: + if val == 0: + os._exit(0) + return val * 3 + + @serve.deployment + class ServiceC: + def __call__(self, val: int) -> int: + return val * 5 + + @serve.deployment + class Ingress: + def __init__( + self, + a: DeploymentHandle, + b: DeploymentHandle, + c: DeploymentHandle, + ): + self._a = a + self._b = b + self._c = c + + async def __call__(self, val: int) -> int: + a_resp: DeploymentResponse = self._a.remote(val) + b_resp: DeploymentResponse = self._b.remote(a_resp) + c_resp: DeploymentResponse = self._c.remote(b_resp) + return await c_resp + + app = Ingress.bind( + ServiceA.bind(), + ServiceB.bind(), + ServiceC.bind(), + ) + handle: DeploymentHandle = serve.run(app) + + # Warmup: 5 -> A=10 -> B=30 -> C=150 + assert handle.remote(5).result(timeout_s=10) == 150 + + # Trigger B crash: val=0 -> A=0 -> B gets 0, crashes + crash_response = handle.remote(0) + with pytest.raises((RayActorError, Exception)): + crash_response.result(timeout_s=5) + + # Wait for B to be restarted + wait_for_condition( + check_num_replicas_eq, + name="ServiceB", + target=1, + timeout=30, + ) + + # A and C should still be in rotation. New request: 2 -> A=4 -> B=12 -> C=60 + result = handle.remote(2).result(timeout_s=30) + assert result == 60, f"Expected 60, got {result}" + + +@pytest.mark.skipif( + sys.platform == "win32", + reason="ActorDiedError not raised properly on Windows.", +) +@pytest.mark.skipif( + RAY_SERVE_USE_GRPC_BY_DEFAULT, + reason="Chained object refs use Ray actor transport; gRPC path differs.", +) +@pytest.mark.skipif( + RAY_SERVE_FORCE_LOCAL_TESTING_MODE, + reason="local_testing_mode has different failure handling.", +) +def test_chained_deployment_response_multiple_crashes_then_success(serve_instance): + """ + Multiple crash requests followed by multiple success requests. + Verifies recovery and routing after repeated upstream failures. + """ + + @serve.deployment + class Adder: + def __init__(self, increment: int): + self._increment = increment + + def __call__(self, val: int) -> int: + if val == 0: + os._exit(0) + return val + self._increment + + @serve.deployment + class Multiplier: + def __init__(self, multiple: int): + self._multiple = multiple + + def __call__(self, val: int) -> int: + return val * self._multiple + + @serve.deployment + class Ingress: + def __init__(self, adder: DeploymentHandle, multiplier: DeploymentHandle): + self._adder = adder + self._multiplier = multiplier + + async def __call__(self, input: int) -> int: + adder_response: DeploymentResponse = self._adder.remote(input) + multiplier_response: DeploymentResponse = self._multiplier.remote( + adder_response + ) + return await multiplier_response + + app = Ingress.bind( + Adder.bind(increment=1), + Multiplier.bind(multiple=2), + ) + handle: DeploymentHandle = serve.run(app) + + # Warmup + assert handle.remote(5).result(timeout_s=10) == 12 + + # Trigger multiple crashes (each kills Adder, controller restarts it) + for _ in range(3): + crash_response = handle.remote(0) + with pytest.raises((RayActorError, Exception)): + crash_response.result(timeout_s=5) + + # Wait for Adder to be healthy + wait_for_condition( + check_num_replicas_eq, + name="Adder", + target=1, + timeout=30, + ) + + # Multiple success requests should all succeed + for i in range(3): + result = handle.remote(2).result(timeout_s=30) + assert result == 6, f"Request {i}: expected 6, got {result}" + + +@pytest.mark.skipif( + RAY_SERVE_USE_GRPC_BY_DEFAULT, + reason="Chained object refs use Ray actor transport; gRPC path differs.", +) +@pytest.mark.skipif( + RAY_SERVE_FORCE_LOCAL_TESTING_MODE, + reason="local_testing_mode has different failure handling.", +) +def test_chained_deployment_response_upstream_raises_exception(serve_instance): + """ + When upstream raises a normal exception (not ActorDiedError), the error + propagates correctly and the downstream replica remains in rotation. + """ + # No Windows skip: we use raise ValueError, not os._exit + + @serve.deployment + class Adder: + def __init__(self, increment: int): + self._increment = increment + + def __call__(self, val: int) -> int: + if val == 0: + raise ValueError("Adder rejects 0") + return val + self._increment + + @serve.deployment + class Multiplier: + def __init__(self, multiple: int): + self._multiple = multiple + + def __call__(self, val: int) -> int: + return val * self._multiple + + @serve.deployment + class Ingress: + def __init__(self, adder: DeploymentHandle, multiplier: DeploymentHandle): + self._adder = adder + self._multiplier = multiplier + + async def __call__(self, input: int) -> int: + adder_response: DeploymentResponse = self._adder.remote(input) + multiplier_response: DeploymentResponse = self._multiplier.remote( + adder_response + ) + return await multiplier_response + + app = Ingress.bind( + Adder.bind(increment=1), + Multiplier.bind(multiple=2), + ) + handle: DeploymentHandle = serve.run(app) + + # Warmup + assert handle.remote(5).result(timeout_s=10) == 12 + + # Upstream raises ValueError (may be wrapped in RayTaskError) + error_response = handle.remote(0) + with pytest.raises(Exception): + error_response.result(timeout_s=5) + + # Multiplier should still be in rotation. Next request succeeds. + result = handle.remote(2).result(timeout_s=30) + assert result == 6, f"Expected 6, got {result}" + + def test_nested_deployment_response_error(serve_instance): """Test that passing a deployment response in a nested object to a downstream handle call errors, and with an informative error message.""" diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py index d26c8b76eb59..229eec87070b 100644 --- a/python/ray/serve/tests/test_metrics.py +++ b/python/ray/serve/tests/test_metrics.py @@ -811,7 +811,9 @@ def g(): wait_for_condition( lambda: len( - get_metric_dictionaries("ray_serve_deployment_request_counter_total") + get_metric_dictionaries( + "ray_serve_deployment_request_counter_total", wait=False + ) ) == 2, timeout=40, @@ -843,7 +845,9 @@ def g(): # Latency metrics wait_for_condition( lambda: len( - get_metric_dictionaries("ray_serve_deployment_processing_latency_ms_count") + get_metric_dictionaries( + "ray_serve_deployment_processing_latency_ms_count", wait=False + ) ) == 2, timeout=40, @@ -862,7 +866,9 @@ def g(): } == expected_output wait_for_condition( - lambda: len(get_metric_dictionaries("ray_serve_replica_processing_queries")) + lambda: len( + get_metric_dictionaries("ray_serve_replica_processing_queries", wait=False) + ) == 2 ) processing_queries = get_metric_dictionaries("ray_serve_replica_processing_queries") @@ -880,7 +886,11 @@ def h(): url_h = get_application_url("HTTP", "app3") assert 500 == httpx.get(url_h).status_code wait_for_condition( - lambda: len(get_metric_dictionaries("ray_serve_deployment_error_counter_total")) + lambda: len( + get_metric_dictionaries( + "ray_serve_deployment_error_counter_total", wait=False + ) + ) == 1, timeout=40, ) @@ -895,7 +905,9 @@ def h(): assert err_requests[0]["exception_type"] == "ZeroDivisionError" wait_for_condition( - lambda: len(get_metric_dictionaries("ray_serve_deployment_replica_healthy")) + lambda: len( + get_metric_dictionaries("ray_serve_deployment_replica_healthy", wait=False) + ) == 3, timeout=40, ) @@ -926,7 +938,7 @@ def raises_value_error(): def check_metric(): err_metrics = get_metric_dictionaries( - "ray_serve_deployment_error_counter_total" + "ray_serve_deployment_error_counter_total", wait=False ) value_error_metrics = [ m for m in err_metrics if m.get("exception_type") == "ValueError" @@ -964,7 +976,9 @@ async def __call__(self): def check_queue_wait_time_metric(): metrics = get_metric_dictionaries( - "ray_serve_request_router_fulfillment_time_ms_sum", timeseries=timeseries + "ray_serve_request_router_fulfillment_time_ms_sum", + timeseries=timeseries, + wait=False, ) if not metrics: return False @@ -1020,7 +1034,7 @@ async def __call__(self, request: Request): # Check that the router queue length metric appears with correct tags def check_router_queue_len(): metrics = get_metric_dictionaries( - "ray_serve_request_router_queue_len", timeseries=timeseries + "ray_serve_request_router_queue_len", timeseries=timeseries, wait=False ) if not metrics: return False @@ -1168,7 +1182,9 @@ def get_item(self, item_id: str): # Wait for metrics to be updated def metrics_available(): - metrics = get_metric_dictionaries("ray_serve_num_http_requests_total") + metrics = get_metric_dictionaries( + "ray_serve_num_http_requests_total", wait=False + ) api_metrics = [m for m in metrics if m.get("application") == "api_app"] return len(api_metrics) >= 3 @@ -1265,7 +1281,7 @@ async def record_routing_stats(self): # This metric is recorded when the controller polls routing stats from replicas def check_routing_stats_delay_metric(): metrics = get_metric_dictionaries( - "ray_serve_routing_stats_delay_ms_count", timeseries=timeseries + "ray_serve_routing_stats_delay_ms_count", timeseries=timeseries, wait=False ) if not metrics: return False @@ -1341,7 +1357,7 @@ def set_should_hang(self, value: bool): # Check that error metric with error_type="exception" is reported def check_exception_error_metric(): metrics = get_metric_dictionaries( - "ray_serve_routing_stats_error_total", timeseries=timeseries + "ray_serve_routing_stats_error_total", timeseries=timeseries, wait=False ) for metric in metrics: if ( @@ -1367,7 +1383,7 @@ def check_exception_error_metric(): # Check that error metric with error_type="timeout" is reported def check_timeout_error_metric(): metrics = get_metric_dictionaries( - "ray_serve_routing_stats_error_total", timeseries=timeseries + "ray_serve_routing_stats_error_total", timeseries=timeseries, wait=False ) for metric in metrics: if ( @@ -1435,7 +1451,9 @@ def _send_requests_forever(): # Wait for the utilization metric to be reported def check_utilization_metric_exists(): metrics = get_metric_dictionaries( - "ray_serve_replica_utilization_percent", timeseries=timeseries + "ray_serve_replica_utilization_percent", + timeseries=timeseries, + wait=False, ) if not metrics: return False diff --git a/python/ray/serve/tests/test_metrics_2.py b/python/ray/serve/tests/test_metrics_2.py index 2aec83edd5da..efd785cad03b 100644 --- a/python/ray/serve/tests/test_metrics_2.py +++ b/python/ray/serve/tests/test_metrics_2.py @@ -125,6 +125,7 @@ def h(): get_metric_dictionaries( "ray_serve_deployment_processing_latency_ms_sum", timeseries=timeseries, + wait=False, ) ) == 3, @@ -239,6 +240,7 @@ def __call__(self, *args, **kwargs): get_metric_dictionaries( "ray_serve_deployment_processing_latency_ms_sum", timeseries=timeseries, + wait=False, ) ) == 5, @@ -365,8 +367,10 @@ async def app2(self): wait_for_condition( lambda: len( get_metric_dictionaries( - "ray_serve_deployment_request_counter_total", timeseries=timeseries - ), + "ray_serve_deployment_request_counter_total", + timeseries=timeseries, + wait=False, + ) ) == 4, timeout=40, @@ -416,7 +420,9 @@ def route2(self): wait_for_condition( lambda: len( get_metric_dictionaries( - "ray_serve_deployment_request_counter_total", timeseries=timeseries + "ray_serve_deployment_request_counter_total", + timeseries=timeseries, + wait=False, ) ) == 2, @@ -486,7 +492,9 @@ def __call__(self): deployment_name, replica_id = resp.json() wait_for_condition( lambda: len( - get_metric_dictionaries("ray_my_gauge", timeseries=timeseries), + get_metric_dictionaries( + "ray_my_gauge", timeseries=timeseries, wait=False + ), ) == 1, timeout=40, @@ -631,7 +639,9 @@ async def __call__(self): timeseries = PrometheusTimeseries() wait_for_condition( lambda: len( - get_metric_dictionaries("ray_my_gauge", timeseries=timeseries), + get_metric_dictionaries( + "ray_my_gauge", timeseries=timeseries, wait=False + ), ) == 1, timeout=40, @@ -971,7 +981,7 @@ def f(): # Wait for the proxy to become healthy and metric to be reported def check_proxy_status(): metrics = get_metric_dictionaries( - "ray_serve_proxy_status", timeseries=timeseries + "ray_serve_proxy_status", timeseries=timeseries, wait=False ) if not metrics: return False @@ -1027,7 +1037,9 @@ def f(): # The histogram metric will have _sum and _count suffixes def check_shutdown_duration_metric_exists(): metrics = get_metric_dictionaries( - "ray_serve_proxy_shutdown_duration_ms_sum", timeseries=timeseries + "ray_serve_proxy_shutdown_duration_ms_sum", + timeseries=timeseries, + wait=False, ) if not metrics: return False @@ -1041,7 +1053,9 @@ def check_shutdown_duration_metric_exists(): # Verify the metric has the expected tags metrics = get_metric_dictionaries( - "ray_serve_proxy_shutdown_duration_ms_sum", timeseries=timeseries + "ray_serve_proxy_shutdown_duration_ms_sum", + timeseries=timeseries, + wait=False, ) assert len(metrics) == 1 for metric in metrics: @@ -1050,7 +1064,9 @@ def check_shutdown_duration_metric_exists(): # Also verify _count metric exists count_metrics = get_metric_dictionaries( - "ray_serve_proxy_shutdown_duration_ms_count", timeseries=timeseries + "ray_serve_proxy_shutdown_duration_ms_count", + timeseries=timeseries, + wait=False, ) assert len(count_metrics) == 1 diff --git a/python/ray/serve/tests/test_metrics_3.py b/python/ray/serve/tests/test_metrics_3.py index d1ac65bea01f..374650f2581c 100644 --- a/python/ray/serve/tests/test_metrics_3.py +++ b/python/ray/serve/tests/test_metrics_3.py @@ -70,14 +70,14 @@ def deployment_b(): def check_status_metrics(): # Check deployment status metrics deployment_metrics = get_metric_dictionaries( - "ray_serve_deployment_status", timeseries=timeseries + "ray_serve_deployment_status", timeseries=timeseries, wait=False ) if len(deployment_metrics) < 2: return False # Check application status metrics app_metrics = get_metric_dictionaries( - "ray_serve_application_status", timeseries=timeseries + "ray_serve_application_status", timeseries=timeseries, wait=False ) if len(app_metrics) < 2: return False @@ -183,7 +183,8 @@ def check_initialization_latency_value(): # Assert that 2 metrics are recorded (one per replica) def check_metrics_count(): metrics = get_metric_dictionaries( - "ray_serve_replica_initialization_latency_ms_count" + "ray_serve_replica_initialization_latency_ms_count", + wait=False, ) assert len(metrics) == 2, f"Expected 2 metrics, got {len(metrics)}" # All metrics should have same deployment and application @@ -593,6 +594,7 @@ def check_metrics_delay_metrics(): "ray_serve_autoscaling_handle_metrics_delay_ms", timeout=5, timeseries=timeseries, + wait=False, ) for m in metrics_dicts: if ( @@ -616,6 +618,7 @@ def check_metrics_delay_metrics(): "ray_serve_autoscaling_replica_metrics_delay_ms", timeout=5, timeseries=timeseries, + wait=False, ) for m in metrics_dicts: if ( @@ -705,6 +708,7 @@ def check_metrics_delay_metric(): "ray_serve_autoscaling_async_inference_task_queue_metrics_delay_ms", timeout=5, timeseries=timeseries, + wait=False, ) for m in metrics_dicts: if ( @@ -786,6 +790,7 @@ def check_user_stats_latency_metric(): "ray_serve_user_autoscaling_stats_latency_ms_sum", timeout=5, timeseries=timeseries, + wait=False, ) for m in metrics_dicts: if ( @@ -838,6 +843,7 @@ def check_stats_failure_metric(): "ray_serve_record_autoscaling_stats_failed_total", timeout=5, timeseries=timeseries, + wait=False, ) for m in metrics_dicts: if ( @@ -1102,6 +1108,7 @@ def check_proxy_main_loop_metrics(): "ray_serve_event_loop_monitoring_iterations_total", timeout=10, timeseries=timeseries, + wait=False, ) for m in metrics: if m.get("component") == "proxy" and m.get("loop_type") == "main": @@ -1119,6 +1126,7 @@ def check_proxy_router_loop_metrics(): "ray_serve_event_loop_monitoring_iterations_total", timeout=10, timeseries=timeseries, + wait=False, ) for m in metrics: if m.get("component") == "proxy" and m.get("loop_type") == "router": @@ -1139,6 +1147,7 @@ def check_replica_main_loop_metrics(): "ray_serve_event_loop_monitoring_iterations_total", timeout=10, timeseries=timeseries, + wait=False, ) for m in metrics: if m.get("component") == "replica" and m.get("loop_type") == "main": @@ -1161,6 +1170,7 @@ def check_replica_user_code_loop_metrics(): "ray_serve_event_loop_monitoring_iterations_total", timeout=10, timeseries=timeseries, + wait=False, ) for m in metrics: if m.get("component") == "replica" and m.get("loop_type") == "user_code": @@ -1186,6 +1196,7 @@ def check_router_loop_metrics(): "ray_serve_event_loop_monitoring_iterations_total", timeout=10, timeseries=timeseries, + wait=False, ) for m in metrics: if m.get("component") == "replica" and m.get("loop_type") == "router": @@ -1207,6 +1218,7 @@ def check_scheduling_latency_metric(): "ray_serve_event_loop_scheduling_latency_ms_count", timeout=10, timeseries=timeseries, + wait=False, ) # Should have metrics for proxy main, replica main, replica user_code, router component_loop_pairs = set() @@ -1236,6 +1248,7 @@ def check_tasks_gauge_metric(): "ray_serve_event_loop_tasks", timeout=10, timeseries=timeseries, + wait=False, ) # Should have metrics for proxy main, replica main, replica user_code, router component_loop_pairs = set() diff --git a/python/ray/serve/tests/test_metrics_haproxy.py b/python/ray/serve/tests/test_metrics_haproxy.py index c5574094a835..51c300f0087b 100644 --- a/python/ray/serve/tests/test_metrics_haproxy.py +++ b/python/ray/serve/tests/test_metrics_haproxy.py @@ -801,7 +801,9 @@ def g(): wait_for_condition( lambda: len( - get_metric_dictionaries("ray_serve_deployment_request_counter_total") + get_metric_dictionaries( + "ray_serve_deployment_request_counter_total", wait=False + ) ) == 2, timeout=40, @@ -833,7 +835,9 @@ def g(): # Latency metrics wait_for_condition( lambda: len( - get_metric_dictionaries("ray_serve_deployment_processing_latency_ms_count") + get_metric_dictionaries( + "ray_serve_deployment_processing_latency_ms_count", wait=False + ) ) == 2, timeout=40, @@ -852,7 +856,9 @@ def g(): } == expected_output wait_for_condition( - lambda: len(get_metric_dictionaries("ray_serve_replica_processing_queries")) + lambda: len( + get_metric_dictionaries("ray_serve_replica_processing_queries", wait=False) + ) == 2 ) processing_queries = get_metric_dictionaries("ray_serve_replica_processing_queries") @@ -870,7 +876,11 @@ def h(): url_h = get_application_url("HTTP", "app3") assert 500 == httpx.get(url_h).status_code wait_for_condition( - lambda: len(get_metric_dictionaries("ray_serve_deployment_error_counter_total")) + lambda: len( + get_metric_dictionaries( + "ray_serve_deployment_error_counter_total", wait=False + ) + ) == 1, timeout=40, ) @@ -884,7 +894,9 @@ def h(): ) == expected_output wait_for_condition( - lambda: len(get_metric_dictionaries("ray_serve_deployment_replica_healthy")) + lambda: len( + get_metric_dictionaries("ray_serve_deployment_replica_healthy", wait=False) + ) == 3, timeout=40, ) diff --git a/python/ray/serve/tests/unit/test_deployment_state.py b/python/ray/serve/tests/unit/test_deployment_state.py index 5b5d360700b4..62d141921d62 100644 --- a/python/ray/serve/tests/unit/test_deployment_state.py +++ b/python/ray/serve/tests/unit/test_deployment_state.py @@ -5643,6 +5643,64 @@ def test_rank_assignment_with_replica_failures(self, mock_deployment_state_manag 2, }, f"Expected ranks [0, 1, 2], got {[r.rank for r in ranks_mapping.values()]}" + def test_rank_recovery_skips_when_already_assigned( + self, mock_deployment_state_manager + ): + """Verify that recover_rank is skipped when a replica's rank is already assigned.""" + create_dsm, _, _, _ = mock_deployment_state_manager + dsm: DeploymentStateManager = create_dsm() + + # Deploy 3 replicas: STARTING -> RUNNING (ranks get assigned). + info_1, v1 = deployment_info(num_replicas=3, version="1") + dsm.deploy(TEST_DEPLOYMENT_ID, info_1) + ds: DeploymentState = dsm._deployment_states[TEST_DEPLOYMENT_ID] + + dsm.update() + check_counts(ds, total=3, by_state=[(ReplicaState.STARTING, 3, v1)]) + + self._set_replicas_ready(ds, [ReplicaState.STARTING]) + dsm.update() + check_counts(ds, total=3, by_state=[(ReplicaState.RUNNING, 3, v1)]) + + # Record the actor names and ranks for recovery. + actor_names = [r.replica_id.to_full_id_str() for r in ds._replicas.get()] + original_ranks = { + r.replica_id.unique_id: ds._rank_manager.get_replica_rank( + r.replica_id.unique_id + ) + for r in ds._replicas.get() + } + dsm.save_checkpoint() + + # Simulate controller crash: create a new DSM with the live actor names. + new_dsm: DeploymentStateManager = create_dsm(actor_names) + new_ds = new_dsm._deployment_states[TEST_DEPLOYMENT_ID] + check_counts(new_ds, total=3, by_state=[(ReplicaState.RECOVERING, 3, v1)]) + + # Enable strict rank error mode so duplicate recover_rank raises. + new_ds._rank_manager._fail_on_rank_error = True + + # Pre-populate 1 replica's rank in the new rank manager, simulating + # the scenario where the rank was never released. + target = new_ds._replicas.get(states=[ReplicaState.RECOVERING])[0] + target_id = target.replica_id.unique_id + target_rank = original_ranks[target_id] + new_ds._rank_manager.recover_rank(target_id, target.actor_node_id, target_rank) + assert new_ds._rank_manager.has_replica_rank(target_id) + + # Mark all recovering replicas as ready. + self._set_replicas_ready(new_ds, [ReplicaState.RECOVERING]) + + # This update() calls _check_startup_replicas(RECOVERING). For the + # pre-populated replica, has_replica_rank returns True so recover_rank + # is SKIPPED. + new_dsm.update() + check_counts(new_ds, total=3, by_state=[(ReplicaState.RUNNING, 3, v1)]) + + # Verify all ranks were recovered correctly. + for rid, expected_rank in original_ranks.items(): + assert new_ds._rank_manager.get_replica_rank(rid) == expected_rank + class TestGetOutboundDeployments: def test_basic_outbound_deployments(self, mock_deployment_state_manager): diff --git a/python/ray/serve/tests/unit/test_router.py b/python/ray/serve/tests/unit/test_router.py index 766f1bc5fa33..6c0c808ecaf7 100644 --- a/python/ray/serve/tests/unit/test_router.py +++ b/python/ray/serve/tests/unit/test_router.py @@ -12,7 +12,7 @@ import ray from ray._common.test_utils import async_wait_for_condition, wait_for_condition from ray._common.utils import get_or_create_event_loop -from ray.exceptions import ActorDiedError, ActorUnavailableError +from ray.exceptions import ActorDiedError, ActorUnavailableError, RayTaskError from ray.serve._private.common import ( DeploymentHandleSource, DeploymentID, @@ -103,16 +103,22 @@ def __init__( queue_len_info: Optional[ReplicaQueueLengthInfo] = None, is_cross_language: bool = False, error: Optional[Exception] = None, + actor_id: Optional[ray.ActorID] = None, ): self._replica_id = replica_id self._is_cross_language = is_cross_language self._queue_len_info = queue_len_info self._error = error + self._actor_id = actor_id @property def replica_id(self) -> ReplicaID: return self._replica_id + @property + def actor_id(self) -> Optional[ray.ActorID]: + return self._actor_id + @property def is_cross_language(self) -> bool: return self._is_cross_language @@ -260,8 +266,7 @@ def on_request_completed( @pytest.fixture -@pytest.mark.asyncio -def setup_router(request) -> Tuple[AsyncioRouter, FakeRequestRouter]: +async def setup_router(request) -> Tuple[AsyncioRouter, FakeRequestRouter]: if not hasattr(request, "param"): request.param = {} @@ -285,7 +290,8 @@ def setup_router(request) -> Tuple[AsyncioRouter, FakeRequestRouter]: prefer_local_node_routing=False, _request_router_initialized_event=asyncio.Event(), ) - return router, fake_request_router + yield router, fake_request_router + await router.shutdown() def dummy_request_metadata(is_streaming: bool = False) -> RequestMetadata: @@ -647,13 +653,18 @@ async def test_max_queued_requests_updated( async def test_replica_actor_died( self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] ): + """ActorDiedError whose actor_id matches the replica → replica is dropped.""" router, fake_request_router = setup_router d_id = DeploymentID(name="test") r1_id = ReplicaID(unique_id="r1", deployment_id=d_id) r2_id = ReplicaID(unique_id="r2", deployment_id=d_id) + r1_actor_id = ray.ActorID.from_random() + error = ActorDiedError() + error.actor_id = r1_actor_id.hex() + fake_request_router.set_replica_to_return( - FakeReplica(r1_id, error=ActorDiedError()) + FakeReplica(r1_id, error=error, actor_id=r1_actor_id) ) fake_request_router.set_replica_to_return_on_retry( FakeReplica( @@ -666,6 +677,185 @@ async def test_replica_actor_died( await router.assign_request(dummy_request_metadata()) assert r1_id in fake_request_router.dropped_replicas + @pytest.mark.parametrize( + "setup_router", + [ + { + "enable_strict_max_ongoing_requests": True, + "enable_queue_len_cache": True, + }, + ], + indirect=True, + ) + async def test_replica_actor_died_mismatched_actor_id( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """ActorDiedError whose actor_id does NOT match the replica → replica is NOT + dropped (the error came from an upstream dependency, not this replica).""" + router, fake_request_router = setup_router + d_id = DeploymentID(name="test") + r1_id = ReplicaID(unique_id="r1", deployment_id=d_id) + r2_id = ReplicaID(unique_id="r2", deployment_id=d_id) + + # r1 has one actor_id but the error reports a different actor_id (upstream). + r1_actor_id = ray.ActorID.from_random() + upstream_actor_id = ray.ActorID.from_random() + error = ActorDiedError() + error.actor_id = upstream_actor_id.hex() + + fake_request_router.set_replica_to_return( + FakeReplica(r1_id, error=error, actor_id=r1_actor_id) + ) + # r2 handles the retry after r1 fails. + fake_request_router.set_replica_to_return_on_retry( + FakeReplica( + r2_id, + queue_len_info=ReplicaQueueLengthInfo( + accepted=True, num_ongoing_requests=5 + ), + ) + ) + await router.assign_request(dummy_request_metadata()) + # r1 should NOT be dropped because the error came from a different actor. + assert r1_id not in fake_request_router.dropped_replicas + + async def test_actor_died_matching_id_via_callback( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """Done-callback with ActorDiedError whose actor_id matches the replica → + replica is dropped via _process_finished_request.""" + router, fake_request_router = setup_router + d_id = DeploymentID(name="test") + r1_id = ReplicaID(unique_id="r1", deployment_id=d_id) + + r1_actor_id = ray.ActorID.from_random() + fake_request_router.set_replica_to_return( + FakeReplica(r1_id, actor_id=r1_actor_id) + ) + + result = await router.assign_request(dummy_request_metadata()) + + # Simulate the response finishing with an ActorDiedError from this replica. + error = ActorDiedError() + error.actor_id = r1_actor_id.hex() + result.fire_done_callbacks(result=error) + await asyncio.sleep(0) + + assert r1_id in fake_request_router.dropped_replicas + + async def test_actor_died_mismatched_id_via_callback( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """Done-callback with ActorDiedError whose actor_id does NOT match the + replica → replica is NOT dropped (upstream dependency failure).""" + router, fake_request_router = setup_router + d_id = DeploymentID(name="test") + r1_id = ReplicaID(unique_id="r1", deployment_id=d_id) + + r1_actor_id = ray.ActorID.from_random() + upstream_actor_id = ray.ActorID.from_random() + fake_request_router.set_replica_to_return( + FakeReplica(r1_id, actor_id=r1_actor_id) + ) + + result = await router.assign_request(dummy_request_metadata()) + + # Simulate the response finishing with an ActorDiedError from a different actor. + error = ActorDiedError() + error.actor_id = upstream_actor_id.hex() + result.fire_done_callbacks(result=error) + await asyncio.sleep(0) + + assert r1_id not in fake_request_router.dropped_replicas + + async def test_actor_died_none_actor_id_via_callback( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """Done-callback with ActorDiedError(actor_id=None) and replica has valid + actor_id → replica IS dropped (conservative fallback when ID unknown).""" + router, fake_request_router = setup_router + d_id = DeploymentID(name="test") + r1_id = ReplicaID(unique_id="r1", deployment_id=d_id) + + r1_actor_id = ray.ActorID.from_random() + fake_request_router.set_replica_to_return( + FakeReplica(r1_id, actor_id=r1_actor_id) + ) + + result = await router.assign_request(dummy_request_metadata()) + + # ActorDiedError without actor_id (e.g., constructed without + # ActorDiedErrorContext). Conservative fallback: mark replica dead. + error = ActorDiedError() + error.actor_id = None # Explicitly None + result.fire_done_callbacks(result=error) + await asyncio.sleep(0) + + assert r1_id in fake_request_router.dropped_replicas + + async def test_ray_task_error_wrapping_actor_died_matching_id_via_callback( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """Done-callback with RayTaskError(cause=ActorDiedError) where actor_id + matches the replica → replica is dropped.""" + router, fake_request_router = setup_router + d_id = DeploymentID(name="test") + r1_id = ReplicaID(unique_id="r1", deployment_id=d_id) + + r1_actor_id = ray.ActorID.from_random() + fake_request_router.set_replica_to_return( + FakeReplica(r1_id, actor_id=r1_actor_id) + ) + + result = await router.assign_request(dummy_request_metadata()) + + actor_died = ActorDiedError() + actor_died.actor_id = r1_actor_id.hex() + wrapped = RayTaskError( + function_name="test_func", + traceback_str="", + cause=actor_died, + proctitle="test", + pid=12345, + ip="127.0.0.1", + ) + result.fire_done_callbacks(result=wrapped) + await asyncio.sleep(0) + + assert r1_id in fake_request_router.dropped_replicas + + async def test_ray_task_error_wrapping_actor_died_mismatched_id_via_callback( + self, setup_router: Tuple[AsyncioRouter, FakeRequestRouter] + ): + """Done-callback with RayTaskError(cause=ActorDiedError) where actor_id does + NOT match the replica → replica is NOT dropped (upstream failure).""" + router, fake_request_router = setup_router + d_id = DeploymentID(name="test") + r1_id = ReplicaID(unique_id="r1", deployment_id=d_id) + + r1_actor_id = ray.ActorID.from_random() + upstream_actor_id = ray.ActorID.from_random() + fake_request_router.set_replica_to_return( + FakeReplica(r1_id, actor_id=r1_actor_id) + ) + + result = await router.assign_request(dummy_request_metadata()) + + actor_died = ActorDiedError() + actor_died.actor_id = upstream_actor_id.hex() + wrapped = RayTaskError( + function_name="test_func", + traceback_str="", + cause=actor_died, + proctitle="test", + pid=12345, + ip="127.0.0.1", + ) + result.fire_done_callbacks(result=wrapped) + await asyncio.sleep(0) + + assert r1_id not in fake_request_router.dropped_replicas + @pytest.mark.parametrize( "setup_router", [ diff --git a/python/ray/tests/BUILD.bazel b/python/ray/tests/BUILD.bazel index 102c76e49f9c..6a312e6af42f 100644 --- a/python/ray/tests/BUILD.bazel +++ b/python/ray/tests/BUILD.bazel @@ -328,6 +328,7 @@ py_test_module_list( "test_ray_debugger.py", "test_ray_init.py", "test_ray_shutdown.py", + "test_resize_raylet.py", "test_resource_metrics.py", "test_runtime_context.py", "test_runtime_env_env_vars.py", diff --git a/python/ray/tests/test_resize_raylet.py b/python/ray/tests/test_resize_raylet.py new file mode 100644 index 000000000000..5abb77111ff9 --- /dev/null +++ b/python/ray/tests/test_resize_raylet.py @@ -0,0 +1,36 @@ +import sys + +import pytest + +import ray +from ray._raylet import GcsClient + + +def test_resize_raylet_resource_instances(ray_start_cluster): + cluster = ray_start_cluster + cluster.add_node() + ray.init(address=cluster.address) + worker_node = cluster.add_node(num_cpus=2) + cluster.wait_for_nodes() + + gcs_client = GcsClient(address=ray.get_runtime_context().gcs_address) + total_resources = gcs_client.resize_raylet_resource_instances( + worker_node.node_id, + {"CPU": 3.0}, + ) + + assert total_resources["CPU"] == 3.0 + + missing_node_id = ray.NodeID.from_random().hex() + with pytest.raises( + ValueError, + match=f"Raylet {missing_node_id} is not alive.", + ): + gcs_client.resize_raylet_resource_instances( + missing_node_id, + {"CPU": 3.0}, + ) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", __file__])) diff --git a/python/ray/train/_internal/backend_executor.py b/python/ray/train/_internal/backend_executor.py index a44aebde88c5..3dc94664b17e 100644 --- a/python/ray/train/_internal/backend_executor.py +++ b/python/ray/train/_internal/backend_executor.py @@ -23,7 +23,8 @@ from ray._private.accelerators.nvidia_gpu import CUDA_VISIBLE_DEVICES_ENV_VAR from ray._private.ray_constants import env_integer from ray.exceptions import RayActorError -from ray.train import Checkpoint, DataConfig +from ray.train._checkpoint import Checkpoint +from ray.train._internal.data_config import DataConfig from ray.train._internal.session import ( TrialInfo, _TrainingResult, diff --git a/python/ray/train/_internal/session.py b/python/ray/train/_internal/session.py index 7d8a08f053fa..02bab5fd161f 100644 --- a/python/ray/train/_internal/session.py +++ b/python/ray/train/_internal/session.py @@ -20,7 +20,7 @@ TIME_THIS_ITER_S, TIMESTAMP, ) -from ray.train import Checkpoint +from ray.train._checkpoint import Checkpoint from ray.train._internal.accelerator import Accelerator from ray.train._internal.storage import StorageContext from ray.train.constants import ( diff --git a/release/BUILD.bazel b/release/BUILD.bazel index e2c8968c3433..7d3eedb2c630 100644 --- a/release/BUILD.bazel +++ b/release/BUILD.bazel @@ -407,6 +407,16 @@ py_library( ], ) +py_library( + name = "github_client", + srcs = ["ray_release/github_client.py"], + imports = ["."], + visibility = ["//ci/ray_ci:__subpackages__"], + deps = [ + bk_require("requests"), + ], +) + py_library( name = "test", srcs = ["ray_release/test.py"], @@ -868,6 +878,22 @@ py_test( ], ) +py_test( + name = "test_github_client", + size = "small", + srcs = ["ray_release/tests/test_github_client.py"], + exec_compatible_with = ["//bazel:py3"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":github_client", + bk_require("pytest"), + bk_require("responses"), + ], +) + py_test( name = "test_test", size = "small", diff --git a/release/llm_tests/serve/configs/serve_phi_tiny_moe_dp4_gang.yaml b/release/llm_tests/serve/configs/serve_phi_tiny_moe_dp4_gang.yaml new file mode 100644 index 000000000000..fc97da6a2874 --- /dev/null +++ b/release/llm_tests/serve/configs/serve_phi_tiny_moe_dp4_gang.yaml @@ -0,0 +1,26 @@ +applications: + - args: + llm_config: + model_loading_config: + model_id: microsoft/Phi-tiny-MoE-instruct + model_source: microsoft/Phi-tiny-MoE-instruct + deployment_config: + num_replicas: 2 + engine_kwargs: + tensor_parallel_size: 1 + pipeline_parallel_size: 1 + data_parallel_size: 4 + distributed_executor_backend: ray + max_model_len: 1024 + max_num_seqs: 32 + enforce_eager: true + placement_group_config: + bundles: + - GPU: 1 + CPU: 1 + runtime_env: + env_vars: + VLLM_DISABLE_COMPILE_CACHE: "1" + import_path: ray.serve.llm:build_dp_openai_app + name: llm-endpoint + route_prefix: / diff --git a/release/llm_tests/serve/test_llm_serve_correctness.py b/release/llm_tests/serve/test_llm_serve_correctness.py index 00d914f811e1..620bbb105199 100644 --- a/release/llm_tests/serve/test_llm_serve_correctness.py +++ b/release/llm_tests/serve/test_llm_serve_correctness.py @@ -1,12 +1,12 @@ import subprocess import time -from typing import Literal -import requests import pytest from openai import OpenAI from ray import serve -from ray.serve.llm import LLMConfig, build_openai_app, ModelLoadingConfig +from ray.serve.llm import LLMConfig, ModelLoadingConfig, build_openai_app + +from test_utils import create_openai_client, wait_for_server_ready MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" RAY_MODEL_ID = "qwen-0.5b" @@ -48,12 +48,6 @@ def start_ray_serve( return ray_url -def create_openai_client(server_url: str) -> OpenAI: - """Create an OpenAI client.""" - openai_api_base = f"{server_url}/v1" - return OpenAI(base_url=openai_api_base, api_key="fake-key") - - def generate_completion(client: OpenAI, model_id: str, test_prompt: str) -> str: """Generate completion using the provided OpenAI client.""" response = client.completions.create( @@ -92,7 +86,7 @@ def __init__( self.model_id = model_id self.vllm_url = self._start_vllm_server() self.openai_client = create_openai_client(self.vllm_url) - wait_for_server_ready(self.vllm_url, server_type="vllm", timeout=240) + wait_for_server_ready(self.vllm_url, model_id=self.model_id, timeout=240) def _start_vllm_server(self) -> str: """Start vLLM server with specified parallelism parameters.""" @@ -131,50 +125,6 @@ def shutdown(self): self.process.kill() -def wait_for_server_ready( - url: str, - server_type: Literal["ray", "vllm"] = "ray", - timeout: int = 120, - retry_interval: int = 2, -) -> bool: - """Poll the server until it's ready or timeout is reached. - - Args: - url: The server URL to check - server_type: Either "ray" or "vllm" - timeout: Maximum time to wait in seconds - retry_interval: Time between retry attempts - """ - start_time = time.time() - while time.time() - start_time < timeout: - try: - # Directly test if the server can handle a completion request - model_id = MODEL_ID if server_type == "vllm" else RAY_MODEL_ID - test_data = { - "model": model_id, - "prompt": "test", - "max_tokens": 5, - "temperature": 0, - } - completion_response = requests.post( - f"{url}/v1/completions", json=test_data, timeout=10 - ) - if completion_response.status_code == 200: - print( - f"{server_type.upper()} server at {url} is ready to handle requests!" - ) - return True - except Exception: - pass - - print(f"Waiting for {server_type.upper()} server at {url} to be ready...") - time.sleep(retry_interval) - - raise TimeoutError( - f"{server_type.upper()} server at {url} did not become ready within {timeout} seconds" - ) - - @pytest.mark.parametrize( "tensor_parallel_size, pipeline_parallel_size", [ @@ -197,7 +147,7 @@ def test_llm_serve_correctness( ray_url = start_ray_serve(tensor_parallel_size, pipeline_parallel_size) ray_client = create_openai_client(ray_url) - wait_for_server_ready(ray_url, server_type="ray", timeout=240) + wait_for_server_ready(ray_url, model_id=RAY_MODEL_ID, timeout=240) time.sleep(5) # Buffer time for server to be ready ray_completion_output = generate_completion(ray_client, RAY_MODEL_ID, test_prompt) diff --git a/release/llm_tests/serve/test_llm_serve_dp_engine_fault.py b/release/llm_tests/serve/test_llm_serve_dp_engine_fault.py new file mode 100644 index 000000000000..ec6db6f5aea3 --- /dev/null +++ b/release/llm_tests/serve/test_llm_serve_dp_engine_fault.py @@ -0,0 +1,210 @@ +"""Test DP deployment fault tolerance when a vLLM GPU worker process is killed. + +Finds a RayWorkerWrapper actor (the GPU worker process spun up by vLLM's +ray distributed backend) and kills it to simulate a DP replica failure: + + worker killed → compiled DAG error → EngineCore fatal error + → engine_dead flag set → check_health() raises + → Serve RESTART_GANG tears down the gang + +The test verifies that the surviving DP replicas continue serving requests +and the faulty gang is fully replaced. +""" + +import asyncio +import os +import signal +import time + +import pytest + +import ray +from ray import serve +from ray._common.test_utils import wait_for_condition +from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME +from ray.serve.llm import LLMConfig, ModelLoadingConfig, build_dp_deployment +from ray.serve.schema import ApplicationStatus, ReplicaState +from ray.util.state import list_actors +from vllm.entrypoints.openai.completion.protocol import CompletionRequest + + +@pytest.fixture(autouse=True) +def cleanup(): + yield + serve.shutdown() + ray.shutdown() + + +def is_default_app_running(): + try: + app = serve.status().applications[SERVE_DEFAULT_APP_NAME] + return app.status == ApplicationStatus.RUNNING + except (KeyError, AttributeError): + return False + + +def get_num_running_replicas(deployment_name: str) -> int: + """Get the number of RUNNING replicas for a deployment.""" + dep = ( + serve.status().applications[SERVE_DEFAULT_APP_NAME].deployments[deployment_name] + ) + return dep.replica_states.get(ReplicaState.RUNNING, 0) + + +def find_vllm_worker_actors(): + """Find all alive RayWorkerWrapper actors created by vLLM's ray distributed executor backend.""" + actors = list_actors( + filters=[ + ("class_name", "=", "RayWorkerWrapper"), + ("state", "=", "ALIVE"), + ] + ) + return actors + + +def kill_one_vllm_worker(): + """Kill one RayWorkerWrapper actor by sending SIGKILL to its process. + + Returns the node_id of the killed worker so we can verify which gang gets torn down. + """ + actors = find_vllm_worker_actors() + assert len(actors) > 0, "No alive RayWorkerWrapper actors found" + + target = actors[0] + target_pid = target.pid + target_node_id = target.node_id + + @ray.remote(num_cpus=0) + def kill_pid(pid): + os.kill(pid, signal.SIGKILL) + return f"Killed pid {pid}" + + result = ray.get( + kill_pid.options( + scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( + node_id=target_node_id, soft=False + ), + ).remote(target_pid) + ) + return target_node_id, result + + +def test_llm_serve_dp_engine_fault(): + """DP deployment recovers when a vLLM GPU worker process is killed.""" + deployment_name = "DPServer:microsoft--Phi-tiny-MoE-instruct" + dp_size = 2 + num_replicas = 2 + expected_serve_replicas = num_replicas * dp_size + + llm_config = LLMConfig( + model_loading_config=ModelLoadingConfig( + model_id="microsoft/Phi-tiny-MoE-instruct", + model_source="microsoft/Phi-tiny-MoE-instruct", + ), + deployment_config=dict( + num_replicas=num_replicas, + health_check_period_s=1, + health_check_timeout_s=3, + ), + engine_kwargs=dict( + tensor_parallel_size=1, + pipeline_parallel_size=1, + data_parallel_size=dp_size, + distributed_executor_backend="ray", + enable_expert_parallel=True, + max_model_len=1024, + max_num_seqs=32, + enforce_eager=True, + ), + runtime_env={ + "env_vars": { + "VLLM_DISABLE_COMPILE_CACHE": "1", + # Shorter compiled graph timeout so the dead worker is detected quickly. + "RAY_CGRAPH_get_timeout": "30", + }, + }, + ) + + handle = serve.run(build_dp_deployment(llm_config), blocking=False) + wait_for_condition(is_default_app_running, timeout=180) + + # Step 1: Verify steady state — all replicas running. + assert get_num_running_replicas(deployment_name) == expected_serve_replicas + + # Verify RayWorkerWrapper actors exist. + workers_before = find_vllm_worker_actors() + assert len(workers_before) > 0 + + # Step 2: Start continuous request sending, then kill a GPU worker. + @ray.remote + class RequestSender: + def __init__(self, h): + self._handle = h.options(stream=True) + self.total = 0 + self.errors = [] + self._stop = False + + async def send(self, prompt="test", max_tokens=1, ignore_errors=False): + req = CompletionRequest( + model="microsoft/Phi-tiny-MoE-instruct", + prompt=prompt, + max_tokens=max_tokens, + ) + try: + async for _ in self._handle.completions.remote(req): + pass + self.total += 1 + except Exception as e: + if not ignore_errors: + self.errors.append(str(e)) + self.total += 1 + + async def run(self): + while not self._stop: + await self.send(prompt="Hello, world!", max_tokens=5) + await asyncio.sleep(0.1) + + def stop(self): + self._stop = True + + def get_results(self): + return self.total, self.errors + + sender = RequestSender.remote(handle) + + # Send a few warm-up requests to confirm the deployment works. + for _ in range(5): + sender.send.remote(ignore_errors=True) + time.sleep(0.5) + + # Kill one RayWorkerWrapper GPU worker process. + killed_node_id, kill_msg = kill_one_vllm_worker() + print(f"Killed vLLM worker on node {killed_node_id}: {kill_msg}") + + # Step 3: Wait for the faulty gang to be torn down. + # Detection: compiled DAG timeout (~30s) + health check overhead. + wait_for_condition( + lambda: get_num_running_replicas(deployment_name) < expected_serve_replicas, + timeout=60, + ) + + # Step 4: Verify the surviving replicas continue serving requests. + time.sleep(2) + sender.run.remote() + + # Step 5: Wait for full recovery — all replicas back to RUNNING. + wait_for_condition( + lambda: get_num_running_replicas(deployment_name) == expected_serve_replicas, + timeout=120, + ) + + # Step 6: Assert zero downtime — requests should have kept flowing + # through the surviving gang during recovery. + ray.get(sender.stop.remote()) + total, errors = ray.get(sender.get_results.remote()) + assert total > 0, "Expected at least one successful request" + assert len(errors) == 0, f"Expected zero errors, got {errors}" + + +if __name__ == "__main__": + pytest.main(["-v", __file__]) diff --git a/release/llm_tests/serve/test_llm_serve_multi_node_integration.py b/release/llm_tests/serve/test_llm_serve_multi_node_integration.py index a3f109cec15a..e783b5ddee38 100644 --- a/release/llm_tests/serve/test_llm_serve_multi_node_integration.py +++ b/release/llm_tests/serve/test_llm_serve_multi_node_integration.py @@ -1,11 +1,23 @@ +import pathlib +from collections import defaultdict +import asyncio +import time + import pytest +import yaml import ray from ray import serve from ray._common.test_utils import wait_for_condition -from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME +from ray.experimental.internal_kv import _internal_kv_list +from ray.llm._internal.serve.serving_patterns.data_parallel.dp_server import ( + GangMasterInfoRegistry, +) +from ray.serve._private.common import DeploymentID, ReplicaState +from ray.serve._private.constants import SERVE_DEFAULT_APP_NAME, SERVE_NAMESPACE from ray.serve.llm import ( build_dp_deployment, + build_dp_openai_app, build_pd_openai_app, build_openai_app, LLMConfig, @@ -13,6 +25,9 @@ ModelLoadingConfig, ) from ray.serve.schema import ApplicationStatus +from vllm.entrypoints.openai.completion.protocol import CompletionRequest + +CONFIGS_DIR = pathlib.Path(__file__).parent / "configs" @pytest.fixture(autouse=True) @@ -80,14 +95,154 @@ def test_llm_serve_multi_node(tp_size, pp_size): wait_for_condition(is_default_app_running, timeout=300) -def test_llm_serve_data_parallelism(): - """Test Data Parallelism deployment with STRICT_PACK override. +@pytest.mark.parametrize( + "tp_size,dp_size,num_replicas,placement_group_config", + [ + # TP=1 cases + (1, 4, None, {"bundles": [{"GPU": 1, "CPU": 1}]}), # Single group, single node + (1, 8, None, {"bundles": [{"GPU": 1, "CPU": 1}]}), # Single group, multi-node + (1, 2, 2, {"bundles": [{"GPU": 1, "CPU": 1}]}), # Multi-group, single node + (1, 4, 2, {"bundles": [{"GPU": 1, "CPU": 1}]}), # Multi-group, multi-node + # TP=2 cases — auto-generates correct bundles from TP size + (2, 2, 1, None), # TP, single group, single node + (2, 2, 2, None), # TP, multi-group, multi-node + # TP=2 cases — explicit placement_group_config with 2 bundles for TP=2 + (2, 2, None, {"bundles": [{"GPU": 1, "CPU": 1}, {"GPU": 1}]}), + (2, 2, 2, {"bundles": [{"GPU": 1, "CPU": 1}, {"GPU": 1}]}), + ], +) +def test_llm_serve_data_parallelism( + tp_size, dp_size, num_replicas, placement_group_config +): + """Test Data Parallelism deployment.""" + deployment_config = dict() + if num_replicas is not None: + deployment_config["num_replicas"] = num_replicas - Validates that DP deployments work correctly with placement group configs - """ + llm_config = LLMConfig( + model_loading_config=ModelLoadingConfig( + model_id="microsoft/Phi-tiny-MoE-instruct", + model_source="microsoft/Phi-tiny-MoE-instruct", + ), + deployment_config=deployment_config, + engine_kwargs=dict( + tensor_parallel_size=tp_size, + pipeline_parallel_size=1, + data_parallel_size=dp_size, + distributed_executor_backend="ray", + max_model_len=1024, + max_num_seqs=32, + enforce_eager=True, + ), + placement_group_config=placement_group_config, + runtime_env={"env_vars": {"VLLM_DISABLE_COMPILE_CACHE": "1"}}, + ) + + app = build_dp_deployment(llm_config) + serve.run(app, blocking=False) + + wait_for_condition(is_default_app_running, timeout=300) + + +def test_llm_serve_data_parallelism_autoscaling(): + """Test that DP deployment with autoscaling scales up under load.""" + dp_size = 2 + deployment_name = "DPServer:microsoft--Phi-tiny-MoE-instruct" + + deployment_config = { + "num_replicas": "auto", + "autoscaling_config": dict( + min_replicas=1, + max_replicas=2, + upscale_delay_s=0.1, + downscale_delay_s=5, + metrics_interval_s=1, + look_back_period_s=2, + target_ongoing_requests=1, + ), + "max_ongoing_requests": 20, + } + + llm_config = LLMConfig( + model_loading_config=ModelLoadingConfig( + model_id="microsoft/Phi-tiny-MoE-instruct", + model_source="microsoft/Phi-tiny-MoE-instruct", + ), + deployment_config=deployment_config, + engine_kwargs=dict( + tensor_parallel_size=2, + pipeline_parallel_size=1, + data_parallel_size=dp_size, + distributed_executor_backend="ray", + max_model_len=1024, + max_num_seqs=32, + enforce_eager=True, + ), + runtime_env={"env_vars": {"VLLM_DISABLE_COMPILE_CACHE": "1"}}, + ) + + app = build_dp_deployment(llm_config) + handle = serve.run(app, blocking=False) + + wait_for_condition(is_default_app_running, timeout=300) + + def check_num_replicas_eq(target): + dep = ( + serve.status() + .applications[SERVE_DEFAULT_APP_NAME] + .deployments[deployment_name] + ) + running = dep.replica_states.get("RUNNING", 0) + assert running == target + return True + + def get_total_replicas(): + dep = ( + serve.status() + .applications[SERVE_DEFAULT_APP_NAME] + .deployments[deployment_name] + ) + return sum(dep.replica_states.values()) + + def check_total_replicas_gte(target): + assert get_total_replicas() >= target + return True + + wait_for_condition(check_num_replicas_eq, target=2, timeout=60) + + request = CompletionRequest( + model="microsoft/Phi-tiny-MoE-instruct", + prompt="Write a very long detailed story about", + max_tokens=1024, + ) + # Send enough concurrent requests to trigger upscaling + streaming_handle = handle.options(stream=True) + results = [streaming_handle.completions.remote(request) for _ in range(20)] + + wait_for_condition(check_total_replicas_gte, target=4, timeout=120) + + total = get_total_replicas() + assert total % dp_size == 0 + + # Drain requests by consuming streaming responses + for res in results: + for _ in res: + pass + + # After requests drain, verify scale-down back to min_replicas + wait_for_condition(check_num_replicas_eq, target=2, timeout=120) + + dep = ( + serve.status().applications[SERVE_DEFAULT_APP_NAME].deployments[deployment_name] + ) + running = dep.replica_states.get("RUNNING", 0) + assert running % dp_size == 0 + + +def test_llm_serve_data_parallelism_cleanup(): + """Test that Data Parallelism KV entries are cleaned up on shutdown.""" placement_group_config = { "bundles": [{"GPU": 1, "CPU": 1}], - "strategy": "SPREAD", # Will be overridden to STRICT_PACK } llm_config = LLMConfig( @@ -95,25 +250,39 @@ def test_llm_serve_data_parallelism(): model_id="microsoft/Phi-tiny-MoE-instruct", model_source="microsoft/Phi-tiny-MoE-instruct", ), - deployment_config=dict(), # DP sets num_replicas, not autoscaling + deployment_config=dict(), engine_kwargs=dict( tensor_parallel_size=1, pipeline_parallel_size=1, - data_parallel_size=4, # 4 DP replicas, need to fill 2x4GPU workers + data_parallel_size=4, distributed_executor_backend="ray", max_model_len=1024, max_num_seqs=32, enforce_eager=True, ), - experimental_configs=dict( - dp_size_per_node=2, - ), placement_group_config=placement_group_config, runtime_env={"env_vars": {"VLLM_DISABLE_COMPILE_CACHE": "1"}}, ) app = build_dp_deployment(llm_config) serve.run(app, blocking=False) + wait_for_condition(is_default_app_running, timeout=300) + + master_keys = _internal_kv_list(GangMasterInfoRegistry._KEY_PREFIX) + assert len(master_keys) > 0 + + serve.shutdown() + + +def test_llm_serve_data_parallelism_declarative(): + """Test Data Parallelism deployment via declarative config.""" + config_path = CONFIGS_DIR / "serve_phi_tiny_moe_dp4_gang.yaml" + with open(config_path) as f: + config = yaml.safe_load(f) + + app_config = config["applications"][0] + app = build_dp_openai_app(app_config["args"]) + serve.run(app, blocking=False) wait_for_condition(is_default_app_running, timeout=300) @@ -153,7 +322,6 @@ def test_llm_serve_prefill_decode_with_data_parallelism(): }, }, experimental_configs={ - "dp_size_per_node": 4, "NIXL_SIDE_CHANNEL_PORT_BASE": 40000, # Prefill port range }, runtime_env={"env_vars": {"VLLM_DISABLE_COMPILE_CACHE": "1"}}, @@ -170,7 +338,6 @@ def test_llm_serve_prefill_decode_with_data_parallelism(): }, }, experimental_configs={ - "dp_size_per_node": 4, "NIXL_SIDE_CHANNEL_PORT_BASE": 41000, # Decode port range (different) }, runtime_env={"env_vars": {"VLLM_DISABLE_COMPILE_CACHE": "1"}}, @@ -188,5 +355,120 @@ def test_llm_serve_prefill_decode_with_data_parallelism(): wait_for_condition(is_default_app_running, timeout=300) +def test_llm_serve_data_parallelism_worker_fault(): + """Test DP deployment recovers after replica death.""" + + deployment_name = "DPServer:microsoft--Phi-tiny-MoE-instruct" + dp_size = 2 + num_replicas = 2 + expected_serve_replicas = num_replicas * dp_size + + llm_config = LLMConfig( + model_loading_config=ModelLoadingConfig( + model_id="microsoft/Phi-tiny-MoE-instruct", + model_source="microsoft/Phi-tiny-MoE-instruct", + ), + deployment_config=dict( + num_replicas=2, + health_check_period_s=1, + health_check_timeout_s=5, + ), + engine_kwargs=dict( + tensor_parallel_size=1, + pipeline_parallel_size=1, + data_parallel_size=dp_size, + distributed_executor_backend="ray", + max_model_len=1024, + max_num_seqs=32, + enforce_eager=True, + ), + runtime_env={"env_vars": {"VLLM_DISABLE_COMPILE_CACHE": "1"}}, + ) + handle = serve.run(build_dp_deployment(llm_config), blocking=False) + wait_for_condition(is_default_app_running, timeout=300) + + deployment_id = DeploymentID(name=deployment_name, app_name=SERVE_DEFAULT_APP_NAME) + controller = serve.context._global_client._controller + + # Verify 4 running replicas in 2 gangs. + replicas = ray.get( + controller._dump_replica_states_for_testing.remote(deployment_id) + ) + running = replicas.get([ReplicaState.RUNNING]) + assert len(running) == expected_serve_replicas + + gangs = defaultdict(list) + for r in running: + assert r.gang_context is not None + gangs[r.gang_context.gang_id].append(r) + assert len(gangs) == 2 + + # Pick one gang and kill one of its replicas. + target_gang_id = list(gangs.keys())[0] + victim = gangs[target_gang_id][0] + victim_actor = ray.get_actor( + victim.replica_id.to_full_id_str(), namespace=SERVE_NAMESPACE + ) + original_replica_ids = {r.replica_id.unique_id for r in running} + + # Background request sender + @ray.remote + class RequestSender: + def __init__(self, handle): + self._handle = handle.options(stream=True) + self.total = 0 + self.errors = [] + self._stop = False + + async def run(self): + request = CompletionRequest( + model="microsoft/Phi-tiny-MoE-instruct", + prompt="Hello, world!", + max_tokens=5, + ) + while not self._stop: + try: + async for _ in self._handle.completions.remote(request): + pass + self.total += 1 + except Exception as e: + self.errors.append(str(e)) + self.total += 1 + await asyncio.sleep(0.1) + + def stop(self): + self._stop = True + + def get_results(self): + return self.total, self.errors + + sender = RequestSender.remote(handle) + sender.run.remote() + time.sleep(1) + + # Kill the victim replica + ray.kill(victim_actor, no_restart=False) + + # Wait for the killed gang to be replaced + def all_replicas_recovered(): + replicas = ray.get( + controller._dump_replica_states_for_testing.remote(deployment_id) + ) + running = replicas.get([ReplicaState.RUNNING]) + if len(running) != expected_serve_replicas: + return False + current_ids = {r.replica_id.unique_id for r in running} + new_ids = current_ids - original_replica_ids + return len(new_ids) == dp_size + + wait_for_condition(all_replicas_recovered, timeout=180) + + # Stop sender and verify no requests were dropped + ray.get(sender.stop.remote()) + total, errors = ray.get(sender.get_results.remote()) + assert total > 0 + assert len(errors) == 0 + + if __name__ == "__main__": pytest.main(["-v", __file__]) diff --git a/release/llm_tests/serve/test_utils.py b/release/llm_tests/serve/test_utils.py index 662f0333cf4b..15ab53533da6 100644 --- a/release/llm_tests/serve/test_utils.py +++ b/release/llm_tests/serve/test_utils.py @@ -2,11 +2,14 @@ import logging import os import re +import time import uuid from contextlib import contextmanager from typing import Any, Dict, List, Optional, Union -import time +import requests +from openai import OpenAI + import anyscale import boto3 import ray @@ -17,6 +20,7 @@ from ray._common.test_utils import wait_for_condition from ray.serve._private.utils import get_random_string + logger = logging.getLogger(__file__) logging.basicConfig(level=logging.INFO) REGION_NAME = "us-west-2" @@ -231,3 +235,38 @@ def get_vllm_s3_storage_path() -> str: storage_path = f"s3://{S3_BUCKET}/{S3_PREFIX}/vllm-perf-results-{unique_id}.jsonl" return storage_path + + +def create_openai_client(server_url: str) -> OpenAI: + return OpenAI(base_url=f"{server_url}/v1", api_key="fake-key") + + +def wait_for_server_ready( + url: str, + model_id: str, + timeout: int = 300, + retry_interval: int = 2, +) -> None: + """Poll the server until it's ready or timeout is reached.""" + start_time = time.time() + while time.time() - start_time < timeout: + try: + resp = requests.post( + f"{url}/v1/completions", + json={ + "model": model_id, + "prompt": "test", + "max_tokens": 5, + "temperature": 0, + }, + timeout=10, + ) + if resp.status_code == 200: + print(f"Server at {url} is ready to handle requests!") + return + except Exception: + pass + + print(f"Waiting for server at {url} to be ready...") + time.sleep(retry_interval) + raise TimeoutError(f"Server at {url} not ready within {timeout}s") diff --git a/release/ray_release/github_client.py b/release/ray_release/github_client.py new file mode 100644 index 000000000000..db82e7d4518b --- /dev/null +++ b/release/ray_release/github_client.py @@ -0,0 +1,87 @@ +""" +Minimal GitHub REST API client using requests, replacing PyGithub. + +Supports the operations used by ray_release: reading/creating/updating issues, +reading labels, and reading pull requests. +""" +from dataclasses import dataclass +from typing import Any, Dict, Optional + +import requests + + +class GitHubException(Exception): + """Raised when the GitHub API returns a non-2xx response.""" + + def __init__(self, status: int, data: Any = None, headers: Any = None) -> None: + self.status = status + self.data = data + self.headers = headers + super().__init__(f"GitHub API error {status}: {data}") + + +@dataclass +class GitHubRepo: + """A GitHub repository.""" + + full_name: str + _client: "GitHubClient" + + +class GitHubClient: + """Authenticated client for the GitHub REST API.""" + + BASE_URL = "https://api.github.com" + + def __init__(self, token: str) -> None: + """Initialize the client with a personal access token or app token.""" + self._session = requests.Session() + self._session.headers.update( + { + "Authorization": f"Bearer {token}", + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + } + ) + + def get_repo(self, full_name: str) -> "GitHubRepo": + """Return a repo handle for owner/name (e.g. 'ray-project/ray').""" + return GitHubRepo(full_name=full_name, _client=self) + + def _raise_for_response(self, resp: requests.Response) -> None: + if not resp.ok: + try: + data = resp.json() + except requests.exceptions.JSONDecodeError: + data = resp.text + raise GitHubException(resp.status_code, data, resp.headers) + + def _get(self, path: str, params: Optional[Dict[str, Any]] = None) -> dict: + resp = self._session.get(f"{self.BASE_URL}{path}", params=params) + self._raise_for_response(resp) + return resp.json() + + def _get_paginated( + self, path: str, params: Optional[Dict[str, Any]] = None + ) -> list: + params = dict(params or {}) + params["per_page"] = 100 + results = [] + url: Optional[str] = f"{self.BASE_URL}{path}" + while url: + resp = self._session.get(url, params=params) + self._raise_for_response(resp) + results.extend(resp.json()) + url = resp.links.get("next", {}).get("url") + params = None # subsequent URLs from Link header already include all params + return results + + def _post(self, path: str, data: dict) -> dict: + resp = self._session.post(f"{self.BASE_URL}{path}", json=data) + self._raise_for_response(resp) + return resp.json() + + def _patch(self, path: str, data: dict) -> dict: + resp = self._session.patch(f"{self.BASE_URL}{path}", json=data) + self._raise_for_response(resp) + return resp.json() diff --git a/release/ray_release/tests/test_github_client.py b/release/ray_release/tests/test_github_client.py new file mode 100644 index 000000000000..180390e4e1ac --- /dev/null +++ b/release/ray_release/tests/test_github_client.py @@ -0,0 +1,48 @@ +""" +Tests for the GitHub REST API client. + +Uses the `responses` library to intercept outbound HTTP requests without +patching Python internals, so tests exercise the full client code path: +URL construction, headers, JSON parsing, and dataclass population. +""" +import sys + +import pytest + +from ray_release.github_client import ( + GitHubClient, + GitHubException, + GitHubRepo, +) + +REPO = "owner/repo" +BASE = "https://api.github.com" + + +def _client() -> GitHubClient: + return GitHubClient("test-token") + + +def _repo(client: GitHubClient = None) -> GitHubRepo: + return (_client() if client is None else client).get_repo(REPO) + + +# --------------------------------------------------------------------------- +# GitHubException +# --------------------------------------------------------------------------- + + +def test_github_exception_stores_status_and_data(): + exc = GitHubException(404, {"message": "Not Found"}, {"x-header": "val"}) + assert exc.status == 404 + assert exc.data == {"message": "Not Found"} + assert exc.headers == {"x-header": "val"} + assert "404" in str(exc) + + +def test_github_exception_is_exception(): + assert isinstance(GitHubException(500), Exception) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/release/release_tests.yaml b/release/release_tests.yaml index ceba4590201f..45e5629a09cf 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -3993,6 +3993,23 @@ script: > pytest -sv test_llm_serve_multi_node_integration.py +- name: llm_serve_dp_engine_fault + frequency: nightly + python: "3.11" + group: llm-serve + team: llm + working_dir: llm_tests/serve + + cluster: + byod: + type: llm-cu128 + cluster_compute: llm_2x_4xl4.yaml + + run: + timeout: 3600 + script: > + pytest -sv test_llm_serve_dp_engine_fault.py + - name: llm_serve_llama_3dot2_1B_no_accelerator frequency: nightly python: "3.11" diff --git a/release/requirements_py310.in b/release/requirements_py310.in index ba5bc311820c..000070858db1 100644 --- a/release/requirements_py310.in +++ b/release/requirements_py310.in @@ -13,6 +13,7 @@ jinja2 msal protobuf >= 3.15.3, != 3.19.5 pytest +responses pyyaml pybuildkite PyGithub diff --git a/release/requirements_py310.txt b/release/requirements_py310.txt index cdb5e9728567..3eb79d13827f 100644 --- a/release/requirements_py310.txt +++ b/release/requirements_py310.txt @@ -1663,6 +1663,10 @@ requests-toolbelt==1.0.0 \ --hash=sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6 \ --hash=sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06 # via twine +responses==0.25.3 \ + --hash=sha256:521efcbc82081ab8daa588e08f7e8a64ce79b91c39f6e62199b19159bea7dbcb \ + --hash=sha256:617b9247abd9ae28313d57a75880422d55ec63c29d33d629697590a034358dba + # via -r release/requirements_py310.in rfc3986==2.0.0 \ --hash=sha256:50b1502b60e289cb37883f3dfd34532b8873c7de9f49bb546641ce9cbd256ebd \ --hash=sha256:97aacf9dbd4bfd829baad6e6309fa6573aaf1be3f6fa735c8ab05e46cecb261c diff --git a/src/ray/gcs/gcs_autoscaler_state_manager.cc b/src/ray/gcs/gcs_autoscaler_state_manager.cc index 85dd16fd7d66..c5095523aba3 100644 --- a/src/ray/gcs/gcs_autoscaler_state_manager.cc +++ b/src/ray/gcs/gcs_autoscaler_state_manager.cc @@ -552,11 +552,10 @@ void GcsAutoscalerStateManager::HandleResizeRayletResourceInstances( raylet_client->ResizeLocalResourceInstances( std::move(*request.mutable_resources()), [reply, send_reply_callback]( - const Status &status, - const rpc::ResizeLocalResourceInstancesReply &raylet_reply) { + const Status &status, rpc::ResizeLocalResourceInstancesReply &&raylet_reply) { if (status.ok()) { - reply->mutable_total_resources()->insert(raylet_reply.total_resources().begin(), - raylet_reply.total_resources().end()); + *reply->mutable_total_resources() = + std::move(*raylet_reply.mutable_total_resources()); } send_reply_callback(status, nullptr, nullptr); }); diff --git a/src/ray/gcs_rpc_client/accessor.cc b/src/ray/gcs_rpc_client/accessor.cc index 3bfc7d9790ed..91a6d8bc3b08 100644 --- a/src/ray/gcs_rpc_client/accessor.cc +++ b/src/ray/gcs_rpc_client/accessor.cc @@ -1233,6 +1233,22 @@ Status AutoscalerStateAccessor::DrainNode(const std::string &node_id, return Status::OK(); } +Status AutoscalerStateAccessor::ResizeRayletResourceInstances( + const std::string &node_id, + const std::unordered_map &resources, + int64_t timeout_ms, + std::unordered_map &total_resources) { + rpc::autoscaler::ResizeRayletResourceInstancesRequest request; + request.set_node_id(NodeID::FromHex(node_id).Binary()); + request.mutable_resources()->insert(resources.begin(), resources.end()); + + rpc::autoscaler::ResizeRayletResourceInstancesReply reply; + RAY_RETURN_NOT_OK(client_impl_->GetGcsRpcClient().SyncResizeRayletResourceInstances( + std::move(request), &reply, timeout_ms)); + total_resources.insert(reply.total_resources().begin(), reply.total_resources().end()); + return Status::OK(); +} + PublisherAccessor::PublisherAccessor(GcsClient *client_impl) : client_impl_(client_impl) {} diff --git a/src/ray/gcs_rpc_client/accessor.h b/src/ray/gcs_rpc_client/accessor.h index 6ef9a73ac71f..008be7b6ec93 100644 --- a/src/ray/gcs_rpc_client/accessor.h +++ b/src/ray/gcs_rpc_client/accessor.h @@ -792,6 +792,12 @@ class AutoscalerStateAccessor { bool &is_accepted, std::string &rejection_reason_message); + virtual Status ResizeRayletResourceInstances( + const std::string &node_id, + const std::unordered_map &resources, + int64_t timeout_ms, + std::unordered_map &total_resources); + private: GcsClient *client_impl_; }; diff --git a/src/ray/raylet_rpc_client/raylet_client.cc b/src/ray/raylet_rpc_client/raylet_client.cc index 0f77672a1c81..47c3fab96851 100644 --- a/src/ray/raylet_rpc_client/raylet_client.cc +++ b/src/ray/raylet_rpc_client/raylet_client.cc @@ -392,12 +392,13 @@ void RayletClient::ResizeLocalResourceInstances( const rpc::ClientCallback &callback) { rpc::ResizeLocalResourceInstancesRequest request; *request.mutable_resources() = std::move(resources); - INVOKE_RPC_CALL(NodeManagerService, - ResizeLocalResourceInstances, - request, - callback, - grpc_client_, - /*method_timeout_ms*/ -1); + INVOKE_RETRYABLE_RPC_CALL(retryable_grpc_client_, + NodeManagerService, + ResizeLocalResourceInstances, + request, + callback, + grpc_client_, + /*method_timeout_ms*/ -1); } void RayletClient::IsLocalWorkerDead(