From de6fb36cde29d758708ab931228e565294d5e92a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 28 Jan 2026 02:50:48 +0000 Subject: [PATCH 1/5] Initial plan From c44b1feaa2d3f8d7c63c868c2342c9816b2ad675 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 28 Jan 2026 02:54:43 +0000 Subject: [PATCH 2/5] Add XPU support for PD disaggregation with NIXL backend Co-authored-by: zhenwei-intel <109187816+zhenwei-intel@users.noreply.github.com> --- lmcache/v1/storage_backend/p2p_backend.py | 1 + lmcache/v1/storage_backend/pd_backend.py | 10 ++++++-- lmcache/v1/transfer_channel/nixl_channel.py | 24 +++++++++++++++---- lmcache/v1/transfer_channel/transfer_utils.py | 6 +++-- 4 files changed, 33 insertions(+), 8 deletions(-) diff --git a/lmcache/v1/storage_backend/p2p_backend.py b/lmcache/v1/storage_backend/p2p_backend.py index 89a1818a8b..3139b5774e 100644 --- a/lmcache/v1/storage_backend/p2p_backend.py +++ b/lmcache/v1/storage_backend/p2p_backend.py @@ -231,6 +231,7 @@ def __init__( peer_lookup_url=self.peer_lookup_url, backends=config.nixl_backends, event_loop=loop, + device="cpu", ) self.running = asyncio.Event() diff --git a/lmcache/v1/storage_backend/pd_backend.py b/lmcache/v1/storage_backend/pd_backend.py index 3fbdec40b8..926137209e 100644 --- a/lmcache/v1/storage_backend/pd_backend.py +++ b/lmcache/v1/storage_backend/pd_backend.py @@ -189,6 +189,7 @@ def __init__( tp_rank=self.tp_rank, peer_init_url=peer_init_url, backends=config.nixl_backends, + device=self.pd_config.buffer_device, ) if self.pd_config.role == "sender": @@ -217,8 +218,13 @@ def initialize_allocator( config.pd_buffer_device, metadata.worker_id, ) - logger.info(f"Setting cuda device to {corrected_device} ") - torch.cuda.set_device(corrected_device) + logger.info(f"Setting device to {corrected_device}") + + # Set device based on device type + if corrected_device.startswith("cuda"): + torch.cuda.set_device(corrected_device) + elif corrected_device.startswith("xpu"): + torch.xpu.set_device(corrected_device) paged_mem_allocator = PagedCpuGpuMemoryAllocator() paged_mem_allocator.init_gpu_memory_allocator( diff --git a/lmcache/v1/transfer_channel/nixl_channel.py b/lmcache/v1/transfer_channel/nixl_channel.py index 85744b4d67..22eb57b321 100644 --- a/lmcache/v1/transfer_channel/nixl_channel.py +++ b/lmcache/v1/transfer_channel/nixl_channel.py @@ -81,6 +81,9 @@ def __init__( else: backends = ["UCX"] + # Extract device from kwargs (optional, defaults to "cuda" for backwards compatibility) + device = kwargs.get("device", "cuda") + self.role = kwargs["role"] self.nixl_wrapper = NixlAgentWrapper( @@ -89,6 +92,7 @@ def __init__( page_size=kwargs["align_bytes"], tp_rank=kwargs["tp_rank"], backends=backends, + device=device, ) self.nixl_agent = self.nixl_wrapper.agent @@ -579,6 +583,7 @@ def __init__( page_size: int, tp_rank: int, backends: list[str], + device: str = "cuda", ): """ Initialize the NIXL agent. @@ -590,6 +595,8 @@ def __init__( the lmcache memory allocator. tp_rank (int): The tensor parallel rank. backends (list[str]): The list of backends to use. + device (str): The device type string (e.g., "cuda:0", "xpu:0"). + Defaults to "cuda" for backward compatibility. Returns: NixlWrapper: The NIXL agent. @@ -608,6 +615,16 @@ def __init__( if backends is None: backends = ["UCX"] + # Determine memory type based on device string + # device can be "cuda", "cuda:0", "xpu", "xpu:0", etc. + if device.startswith("cuda"): + mem_type = "cuda" + elif device.startswith("xpu"): + mem_type = "xpu" + else: + # Default to cuda for backward compatibility + mem_type = "cuda" + # Create a NIXL agent nixl_agent = NixlAgent( str(uuid.uuid4()), @@ -618,8 +635,7 @@ def __init__( # The four fields are (base_addr, length, dev_id, meta_info) # https://github.com/ai-dynamo/nixl/blob/main/src/api/cpp/nixl_descriptors.h#L152 memory_desc = [(buffer_ptr, buffer_size, tp_rank, "")] - # TODO(Jiayi): remove hardcode `mem_type` - reg_descs = nixl_agent.get_reg_descs(memory_desc, mem_type="cuda") + reg_descs = nixl_agent.get_reg_descs(memory_desc, mem_type=mem_type) nixl_agent.register_memory(reg_descs) # Create xfer handlers @@ -627,8 +643,8 @@ def __init__( for base_addr in range(buffer_ptr, buffer_ptr + buffer_size, page_size): xfer_desc.append((base_addr, page_size, tp_rank)) - xfer_descs = nixl_agent.get_xfer_descs(xfer_desc, mem_type="cuda") - xfer_handler = nixl_agent.prep_xfer_dlist("", xfer_descs, mem_type="cuda") + xfer_descs = nixl_agent.get_xfer_descs(xfer_desc, mem_type=mem_type) + xfer_handler = nixl_agent.prep_xfer_dlist("", xfer_descs, mem_type=mem_type) self.agent = nixl_agent self.reg_descs = reg_descs diff --git a/lmcache/v1/transfer_channel/transfer_utils.py b/lmcache/v1/transfer_channel/transfer_utils.py index 0e8e5b9b1f..f13a9c272e 100644 --- a/lmcache/v1/transfer_channel/transfer_utils.py +++ b/lmcache/v1/transfer_channel/transfer_utils.py @@ -11,8 +11,8 @@ def get_correct_device(device: str, worker_id: int) -> str: Get the correct device based on the given device string. Args: - device (str): The device string, could be cpu or cuda. - worker_id (int): The worker id to determine the cuda device. + device (str): The device string, could be cpu, cuda, or xpu. + worker_id (int): The worker id to determine the device. Returns: str: The correct device string with device id. @@ -21,6 +21,8 @@ def get_correct_device(device: str, worker_id: int) -> str: return "cpu" elif device.startswith("cuda"): return f"cuda:{worker_id}" + elif device.startswith("xpu"): + return f"xpu:{worker_id}" else: raise ValueError(f"Invalid device: {device}") From e93eed82cf00504822801e034a05004324e3d64a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 28 Jan 2026 02:56:38 +0000 Subject: [PATCH 3/5] Fix linting issues (line length and formatting) Co-authored-by: zhenwei-intel <109187816+zhenwei-intel@users.noreply.github.com> --- lmcache/v1/storage_backend/pd_backend.py | 2 +- lmcache/v1/transfer_channel/nixl_channel.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lmcache/v1/storage_backend/pd_backend.py b/lmcache/v1/storage_backend/pd_backend.py index 926137209e..aeb722272e 100644 --- a/lmcache/v1/storage_backend/pd_backend.py +++ b/lmcache/v1/storage_backend/pd_backend.py @@ -219,7 +219,7 @@ def initialize_allocator( metadata.worker_id, ) logger.info(f"Setting device to {corrected_device}") - + # Set device based on device type if corrected_device.startswith("cuda"): torch.cuda.set_device(corrected_device) diff --git a/lmcache/v1/transfer_channel/nixl_channel.py b/lmcache/v1/transfer_channel/nixl_channel.py index 22eb57b321..4f56ec397d 100644 --- a/lmcache/v1/transfer_channel/nixl_channel.py +++ b/lmcache/v1/transfer_channel/nixl_channel.py @@ -81,7 +81,8 @@ def __init__( else: backends = ["UCX"] - # Extract device from kwargs (optional, defaults to "cuda" for backwards compatibility) + # Extract device from kwargs (optional, defaults to "cuda" for + # backwards compatibility) device = kwargs.get("device", "cuda") self.role = kwargs["role"] From 55b14e91277130cefbe86b9435538aa9c3350195 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 28 Jan 2026 02:59:27 +0000 Subject: [PATCH 4/5] Address code review feedback: improve error handling and validation Co-authored-by: zhenwei-intel <109187816+zhenwei-intel@users.noreply.github.com> --- lmcache/v1/storage_backend/pd_backend.py | 5 +++++ lmcache/v1/transfer_channel/nixl_channel.py | 11 ++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/lmcache/v1/storage_backend/pd_backend.py b/lmcache/v1/storage_backend/pd_backend.py index aeb722272e..5c4517c432 100644 --- a/lmcache/v1/storage_backend/pd_backend.py +++ b/lmcache/v1/storage_backend/pd_backend.py @@ -224,6 +224,11 @@ def initialize_allocator( if corrected_device.startswith("cuda"): torch.cuda.set_device(corrected_device) elif corrected_device.startswith("xpu"): + if not hasattr(torch, "xpu"): + raise RuntimeError( + "XPU device is not available. Please ensure PyTorch is built " + "with XPU support." + ) torch.xpu.set_device(corrected_device) paged_mem_allocator = PagedCpuGpuMemoryAllocator() diff --git a/lmcache/v1/transfer_channel/nixl_channel.py b/lmcache/v1/transfer_channel/nixl_channel.py index 4f56ec397d..8c0ef70e21 100644 --- a/lmcache/v1/transfer_channel/nixl_channel.py +++ b/lmcache/v1/transfer_channel/nixl_channel.py @@ -617,14 +617,19 @@ def __init__( backends = ["UCX"] # Determine memory type based on device string - # device can be "cuda", "cuda:0", "xpu", "xpu:0", etc. + # device can be "cuda", "cuda:0", "xpu", "xpu:0", "cpu", etc. if device.startswith("cuda"): mem_type = "cuda" elif device.startswith("xpu"): mem_type = "xpu" + elif device.startswith("cpu"): + mem_type = "cpu" else: - # Default to cuda for backward compatibility - mem_type = "cuda" + # Raise error for unsupported device types + raise ValueError( + f"Unsupported device type: {device}. " + "Supported device types are: cuda, xpu, cpu" + ) # Create a NIXL agent nixl_agent = NixlAgent( From bea0f190308390ce3bda084ce16c8dc75c290685 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 28 Jan 2026 03:02:53 +0000 Subject: [PATCH 5/5] Fix XPU support in NIXL storage backend and cache engine Co-authored-by: zhenwei-intel <109187816+zhenwei-intel@users.noreply.github.com> --- lmcache/v1/cache_engine.py | 13 +++++++++++-- lmcache/v1/storage_backend/nixl_storage_backend.py | 11 ++++++++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/lmcache/v1/cache_engine.py b/lmcache/v1/cache_engine.py index 61231b2aff..3900eded69 100644 --- a/lmcache/v1/cache_engine.py +++ b/lmcache/v1/cache_engine.py @@ -1794,8 +1794,17 @@ def _Create_memory_allocator( buffer.data_ptr(), config.nixl_buffer_size, 0 ) else: - logger.info(f"Setting cuda device to {corrected_device} ") - torch.cuda.set_device(corrected_device) + logger.info(f"Setting device to {corrected_device}") + # Set device based on device type + if corrected_device.startswith("cuda"): + torch.cuda.set_device(corrected_device) + elif corrected_device.startswith("xpu"): + if not hasattr(torch, "xpu"): + raise RuntimeError( + "XPU device is not available. Please ensure PyTorch " + "is built with XPU support." + ) + torch.xpu.set_device(corrected_device) return PagedTensorMemoryAllocator( buffer, diff --git a/lmcache/v1/storage_backend/nixl_storage_backend.py b/lmcache/v1/storage_backend/nixl_storage_backend.py index 6bbc3abc49..6f7521838e 100644 --- a/lmcache/v1/storage_backend/nixl_storage_backend.py +++ b/lmcache/v1/storage_backend/nixl_storage_backend.py @@ -532,7 +532,16 @@ def initialize_allocator( base_buffer, self.buffer = _allocate_gpu_memory( config.nixl_buffer_size, corrected_device ) - torch.cuda.set_device(corrected_device) + # Set device based on device type + if corrected_device.startswith("cuda"): + torch.cuda.set_device(corrected_device) + elif corrected_device.startswith("xpu"): + if not hasattr(torch, "xpu"): + raise RuntimeError( + "XPU device is not available. Please ensure PyTorch is " + "built with XPU support." + ) + torch.xpu.set_device(corrected_device) self.base_buffer = base_buffer # Prevents early GC of the aligned tensor. self.free_pinned_buffer = False