From de6fb36cde29d758708ab931228e565294d5e92a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 28 Jan 2026 02:50:48 +0000
Subject: [PATCH 1/5] Initial plan


From c44b1feaa2d3f8d7c63c868c2342c9816b2ad675 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 28 Jan 2026 02:54:43 +0000
Subject: [PATCH 2/5] Add XPU support for PD disaggregation with NIXL backend

Co-authored-by: zhenwei-intel <109187816+zhenwei-intel@users.noreply.github.com>
---
 lmcache/v1/storage_backend/p2p_backend.py     |  1 +
 lmcache/v1/storage_backend/pd_backend.py      | 10 ++++++--
 lmcache/v1/transfer_channel/nixl_channel.py   | 24 +++++++++++++++----
 lmcache/v1/transfer_channel/transfer_utils.py |  6 +++--
 4 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/lmcache/v1/storage_backend/p2p_backend.py b/lmcache/v1/storage_backend/p2p_backend.py
index 89a1818a8b..3139b5774e 100644
--- a/lmcache/v1/storage_backend/p2p_backend.py
+++ b/lmcache/v1/storage_backend/p2p_backend.py
@@ -231,6 +231,7 @@ def __init__(
             peer_lookup_url=self.peer_lookup_url,
             backends=config.nixl_backends,
             event_loop=loop,
+            device="cpu",
         )
 
         self.running = asyncio.Event()
diff --git a/lmcache/v1/storage_backend/pd_backend.py b/lmcache/v1/storage_backend/pd_backend.py
index 3fbdec40b8..926137209e 100644
--- a/lmcache/v1/storage_backend/pd_backend.py
+++ b/lmcache/v1/storage_backend/pd_backend.py
@@ -189,6 +189,7 @@ def __init__(
             tp_rank=self.tp_rank,
             peer_init_url=peer_init_url,
             backends=config.nixl_backends,
+            device=self.pd_config.buffer_device,
         )
 
         if self.pd_config.role == "sender":
@@ -217,8 +218,13 @@ def initialize_allocator(
             config.pd_buffer_device,
             metadata.worker_id,
         )
-        logger.info(f"Setting cuda device to {corrected_device} ")
-        torch.cuda.set_device(corrected_device)
+        logger.info(f"Setting device to {corrected_device}")
+        
+        # Set device based on device type
+        if corrected_device.startswith("cuda"):
+            torch.cuda.set_device(corrected_device)
+        elif corrected_device.startswith("xpu"):
+            torch.xpu.set_device(corrected_device)
 
         paged_mem_allocator = PagedCpuGpuMemoryAllocator()
         paged_mem_allocator.init_gpu_memory_allocator(
diff --git a/lmcache/v1/transfer_channel/nixl_channel.py b/lmcache/v1/transfer_channel/nixl_channel.py
index 85744b4d67..22eb57b321 100644
--- a/lmcache/v1/transfer_channel/nixl_channel.py
+++ b/lmcache/v1/transfer_channel/nixl_channel.py
@@ -81,6 +81,9 @@ def __init__(
         else:
             backends = ["UCX"]
 
+        # Extract device from kwargs (optional, defaults to "cuda" for backwards compatibility)
+        device = kwargs.get("device", "cuda")
+
         self.role = kwargs["role"]
 
         self.nixl_wrapper = NixlAgentWrapper(
@@ -89,6 +92,7 @@ def __init__(
             page_size=kwargs["align_bytes"],
             tp_rank=kwargs["tp_rank"],
             backends=backends,
+            device=device,
         )
         self.nixl_agent = self.nixl_wrapper.agent
 
@@ -579,6 +583,7 @@ def __init__(
         page_size: int,
         tp_rank: int,
         backends: list[str],
+        device: str = "cuda",
     ):
         """
         Initialize the NIXL agent.
@@ -590,6 +595,8 @@ def __init__(
                 the lmcache memory allocator.
             tp_rank (int): The tensor parallel rank.
             backends (list[str]): The list of backends to use.
+            device (str): The device type string (e.g., "cuda:0", "xpu:0").
+                Defaults to "cuda" for backward compatibility.
 
         Returns:
             NixlWrapper: The NIXL agent.
@@ -608,6 +615,16 @@ def __init__(
         if backends is None:
             backends = ["UCX"]
 
+        # Determine memory type based on device string
+        # device can be "cuda", "cuda:0", "xpu", "xpu:0", etc.
+        if device.startswith("cuda"):
+            mem_type = "cuda"
+        elif device.startswith("xpu"):
+            mem_type = "xpu"
+        else:
+            # Default to cuda for backward compatibility
+            mem_type = "cuda"
+
         # Create a NIXL agent
         nixl_agent = NixlAgent(
             str(uuid.uuid4()),
@@ -618,8 +635,7 @@ def __init__(
         # The four fields are (base_addr, length, dev_id, meta_info)
         # https://github.com/ai-dynamo/nixl/blob/main/src/api/cpp/nixl_descriptors.h#L152
         memory_desc = [(buffer_ptr, buffer_size, tp_rank, "")]
-        # TODO(Jiayi): remove hardcode `mem_type`
-        reg_descs = nixl_agent.get_reg_descs(memory_desc, mem_type="cuda")
+        reg_descs = nixl_agent.get_reg_descs(memory_desc, mem_type=mem_type)
         nixl_agent.register_memory(reg_descs)
 
         # Create xfer handlers
@@ -627,8 +643,8 @@ def __init__(
         for base_addr in range(buffer_ptr, buffer_ptr + buffer_size, page_size):
             xfer_desc.append((base_addr, page_size, tp_rank))
 
-        xfer_descs = nixl_agent.get_xfer_descs(xfer_desc, mem_type="cuda")
-        xfer_handler = nixl_agent.prep_xfer_dlist("", xfer_descs, mem_type="cuda")
+        xfer_descs = nixl_agent.get_xfer_descs(xfer_desc, mem_type=mem_type)
+        xfer_handler = nixl_agent.prep_xfer_dlist("", xfer_descs, mem_type=mem_type)
 
         self.agent = nixl_agent
         self.reg_descs = reg_descs
diff --git a/lmcache/v1/transfer_channel/transfer_utils.py b/lmcache/v1/transfer_channel/transfer_utils.py
index 0e8e5b9b1f..f13a9c272e 100644
--- a/lmcache/v1/transfer_channel/transfer_utils.py
+++ b/lmcache/v1/transfer_channel/transfer_utils.py
@@ -11,8 +11,8 @@ def get_correct_device(device: str, worker_id: int) -> str:
     Get the correct device based on the given device string.
 
     Args:
-        device (str): The device string, could be cpu or cuda.
-        worker_id (int): The worker id to determine the cuda device.
+        device (str): The device string, could be cpu, cuda, or xpu.
+        worker_id (int): The worker id to determine the device.
 
     Returns:
         str: The correct device string with device id.
@@ -21,6 +21,8 @@ def get_correct_device(device: str, worker_id: int) -> str:
         return "cpu"
     elif device.startswith("cuda"):
         return f"cuda:{worker_id}"
+    elif device.startswith("xpu"):
+        return f"xpu:{worker_id}"
     else:
         raise ValueError(f"Invalid device: {device}")
 

From e93eed82cf00504822801e034a05004324e3d64a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 28 Jan 2026 02:56:38 +0000
Subject: [PATCH 3/5] Fix linting issues (line length and formatting)

Co-authored-by: zhenwei-intel <109187816+zhenwei-intel@users.noreply.github.com>
---
 lmcache/v1/storage_backend/pd_backend.py    | 2 +-
 lmcache/v1/transfer_channel/nixl_channel.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/lmcache/v1/storage_backend/pd_backend.py b/lmcache/v1/storage_backend/pd_backend.py
index 926137209e..aeb722272e 100644
--- a/lmcache/v1/storage_backend/pd_backend.py
+++ b/lmcache/v1/storage_backend/pd_backend.py
@@ -219,7 +219,7 @@ def initialize_allocator(
             metadata.worker_id,
         )
         logger.info(f"Setting device to {corrected_device}")
-        
+
         # Set device based on device type
         if corrected_device.startswith("cuda"):
             torch.cuda.set_device(corrected_device)
diff --git a/lmcache/v1/transfer_channel/nixl_channel.py b/lmcache/v1/transfer_channel/nixl_channel.py
index 22eb57b321..4f56ec397d 100644
--- a/lmcache/v1/transfer_channel/nixl_channel.py
+++ b/lmcache/v1/transfer_channel/nixl_channel.py
@@ -81,7 +81,8 @@ def __init__(
         else:
             backends = ["UCX"]
 
-        # Extract device from kwargs (optional, defaults to "cuda" for backwards compatibility)
+        # Extract device from kwargs (optional, defaults to "cuda" for
+        # backwards compatibility)
         device = kwargs.get("device", "cuda")
 
         self.role = kwargs["role"]

From 55b14e91277130cefbe86b9435538aa9c3350195 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 28 Jan 2026 02:59:27 +0000
Subject: [PATCH 4/5] Address code review feedback: improve error handling and
 validation

Co-authored-by: zhenwei-intel <109187816+zhenwei-intel@users.noreply.github.com>
---
 lmcache/v1/storage_backend/pd_backend.py    |  5 +++++
 lmcache/v1/transfer_channel/nixl_channel.py | 11 ++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/lmcache/v1/storage_backend/pd_backend.py b/lmcache/v1/storage_backend/pd_backend.py
index aeb722272e..5c4517c432 100644
--- a/lmcache/v1/storage_backend/pd_backend.py
+++ b/lmcache/v1/storage_backend/pd_backend.py
@@ -224,6 +224,11 @@ def initialize_allocator(
         if corrected_device.startswith("cuda"):
             torch.cuda.set_device(corrected_device)
         elif corrected_device.startswith("xpu"):
+            if not hasattr(torch, "xpu"):
+                raise RuntimeError(
+                    "XPU device is not available. Please ensure PyTorch is built "
+                    "with XPU support."
+                )
             torch.xpu.set_device(corrected_device)
 
         paged_mem_allocator = PagedCpuGpuMemoryAllocator()
diff --git a/lmcache/v1/transfer_channel/nixl_channel.py b/lmcache/v1/transfer_channel/nixl_channel.py
index 4f56ec397d..8c0ef70e21 100644
--- a/lmcache/v1/transfer_channel/nixl_channel.py
+++ b/lmcache/v1/transfer_channel/nixl_channel.py
@@ -617,14 +617,19 @@ def __init__(
             backends = ["UCX"]
 
         # Determine memory type based on device string
-        # device can be "cuda", "cuda:0", "xpu", "xpu:0", etc.
+        # device can be "cuda", "cuda:0", "xpu", "xpu:0", "cpu", etc.
         if device.startswith("cuda"):
             mem_type = "cuda"
         elif device.startswith("xpu"):
             mem_type = "xpu"
+        elif device.startswith("cpu"):
+            mem_type = "cpu"
         else:
-            # Default to cuda for backward compatibility
-            mem_type = "cuda"
+            # Raise error for unsupported device types
+            raise ValueError(
+                f"Unsupported device type: {device}. "
+                "Supported device types are: cuda, xpu, cpu"
+            )
 
         # Create a NIXL agent
         nixl_agent = NixlAgent(

From bea0f190308390ce3bda084ce16c8dc75c290685 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 28 Jan 2026 03:02:53 +0000
Subject: [PATCH 5/5] Fix XPU support in NIXL storage backend and cache engine

Co-authored-by: zhenwei-intel <109187816+zhenwei-intel@users.noreply.github.com>
---
 lmcache/v1/cache_engine.py                         | 13 +++++++++++--
 lmcache/v1/storage_backend/nixl_storage_backend.py | 11 ++++++++++-
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/lmcache/v1/cache_engine.py b/lmcache/v1/cache_engine.py
index 61231b2aff..3900eded69 100644
--- a/lmcache/v1/cache_engine.py
+++ b/lmcache/v1/cache_engine.py
@@ -1794,8 +1794,17 @@ def _Create_memory_allocator(
                     buffer.data_ptr(), config.nixl_buffer_size, 0
                 )
             else:
-                logger.info(f"Setting cuda device to {corrected_device} ")
-                torch.cuda.set_device(corrected_device)
+                logger.info(f"Setting device to {corrected_device}")
+                # Set device based on device type
+                if corrected_device.startswith("cuda"):
+                    torch.cuda.set_device(corrected_device)
+                elif corrected_device.startswith("xpu"):
+                    if not hasattr(torch, "xpu"):
+                        raise RuntimeError(
+                            "XPU device is not available. Please ensure PyTorch "
+                            "is built with XPU support."
+                        )
+                    torch.xpu.set_device(corrected_device)
 
             return PagedTensorMemoryAllocator(
                 buffer,
diff --git a/lmcache/v1/storage_backend/nixl_storage_backend.py b/lmcache/v1/storage_backend/nixl_storage_backend.py
index 6bbc3abc49..6f7521838e 100644
--- a/lmcache/v1/storage_backend/nixl_storage_backend.py
+++ b/lmcache/v1/storage_backend/nixl_storage_backend.py
@@ -532,7 +532,16 @@ def initialize_allocator(
             base_buffer, self.buffer = _allocate_gpu_memory(
                 config.nixl_buffer_size, corrected_device
             )
-            torch.cuda.set_device(corrected_device)
+            # Set device based on device type
+            if corrected_device.startswith("cuda"):
+                torch.cuda.set_device(corrected_device)
+            elif corrected_device.startswith("xpu"):
+                if not hasattr(torch, "xpu"):
+                    raise RuntimeError(
+                        "XPU device is not available. Please ensure PyTorch is "
+                        "built with XPU support."
+                    )
+                torch.xpu.set_device(corrected_device)
             self.base_buffer = base_buffer  # Prevents early GC of the aligned tensor.
             self.free_pinned_buffer = False