From c82cd357aada9df150ee3fb55b30ee640aeb8f33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nearly=20Ther=D1=A5?=
 <188969017+yychyo@users.noreply.github.com>
Date: Wed, 25 Mar 2026 17:24:46 +0300
Subject: [PATCH 01/11] server: clear idle slots KV from VRAM
 (LLAMA_KV_KEEP_ONLY_ACTIVE)

---
 tools/server/server-context.cpp               | 52 +++++++++++-
 tools/server/server-task.cpp                  |  2 +-
 .../tests/unit/test_kv_keep_only_active.py    | 81 +++++++++++++++++++
 tools/server/tests/utils.py                   | 15 +++-
 4 files changed, 143 insertions(+), 7 deletions(-)
 create mode 100644 tools/server/tests/unit/test_kv_keep_only_active.py

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index b79a5270b52..bbddf5e5564 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -573,6 +573,7 @@ struct server_context_impl {
     int slots_debug = 0;
     int n_empty_consecutive = 0;
 
+    bool kv_keep_only_active = false;
     std::unique_ptr<server_prompt_cache> prompt_cache;
 
     server_metrics metrics;
@@ -864,6 +865,21 @@ struct server_context_impl {
 
         metrics.init();
 
+        // LLAMA_KV_KEEP_ONLY_ACTIVE: clear idle slots' KV from VRAM before each decode batch
+        {
+            const char * env = getenv("LLAMA_KV_KEEP_ONLY_ACTIVE");
+            if (env && atoi(env)) {
+                if (!params_base.kv_unified) {
+                    SRV_WRN("%s\n", "LLAMA_KV_KEEP_ONLY_ACTIVE requires unified KV cache, ignoring");
+                } else if (params_base.cache_ram_mib == 0) {
+                    SRV_WRN("%s\n", "LLAMA_KV_KEEP_ONLY_ACTIVE requires --cache-ram, ignoring");
+                } else {
+                    kv_keep_only_active = true;
+                    SRV_INF("%s\n", "LLAMA_KV_KEEP_ONLY_ACTIVE: idle slots' KV will be cleared from VRAM before each decode");
+                }
+            }
+        }
+
         // populate webui settings
         {
             if (!params_base.webui_config_json.empty()) {
@@ -1010,15 +1026,15 @@ struct server_context_impl {
             // cache prompts only for completion tasks
             update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION;
 
-            // don't update the cache if the slot's context is empty
-            update_cache = update_cache && tokens.size() > 0;
-
             if (update_cache) {
                 SRV_WRN("%s", "updating prompt cache\n");
 
                 const int64_t t_start = ggml_time_us();
 
-                ret->prompt_save(*prompt_cache);
+                // don't save the slot's state if its context is empty
+                if (tokens.size() > 0) {
+                    ret->prompt_save(*prompt_cache);
+                }
 
                 if (!ret->prompt_load(*prompt_cache, task.tokens)) {
                     ret->prompt_clear(false);
@@ -2689,6 +2705,34 @@ struct server_context_impl {
             n_empty_consecutive = 0;
         }
 
+        if (kv_keep_only_active && batch.n_tokens > 0) { // LLAMA_KV_KEEP_ONLY_ACTIVE: clear idle slots' KV
+            int kv_used = 0;
+            int n_cleared = 0;
+
+            for (auto & slot : slots) {
+                const int n_tokens = slot.prompt.n_tokens();
+                if (n_tokens == 0) {
+                    continue;
+                }
+                if (slot.is_processing()) {
+                    kv_used += n_tokens;
+                    continue;
+                }
+
+                slot.prompt_save(*prompt_cache);
+                slot.prompt_clear(false);
+                ++n_cleared;
+
+                SLT_DBG(slot, "kv_keep_only_active: cleared idle slot with %d tokens\n", n_tokens);
+            }
+
+            if (n_cleared > 0) {
+                prompt_cache->update();
+
+                SRV_INF("kv_keep_only_active: cleared %d slot(s), kv: %d/%d\n", n_cleared, kv_used, n_ctx);
+            }
+        }
+
         int32_t i_next = 0;
 
         // process the created batch of tokens
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index 7d543b9292b..0534409c032 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -2010,7 +2010,7 @@ server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t
 bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot) {
     const int lcp_best = prompt.tokens.get_common_prefix(tokens_new);
 
-    float f_keep_best = float(lcp_best) / prompt.tokens.size();
+    float f_keep_best = prompt.tokens.size() > 0 ? float(lcp_best) / prompt.tokens.size() : -1.0f; // empty slot: any cache entry wins
     float sim_best    = float(lcp_best) / tokens_new.size();
 
     SRV_WRN(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);
diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py
new file mode 100644
index 00000000000..7d8d79e2842
--- /dev/null
+++ b/tools/server/tests/unit/test_kv_keep_only_active.py
@@ -0,0 +1,81 @@
+import os
+import tempfile
+import pytest
+from utils import *
+
+server = ServerPreset.tinyllama2()
+
+class LogReader:
+    def __init__(self, path):
+        self.path = path
+        self.pos = 0
+    def drain(self):
+        with open(self.path) as f:
+            f.seek(self.pos)
+            content = f.read()
+            self.pos = f.tell()
+        return content
+
+@pytest.fixture(autouse=True)
+def create_server():
+    global server
+    os.environ["LLAMA_KV_KEEP_ONLY_ACTIVE"] = "1"
+    server = ServerPreset.tinyllama2()
+    server.n_slots = 2
+    server.n_predict = 4
+    server.temperature = 0.0
+    server.server_slots = True
+    server.cache_ram = 100
+    server.kv_unified = True
+    fd, server.log_path = tempfile.mkstemp(suffix='.log')
+    os.close(fd)
+    yield
+    os.environ.pop("LLAMA_KV_KEEP_ONLY_ACTIVE", None)
+    if os.path.exists(server.log_path):
+        os.unlink(server.log_path)
+
+
+LONG_PROMPT = (
+    "Once upon a time in a land far away, there lived a brave knight "
+    "who traveled across mountains and rivers to find the legendary "
+    "golden sword hidden deep within the enchanted forest of whispers. "
+    "He met many creatures along the way including dragons and fairies "
+    "and wizards who helped him on his noble quest to save the kingdom."
+)
+
+
+# idle slot cleared by LLAMA_KV_KEEP_ONLY_ACTIVE should restore from cache-ram
+def test_clear_and_restore():
+    global server
+    server.start()
+    log = LogReader(server.log_path)
+
+    # verify feature is enabled
+    assert "LLAMA_KV_KEEP_ONLY_ACTIVE" in log.drain()
+
+    res = server.make_request("POST", "/completion", data={
+        "prompt": LONG_PROMPT,
+        "id_slot": 0,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    original_prompt_n = res.body["timings"]["prompt_n"]
+
+    # Request on slot 1 triggers batch, clearing idle slot 0
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "The quick brown fox",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert "kv_keep_only_active: cleared 1 slot" in log.drain()
+
+    # Re-send same prompt — should restore from cache-ram
+    res = server.make_request("POST", "/completion", data={
+        "prompt": LONG_PROMPT,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert "updating prompt cache" in log.drain()
+    assert res.body["timings"]["cache_n"] > 0
+    assert res.body["timings"]["prompt_n"] < original_prompt_n
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index c6fe11261f9..1edda66a84a 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -102,6 +102,8 @@ class ServerProcess:
     mmproj_url: str | None = None
     media_path: str | None = None
     sleep_idle_seconds: int | None = None
+    cache_ram: int | None = None
+    log_path: str | None = None
     webui_mcp_proxy: bool = False
 
     # session variables
@@ -237,6 +239,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
             server_args.extend(["--media-path", self.media_path])
         if self.sleep_idle_seconds is not None:
             server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
+        if self.cache_ram is not None:
+            server_args.extend(["--cache-ram", self.cache_ram])
         if self.webui_mcp_proxy:
             server_args.append("--webui-mcp-proxy")
 
@@ -249,11 +253,16 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
             flags |= subprocess.CREATE_NEW_PROCESS_GROUP
             flags |= subprocess.CREATE_NO_WINDOW
 
+        if self.log_path:
+            self._log = open(self.log_path, "w")
+        else:
+            self._log = sys.stdout
+
         self.process = subprocess.Popen(
             [str(arg) for arg in [server_path, *server_args]],
             creationflags=flags,
-            stdout=sys.stdout,
-            stderr=sys.stdout,
+            stdout=self._log,
+            stderr=self._log if self._log != sys.stdout else sys.stdout,
             env={**os.environ, "LLAMA_CACHE": "tmp"} if "LLAMA_CACHE" not in os.environ else None,
         )
         server_instances.add(self)
@@ -290,6 +299,8 @@ def stop(self) -> None:
             print(f"Stopping server with pid={self.process.pid}")
             self.process.kill()
             self.process = None
+        if hasattr(self, '_log') and self._log != sys.stdout:
+            self._log.close()
 
     def make_request(
         self,

From 2563b4ae21fc26fded04b2018a3c769d2ab7f8c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nearly=20Ther=D1=A5?=
 <188969017+yychyo@users.noreply.github.com>
Date: Fri, 27 Mar 2026 23:01:45 +0300
Subject: [PATCH 02/11] server: move idle slot KV clearing to slot release

The save "cost" is now paid by the finishing request.
---
 tools/server/server-context.cpp               | 39 +++++--------------
 .../tests/unit/test_kv_keep_only_active.py    |  2 +-
 2 files changed, 11 insertions(+), 30 deletions(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index bbddf5e5564..b37408c71d2 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -788,6 +788,15 @@ struct server_context_impl {
 
             slot.callback_on_release = [this](int id_slot) {
                 queue_tasks.pop_deferred_task(id_slot);
+
+                if (kv_keep_only_active) {
+                    auto & slot = slots[id_slot];
+                    if (slot.prompt.n_tokens() > 0) {
+                        slot.prompt_save(*prompt_cache);
+                        slot.prompt_clear(false);
+                        prompt_cache->update();
+                    }
+                }
             };
 
             slot.reset();
@@ -875,7 +884,7 @@ struct server_context_impl {
                     SRV_WRN("%s\n", "LLAMA_KV_KEEP_ONLY_ACTIVE requires --cache-ram, ignoring");
                 } else {
                     kv_keep_only_active = true;
-                    SRV_INF("%s\n", "LLAMA_KV_KEEP_ONLY_ACTIVE: idle slots' KV will be cleared from VRAM before each decode");
+                    SRV_INF("%s\n", "LLAMA_KV_KEEP_ONLY_ACTIVE: idle slots' KV will be saved to cache-ram and cleared on release");
                 }
             }
         }
@@ -2705,34 +2714,6 @@ struct server_context_impl {
             n_empty_consecutive = 0;
         }
 
-        if (kv_keep_only_active && batch.n_tokens > 0) { // LLAMA_KV_KEEP_ONLY_ACTIVE: clear idle slots' KV
-            int kv_used = 0;
-            int n_cleared = 0;
-
-            for (auto & slot : slots) {
-                const int n_tokens = slot.prompt.n_tokens();
-                if (n_tokens == 0) {
-                    continue;
-                }
-                if (slot.is_processing()) {
-                    kv_used += n_tokens;
-                    continue;
-                }
-
-                slot.prompt_save(*prompt_cache);
-                slot.prompt_clear(false);
-                ++n_cleared;
-
-                SLT_DBG(slot, "kv_keep_only_active: cleared idle slot with %d tokens\n", n_tokens);
-            }
-
-            if (n_cleared > 0) {
-                prompt_cache->update();
-
-                SRV_INF("kv_keep_only_active: cleared %d slot(s), kv: %d/%d\n", n_cleared, kv_used, n_ctx);
-            }
-        }
-
         int32_t i_next = 0;
 
         // process the created batch of tokens
diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py
index 7d8d79e2842..4d7213ec433 100644
--- a/tools/server/tests/unit/test_kv_keep_only_active.py
+++ b/tools/server/tests/unit/test_kv_keep_only_active.py
@@ -68,7 +68,7 @@ def test_clear_and_restore():
         "cache_prompt": True,
     })
     assert res.status_code == 200
-    assert "kv_keep_only_active: cleared 1 slot" in log.drain()
+    assert "clearing prompt with" in log.drain()
 
     # Re-send same prompt — should restore from cache-ram
     res = server.make_request("POST", "/completion", data={

From ced2476cbd13c28797bf1cd04f6ed4723fc5de22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nearly=20Ther=D1=A5?=
 <188969017+yychyo@users.noreply.github.com>
Date: Fri, 27 Mar 2026 23:05:44 +0300
Subject: [PATCH 03/11] server: add --kv-clear-idle flag, enable by default

---
 common/arg.cpp                                |  8 ++++++++
 common/common.h                               |  1 +
 tools/server/server-context.cpp               | 20 ++++++++-----------
 .../tests/unit/test_kv_keep_only_active.py    |  4 +---
 4 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 0096e7a68d6..99b725b6c8f 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1309,6 +1309,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.kv_unified = value;
         }
     ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
+    add_opt(common_arg(
+        {"--kv-clear-idle"},
+        {"--no-kv-clear-idle"},
+        "save and clear idle slots' KV on release (default: enabled, requires unified KV and cache-ram)",
+        [](common_params & params, bool value) {
+            params.kv_clear_idle = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--context-shift"},
         {"--no-context-shift"},
diff --git a/common/common.h b/common/common.h
index 62201ea1ad3..6ee835c70e2 100644
--- a/common/common.h
+++ b/common/common.h
@@ -530,6 +530,7 @@ struct common_params {
     bool ctx_shift         = false; // context shift on infinite text generation
     bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
     bool kv_unified        = false; // enable unified KV cache
+    bool kv_clear_idle     = true;  // save and clear idle slots' KV on release
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool use_mmap          = true;  // enable mmap to use filesystem cache
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index b37408c71d2..7911f502e3d 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -874,18 +874,14 @@ struct server_context_impl {
 
         metrics.init();
 
-        // LLAMA_KV_KEEP_ONLY_ACTIVE: clear idle slots' KV from VRAM before each decode batch
-        {
-            const char * env = getenv("LLAMA_KV_KEEP_ONLY_ACTIVE");
-            if (env && atoi(env)) {
-                if (!params_base.kv_unified) {
-                    SRV_WRN("%s\n", "LLAMA_KV_KEEP_ONLY_ACTIVE requires unified KV cache, ignoring");
-                } else if (params_base.cache_ram_mib == 0) {
-                    SRV_WRN("%s\n", "LLAMA_KV_KEEP_ONLY_ACTIVE requires --cache-ram, ignoring");
-                } else {
-                    kv_keep_only_active = true;
-                    SRV_INF("%s\n", "LLAMA_KV_KEEP_ONLY_ACTIVE: idle slots' KV will be saved to cache-ram and cleared on release");
-                }
+        if (params_base.kv_clear_idle) {
+            if (!params_base.kv_unified) {
+                SRV_WRN("%s\n", "--kv-clear-idle requires --kv-unified, disabling");
+            } else if (params_base.cache_ram_mib == 0) {
+                SRV_WRN("%s\n", "--kv-clear-idle requires --cache-ram, disabling");
+            } else {
+                kv_keep_only_active = true;
+                SRV_INF("%s\n", "kv-clear-idle: idle slots' KV will be saved to cache-ram and cleared on release");
             }
         }
 
diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py
index 4d7213ec433..79129c6bceb 100644
--- a/tools/server/tests/unit/test_kv_keep_only_active.py
+++ b/tools/server/tests/unit/test_kv_keep_only_active.py
@@ -19,7 +19,6 @@ def drain(self):
 @pytest.fixture(autouse=True)
 def create_server():
     global server
-    os.environ["LLAMA_KV_KEEP_ONLY_ACTIVE"] = "1"
     server = ServerPreset.tinyllama2()
     server.n_slots = 2
     server.n_predict = 4
@@ -30,7 +29,6 @@ def create_server():
     fd, server.log_path = tempfile.mkstemp(suffix='.log')
     os.close(fd)
     yield
-    os.environ.pop("LLAMA_KV_KEEP_ONLY_ACTIVE", None)
     if os.path.exists(server.log_path):
         os.unlink(server.log_path)
 
@@ -51,7 +49,7 @@ def test_clear_and_restore():
     log = LogReader(server.log_path)
 
     # verify feature is enabled
-    assert "LLAMA_KV_KEEP_ONLY_ACTIVE" in log.drain()
+    assert "kv-clear-idle" in log.drain()
 
     res = server.make_request("POST", "/completion", data={
         "prompt": LONG_PROMPT,

From f6c4a3d21be27e1d6ea73f0acce35870194e10a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nearly=20Ther=D1=A5?=
 <188969017+yychyo@users.noreply.github.com>
Date: Sat, 28 Mar 2026 07:52:50 +0300
Subject: [PATCH 04/11] server: skip clearing last idle slot, clear on launch

---
 tools/server/server-context.cpp               | 31 ++++++++++++++++---
 .../tests/unit/test_kv_keep_only_active.py    | 15 +++++++--
 2 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 7911f502e3d..0f1a4d92f95 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -606,6 +606,15 @@ struct server_context_impl {
         llama_batch_free(batch);
     }
 
+    void slot_save_and_clear(server_slot & slot) {
+        if (slot.prompt.n_tokens() == 0) {
+            return;
+        }
+        slot.prompt_save(*prompt_cache);
+        slot.prompt_clear(false);
+        prompt_cache->update();
+    }
+
     void handle_sleeping_state(bool new_state) {
         GGML_ASSERT(sleeping != new_state);
         if (new_state) {
@@ -790,11 +799,15 @@ struct server_context_impl {
                 queue_tasks.pop_deferred_task(id_slot);
 
                 if (kv_keep_only_active) {
-                    auto & slot = slots[id_slot];
-                    if (slot.prompt.n_tokens() > 0) {
-                        slot.prompt_save(*prompt_cache);
-                        slot.prompt_clear(false);
-                        prompt_cache->update();
+                    bool should_clear = false;
+                    for (const auto & s : slots) {
+                        if (s.id != id_slot && s.prompt.n_tokens() > 0) {
+                            should_clear = true;
+                            break;
+                        }
+                    }
+                    if (should_clear) {
+                        slot_save_and_clear(slots[id_slot]);
                     }
                 }
             };
@@ -1100,6 +1113,14 @@ struct server_context_impl {
     }
 
     bool launch_slot_with_task(server_slot & slot, server_task && task) {
+        if (kv_keep_only_active) {
+            for (auto & s : slots) {
+                if (s.id != slot.id && !s.is_processing()) {
+                    slot_save_and_clear(s);
+                }
+            }
+        }
+
         // process per-request lora adapters
         if (!task.params.lora.empty()) {
             auto task_loras = construct_lora_list(task.params.lora);
diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py
index 79129c6bceb..aa8b2d8b9fd 100644
--- a/tools/server/tests/unit/test_kv_keep_only_active.py
+++ b/tools/server/tests/unit/test_kv_keep_only_active.py
@@ -42,7 +42,7 @@ def create_server():
 )
 
 
-# idle slot cleared by LLAMA_KV_KEEP_ONLY_ACTIVE should restore from cache-ram
+# idle slot cleared on release should restore from cache-ram
 def test_clear_and_restore():
     global server
     server.start()
@@ -59,7 +59,10 @@ def test_clear_and_restore():
     assert res.status_code == 200
     original_prompt_n = res.body["timings"]["prompt_n"]
 
-    # Request on slot 1 triggers batch, clearing idle slot 0
+    # Slot 0 is the only slot with KV — should NOT be cleared
+    assert "clearing prompt with" not in log.drain()
+
+    # Launching slot 1 clears idle slot 0
     res = server.make_request("POST", "/completion", data={
         "prompt": "The quick brown fox",
         "id_slot": 1,
@@ -77,3 +80,11 @@ def test_clear_and_restore():
     assert "updating prompt cache" in log.drain()
     assert res.body["timings"]["cache_n"] > 0
     assert res.body["timings"]["prompt_n"] < original_prompt_n
+
+    # Follow-up — slot 0 kept its KV, no clearing needed
+    res = server.make_request("POST", "/completion", data={
+        "prompt": LONG_PROMPT + " The knight finally reached the castle gates.",
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert "clearing prompt with" not in log.drain()

From 698e9a3c9cdf1f25d26a109057886567eb7a2d1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nearly=20Ther=D1=A5?=
 <188969017+yychyo@users.noreply.github.com>
Date: Sat, 28 Mar 2026 10:06:36 +0300
Subject: [PATCH 05/11] server: test --no-kv-clear-idle flag

---
 .../tests/unit/test_kv_keep_only_active.py    | 26 +++++++++++++++++++
 tools/server/tests/utils.py                   |  3 +++
 2 files changed, 29 insertions(+)

diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py
index aa8b2d8b9fd..2fb560e5f30 100644
--- a/tools/server/tests/unit/test_kv_keep_only_active.py
+++ b/tools/server/tests/unit/test_kv_keep_only_active.py
@@ -88,3 +88,29 @@ def test_clear_and_restore():
     })
     assert res.status_code == 200
     assert "clearing prompt with" not in log.drain()
+
+
+def test_disabled_with_flag():
+    global server
+    server.no_kv_clear_idle = True
+    server.start()
+    log = LogReader(server.log_path)
+
+    # Feature should not be enabled
+    assert "kv-clear-idle" not in log.drain()
+
+    res = server.make_request("POST", "/completion", data={
+        "prompt": LONG_PROMPT,
+        "id_slot": 0,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+
+    # Request on different slot — should NOT trigger clearing
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "The quick brown fox",
+        "id_slot": 1,
+        "cache_prompt": True,
+    })
+    assert res.status_code == 200
+    assert "clearing prompt with" not in log.drain()
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index 1edda66a84a..8c28bb8049a 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -103,6 +103,7 @@ class ServerProcess:
     media_path: str | None = None
     sleep_idle_seconds: int | None = None
     cache_ram: int | None = None
+    no_kv_clear_idle: bool = False
     log_path: str | None = None
     webui_mcp_proxy: bool = False
 
@@ -241,6 +242,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
             server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
         if self.cache_ram is not None:
             server_args.extend(["--cache-ram", self.cache_ram])
+        if self.no_kv_clear_idle:
+            server_args.append("--no-kv-clear-idle")
         if self.webui_mcp_proxy:
             server_args.append("--webui-mcp-proxy")
 

From d658a62c265b16536cf94306ceb760e8b35bd7d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nearly=20Ther=D1=A5?=
 <188969017+yychyo@users.noreply.github.com>
Date: Sat, 28 Mar 2026 10:28:51 +0300
Subject: [PATCH 06/11] server: simplify on-release clearing loop

---
 tools/server/server-context.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 0f1a4d92f95..5745a0fbb07 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -799,16 +799,12 @@ struct server_context_impl {
                 queue_tasks.pop_deferred_task(id_slot);
 
                 if (kv_keep_only_active) {
-                    bool should_clear = false;
                     for (const auto & s : slots) {
                         if (s.id != id_slot && s.prompt.n_tokens() > 0) {
-                            should_clear = true;
+                            slot_save_and_clear(slots[id_slot]);
                             break;
                         }
                     }
-                    if (should_clear) {
-                        slot_save_and_clear(slots[id_slot]);
-                    }
                 }
             };
 

From 3d338c9d99621fc7aab8065d0774f68c67fd8cd6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nearly=20Ther=D1=A5?=
 <188969017+yychyo@users.noreply.github.com>
Date: Tue, 31 Mar 2026 15:23:16 +0300
Subject: [PATCH 07/11] server: remove on-release KV clearing, keep launch-only

---
 tools/server/server-context.cpp                     | 9 ---------
 tools/server/tests/unit/test_kv_keep_only_active.py | 2 +-
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 5745a0fbb07..93af288d225 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -797,15 +797,6 @@ struct server_context_impl {
 
             slot.callback_on_release = [this](int id_slot) {
                 queue_tasks.pop_deferred_task(id_slot);
-
-                if (kv_keep_only_active) {
-                    for (const auto & s : slots) {
-                        if (s.id != id_slot && s.prompt.n_tokens() > 0) {
-                            slot_save_and_clear(slots[id_slot]);
-                            break;
-                        }
-                    }
-                }
             };
 
             slot.reset();
diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py
index 2fb560e5f30..0d993c2e0fd 100644
--- a/tools/server/tests/unit/test_kv_keep_only_active.py
+++ b/tools/server/tests/unit/test_kv_keep_only_active.py
@@ -42,7 +42,7 @@ def create_server():
 )
 
 
-# idle slot cleared on release should restore from cache-ram
+# idle slot cleared on launch should restore from cache-ram
 def test_clear_and_restore():
     global server
     server.start()

From 09ed3c03e017c65bdd5509029b0356fc071d784f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 31 Mar 2026 16:34:21 +0300
Subject: [PATCH 08/11] cont : clean-up

---
 common/arg.cpp                  | 10 +++++-----
 common/common.h                 |  6 +++---
 tools/cli/README.md             |  4 ++--
 tools/completion/README.md      |  4 ++--
 tools/server/README.md          |  5 +++--
 tools/server/server-context.cpp | 33 ++++++++++++++++-----------------
 tools/server/tests/utils.py     |  6 +++---
 7 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 99b725b6c8f..00b747a72a5 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1310,13 +1310,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL}));
     add_opt(common_arg(
-        {"--kv-clear-idle"},
-        {"--no-kv-clear-idle"},
-        "save and clear idle slots' KV on release (default: enabled, requires unified KV and cache-ram)",
+        {"--clear-idle"},
+        {"--no-clear-idle"},
+        "save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)",
         [](common_params & params, bool value) {
-            params.kv_clear_idle = value;
+            params.clear_idle = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--context-shift"},
         {"--no-context-shift"},
diff --git a/common/common.h b/common/common.h
index 6ee835c70e2..ccffe44978e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -530,7 +530,6 @@ struct common_params {
     bool ctx_shift         = false; // context shift on infinite text generation
     bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
     bool kv_unified        = false; // enable unified KV cache
-    bool kv_clear_idle     = true;  // save and clear idle slots' KV on release
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool use_mmap          = true;  // enable mmap to use filesystem cache
@@ -579,8 +578,9 @@ struct common_params {
     int32_t n_threads_http      = -1;    // number of threads to process HTTP requests (TODO: support threadpool)
     int32_t n_cache_reuse       = 0;     // min chunk size to reuse from the cache via KV shifting
     bool    cache_prompt        = true;  // whether to enable prompt caching
-    int32_t n_ctx_checkpoints   = 32;     // max number of context checkpoints per slot
-    int32_t checkpoint_every_nt = 8192;   // make a checkpoint every n tokens during prefill
+    bool    clear_idle          = true;  // save and clear idle slots upon starting a new task
+    int32_t n_ctx_checkpoints   = 32;    // max number of context checkpoints per slot
+    int32_t checkpoint_every_nt = 8192;  // make a checkpoint every n tokens during prefill
     int32_t cache_ram_mib       = 8192;  // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
 
     std::string hostname      = "127.0.0.1";
diff --git a/tools/cli/README.md b/tools/cli/README.md
index 840976a8848..de0b7804091 100644
--- a/tools/cli/README.md
+++ b/tools/cli/README.md
@@ -176,8 +176,8 @@
 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
 | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
 | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
 | `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
diff --git a/tools/completion/README.md b/tools/completion/README.md
index 25884ed92d0..fe1a036a38c 100644
--- a/tools/completion/README.md
+++ b/tools/completion/README.md
@@ -255,8 +255,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
 | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
 | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
 
diff --git a/tools/server/README.md b/tools/server/README.md
index 5a5d3aa02b7..d6348014b0e 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -168,6 +168,7 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)<br/>(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) |
 | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
 | `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
+| `--clear-idle, --no-clear-idle` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)<br/>(env: LLAMA_ARG_CLEAR_IDLE) |
 | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
 | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
 | `-sp, --special` | special tokens output enabled (default: false) |
@@ -220,8 +221,8 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
 | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
 | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)<br/>(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) |
-| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
-| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
+| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
+| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
 | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)<br/>(env: LLAMA_ARG_SKIP_CHAT_PARSING) |
 | `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) |
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 93af288d225..47b989a31bd 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -573,7 +573,6 @@ struct server_context_impl {
     int slots_debug = 0;
     int n_empty_consecutive = 0;
 
-    bool kv_keep_only_active = false;
     std::unique_ptr<server_prompt_cache> prompt_cache;
 
     server_metrics metrics;
@@ -610,6 +609,7 @@ struct server_context_impl {
         if (slot.prompt.n_tokens() == 0) {
             return;
         }
+        SLT_INF(slot, "%s", "saving idle slot to prompt cache\n");
         slot.prompt_save(*prompt_cache);
         slot.prompt_clear(false);
         prompt_cache->update();
@@ -874,14 +874,15 @@ struct server_context_impl {
 
         metrics.init();
 
-        if (params_base.kv_clear_idle) {
+        if (params_base.clear_idle) {
             if (!params_base.kv_unified) {
-                SRV_WRN("%s\n", "--kv-clear-idle requires --kv-unified, disabling");
+                SRV_WRN("%s: --clear-idle requires --kv-unified, disabling\n", __func__);
+                params_base.clear_idle = false;
             } else if (params_base.cache_ram_mib == 0) {
-                SRV_WRN("%s\n", "--kv-clear-idle requires --cache-ram, disabling");
+                SRV_WRN("%s: --clear-idle requires --cache-ram, disabling\n", __func__);
+                params_base.clear_idle = false;
             } else {
-                kv_keep_only_active = true;
-                SRV_INF("%s\n", "kv-clear-idle: idle slots' KV will be saved to cache-ram and cleared on release");
+                SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
             }
         }
 
@@ -1100,14 +1101,6 @@ struct server_context_impl {
     }
 
     bool launch_slot_with_task(server_slot & slot, server_task && task) {
-        if (kv_keep_only_active) {
-            for (auto & s : slots) {
-                if (s.id != slot.id && !s.is_processing()) {
-                    slot_save_and_clear(s);
-                }
-            }
-        }
-
         // process per-request lora adapters
         if (!task.params.lora.empty()) {
             auto task_loras = construct_lora_list(task.params.lora);
@@ -1721,9 +1714,7 @@ struct server_context_impl {
                     const int id_slot = task.id_slot;
                     const int id_task = task.id;
 
-                    server_slot * slot = id_slot != -1
-                                            ? get_slot_by_id(id_slot)
-                                            : get_available_slot(task);
+                    server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
 
                     //
                     // slot scheduling logic
@@ -1760,6 +1751,14 @@ struct server_context_impl {
                         SRV_ERR("failed to launch slot with task, id_task = %d\n", id_task);
                         break; // drop the task
                     }
+
+                    if (params_base.clear_idle) {
+                        for (auto & s : slots) {
+                            if (!s.is_processing()) {
+                                slot_save_and_clear(s);
+                            }
+                        }
+                    }
                 } break;
             case SERVER_TASK_TYPE_CANCEL:
                 {
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index 8c28bb8049a..62b6c994361 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -103,7 +103,7 @@ class ServerProcess:
     media_path: str | None = None
     sleep_idle_seconds: int | None = None
     cache_ram: int | None = None
-    no_kv_clear_idle: bool = False
+    no_clear_idle: bool = False
     log_path: str | None = None
     webui_mcp_proxy: bool = False
 
@@ -242,8 +242,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
             server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds])
         if self.cache_ram is not None:
             server_args.extend(["--cache-ram", self.cache_ram])
-        if self.no_kv_clear_idle:
-            server_args.append("--no-kv-clear-idle")
+        if self.no_clear_idle:
+            server_args.append("--no-clear-idle")
         if self.webui_mcp_proxy:
             server_args.append("--webui-mcp-proxy")
 

From 645aebc4ce7b01d0bdea69a18d1b4264c08bf69f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nearly=20Ther=D1=A5?=
 <188969017+yychyo@users.noreply.github.com>
Date: Tue, 31 Mar 2026 16:41:43 +0300
Subject: [PATCH 09/11] tests: update log strings after --clear-idle rename

---
 .../server/tests/unit/test_kv_keep_only_active.py  | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py
index 0d993c2e0fd..98ffced0eee 100644
--- a/tools/server/tests/unit/test_kv_keep_only_active.py
+++ b/tools/server/tests/unit/test_kv_keep_only_active.py
@@ -49,7 +49,7 @@ def test_clear_and_restore():
     log = LogReader(server.log_path)
 
     # verify feature is enabled
-    assert "kv-clear-idle" in log.drain()
+    assert "idle slots will be saved to prompt cache" in log.drain()
 
     res = server.make_request("POST", "/completion", data={
         "prompt": LONG_PROMPT,
@@ -60,7 +60,7 @@ def test_clear_and_restore():
     original_prompt_n = res.body["timings"]["prompt_n"]
 
     # Slot 0 is the only slot with KV — should NOT be cleared
-    assert "clearing prompt with" not in log.drain()
+    assert "saving idle slot to prompt cache" not in log.drain()
 
     # Launching slot 1 clears idle slot 0
     res = server.make_request("POST", "/completion", data={
@@ -69,7 +69,7 @@ def test_clear_and_restore():
         "cache_prompt": True,
     })
     assert res.status_code == 200
-    assert "clearing prompt with" in log.drain()
+    assert "saving idle slot to prompt cache" in log.drain()
 
     # Re-send same prompt — should restore from cache-ram
     res = server.make_request("POST", "/completion", data={
@@ -87,17 +87,17 @@ def test_clear_and_restore():
         "cache_prompt": True,
     })
     assert res.status_code == 200
-    assert "clearing prompt with" not in log.drain()
+    assert "saving idle slot to prompt cache" not in log.drain()
 
 
 def test_disabled_with_flag():
     global server
-    server.no_kv_clear_idle = True
+    server.no_clear_idle = True
     server.start()
     log = LogReader(server.log_path)
 
     # Feature should not be enabled
-    assert "kv-clear-idle" not in log.drain()
+    assert "idle slots will be saved to prompt cache" not in log.drain()
 
     res = server.make_request("POST", "/completion", data={
         "prompt": LONG_PROMPT,
@@ -113,4 +113,4 @@ def test_disabled_with_flag():
         "cache_prompt": True,
     })
     assert res.status_code == 200
-    assert "clearing prompt with" not in log.drain()
+    assert "saving idle slot to prompt cache" not in log.drain()

From a22ba24395042c1d4cfa2d098f67700eaffd3e00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nearly=20Ther=D1=A5?=
 <188969017+yychyo@users.noreply.github.com>
Date: Wed, 1 Apr 2026 19:36:29 +0300
Subject: [PATCH 10/11] tests: use debug tags instead of log message matching

---
 tools/server/server-context.cpp                     |  2 ++
 tools/server/tests/unit/test_kv_keep_only_active.py | 13 +++++++------
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 47b989a31bd..2fd863ef070 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -610,6 +610,7 @@ struct server_context_impl {
             return;
         }
         SLT_INF(slot, "%s", "saving idle slot to prompt cache\n");
+        SLT_DBG(slot, "%s", "__TEST_TAG_CLEAR_IDLE_SLOT__\n");
         slot.prompt_save(*prompt_cache);
         slot.prompt_clear(false);
         prompt_cache->update();
@@ -883,6 +884,7 @@ struct server_context_impl {
                 params_base.clear_idle = false;
             } else {
                 SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
+                SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
             }
         }
 
diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py
index 98ffced0eee..8e26986ccb1 100644
--- a/tools/server/tests/unit/test_kv_keep_only_active.py
+++ b/tools/server/tests/unit/test_kv_keep_only_active.py
@@ -26,6 +26,7 @@ def create_server():
     server.server_slots = True
     server.cache_ram = 100
     server.kv_unified = True
+    server.debug = True
     fd, server.log_path = tempfile.mkstemp(suffix='.log')
     os.close(fd)
     yield
@@ -49,7 +50,7 @@ def test_clear_and_restore():
     log = LogReader(server.log_path)
 
     # verify feature is enabled
-    assert "idle slots will be saved to prompt cache" in log.drain()
+    assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" in log.drain()
 
     res = server.make_request("POST", "/completion", data={
         "prompt": LONG_PROMPT,
@@ -60,7 +61,7 @@ def test_clear_and_restore():
     original_prompt_n = res.body["timings"]["prompt_n"]
 
     # Slot 0 is the only slot with KV — should NOT be cleared
-    assert "saving idle slot to prompt cache" not in log.drain()
+    assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
 
     # Launching slot 1 clears idle slot 0
     res = server.make_request("POST", "/completion", data={
@@ -69,7 +70,7 @@ def test_clear_and_restore():
         "cache_prompt": True,
     })
     assert res.status_code == 200
-    assert "saving idle slot to prompt cache" in log.drain()
+    assert "__TEST_TAG_CLEAR_IDLE_SLOT__" in log.drain()
 
     # Re-send same prompt — should restore from cache-ram
     res = server.make_request("POST", "/completion", data={
@@ -87,7 +88,7 @@ def test_clear_and_restore():
         "cache_prompt": True,
     })
     assert res.status_code == 200
-    assert "saving idle slot to prompt cache" not in log.drain()
+    assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
 
 
 def test_disabled_with_flag():
@@ -97,7 +98,7 @@ def test_disabled_with_flag():
     log = LogReader(server.log_path)
 
     # Feature should not be enabled
-    assert "idle slots will be saved to prompt cache" not in log.drain()
+    assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" not in log.drain()
 
     res = server.make_request("POST", "/completion", data={
         "prompt": LONG_PROMPT,
@@ -113,4 +114,4 @@ def test_disabled_with_flag():
         "cache_prompt": True,
     })
     assert res.status_code == 200
-    assert "saving idle slot to prompt cache" not in log.drain()
+    assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()

From 5fc169814b9b57d294e451b47ace3740cb4a12ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nearly=20Ther=D1=A5?=
 <188969017+yychyo@users.noreply.github.com>
Date: Fri, 3 Apr 2026 11:27:45 +0300
Subject: [PATCH 11/11] test: fix Windows CI by dropping temp log file unlink

---
 tools/server/tests/unit/test_kv_keep_only_active.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py
index 8e26986ccb1..da93d50011e 100644
--- a/tools/server/tests/unit/test_kv_keep_only_active.py
+++ b/tools/server/tests/unit/test_kv_keep_only_active.py
@@ -30,8 +30,6 @@ def create_server():
     fd, server.log_path = tempfile.mkstemp(suffix='.log')
     os.close(fd)
     yield
-    if os.path.exists(server.log_path):
-        os.unlink(server.log_path)
 
 
 LONG_PROMPT = (