From c82cd357aada9df150ee3fb55b30ee640aeb8f33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nearly=20Ther=D1=A5?= <188969017+yychyo@users.noreply.github.com> Date: Wed, 25 Mar 2026 17:24:46 +0300 Subject: [PATCH 01/11] server: clear idle slots KV from VRAM (LLAMA_KV_KEEP_ONLY_ACTIVE) --- tools/server/server-context.cpp | 52 +++++++++++- tools/server/server-task.cpp | 2 +- .../tests/unit/test_kv_keep_only_active.py | 81 +++++++++++++++++++ tools/server/tests/utils.py | 15 +++- 4 files changed, 143 insertions(+), 7 deletions(-) create mode 100644 tools/server/tests/unit/test_kv_keep_only_active.py diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index b79a5270b52..bbddf5e5564 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -573,6 +573,7 @@ struct server_context_impl { int slots_debug = 0; int n_empty_consecutive = 0; + bool kv_keep_only_active = false; std::unique_ptr prompt_cache; server_metrics metrics; @@ -864,6 +865,21 @@ struct server_context_impl { metrics.init(); + // LLAMA_KV_KEEP_ONLY_ACTIVE: clear idle slots' KV from VRAM before each decode batch + { + const char * env = getenv("LLAMA_KV_KEEP_ONLY_ACTIVE"); + if (env && atoi(env)) { + if (!params_base.kv_unified) { + SRV_WRN("%s\n", "LLAMA_KV_KEEP_ONLY_ACTIVE requires unified KV cache, ignoring"); + } else if (params_base.cache_ram_mib == 0) { + SRV_WRN("%s\n", "LLAMA_KV_KEEP_ONLY_ACTIVE requires --cache-ram, ignoring"); + } else { + kv_keep_only_active = true; + SRV_INF("%s\n", "LLAMA_KV_KEEP_ONLY_ACTIVE: idle slots' KV will be cleared from VRAM before each decode"); + } + } + } + // populate webui settings { if (!params_base.webui_config_json.empty()) { @@ -1010,15 +1026,15 @@ struct server_context_impl { // cache prompts only for completion tasks update_cache = update_cache && task.type == SERVER_TASK_TYPE_COMPLETION; - // don't update the cache if the slot's context is empty - update_cache = update_cache && tokens.size() > 0; - if (update_cache) { SRV_WRN("%s", "updating prompt cache\n"); const int64_t t_start = ggml_time_us(); - ret->prompt_save(*prompt_cache); + // don't save the slot's state if its context is empty + if (tokens.size() > 0) { + ret->prompt_save(*prompt_cache); + } if (!ret->prompt_load(*prompt_cache, task.tokens)) { ret->prompt_clear(false); @@ -2689,6 +2705,34 @@ struct server_context_impl { n_empty_consecutive = 0; } + if (kv_keep_only_active && batch.n_tokens > 0) { // LLAMA_KV_KEEP_ONLY_ACTIVE: clear idle slots' KV + int kv_used = 0; + int n_cleared = 0; + + for (auto & slot : slots) { + const int n_tokens = slot.prompt.n_tokens(); + if (n_tokens == 0) { + continue; + } + if (slot.is_processing()) { + kv_used += n_tokens; + continue; + } + + slot.prompt_save(*prompt_cache); + slot.prompt_clear(false); + ++n_cleared; + + SLT_DBG(slot, "kv_keep_only_active: cleared idle slot with %d tokens\n", n_tokens); + } + + if (n_cleared > 0) { + prompt_cache->update(); + + SRV_INF("kv_keep_only_active: cleared %d slot(s), kv: %d/%d\n", n_cleared, kv_used, n_ctx); + } + } + int32_t i_next = 0; // process the created batch of tokens diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 7d543b9292b..0534409c032 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -2010,7 +2010,7 @@ server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tokens_new, llama_context * ctx, int32_t id_slot) { const int lcp_best = prompt.tokens.get_common_prefix(tokens_new); - float f_keep_best = float(lcp_best) / prompt.tokens.size(); + float f_keep_best = prompt.tokens.size() > 0 ? float(lcp_best) / prompt.tokens.size() : -1.0f; // empty slot: any cache entry wins float sim_best = float(lcp_best) / tokens_new.size(); SRV_WRN(" - looking for better prompt, base f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best); diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py new file mode 100644 index 00000000000..7d8d79e2842 --- /dev/null +++ b/tools/server/tests/unit/test_kv_keep_only_active.py @@ -0,0 +1,81 @@ +import os +import tempfile +import pytest +from utils import * + +server = ServerPreset.tinyllama2() + +class LogReader: + def __init__(self, path): + self.path = path + self.pos = 0 + def drain(self): + with open(self.path) as f: + f.seek(self.pos) + content = f.read() + self.pos = f.tell() + return content + +@pytest.fixture(autouse=True) +def create_server(): + global server + os.environ["LLAMA_KV_KEEP_ONLY_ACTIVE"] = "1" + server = ServerPreset.tinyllama2() + server.n_slots = 2 + server.n_predict = 4 + server.temperature = 0.0 + server.server_slots = True + server.cache_ram = 100 + server.kv_unified = True + fd, server.log_path = tempfile.mkstemp(suffix='.log') + os.close(fd) + yield + os.environ.pop("LLAMA_KV_KEEP_ONLY_ACTIVE", None) + if os.path.exists(server.log_path): + os.unlink(server.log_path) + + +LONG_PROMPT = ( + "Once upon a time in a land far away, there lived a brave knight " + "who traveled across mountains and rivers to find the legendary " + "golden sword hidden deep within the enchanted forest of whispers. " + "He met many creatures along the way including dragons and fairies " + "and wizards who helped him on his noble quest to save the kingdom." +) + + +# idle slot cleared by LLAMA_KV_KEEP_ONLY_ACTIVE should restore from cache-ram +def test_clear_and_restore(): + global server + server.start() + log = LogReader(server.log_path) + + # verify feature is enabled + assert "LLAMA_KV_KEEP_ONLY_ACTIVE" in log.drain() + + res = server.make_request("POST", "/completion", data={ + "prompt": LONG_PROMPT, + "id_slot": 0, + "cache_prompt": True, + }) + assert res.status_code == 200 + original_prompt_n = res.body["timings"]["prompt_n"] + + # Request on slot 1 triggers batch, clearing idle slot 0 + res = server.make_request("POST", "/completion", data={ + "prompt": "The quick brown fox", + "id_slot": 1, + "cache_prompt": True, + }) + assert res.status_code == 200 + assert "kv_keep_only_active: cleared 1 slot" in log.drain() + + # Re-send same prompt — should restore from cache-ram + res = server.make_request("POST", "/completion", data={ + "prompt": LONG_PROMPT, + "cache_prompt": True, + }) + assert res.status_code == 200 + assert "updating prompt cache" in log.drain() + assert res.body["timings"]["cache_n"] > 0 + assert res.body["timings"]["prompt_n"] < original_prompt_n diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index c6fe11261f9..1edda66a84a 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -102,6 +102,8 @@ class ServerProcess: mmproj_url: str | None = None media_path: str | None = None sleep_idle_seconds: int | None = None + cache_ram: int | None = None + log_path: str | None = None webui_mcp_proxy: bool = False # session variables @@ -237,6 +239,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None: server_args.extend(["--media-path", self.media_path]) if self.sleep_idle_seconds is not None: server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds]) + if self.cache_ram is not None: + server_args.extend(["--cache-ram", self.cache_ram]) if self.webui_mcp_proxy: server_args.append("--webui-mcp-proxy") @@ -249,11 +253,16 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None: flags |= subprocess.CREATE_NEW_PROCESS_GROUP flags |= subprocess.CREATE_NO_WINDOW + if self.log_path: + self._log = open(self.log_path, "w") + else: + self._log = sys.stdout + self.process = subprocess.Popen( [str(arg) for arg in [server_path, *server_args]], creationflags=flags, - stdout=sys.stdout, - stderr=sys.stdout, + stdout=self._log, + stderr=self._log if self._log != sys.stdout else sys.stdout, env={**os.environ, "LLAMA_CACHE": "tmp"} if "LLAMA_CACHE" not in os.environ else None, ) server_instances.add(self) @@ -290,6 +299,8 @@ def stop(self) -> None: print(f"Stopping server with pid={self.process.pid}") self.process.kill() self.process = None + if hasattr(self, '_log') and self._log != sys.stdout: + self._log.close() def make_request( self, From 2563b4ae21fc26fded04b2018a3c769d2ab7f8c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nearly=20Ther=D1=A5?= <188969017+yychyo@users.noreply.github.com> Date: Fri, 27 Mar 2026 23:01:45 +0300 Subject: [PATCH 02/11] server: move idle slot KV clearing to slot release The save "cost" is now paid by the finishing request. --- tools/server/server-context.cpp | 39 +++++-------------- .../tests/unit/test_kv_keep_only_active.py | 2 +- 2 files changed, 11 insertions(+), 30 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index bbddf5e5564..b37408c71d2 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -788,6 +788,15 @@ struct server_context_impl { slot.callback_on_release = [this](int id_slot) { queue_tasks.pop_deferred_task(id_slot); + + if (kv_keep_only_active) { + auto & slot = slots[id_slot]; + if (slot.prompt.n_tokens() > 0) { + slot.prompt_save(*prompt_cache); + slot.prompt_clear(false); + prompt_cache->update(); + } + } }; slot.reset(); @@ -875,7 +884,7 @@ struct server_context_impl { SRV_WRN("%s\n", "LLAMA_KV_KEEP_ONLY_ACTIVE requires --cache-ram, ignoring"); } else { kv_keep_only_active = true; - SRV_INF("%s\n", "LLAMA_KV_KEEP_ONLY_ACTIVE: idle slots' KV will be cleared from VRAM before each decode"); + SRV_INF("%s\n", "LLAMA_KV_KEEP_ONLY_ACTIVE: idle slots' KV will be saved to cache-ram and cleared on release"); } } } @@ -2705,34 +2714,6 @@ struct server_context_impl { n_empty_consecutive = 0; } - if (kv_keep_only_active && batch.n_tokens > 0) { // LLAMA_KV_KEEP_ONLY_ACTIVE: clear idle slots' KV - int kv_used = 0; - int n_cleared = 0; - - for (auto & slot : slots) { - const int n_tokens = slot.prompt.n_tokens(); - if (n_tokens == 0) { - continue; - } - if (slot.is_processing()) { - kv_used += n_tokens; - continue; - } - - slot.prompt_save(*prompt_cache); - slot.prompt_clear(false); - ++n_cleared; - - SLT_DBG(slot, "kv_keep_only_active: cleared idle slot with %d tokens\n", n_tokens); - } - - if (n_cleared > 0) { - prompt_cache->update(); - - SRV_INF("kv_keep_only_active: cleared %d slot(s), kv: %d/%d\n", n_cleared, kv_used, n_ctx); - } - } - int32_t i_next = 0; // process the created batch of tokens diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py index 7d8d79e2842..4d7213ec433 100644 --- a/tools/server/tests/unit/test_kv_keep_only_active.py +++ b/tools/server/tests/unit/test_kv_keep_only_active.py @@ -68,7 +68,7 @@ def test_clear_and_restore(): "cache_prompt": True, }) assert res.status_code == 200 - assert "kv_keep_only_active: cleared 1 slot" in log.drain() + assert "clearing prompt with" in log.drain() # Re-send same prompt — should restore from cache-ram res = server.make_request("POST", "/completion", data={ From ced2476cbd13c28797bf1cd04f6ed4723fc5de22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nearly=20Ther=D1=A5?= <188969017+yychyo@users.noreply.github.com> Date: Fri, 27 Mar 2026 23:05:44 +0300 Subject: [PATCH 03/11] server: add --kv-clear-idle flag, enable by default --- common/arg.cpp | 8 ++++++++ common/common.h | 1 + tools/server/server-context.cpp | 20 ++++++++----------- .../tests/unit/test_kv_keep_only_active.py | 4 +--- 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 0096e7a68d6..99b725b6c8f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1309,6 +1309,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.kv_unified = value; } ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL})); + add_opt(common_arg( + {"--kv-clear-idle"}, + {"--no-kv-clear-idle"}, + "save and clear idle slots' KV on release (default: enabled, requires unified KV and cache-ram)", + [](common_params & params, bool value) { + params.kv_clear_idle = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--context-shift"}, {"--no-context-shift"}, diff --git a/common/common.h b/common/common.h index 62201ea1ad3..6ee835c70e2 100644 --- a/common/common.h +++ b/common/common.h @@ -530,6 +530,7 @@ struct common_params { bool ctx_shift = false; // context shift on infinite text generation bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) bool kv_unified = false; // enable unified KV cache + bool kv_clear_idle = true; // save and clear idle slots' KV on release bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool use_mmap = true; // enable mmap to use filesystem cache diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index b37408c71d2..7911f502e3d 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -874,18 +874,14 @@ struct server_context_impl { metrics.init(); - // LLAMA_KV_KEEP_ONLY_ACTIVE: clear idle slots' KV from VRAM before each decode batch - { - const char * env = getenv("LLAMA_KV_KEEP_ONLY_ACTIVE"); - if (env && atoi(env)) { - if (!params_base.kv_unified) { - SRV_WRN("%s\n", "LLAMA_KV_KEEP_ONLY_ACTIVE requires unified KV cache, ignoring"); - } else if (params_base.cache_ram_mib == 0) { - SRV_WRN("%s\n", "LLAMA_KV_KEEP_ONLY_ACTIVE requires --cache-ram, ignoring"); - } else { - kv_keep_only_active = true; - SRV_INF("%s\n", "LLAMA_KV_KEEP_ONLY_ACTIVE: idle slots' KV will be saved to cache-ram and cleared on release"); - } + if (params_base.kv_clear_idle) { + if (!params_base.kv_unified) { + SRV_WRN("%s\n", "--kv-clear-idle requires --kv-unified, disabling"); + } else if (params_base.cache_ram_mib == 0) { + SRV_WRN("%s\n", "--kv-clear-idle requires --cache-ram, disabling"); + } else { + kv_keep_only_active = true; + SRV_INF("%s\n", "kv-clear-idle: idle slots' KV will be saved to cache-ram and cleared on release"); } } diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py index 4d7213ec433..79129c6bceb 100644 --- a/tools/server/tests/unit/test_kv_keep_only_active.py +++ b/tools/server/tests/unit/test_kv_keep_only_active.py @@ -19,7 +19,6 @@ def drain(self): @pytest.fixture(autouse=True) def create_server(): global server - os.environ["LLAMA_KV_KEEP_ONLY_ACTIVE"] = "1" server = ServerPreset.tinyllama2() server.n_slots = 2 server.n_predict = 4 @@ -30,7 +29,6 @@ def create_server(): fd, server.log_path = tempfile.mkstemp(suffix='.log') os.close(fd) yield - os.environ.pop("LLAMA_KV_KEEP_ONLY_ACTIVE", None) if os.path.exists(server.log_path): os.unlink(server.log_path) @@ -51,7 +49,7 @@ def test_clear_and_restore(): log = LogReader(server.log_path) # verify feature is enabled - assert "LLAMA_KV_KEEP_ONLY_ACTIVE" in log.drain() + assert "kv-clear-idle" in log.drain() res = server.make_request("POST", "/completion", data={ "prompt": LONG_PROMPT, From f6c4a3d21be27e1d6ea73f0acce35870194e10a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nearly=20Ther=D1=A5?= <188969017+yychyo@users.noreply.github.com> Date: Sat, 28 Mar 2026 07:52:50 +0300 Subject: [PATCH 04/11] server: skip clearing last idle slot, clear on launch --- tools/server/server-context.cpp | 31 ++++++++++++++++--- .../tests/unit/test_kv_keep_only_active.py | 15 +++++++-- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 7911f502e3d..0f1a4d92f95 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -606,6 +606,15 @@ struct server_context_impl { llama_batch_free(batch); } + void slot_save_and_clear(server_slot & slot) { + if (slot.prompt.n_tokens() == 0) { + return; + } + slot.prompt_save(*prompt_cache); + slot.prompt_clear(false); + prompt_cache->update(); + } + void handle_sleeping_state(bool new_state) { GGML_ASSERT(sleeping != new_state); if (new_state) { @@ -790,11 +799,15 @@ struct server_context_impl { queue_tasks.pop_deferred_task(id_slot); if (kv_keep_only_active) { - auto & slot = slots[id_slot]; - if (slot.prompt.n_tokens() > 0) { - slot.prompt_save(*prompt_cache); - slot.prompt_clear(false); - prompt_cache->update(); + bool should_clear = false; + for (const auto & s : slots) { + if (s.id != id_slot && s.prompt.n_tokens() > 0) { + should_clear = true; + break; + } + } + if (should_clear) { + slot_save_and_clear(slots[id_slot]); } } }; @@ -1100,6 +1113,14 @@ struct server_context_impl { } bool launch_slot_with_task(server_slot & slot, server_task && task) { + if (kv_keep_only_active) { + for (auto & s : slots) { + if (s.id != slot.id && !s.is_processing()) { + slot_save_and_clear(s); + } + } + } + // process per-request lora adapters if (!task.params.lora.empty()) { auto task_loras = construct_lora_list(task.params.lora); diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py index 79129c6bceb..aa8b2d8b9fd 100644 --- a/tools/server/tests/unit/test_kv_keep_only_active.py +++ b/tools/server/tests/unit/test_kv_keep_only_active.py @@ -42,7 +42,7 @@ def create_server(): ) -# idle slot cleared by LLAMA_KV_KEEP_ONLY_ACTIVE should restore from cache-ram +# idle slot cleared on release should restore from cache-ram def test_clear_and_restore(): global server server.start() @@ -59,7 +59,10 @@ def test_clear_and_restore(): assert res.status_code == 200 original_prompt_n = res.body["timings"]["prompt_n"] - # Request on slot 1 triggers batch, clearing idle slot 0 + # Slot 0 is the only slot with KV — should NOT be cleared + assert "clearing prompt with" not in log.drain() + + # Launching slot 1 clears idle slot 0 res = server.make_request("POST", "/completion", data={ "prompt": "The quick brown fox", "id_slot": 1, @@ -77,3 +80,11 @@ def test_clear_and_restore(): assert "updating prompt cache" in log.drain() assert res.body["timings"]["cache_n"] > 0 assert res.body["timings"]["prompt_n"] < original_prompt_n + + # Follow-up — slot 0 kept its KV, no clearing needed + res = server.make_request("POST", "/completion", data={ + "prompt": LONG_PROMPT + " The knight finally reached the castle gates.", + "cache_prompt": True, + }) + assert res.status_code == 200 + assert "clearing prompt with" not in log.drain() From 698e9a3c9cdf1f25d26a109057886567eb7a2d1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nearly=20Ther=D1=A5?= <188969017+yychyo@users.noreply.github.com> Date: Sat, 28 Mar 2026 10:06:36 +0300 Subject: [PATCH 05/11] server: test --no-kv-clear-idle flag --- .../tests/unit/test_kv_keep_only_active.py | 26 +++++++++++++++++++ tools/server/tests/utils.py | 3 +++ 2 files changed, 29 insertions(+) diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py index aa8b2d8b9fd..2fb560e5f30 100644 --- a/tools/server/tests/unit/test_kv_keep_only_active.py +++ b/tools/server/tests/unit/test_kv_keep_only_active.py @@ -88,3 +88,29 @@ def test_clear_and_restore(): }) assert res.status_code == 200 assert "clearing prompt with" not in log.drain() + + +def test_disabled_with_flag(): + global server + server.no_kv_clear_idle = True + server.start() + log = LogReader(server.log_path) + + # Feature should not be enabled + assert "kv-clear-idle" not in log.drain() + + res = server.make_request("POST", "/completion", data={ + "prompt": LONG_PROMPT, + "id_slot": 0, + "cache_prompt": True, + }) + assert res.status_code == 200 + + # Request on different slot — should NOT trigger clearing + res = server.make_request("POST", "/completion", data={ + "prompt": "The quick brown fox", + "id_slot": 1, + "cache_prompt": True, + }) + assert res.status_code == 200 + assert "clearing prompt with" not in log.drain() diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index 1edda66a84a..8c28bb8049a 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -103,6 +103,7 @@ class ServerProcess: media_path: str | None = None sleep_idle_seconds: int | None = None cache_ram: int | None = None + no_kv_clear_idle: bool = False log_path: str | None = None webui_mcp_proxy: bool = False @@ -241,6 +242,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None: server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds]) if self.cache_ram is not None: server_args.extend(["--cache-ram", self.cache_ram]) + if self.no_kv_clear_idle: + server_args.append("--no-kv-clear-idle") if self.webui_mcp_proxy: server_args.append("--webui-mcp-proxy") From d658a62c265b16536cf94306ceb760e8b35bd7d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nearly=20Ther=D1=A5?= <188969017+yychyo@users.noreply.github.com> Date: Sat, 28 Mar 2026 10:28:51 +0300 Subject: [PATCH 06/11] server: simplify on-release clearing loop --- tools/server/server-context.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 0f1a4d92f95..5745a0fbb07 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -799,16 +799,12 @@ struct server_context_impl { queue_tasks.pop_deferred_task(id_slot); if (kv_keep_only_active) { - bool should_clear = false; for (const auto & s : slots) { if (s.id != id_slot && s.prompt.n_tokens() > 0) { - should_clear = true; + slot_save_and_clear(slots[id_slot]); break; } } - if (should_clear) { - slot_save_and_clear(slots[id_slot]); - } } }; From 3d338c9d99621fc7aab8065d0774f68c67fd8cd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nearly=20Ther=D1=A5?= <188969017+yychyo@users.noreply.github.com> Date: Tue, 31 Mar 2026 15:23:16 +0300 Subject: [PATCH 07/11] server: remove on-release KV clearing, keep launch-only --- tools/server/server-context.cpp | 9 --------- tools/server/tests/unit/test_kv_keep_only_active.py | 2 +- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 5745a0fbb07..93af288d225 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -797,15 +797,6 @@ struct server_context_impl { slot.callback_on_release = [this](int id_slot) { queue_tasks.pop_deferred_task(id_slot); - - if (kv_keep_only_active) { - for (const auto & s : slots) { - if (s.id != id_slot && s.prompt.n_tokens() > 0) { - slot_save_and_clear(slots[id_slot]); - break; - } - } - } }; slot.reset(); diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py index 2fb560e5f30..0d993c2e0fd 100644 --- a/tools/server/tests/unit/test_kv_keep_only_active.py +++ b/tools/server/tests/unit/test_kv_keep_only_active.py @@ -42,7 +42,7 @@ def create_server(): ) -# idle slot cleared on release should restore from cache-ram +# idle slot cleared on launch should restore from cache-ram def test_clear_and_restore(): global server server.start() From 09ed3c03e017c65bdd5509029b0356fc071d784f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 31 Mar 2026 16:34:21 +0300 Subject: [PATCH 08/11] cont : clean-up --- common/arg.cpp | 10 +++++----- common/common.h | 6 +++--- tools/cli/README.md | 4 ++-- tools/completion/README.md | 4 ++-- tools/server/README.md | 5 +++-- tools/server/server-context.cpp | 33 ++++++++++++++++----------------- tools/server/tests/utils.py | 6 +++--- 7 files changed, 34 insertions(+), 34 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 99b725b6c8f..00b747a72a5 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1310,13 +1310,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED, LLAMA_EXAMPLE_BENCH, LLAMA_EXAMPLE_PARALLEL})); add_opt(common_arg( - {"--kv-clear-idle"}, - {"--no-kv-clear-idle"}, - "save and clear idle slots' KV on release (default: enabled, requires unified KV and cache-ram)", + {"--clear-idle"}, + {"--no-clear-idle"}, + "save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)", [](common_params & params, bool value) { - params.kv_clear_idle = value; + params.clear_idle = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER})); + ).set_env("LLAMA_ARG_CLEAR_IDLE").set_examples({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--context-shift"}, {"--no-context-shift"}, diff --git a/common/common.h b/common/common.h index 6ee835c70e2..ccffe44978e 100644 --- a/common/common.h +++ b/common/common.h @@ -530,7 +530,6 @@ struct common_params { bool ctx_shift = false; // context shift on infinite text generation bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055) bool kv_unified = false; // enable unified KV cache - bool kv_clear_idle = true; // save and clear idle slots' KV on release bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool use_mmap = true; // enable mmap to use filesystem cache @@ -579,8 +578,9 @@ struct common_params { int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting bool cache_prompt = true; // whether to enable prompt caching - int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot - int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill + bool clear_idle = true; // save and clear idle slots upon starting a new task + int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot + int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc. std::string hostname = "127.0.0.1"; diff --git a/tools/cli/README.md b/tools/cli/README.md index 840976a8848..de0b7804091 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -176,8 +176,8 @@ | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | | `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) | diff --git a/tools/completion/README.md b/tools/completion/README.md index 25884ed92d0..fe1a036a38c 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -255,8 +255,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | diff --git a/tools/server/README.md b/tools/server/README.md index 5a5d3aa02b7..d6348014b0e 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -168,6 +168,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | | `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | +| `--clear-idle, --no-clear-idle` | save and clear idle slots on new task (default: enabled, requires unified KV and cache-ram)
(env: LLAMA_ARG_CLEAR_IDLE) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode | | `-sp, --special` | special tokens output enabled (default: false) | @@ -220,8 +221,8 @@ For the full list of features, please refer to [server's changelog](https://gith | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled

(env: LLAMA_ARG_PREFILL_ASSISTANT) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) | diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 93af288d225..47b989a31bd 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -573,7 +573,6 @@ struct server_context_impl { int slots_debug = 0; int n_empty_consecutive = 0; - bool kv_keep_only_active = false; std::unique_ptr prompt_cache; server_metrics metrics; @@ -610,6 +609,7 @@ struct server_context_impl { if (slot.prompt.n_tokens() == 0) { return; } + SLT_INF(slot, "%s", "saving idle slot to prompt cache\n"); slot.prompt_save(*prompt_cache); slot.prompt_clear(false); prompt_cache->update(); @@ -874,14 +874,15 @@ struct server_context_impl { metrics.init(); - if (params_base.kv_clear_idle) { + if (params_base.clear_idle) { if (!params_base.kv_unified) { - SRV_WRN("%s\n", "--kv-clear-idle requires --kv-unified, disabling"); + SRV_WRN("%s: --clear-idle requires --kv-unified, disabling\n", __func__); + params_base.clear_idle = false; } else if (params_base.cache_ram_mib == 0) { - SRV_WRN("%s\n", "--kv-clear-idle requires --cache-ram, disabling"); + SRV_WRN("%s: --clear-idle requires --cache-ram, disabling\n", __func__); + params_base.clear_idle = false; } else { - kv_keep_only_active = true; - SRV_INF("%s\n", "kv-clear-idle: idle slots' KV will be saved to cache-ram and cleared on release"); + SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__); } } @@ -1100,14 +1101,6 @@ struct server_context_impl { } bool launch_slot_with_task(server_slot & slot, server_task && task) { - if (kv_keep_only_active) { - for (auto & s : slots) { - if (s.id != slot.id && !s.is_processing()) { - slot_save_and_clear(s); - } - } - } - // process per-request lora adapters if (!task.params.lora.empty()) { auto task_loras = construct_lora_list(task.params.lora); @@ -1721,9 +1714,7 @@ struct server_context_impl { const int id_slot = task.id_slot; const int id_task = task.id; - server_slot * slot = id_slot != -1 - ? get_slot_by_id(id_slot) - : get_available_slot(task); + server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task); // // slot scheduling logic @@ -1760,6 +1751,14 @@ struct server_context_impl { SRV_ERR("failed to launch slot with task, id_task = %d\n", id_task); break; // drop the task } + + if (params_base.clear_idle) { + for (auto & s : slots) { + if (!s.is_processing()) { + slot_save_and_clear(s); + } + } + } } break; case SERVER_TASK_TYPE_CANCEL: { diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index 8c28bb8049a..62b6c994361 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -103,7 +103,7 @@ class ServerProcess: media_path: str | None = None sleep_idle_seconds: int | None = None cache_ram: int | None = None - no_kv_clear_idle: bool = False + no_clear_idle: bool = False log_path: str | None = None webui_mcp_proxy: bool = False @@ -242,8 +242,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None: server_args.extend(["--sleep-idle-seconds", self.sleep_idle_seconds]) if self.cache_ram is not None: server_args.extend(["--cache-ram", self.cache_ram]) - if self.no_kv_clear_idle: - server_args.append("--no-kv-clear-idle") + if self.no_clear_idle: + server_args.append("--no-clear-idle") if self.webui_mcp_proxy: server_args.append("--webui-mcp-proxy") From 645aebc4ce7b01d0bdea69a18d1b4264c08bf69f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nearly=20Ther=D1=A5?= <188969017+yychyo@users.noreply.github.com> Date: Tue, 31 Mar 2026 16:41:43 +0300 Subject: [PATCH 09/11] tests: update log strings after --clear-idle rename --- .../server/tests/unit/test_kv_keep_only_active.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py index 0d993c2e0fd..98ffced0eee 100644 --- a/tools/server/tests/unit/test_kv_keep_only_active.py +++ b/tools/server/tests/unit/test_kv_keep_only_active.py @@ -49,7 +49,7 @@ def test_clear_and_restore(): log = LogReader(server.log_path) # verify feature is enabled - assert "kv-clear-idle" in log.drain() + assert "idle slots will be saved to prompt cache" in log.drain() res = server.make_request("POST", "/completion", data={ "prompt": LONG_PROMPT, @@ -60,7 +60,7 @@ def test_clear_and_restore(): original_prompt_n = res.body["timings"]["prompt_n"] # Slot 0 is the only slot with KV — should NOT be cleared - assert "clearing prompt with" not in log.drain() + assert "saving idle slot to prompt cache" not in log.drain() # Launching slot 1 clears idle slot 0 res = server.make_request("POST", "/completion", data={ @@ -69,7 +69,7 @@ def test_clear_and_restore(): "cache_prompt": True, }) assert res.status_code == 200 - assert "clearing prompt with" in log.drain() + assert "saving idle slot to prompt cache" in log.drain() # Re-send same prompt — should restore from cache-ram res = server.make_request("POST", "/completion", data={ @@ -87,17 +87,17 @@ def test_clear_and_restore(): "cache_prompt": True, }) assert res.status_code == 200 - assert "clearing prompt with" not in log.drain() + assert "saving idle slot to prompt cache" not in log.drain() def test_disabled_with_flag(): global server - server.no_kv_clear_idle = True + server.no_clear_idle = True server.start() log = LogReader(server.log_path) # Feature should not be enabled - assert "kv-clear-idle" not in log.drain() + assert "idle slots will be saved to prompt cache" not in log.drain() res = server.make_request("POST", "/completion", data={ "prompt": LONG_PROMPT, @@ -113,4 +113,4 @@ def test_disabled_with_flag(): "cache_prompt": True, }) assert res.status_code == 200 - assert "clearing prompt with" not in log.drain() + assert "saving idle slot to prompt cache" not in log.drain() From a22ba24395042c1d4cfa2d098f67700eaffd3e00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nearly=20Ther=D1=A5?= <188969017+yychyo@users.noreply.github.com> Date: Wed, 1 Apr 2026 19:36:29 +0300 Subject: [PATCH 10/11] tests: use debug tags instead of log message matching --- tools/server/server-context.cpp | 2 ++ tools/server/tests/unit/test_kv_keep_only_active.py | 13 +++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 47b989a31bd..2fd863ef070 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -610,6 +610,7 @@ struct server_context_impl { return; } SLT_INF(slot, "%s", "saving idle slot to prompt cache\n"); + SLT_DBG(slot, "%s", "__TEST_TAG_CLEAR_IDLE_SLOT__\n"); slot.prompt_save(*prompt_cache); slot.prompt_clear(false); prompt_cache->update(); @@ -883,6 +884,7 @@ struct server_context_impl { params_base.clear_idle = false; } else { SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__); + SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n"); } } diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py index 98ffced0eee..8e26986ccb1 100644 --- a/tools/server/tests/unit/test_kv_keep_only_active.py +++ b/tools/server/tests/unit/test_kv_keep_only_active.py @@ -26,6 +26,7 @@ def create_server(): server.server_slots = True server.cache_ram = 100 server.kv_unified = True + server.debug = True fd, server.log_path = tempfile.mkstemp(suffix='.log') os.close(fd) yield @@ -49,7 +50,7 @@ def test_clear_and_restore(): log = LogReader(server.log_path) # verify feature is enabled - assert "idle slots will be saved to prompt cache" in log.drain() + assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" in log.drain() res = server.make_request("POST", "/completion", data={ "prompt": LONG_PROMPT, @@ -60,7 +61,7 @@ def test_clear_and_restore(): original_prompt_n = res.body["timings"]["prompt_n"] # Slot 0 is the only slot with KV — should NOT be cleared - assert "saving idle slot to prompt cache" not in log.drain() + assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain() # Launching slot 1 clears idle slot 0 res = server.make_request("POST", "/completion", data={ @@ -69,7 +70,7 @@ def test_clear_and_restore(): "cache_prompt": True, }) assert res.status_code == 200 - assert "saving idle slot to prompt cache" in log.drain() + assert "__TEST_TAG_CLEAR_IDLE_SLOT__" in log.drain() # Re-send same prompt — should restore from cache-ram res = server.make_request("POST", "/completion", data={ @@ -87,7 +88,7 @@ def test_clear_and_restore(): "cache_prompt": True, }) assert res.status_code == 200 - assert "saving idle slot to prompt cache" not in log.drain() + assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain() def test_disabled_with_flag(): @@ -97,7 +98,7 @@ def test_disabled_with_flag(): log = LogReader(server.log_path) # Feature should not be enabled - assert "idle slots will be saved to prompt cache" not in log.drain() + assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" not in log.drain() res = server.make_request("POST", "/completion", data={ "prompt": LONG_PROMPT, @@ -113,4 +114,4 @@ def test_disabled_with_flag(): "cache_prompt": True, }) assert res.status_code == 200 - assert "saving idle slot to prompt cache" not in log.drain() + assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain() From 5fc169814b9b57d294e451b47ace3740cb4a12ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nearly=20Ther=D1=A5?= <188969017+yychyo@users.noreply.github.com> Date: Fri, 3 Apr 2026 11:27:45 +0300 Subject: [PATCH 11/11] test: fix Windows CI by dropping temp log file unlink --- tools/server/tests/unit/test_kv_keep_only_active.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py index 8e26986ccb1..da93d50011e 100644 --- a/tools/server/tests/unit/test_kv_keep_only_active.py +++ b/tools/server/tests/unit/test_kv_keep_only_active.py @@ -30,8 +30,6 @@ def create_server(): fd, server.log_path = tempfile.mkstemp(suffix='.log') os.close(fd) yield - if os.path.exists(server.log_path): - os.unlink(server.log_path) LONG_PROMPT = (