diff --git a/common/arg.cpp b/common/arg.cpp index 099f0aeab24..e4e0b031269 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3166,7 +3166,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT")); add_opt(common_arg( {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", - string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), + string_format( + "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n" + "disabled with --cache-idle-slots and --parallel 1\n", + params.slot_prompt_similarity + ), [](common_params & params, const std::string & value) { params.slot_prompt_similarity = std::stof(value); } diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index a5372572f01..78b45ebec84 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -686,9 +686,6 @@ struct server_context_impl { json json_webui_settings = json::object(); - // Necessary similarity of prompt for slot selection - float slot_prompt_similarity = 0.0f; - std::string model_name; // name of the loaded model, to be used by API std::set model_aliases; // additional names for the model std::set model_tags; // informational tags @@ -853,9 +850,6 @@ struct server_context_impl { } } - // Necessary similarity of prompt for slot selection - slot_prompt_similarity = params_base.slot_prompt_similarity; - // setup slots SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel); @@ -999,6 +993,16 @@ struct server_context_impl { } } + // prompt similarity doesn't work with idle slots, rely on the unified cache instead + if (params_base.cache_idle_slots) { + params_base.slot_prompt_similarity = 0.0f; + } + + // bypass prompt similarity when we only have one slot + if (params_base.n_parallel == 1) { + params_base.slot_prompt_similarity = 0.0f; + } + // populate webui settings { if (!params_base.webui_config_json.empty()) { @@ -1073,7 +1077,7 @@ struct server_context_impl { bool update_cache = false; // find the slot that has at least n% prompt similarity - if (ret == nullptr && slot_prompt_similarity != 0.0f) { + if (ret == nullptr && params_base.slot_prompt_similarity != 0.0f) { float sim_best = 0; for (server_slot & slot : slots) { @@ -1093,7 +1097,7 @@ struct server_context_impl { const float sim_cur = float(tokens.get_common_prefix(task.tokens)) / task.tokens.size(); // select the current slot if the criteria match - if (sim_cur > sim_best && sim_cur > slot_prompt_similarity) { + if (sim_cur > sim_best && sim_cur > params_base.slot_prompt_similarity) { sim_best = sim_cur; ret = &slot; @@ -1104,7 +1108,7 @@ struct server_context_impl { const float f_keep = (sim_best*task.tokens.size()) / ret->prompt.tokens.size(); SLT_INF(*ret, "selected slot by LCP similarity, sim_best = %.3f (> %.3f thold), f_keep = %.3f\n", - sim_best, slot_prompt_similarity, f_keep); + sim_best, params_base.slot_prompt_similarity, f_keep); // if we are about to lose a large portion of the existing context - save it in the prompt cache if (f_keep < 0.5f) {