Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3166,7 +3166,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
add_opt(common_arg(
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
string_format(
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n"
"disabled with --cache-idle-slots and --parallel 1\n",
params.slot_prompt_similarity
),
[](common_params & params, const std::string & value) {
params.slot_prompt_similarity = std::stof(value);
}
Expand Down
22 changes: 13 additions & 9 deletions tools/server/server-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -686,9 +686,6 @@ struct server_context_impl {

json json_webui_settings = json::object();

// Necessary similarity of prompt for slot selection
float slot_prompt_similarity = 0.0f;

std::string model_name; // name of the loaded model, to be used by API
std::set<std::string> model_aliases; // additional names for the model
std::set<std::string> model_tags; // informational tags
Expand Down Expand Up @@ -853,9 +850,6 @@ struct server_context_impl {
}
}

// Necessary similarity of prompt for slot selection
slot_prompt_similarity = params_base.slot_prompt_similarity;

// setup slots
SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);

Expand Down Expand Up @@ -999,6 +993,16 @@ struct server_context_impl {
}
}

// prompt similarity doesn't work with idle slots, rely on the unified cache instead
if (params_base.cache_idle_slots) {
params_base.slot_prompt_similarity = 0.0f;
}

// bypass prompt similarity when we only have one slot
if (params_base.n_parallel == 1) {
params_base.slot_prompt_similarity = 0.0f;
}

// populate webui settings
{
if (!params_base.webui_config_json.empty()) {
Expand Down Expand Up @@ -1073,7 +1077,7 @@ struct server_context_impl {
bool update_cache = false;

// find the slot that has at least n% prompt similarity
if (ret == nullptr && slot_prompt_similarity != 0.0f) {
if (ret == nullptr && params_base.slot_prompt_similarity != 0.0f) {
float sim_best = 0;

for (server_slot & slot : slots) {
Expand All @@ -1093,7 +1097,7 @@ struct server_context_impl {
const float sim_cur = float(tokens.get_common_prefix(task.tokens)) / task.tokens.size();

// select the current slot if the criteria match
if (sim_cur > sim_best && sim_cur > slot_prompt_similarity) {
if (sim_cur > sim_best && sim_cur > params_base.slot_prompt_similarity) {
sim_best = sim_cur;

ret = &slot;
Expand All @@ -1104,7 +1108,7 @@ struct server_context_impl {
const float f_keep = (sim_best*task.tokens.size()) / ret->prompt.tokens.size();

SLT_INF(*ret, "selected slot by LCP similarity, sim_best = %.3f (> %.3f thold), f_keep = %.3f\n",
sim_best, slot_prompt_similarity, f_keep);
sim_best, params_base.slot_prompt_similarity, f_keep);

// if we are about to lose a large portion of the existing context - save it in the prompt cache
if (f_keep < 0.5f) {
Expand Down