diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 9869ef2cc14c..6bf144e913b3 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -642,6 +642,21 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt } else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") { params.no_op_offload = false; } + } else if (!strcmp(optname, "split_mode") || !strcmp(optname, "sm")) { + // Accepts: none | layer | row | tensor (the latter requires a llama.cpp build + // that includes ggml-org/llama.cpp#19378, FlashAttention enabled, and KV-cache + // quantization disabled). + if (optval != NULL) { + if (optval_str == "none") { + params.split_mode = LLAMA_SPLIT_MODE_NONE; + } else if (optval_str == "layer") { + params.split_mode = LLAMA_SPLIT_MODE_LAYER; + } else if (optval_str == "row") { + params.split_mode = LLAMA_SPLIT_MODE_ROW; + } else if (optval_str == "tensor") { + params.split_mode = LLAMA_SPLIT_MODE_TENSOR; + } + } } else if (!strcmp(optname, "kv_unified") || !strcmp(optname, "unified_kv")) { if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") { params.kv_unified = true; diff --git a/docs/content/features/text-generation.md b/docs/content/features/text-generation.md index 05efcc8fe504..fdc4edc728e0 100644 --- a/docs/content/features/text-generation.md +++ b/docs/content/features/text-generation.md @@ -514,6 +514,7 @@ The `llama.cpp` backend supports additional configuration options that can be sp | `no_op_offload` | boolean | Disable offloading host tensor operations to device. Default: `false`. | `no_op_offload:true` | | `kv_unified` or `unified_kv` | boolean | Enable unified KV cache. Default: `false`. | `kv_unified:true` | | `n_ctx_checkpoints` or `ctx_checkpoints` | integer | Maximum number of context checkpoints per slot. Default: `8`. | `ctx_checkpoints:4` | +| `split_mode` or `sm` | string | How to split the model across multiple GPUs: `none` (single GPU only), `layer` (default — split layers and KV across GPUs), `row` (split rows across GPUs), `tensor` (experimental tensor parallelism — requires `flash_attention: true`, no KV-cache quantization, manually set `context_size`, and a llama.cpp build that includes [#19378](https://github.com/ggml-org/llama.cpp/pull/19378)). | `split_mode:tensor` | **Example configuration with options:**