mudler · mudler · Apr 25, 2026 · Apr 25, 2026
diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -642,6 +642,21 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
             } else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") {
                 params.no_op_offload = false;
             }
+        } else if (!strcmp(optname, "split_mode") || !strcmp(optname, "sm")) {
+            // Accepts: none | layer | row | tensor (the latter requires a llama.cpp build
+            // that includes ggml-org/llama.cpp#19378, FlashAttention enabled, and KV-cache
+            // quantization disabled).
+            if (optval != NULL) {
+                if (optval_str == "none") {
+                    params.split_mode = LLAMA_SPLIT_MODE_NONE;
+                } else if (optval_str == "layer") {
+                    params.split_mode = LLAMA_SPLIT_MODE_LAYER;
+                } else if (optval_str == "row") {
+                    params.split_mode = LLAMA_SPLIT_MODE_ROW;
+                } else if (optval_str == "tensor") {
+                    params.split_mode = LLAMA_SPLIT_MODE_TENSOR;
+                }
+            }
         } else if (!strcmp(optname, "kv_unified") || !strcmp(optname, "unified_kv")) {
             if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
                 params.kv_unified = true;

diff --git a/docs/content/features/text-generation.md b/docs/content/features/text-generation.md
@@ -514,6 +514,7 @@ The `llama.cpp` backend supports additional configuration options that can be sp
 | `no_op_offload` | boolean | Disable offloading host tensor operations to device. Default: `false`. | `no_op_offload:true` |
 | `kv_unified` or `unified_kv` | boolean | Enable unified KV cache. Default: `false`. | `kv_unified:true` |
 | `n_ctx_checkpoints` or `ctx_checkpoints` | integer | Maximum number of context checkpoints per slot. Default: `8`. | `ctx_checkpoints:4` |
+| `split_mode` or `sm` | string | How to split the model across multiple GPUs: `none` (single GPU only), `layer` (default — split layers and KV across GPUs), `row` (split rows across GPUs), `tensor` (experimental tensor parallelism — requires `flash_attention: true`, no KV-cache quantization, manually set `context_size`, and a llama.cpp build that includes [#19378](https://github.com/ggml-org/llama.cpp/pull/19378)). | `split_mode:tensor` |
 
 **Example configuration with options:**