diff --git a/common/arg.cpp b/common/arg.cpp index 07ba7193525..f1f4c12a3ce 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2223,7 +2223,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex if (llama_supports_rpc()) { add_opt(common_arg( {"--rpc"}, "SERVERS", - "comma separated list of RPC servers (host:port)", + "comma-separated list of RPC servers (host:port)", [](common_params & params, const std::string & value) { add_rpc_devices(value); GGML_UNUSED(params); @@ -3555,7 +3555,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_MODEL")); add_opt(common_arg( {"--spec-type"}, common_speculative_all_types_str(), - string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n", + string_format("comma-separated list of types of speculative decoding to use (default: %s)\n", common_speculative_type_name_str(params.speculative.types).c_str()), [](common_params & params, const std::string & value) { const auto enabled_types = string_split(value, ','); diff --git a/common/common.h b/common/common.h index aafc376f2e7..a3cd1743957 100644 --- a/common/common.h +++ b/common/common.h @@ -157,9 +157,9 @@ enum common_params_sampling_config : uint64_t { enum common_speculative_type { COMMON_SPECULATIVE_TYPE_NONE, // no speculative decoding - COMMON_SPECULATIVE_TYPE_DRAFT, // draft model - COMMON_SPECULATIVE_TYPE_EAGLE3, // eagle draft model - COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding + COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, // standalone draft model speculative decoding + COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, // Eagle3 speculative decoding + COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding based on n-grams COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values COMMON_SPECULATIVE_TYPE_NGRAM_MOD, @@ -342,6 +342,7 @@ struct common_params_speculative_ngram_cache { struct common_params_speculative { std::vector types = { COMMON_SPECULATIVE_TYPE_NONE }; + // used by Simple, MTP, Eagle3, etc. - all methods that require some kind of draft model common_params_speculative_draft draft; common_params_speculative_ngram_mod ngram_mod; diff --git a/common/speculative.cpp b/common/speculative.cpp index e487e003d39..0eebcb3dcfe 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -21,8 +21,8 @@ const std::map common_speculative_type_from_name_map = { {"none", COMMON_SPECULATIVE_TYPE_NONE}, - {"draft", COMMON_SPECULATIVE_TYPE_DRAFT}, - {"eagle3", COMMON_SPECULATIVE_TYPE_EAGLE3}, + {"draft-simple", COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE}, + {"draft-eagle3", COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3}, {"ngram-simple", COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE}, {"ngram-map-k", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K}, {"ngram-map-k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V}, @@ -145,15 +145,15 @@ struct common_speculative_impl { virtual void accept(llama_seq_id seq_id, uint16_t n_accepted) = 0; }; -struct common_speculative_state_draft : public common_speculative_impl { +struct common_speculative_impl_draft_simple : public common_speculative_impl { common_params_speculative_draft params; llama_batch batch; std::vector smpls; - common_speculative_state_draft(const common_params_speculative & params, uint32_t n_seq) - : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT, n_seq) + common_speculative_impl_draft_simple(const common_params_speculative & params, uint32_t n_seq) + : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, n_seq) , params(params.draft) { auto * ctx_dft = this->params.ctx_dft; @@ -206,7 +206,7 @@ struct common_speculative_state_draft : public common_speculative_impl { } } - ~common_speculative_state_draft() override { + ~common_speculative_impl_draft_simple() override { llama_batch_free(batch); } @@ -340,11 +340,11 @@ struct common_speculative_state_draft : public common_speculative_impl { } }; -struct common_speculative_state_eagle3 : public common_speculative_impl { +struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { //common_params_speculative_eagle3 params; - common_speculative_state_eagle3(const common_params_speculative & /*params*/, uint32_t n_seq) - : common_speculative_impl(COMMON_SPECULATIVE_TYPE_EAGLE3, n_seq) {} + common_speculative_impl_draft_eagle3(const common_params_speculative & /*params*/, uint32_t n_seq) + : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq) {} void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override { // noop @@ -365,13 +365,13 @@ struct common_speculative_state_eagle3 : public common_speculative_impl { }; // state of self-speculation (simple implementation, not ngram-map) -struct common_speculative_state_ngram_simple : public common_speculative_impl { +struct common_speculative_impl_ngram_simple : public common_speculative_impl { common_params_speculative_ngram_map params; // shared across all sequences common_ngram_simple_config config; - common_speculative_state_ngram_simple( + common_speculative_impl_ngram_simple( const common_params_speculative & params, uint32_t n_seq, common_ngram_simple_config config) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, n_seq) @@ -405,13 +405,13 @@ struct common_speculative_state_ngram_simple : public common_speculative_impl { } }; -struct common_speculative_state_ngram_map_k : public common_speculative_impl { +struct common_speculative_impl_ngram_map_k : public common_speculative_impl { common_params_speculative_ngram_map params; // n_seq configs std::vector config; - common_speculative_state_ngram_map_k( + common_speculative_impl_ngram_map_k( const common_params_speculative & params, const common_ngram_map & config, uint32_t n_seq) @@ -453,7 +453,7 @@ struct common_speculative_state_ngram_map_k : public common_speculative_impl { } }; -struct common_speculative_state_ngram_mod : public common_speculative_impl { +struct common_speculative_impl_ngram_mod : public common_speculative_impl { common_params_speculative_ngram_mod params; // shared across all sequences @@ -475,7 +475,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_impl { std::vector sinfos; - common_speculative_state_ngram_mod( + common_speculative_impl_ngram_mod( const common_params_speculative & params, uint32_t n_seq) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_MOD, n_seq) @@ -621,7 +621,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_impl { } }; -struct common_speculative_state_ngram_cache : public common_speculative_impl { +struct common_speculative_impl_ngram_cache : public common_speculative_impl { common_params_speculative_ngram_cache params; uint16_t n_draft; @@ -639,7 +639,7 @@ struct common_speculative_state_ngram_cache : public common_speculative_impl { std::vector sinfos; - common_speculative_state_ngram_cache( + common_speculative_impl_ngram_cache( const common_params_speculative & params, uint32_t n_seq, uint16_t n_draft, @@ -775,7 +775,7 @@ static common_ngram_map get_common_ngram_map( return common_ngram_map(size_key, size_value, key_only, min_hits); } -static common_speculative_state_ngram_cache create_state_ngram_cache( +static common_speculative_impl_ngram_cache create_state_ngram_cache( const common_speculative_config & config, uint32_t n_seq, const std::string & path_static, @@ -786,7 +786,7 @@ static common_speculative_state_ngram_cache create_state_ngram_cache( bool save_static = false; bool save_dynamic = false; - common_speculative_state_ngram_cache state(config.params, n_seq, n_draft, path_static, path_dynamic, save_static, save_dynamic); + common_speculative_impl_ngram_cache state(config.params, n_seq, n_draft, path_static, path_dynamic, save_static, save_dynamic); return state; } @@ -818,8 +818,8 @@ const char * common_speculative_all_types_str() { std::string common_speculative_type_to_str(common_speculative_type type) { switch (type) { case COMMON_SPECULATIVE_TYPE_NONE: return "none"; - case COMMON_SPECULATIVE_TYPE_DRAFT: return "draft"; - case COMMON_SPECULATIVE_TYPE_EAGLE3: return "eagle3"; + case COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE: return "draft-simple"; + case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3: return "draft-eagle3"; case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: return "ngram-simple"; case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: return "ngram-map-k"; case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram-map-k4v"; @@ -872,9 +872,9 @@ common_speculative * common_speculative_init(common_params_speculative & params, { uint32_t enabled_configs = common_get_enabled_speculative_configs(params.types); - bool has_draft = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT)); - bool has_draft_model = !params.draft.mparams.path.empty(); + bool has_draft_model_path = !params.draft.mparams.path.empty(); + bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE)); // bool has_mtp = false; // TODO: add MTP here bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3 @@ -906,22 +906,22 @@ common_speculative * common_speculative_init(common_params_speculative & params, if (has_ngram_cache) { configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params)); } - if (has_draft) { - if (!has_draft_model) { + if (has_draft_simple) { + if (!has_draft_model_path) { LOG_WRN("%s: draft model is not specified - cannot use 'draft' type\n", __func__); - has_draft = false; + has_draft_simple = false; } - } else if (has_draft_model) { + } else if (has_draft_model_path) { LOG_WRN("%s: draft model is specified but 'draft' speculative type is not explicitly enabled - enabling it\n", __func__); - has_draft = true; + has_draft_simple = true; } - if (has_draft) { - configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT, params)); + if (has_draft_simple) { + configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, params)); } // TODO: add MTP here if (has_draft_eagle3) { - configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_EAGLE3, params)); + configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, params)); } } @@ -932,12 +932,12 @@ common_speculative * common_speculative_init(common_params_speculative & params, switch (config.type) { case COMMON_SPECULATIVE_TYPE_NONE: break; - case COMMON_SPECULATIVE_TYPE_DRAFT: { - impls.push_back(std::make_unique(config.params, n_seq)); + case COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE: { + impls.push_back(std::make_unique(config.params, n_seq)); break; } - case COMMON_SPECULATIVE_TYPE_EAGLE3: { - impls.push_back(std::make_unique(config.params, n_seq)); + case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3: { + impls.push_back(std::make_unique(config.params, n_seq)); break; } case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: { @@ -950,7 +950,7 @@ common_speculative * common_speculative_init(common_params_speculative & params, /* .size_ngram = */ ngram_size_key, /* .size_mgram = */ mgram_size_value }; - auto state = std::make_unique( + auto state = std::make_unique( /* .params = */ config.params, /* .n_seq = */ n_seq, /* .state = */ config_simple @@ -961,13 +961,13 @@ common_speculative * common_speculative_init(common_params_speculative & params, case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: { impls.push_back( - std::make_unique( + std::make_unique( config.params, get_common_ngram_map(config.type, config.params.ngram_map_k), n_seq)); break; } case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: { impls.push_back( - std::make_unique(config.params, n_seq)); + std::make_unique(config.params, n_seq)); break; } case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: { @@ -975,7 +975,7 @@ common_speculative * common_speculative_init(common_params_speculative & params, config, n_seq, params.ngram_cache.lookup_cache_static, params.ngram_cache.lookup_cache_dynamic); - impls.push_back(std::make_unique(state)); + impls.push_back(std::make_unique(state)); break; } default: diff --git a/examples/llama-eval/llama-eval.py b/examples/llama-eval/llama-eval.py index b33a3615be3..a2948273512 100755 --- a/examples/llama-eval/llama-eval.py +++ b/examples/llama-eval/llama-eval.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# type: ignore import argparse import json @@ -100,6 +99,8 @@ def wilson_interval(correct: int, total: int, z: float = 1.96) -> Tuple[float, f class BaseDataset(ABC): + questions: List[Dict] + @abstractmethod def get_question(self, index: int) -> Dict: pass @@ -573,7 +574,7 @@ def normalize_number(s: str) -> Optional[int]: class AimeDataset(BaseDataset): def __init__(self, split: str = "train"): self.split = split - self.questions: List[Dict] = [] + self.questions = [] self._load_dataset() def _load_dataset(self): @@ -618,7 +619,7 @@ def get_prompt(self, question: Dict) -> str: class Aime2025Dataset(BaseDataset): def __init__(self): - self.questions: List[Dict] = [] + self.questions = [] self._load_dataset() def _load_dataset(self): @@ -681,7 +682,7 @@ def get_prompt(self, question: Dict) -> str: class Gsm8kDataset(BaseDataset): def __init__(self, split: str = "test"): self.split = split - self.questions: List[Dict] = [] + self.questions = [] self._load_dataset() def _load_dataset(self): @@ -742,7 +743,7 @@ class GpqaDataset(BaseDataset): def __init__(self, variant: str = "diamond", seed: int = 1234): self.variant = variant self.seed = seed - self.questions: List[Dict] = [] + self.questions = [] self._load_dataset() def _load_dataset(self): diff --git a/ggml/src/ggml-zendnn/CMakeLists.txt b/ggml/src/ggml-zendnn/CMakeLists.txt index 4f321a25257..f1e4f991fae 100644 --- a/ggml/src/ggml-zendnn/CMakeLists.txt +++ b/ggml/src/ggml-zendnn/CMakeLists.txt @@ -28,7 +28,7 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF") ExternalProject_Add( zendnn GIT_REPOSITORY https://github.com/amd/ZenDNN.git - GIT_TAG f79f7321a1add65ced6397a6bfab7edba6e3e14e # ZenDNN-2026-WW13 + GIT_TAG ac9e580d9434b7b98985f2627a7ebfb5eba4bb0d # ZenDNN-2026-WW17 PREFIX ${ZENDNN_PREFIX} SOURCE_DIR ${ZENDNN_SOURCE_DIR} BINARY_DIR ${ZENDNN_BUILD_DIR} diff --git a/ggml/src/ggml-zendnn/ggml-zendnn.cpp b/ggml/src/ggml-zendnn/ggml-zendnn.cpp index 2b82c7c1dbb..6a83bb6b1ec 100644 --- a/ggml/src/ggml-zendnn/ggml-zendnn.cpp +++ b/ggml/src/ggml-zendnn/ggml-zendnn.cpp @@ -47,6 +47,7 @@ static bool ggml_zendnn_matmul(ggml_backend_zendnn_context * ctx, int64_t m, int params.dtypes.dst = ggml_to_zendnn_type(); params.num_threads = ctx->n_threads; + zendnnl::lowoha::matmul::matmul_batch_params_t batch_params; zendnnl::error_handling::status_t status = zendnnl::lowoha::matmul::matmul_direct( 'r', false, true, // row-major, don't transpose B, transpose A (because it's column-major) n, // M: rows of B and C @@ -59,7 +60,7 @@ static bool ggml_zendnn_matmul(ggml_backend_zendnn_context * ctx, int64_t m, int 0.0f, // beta C, ldc, // output C[n,m] true, // is_weights_const - {}, // batch_params + batch_params, // batch_params params // params ); @@ -520,6 +521,12 @@ static ggml_backend_buffer_t ggml_backend_zendnn_device_buffer_from_host_ptr(ggm GGML_UNUSED(max_tensor_size); } +static bool ggml_zendnn_adaptive_fallback_enabled() { + static const bool enabled = std::getenv("GGML_ZENDNN_ADAPTIVE_FALLBACK") == nullptr || + std::atoi(std::getenv("GGML_ZENDNN_ADAPTIVE_FALLBACK")) != 0; + return enabled; +} + static bool ggml_backend_zendnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { switch (op->op) { case GGML_OP_NONE: @@ -538,12 +545,24 @@ static bool ggml_backend_zendnn_device_supports_op(ggml_backend_dev_t dev, const const int64_t ne10 = inputs->ne[0]; const int64_t ne0 = op->ne[0]; const int64_t ne1 = op->ne[1]; - const int64_t min_batch = 1; - if (!ggml_is_contiguous(weights) || !ggml_is_contiguous(inputs) || - ne0 < min_batch || ne1 < min_batch || ne10 < min_batch) { + + if(!ggml_is_contiguous(weights) || !ggml_is_contiguous(inputs)) { + return false; + } + + if (ggml_zendnn_adaptive_fallback_enabled()) { + const int64_t K = inputs->ne[0]; + const int64_t N = (inputs->ne[1]*inputs->ne[2]*inputs->ne[3]); + const int64_t M = weights->ne[1]; + if(K <= 256 || N <= 128 || M <= 96) { return false; + } } + else if (ne0 < min_batch || ne1 < min_batch || ne10 < min_batch) { + return false; + } + // MUL_MAT_ID performs best with a moderate number of experts due to its // gather + batched matmul + scatter approach. Future versions will leverage // ZenDNN's grouped_gemm for better scalability with larger expert counts: diff --git a/tools/cli/README.md b/tools/cli/README.md index 02c564a2906..9f0574d25d3 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -55,6 +55,7 @@ | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | +| `--rpc SERVERS` | comma-separated list of RPC servers (host:port)
(env: LLAMA_ARG_RPC) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | | `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | | `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | @@ -198,7 +199,7 @@ | `--spec-draft-device, -devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | | `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_MODEL) | -| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none)

(env: LLAMA_ARG_SPEC_TYPE) | +| `--spec-type none,draft-simple,draft-eagle3,ngram-simple,ngram-map-k,ngram-map-k4v,ngram-mod,ngram-cache` | comma-separated list of types of speculative decoding to use (default: none)

(env: LLAMA_ARG_SPEC_TYPE) | | `--spec-ngram-mod-n-min N` | minimum number of ngram tokens to use for ngram-based speculative decoding (default: 48) | | `--spec-ngram-mod-n-max N` | maximum number of ngram tokens to use for ngram-based speculative decoding (default: 64) | | `--spec-ngram-mod-n-match N` | ngram-mod lookup length (default: 24) | diff --git a/tools/completion/README.md b/tools/completion/README.md index 7042889db13..048cf7416fc 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -138,6 +138,7 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | +| `--rpc SERVERS` | comma-separated list of RPC servers (host:port)
(env: LLAMA_ARG_RPC) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | | `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | | `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | diff --git a/tools/server/README.md b/tools/server/README.md index 7f856faa813..07e180929e5 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -72,6 +72,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | +| `--rpc SERVERS` | comma-separated list of RPC servers (host:port)
(env: LLAMA_ARG_RPC) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | | `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | | `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | @@ -247,7 +248,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `--spec-draft-device, -devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | | `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_MODEL) | -| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none)

(env: LLAMA_ARG_SPEC_TYPE) | +| `--spec-type none,draft-simple,draft-eagle3,ngram-simple,ngram-map-k,ngram-map-k4v,ngram-mod,ngram-cache` | comma-separated list of types of speculative decoding to use (default: none)

(env: LLAMA_ARG_SPEC_TYPE) | | `--spec-ngram-mod-n-min N` | minimum number of ngram tokens to use for ngram-based speculative decoding (default: 48) | | `--spec-ngram-mod-n-max N` | maximum number of ngram tokens to use for ngram-based speculative decoding (default: 64) | | `--spec-ngram-mod-n-match N` | ngram-mod lookup length (default: 24) |