From d4976a0e0159de502be52a5538e07677e3b08c0a Mon Sep 17 00:00:00 2001
From: Erik LaBianca <erik.labianca@gmail.com>
Date: Mon, 11 May 2026 15:21:38 -0400
Subject: [PATCH 1/3] Add local server metadata introspection

---
 ds4_server.c | 354 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 337 insertions(+), 17 deletions(-)

diff --git a/ds4_server.c b/ds4_server.c
index 0ae9767..e096d71 100644
--- a/ds4_server.c
+++ b/ds4_server.c
@@ -7460,10 +7460,25 @@ typedef struct {
 static bool id_list_contains(const stop_list *ids, const char *id);
 static void id_list_push_unique(stop_list *ids, const char *id);
 
+typedef struct {
+    const char *model_path;
+    ds4_backend backend;
+    int ctx_size;
+    int default_tokens;
+    int n_threads;
+    int routed_quant_bits;
+    bool mtp_enabled;
+    int mtp_draft_tokens;
+    bool quality;
+    bool warm_weights;
+} server_runtime_config;
+
+
 struct server {
     ds4_engine *engine;
     ds4_session *session;
     int default_tokens;
+    server_runtime_config runtime;
     kv_disk_cache kv;
     tool_memory tool_mem;
     live_tool_state responses_live;
@@ -11042,8 +11057,46 @@ typedef struct {
     int fd;
 } client_arg;
 
-static void append_model_json_values(buf *b, int ctx, int default_tokens) {
-    const int max_completion = default_tokens < ctx ? default_tokens : ctx;
+static int effective_max_completion_tokens(int ctx, int default_tokens) {
+    return default_tokens < ctx ? default_tokens : ctx;
+}
+
+static void append_model_supported_parameters_json(buf *b) {
+    buf_puts(b,
+        "["
+            "\"tools\","
+            "\"tool_choice\","
+            "\"max_tokens\","
+            "\"temperature\","
+            "\"top_p\","
+            "\"top_k\","
+            "\"min_p\","
+            "\"stop\","
+            "\"seed\","
+            "\"stream\","
+            "\"reasoning_effort\"]");
+}
+
+static void append_model_meta_json(buf *b, const server_runtime_config *rt) {
+    buf_puts(b, "{\"backend\":");
+    json_escape(b, ds4_backend_name(rt->backend));
+    buf_puts(b, ",\"routed_expert_quant_bits\":");
+    if (rt->routed_quant_bits > 0) buf_printf(b, "%d", rt->routed_quant_bits);
+    else buf_puts(b, "null");
+    buf_printf(b,
+        ",\"mtp\":%s,"
+        "\"mtp_draft_tokens\":%d,"
+        "\"reasoning\":true,"
+        "\"reasoning_default\":\"high\","
+        "\"reasoning_max_min_context\":%u}",
+        rt->mtp_enabled ? "true" : "false",
+        rt->mtp_draft_tokens,
+        ds4_think_max_min_context());
+}
+
+static void append_model_json_values(buf *b, int ctx, int default_tokens,
+                                     const server_runtime_config *rt) {
+    const int max_completion = effective_max_completion_tokens(ctx, default_tokens);
     buf_printf(b,
         "{\"id\":\"deepseek-v4-flash\","
         "\"object\":\"model\","
@@ -11055,25 +11108,143 @@ static void append_model_json_values(buf *b, int ctx, int default_tokens) {
             "\"context_length\":%d,"
             "\"max_completion_tokens\":%d,"
             "\"is_moderated\":false},"
-        "\"supported_parameters\":["
-            "\"tools\","
-            "\"tool_choice\","
-            "\"max_tokens\","
-            "\"temperature\","
-            "\"top_p\","
-            "\"top_k\","
-            "\"min_p\","
-            "\"stop\","
-            "\"seed\","
-            "\"stream\","
-            "\"reasoning_effort\"]}",
+        "\"supported_parameters\":",
         ctx,
         ctx,
         max_completion);
+    append_model_supported_parameters_json(b);
+    buf_puts(b, ",\"meta\":");
+    append_model_meta_json(b, rt);
+    buf_putc(b, '}');
 }
 
 static void append_model_json(buf *b, const server *s) {
-    append_model_json_values(b, ds4_session_ctx(s->session), s->default_tokens);
+    append_model_json_values(b, ds4_session_ctx(s->session), s->default_tokens, &s->runtime);
+}
+
+static void append_nullable_string_json(buf *b, const char *s) {
+    if (s) json_escape(b, s);
+    else buf_puts(b, "null");
+}
+
+static void append_props_json(buf *b, server *s) {
+    const server_runtime_config *rt = &s->runtime;
+    const int ctx = rt->ctx_size > 0 ? rt->ctx_size : ds4_session_ctx(s->session);
+    const int default_tokens = rt->default_tokens > 0 ? rt->default_tokens : s->default_tokens;
+    const int effective_tokens = effective_max_completion_tokens(ctx, default_tokens);
+    ds4_context_memory mem = ds4_context_memory_estimate(rt->backend, ctx);
+
+    int tool_ids = 0;
+    size_t tool_bytes = 0;
+    pthread_mutex_lock(&s->tool_mu);
+    tool_ids = s->tool_mem.entries;
+    tool_bytes = s->tool_mem.bytes;
+    pthread_mutex_unlock(&s->tool_mu);
+
+    buf_puts(b, "{\"server\":{\"name\":\"ds4-server\"},\"model\":{");
+    buf_puts(b, "\"id\":\"deepseek-v4-flash\",\"name\":\"DeepSeek V4 Flash\",\"path\":");
+    append_nullable_string_json(b, rt->model_path);
+    buf_puts(b, ",\"routed_expert_quant_bits\":");
+    if (rt->routed_quant_bits > 0) buf_printf(b, "%d", rt->routed_quant_bits);
+    else buf_puts(b, "null");
+    buf_printf(b, ",\"mtp\":%s,\"mtp_draft_tokens\":%d}",
+               rt->mtp_enabled ? "true" : "false",
+               rt->mtp_draft_tokens);
+
+    buf_puts(b, ",\"runtime\":{\"backend\":");
+    json_escape(b, ds4_backend_name(rt->backend));
+    buf_printf(b,
+        ",\"ctx_size\":%d,"
+        "\"default_max_tokens\":%d,"
+        "\"effective_max_completion_tokens\":%d,"
+        "\"threads\":%d,"
+        "\"quality\":%s,"
+        "\"warm_weights\":%s}",
+        ctx,
+        default_tokens,
+        effective_tokens,
+        rt->n_threads,
+        rt->quality ? "true" : "false",
+        rt->warm_weights ? "true" : "false");
+
+    buf_printf(b,
+        ",\"reasoning\":{"
+            "\"supported_efforts\":[\"low\",\"medium\",\"high\",\"xhigh\",\"max\"],"
+            "\"aliases\":{\"low\":\"high\",\"medium\":\"high\",\"xhigh\":\"high\"},"
+            "\"default\":\"high\","
+            "\"effective_default\":\"%s\","
+            "\"think_max_min_context\":%u}",
+        ds4_think_mode_name(ds4_think_mode_for_context(DS4_THINK_HIGH, ctx)),
+        ds4_think_max_min_context());
+
+    buf_puts(b,
+        ",\"sampling\":{"
+            "\"defaults\":{\"temperature\":1,\"top_p\":1,\"top_k\":0,\"min_p\":0},"
+            "\"thinking_override\":{\"enabled\":true,\"temperature\":1,\"top_p\":1,\"top_k\":0,\"min_p\":0},"
+            "\"tool_protocol_sampling\":{\"structural_temperature\":0}}");
+
+    buf_printf(b,
+        ",\"context_memory\":{"
+            "\"total_bytes\":%llu,"
+            "\"raw_bytes\":%llu,"
+            "\"compressed_bytes\":%llu,"
+            "\"scratch_bytes\":%llu,"
+            "\"prefill_cap\":%u,"
+            "\"raw_cap\":%u,"
+            "\"comp_cap\":%u}",
+        (unsigned long long)mem.total_bytes,
+        (unsigned long long)mem.raw_bytes,
+        (unsigned long long)mem.compressed_bytes,
+        (unsigned long long)mem.scratch_bytes,
+        mem.prefill_cap,
+        mem.raw_cap,
+        mem.comp_cap);
+
+    buf_puts(b, ",\"kv_disk_cache\":{");
+    buf_printf(b, "\"enabled\":%s,\"dir\":", s->kv.enabled ? "true" : "false");
+    append_nullable_string_json(b, s->kv.enabled ? s->kv.dir : NULL);
+    buf_printf(b,
+        ",\"budget_bytes\":%llu,"
+        "\"reject_different_quant\":%s,"
+        "\"policy\":{"
+            "\"min_tokens\":%d,"
+            "\"cold_max_tokens\":%d,"
+            "\"continued_interval_tokens\":%d,"
+            "\"boundary_trim_tokens\":%d,"
+            "\"boundary_align_tokens\":%d},"
+        "\"entries\":%d}",
+        (unsigned long long)(s->kv.enabled ? s->kv.budget_bytes : 0),
+        s->kv.reject_different_quant ? "true" : "false",
+        s->kv.opt.min_tokens,
+        s->kv.opt.cold_max_tokens,
+        s->kv.opt.continued_interval_tokens,
+        s->kv.opt.boundary_trim_tokens,
+        s->kv.opt.boundary_align_tokens,
+        s->kv.enabled ? s->kv.len : 0);
+
+    buf_printf(b,
+        ",\"tool_replay\":{"
+            "\"exact_dsml_replay_enabled\":%s,"
+            "\"max_ids\":%d,"
+            "\"current_ids\":%d,"
+            "\"current_bytes\":%llu}",
+        s->disable_exact_dsml_tool_replay ? "false" : "true",
+        s->tool_mem.max_entries,
+        tool_ids,
+        (unsigned long long)tool_bytes);
+
+    buf_puts(b,
+        ",\"api\":{"
+            "\"endpoints\":["
+                "\"GET /v1/models\","
+                "\"GET /v1/models/deepseek-v4-flash\","
+                "\"GET /props\","
+                "\"POST /v1/chat/completions\","
+                "\"POST /v1/completions\","
+                "\"POST /v1/messages\"],"
+            "\"supported_request_parameters\":");
+    append_model_supported_parameters_json(b);
+    buf_puts(b, "}}\n");
 }
 
 static bool send_model(server *s, int fd) {
@@ -11085,6 +11256,14 @@ static bool send_model(server *s, int fd) {
     return ok;
 }
 
+static bool send_props(server *s, int fd) {
+    buf b = {0};
+    append_props_json(&b, s);
+    bool ok = http_response(fd, 200, "application/json", b.ptr);
+    buf_free(&b);
+    return ok;
+}
+
 static bool send_models(server *s, int fd) {
     buf b = {0};
     buf_puts(&b, "{\"object\":\"list\",\"data\":[");
@@ -11102,6 +11281,22 @@ static void client_done(server *s) {
     pthread_mutex_unlock(&s->mu);
 }
 
+static bool client_peer_is_loopback(int fd) {
+    struct sockaddr_storage ss;
+    socklen_t len = sizeof(ss);
+    if (getpeername(fd, (struct sockaddr *)&ss, &len) != 0) return false;
+    if (ss.ss_family == AF_INET) {
+        const struct sockaddr_in *in = (const struct sockaddr_in *)&ss;
+        uint32_t addr = ntohl(in->sin_addr.s_addr);
+        return (addr >> 24) == 127;
+    }
+    if (ss.ss_family == AF_INET6) {
+        const struct sockaddr_in6 *in6 = (const struct sockaddr_in6 *)&ss;
+        return IN6_IS_ADDR_LOOPBACK(&in6->sin6_addr);
+    }
+    return false;
+}
+
 static void set_client_socket_nonblocking(int fd);
 
 static void *client_main(void *arg) {
@@ -11126,6 +11321,12 @@ static void *client_main(void *arg) {
         http_request_free(&hr);
         goto done;
     }
+    if (!strcmp(hr.method, "GET") && !strcmp(hr.path, "/props")) {
+        if (client_peer_is_loopback(fd)) send_props(s, fd);
+        else http_error(fd, 404, "unknown endpoint");
+        http_request_free(&hr);
+        goto done;
+    }
 
     request req;
     char err[160];
@@ -11550,6 +11751,18 @@ int main(int argc, char **argv) {
     s.engine = engine;
     s.session = session;
     s.default_tokens = cfg.default_tokens;
+    s.runtime = (server_runtime_config){
+        .model_path = cfg.engine.model_path,
+        .backend = cfg.engine.backend,
+        .ctx_size = cfg.ctx_size,
+        .default_tokens = cfg.default_tokens,
+        .n_threads = cfg.engine.n_threads,
+        .routed_quant_bits = ds4_engine_routed_quant_bits(engine),
+        .mtp_enabled = cfg.engine.mtp_path != NULL,
+        .mtp_draft_tokens = cfg.engine.mtp_draft_tokens,
+        .quality = cfg.engine.quality,
+        .warm_weights = cfg.engine.warm_weights,
+    };
     s.disable_exact_dsml_tool_replay = cfg.disable_exact_dsml_tool_replay;
     s.tool_mem.max_entries = cfg.tool_memory_max_ids;
     if (cfg.kv_disk_dir) {
@@ -13690,18 +13903,122 @@ static void test_json_skip_has_nesting_limit(void) {
 }
 
 static void test_model_metadata_clamps_completion_to_context(void) {
+    server_runtime_config rt = {
+        .backend = DS4_BACKEND_CPU,
+        .routed_quant_bits = 2,
+        .mtp_enabled = false,
+        .mtp_draft_tokens = 1,
+    };
     buf b = {0};
-    append_model_json_values(&b, 32768, 393216);
+    append_model_json_values(&b, 32768, 393216, &rt);
     TEST_ASSERT(strstr(b.ptr, "\"context_length\":32768") != NULL);
     TEST_ASSERT(strstr(b.ptr, "\"max_completion_tokens\":32768") != NULL);
     buf_free(&b);
 
-    append_model_json_values(&b, 100000, 4096);
+    append_model_json_values(&b, 100000, 4096, &rt);
     TEST_ASSERT(strstr(b.ptr, "\"context_length\":100000") != NULL);
     TEST_ASSERT(strstr(b.ptr, "\"max_completion_tokens\":4096") != NULL);
     buf_free(&b);
 }
 
+static void test_model_metadata_contains_meta_fields(void) {
+    server_runtime_config rt = {
+        .backend = DS4_BACKEND_CPU,
+        .routed_quant_bits = 4,
+        .mtp_enabled = true,
+        .mtp_draft_tokens = 2,
+    };
+    buf b = {0};
+    append_model_json_values(&b, 32768, 393216, &rt);
+    TEST_ASSERT(strstr(b.ptr, "\"meta\":{") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"backend\":\"cpu\"") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"routed_expert_quant_bits\":4") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"mtp\":true") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"mtp_draft_tokens\":2") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"reasoning\":true") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"reasoning_default\":\"high\"") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"reasoning_max_min_context\":393216") != NULL);
+    buf_free(&b);
+}
+
+static void test_props_server_init(server *s) {
+    memset(s, 0, sizeof(*s));
+    s->default_tokens = 393216;
+    s->runtime = (server_runtime_config){
+        .model_path = "/tmp/ds4flash.gguf",
+        .backend = DS4_BACKEND_CPU,
+        .ctx_size = 32768,
+        .default_tokens = 393216,
+        .n_threads = 7,
+        .routed_quant_bits = 2,
+        .mtp_enabled = true,
+        .mtp_draft_tokens = 3,
+        .quality = true,
+        .warm_weights = true,
+    };
+    s->tool_mem.max_entries = 1234;
+    pthread_mutex_init(&s->tool_mu, NULL);
+}
+
+static void test_props_json_includes_runtime_and_api_metadata(void) {
+    server s;
+    test_props_server_init(&s);
+    s.kv.enabled = true;
+    s.kv.dir = xstrdup("/tmp/ds4-kv");
+    s.kv.budget_bytes = 8192;
+    s.kv.reject_different_quant = true;
+    s.kv.opt = kv_cache_default_options();
+    s.kv.len = 5;
+
+    buf b = {0};
+    append_props_json(&b, &s);
+    TEST_ASSERT(strstr(b.ptr, "\"server\":{\"name\":\"ds4-server\"}") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"path\":\"/tmp/ds4flash.gguf\"") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"backend\":\"cpu\"") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"ctx_size\":32768") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"default_max_tokens\":393216") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"effective_max_completion_tokens\":32768") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"threads\":7") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"quality\":true") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"warm_weights\":true") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"default\":\"high\"") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"think_max_min_context\":393216") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"context_memory\":{") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"total_bytes\":") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"raw_bytes\":") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"compressed_bytes\":") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"scratch_bytes\":") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"kv_disk_cache\":{") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"dir\":\"/tmp/ds4-kv\"") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"budget_bytes\":8192") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"entries\":5") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"exact_dsml_replay_enabled\":true") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"max_ids\":1234") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"GET /props\"") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"POST /v1/chat/completions\"") != NULL);
+
+    buf_free(&b);
+    free(s.kv.dir);
+    pthread_mutex_destroy(&s.tool_mu);
+}
+
+static void test_props_json_reports_disabled_disk_kv(void) {
+    server s;
+    test_props_server_init(&s);
+    s.disable_exact_dsml_tool_replay = true;
+
+    buf b = {0};
+    append_props_json(&b, &s);
+    TEST_ASSERT(strstr(b.ptr, "\"kv_disk_cache\":{\"enabled\":false") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"dir\":null") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"budget_bytes\":0") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"entries\":0") != NULL);
+    TEST_ASSERT(strstr(b.ptr, "\"exact_dsml_replay_enabled\":false") != NULL);
+
+    buf_free(&b);
+    pthread_mutex_destroy(&s.tool_mu);
+}
+
 static void test_client_socket_nonblocking_flag(void) {
     int sv[2];
     TEST_ASSERT(socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == 0);
@@ -14468,6 +14785,9 @@ static void ds4_server_unit_tests_run(void) {
     test_stop_list_streaming_holds_and_trims_stop_text();
     test_json_skip_has_nesting_limit();
     test_model_metadata_clamps_completion_to_context();
+    test_model_metadata_contains_meta_fields();
+    test_props_json_includes_runtime_and_api_metadata();
+    test_props_json_reports_disabled_disk_kv();
     test_client_socket_nonblocking_flag();
     test_thinking_state_tracks_prompt_and_generated_tags();
     test_thinking_checkpoint_remember_gate();

From 3c4ad28d9d844eea366f1afd9b2d5318af86632a Mon Sep 17 00:00:00 2001
From: Erik LaBianca <erik.labianca@gmail.com>
Date: Mon, 11 May 2026 16:03:35 -0400
Subject: [PATCH 2/3] Address /props review polish

- Drop loopback gate on GET /props; the endpoint is now reachable like
  /v1/models, matching llama-server convention. Operators are responsible
  for binding to a trusted interface.
- Remove ctx_size / default_tokens fallback ladders in append_props_json
  by relying on the runtime config being unconditionally populated in
  main().
- Switch MTP introspection to ds4_engine_has_mtp() and
  ds4_engine_mtp_draft_tokens() so the flag reflects engine state rather
  than coupling to the cfg.engine.mtp_path argv shape.
- Pull sampling defaults (temperature, top_p, top_k, min_p, tool-call
  structural temperature) into named macros and use them in request_init,
  the thinking override path, and the /props payload. %g formatting keeps
  the current JSON byte-identical while letting future non-integer
  defaults serialize cleanly.
- Note that the kv.len read in append_props_json is intentionally
  lockless and consistent with other kv.* reads in the function.
- Comment the api.endpoints array as a sync point with client_main()
  routing.
- Add a short header comment on append_props_json describing its
  payload sections.
---
 ds4_server.c | 70 ++++++++++++++++++++++++++--------------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/ds4_server.c b/ds4_server.c
index e096d71..b8fac45 100644
--- a/ds4_server.c
+++ b/ds4_server.c
@@ -40,6 +40,14 @@ static volatile sig_atomic_t g_listen_fd = -1;
 #define DS4_SERVER_IO_TIMEOUT_SEC 10
 #define DS4_SERVER_SEND_STALL_TIMEOUT_MS 2000
 
+/* Sampling defaults applied at request_init time and surfaced in /props. */
+#define DS4_DEFAULT_TEMPERATURE 1.0f
+#define DS4_DEFAULT_TOP_P       1.0f
+#define DS4_DEFAULT_TOP_K       0
+#define DS4_DEFAULT_MIN_P       0.0f
+/* Structural temperature used during tool-call protocol encoding. */
+#define DS4_TOOL_STRUCTURAL_TEMPERATURE 0.0f
+
 static void stop_signal_handler(int sig) {
     (void)sig;
     if (g_stop_requested) _exit(130);
@@ -742,10 +750,10 @@ static void request_init(request *r, req_kind kind, int max_tokens) {
     r->api = API_OPENAI;
     r->model = xstrdup("deepseek-v4-flash");
     r->max_tokens = max_tokens;
-    r->top_k = 0;
-    r->temperature = 1.0f;
-    r->top_p = 1.0f;
-    r->min_p = 0.0f;
+    r->top_k = DS4_DEFAULT_TOP_K;
+    r->temperature = DS4_DEFAULT_TEMPERATURE;
+    r->top_p = DS4_DEFAULT_TOP_P;
+    r->min_p = DS4_DEFAULT_MIN_P;
     r->think_mode = DS4_THINK_HIGH;
 }
 
@@ -10471,13 +10479,13 @@ static void generate_job(server *s, job *j) {
         float top_p = j->req.top_p;
         float min_p = j->req.min_p;
         if (ds4_think_mode_enabled(j->req.think_mode)) {
-            temperature = 1.0f;
-            top_k = 0;
-            top_p = 1.0f;
-            min_p = 0.0f;
+            temperature = DS4_DEFAULT_TEMPERATURE;
+            top_k = DS4_DEFAULT_TOP_K;
+            top_p = DS4_DEFAULT_TOP_P;
+            min_p = DS4_DEFAULT_MIN_P;
         }
         if (in_tool_call && !dsml_decode_state_uses_payload_sampling(dsml_state)) {
-            temperature = 0.0f;
+            temperature = DS4_TOOL_STRUCTURAL_TEMPERATURE;
         }
         int token = ds4_session_sample(s->session, temperature, top_k, top_p, min_p, &rng);
         if (token == ds4_token_eos(s->engine)) {
@@ -11127,10 +11135,12 @@ static void append_nullable_string_json(buf *b, const char *s) {
     else buf_puts(b, "null");
 }
 
+/* Emit /props introspection payload covering server identity, model, runtime config,
+ * reasoning, sampling defaults, context memory estimates, KV disk cache, tool replay, and API surface. */
 static void append_props_json(buf *b, server *s) {
     const server_runtime_config *rt = &s->runtime;
-    const int ctx = rt->ctx_size > 0 ? rt->ctx_size : ds4_session_ctx(s->session);
-    const int default_tokens = rt->default_tokens > 0 ? rt->default_tokens : s->default_tokens;
+    const int ctx = rt->ctx_size;
+    const int default_tokens = rt->default_tokens;
     const int effective_tokens = effective_max_completion_tokens(ctx, default_tokens);
     ds4_context_memory mem = ds4_context_memory_estimate(rt->backend, ctx);
 
@@ -11177,11 +11187,16 @@ static void append_props_json(buf *b, server *s) {
         ds4_think_mode_name(ds4_think_mode_for_context(DS4_THINK_HIGH, ctx)),
         ds4_think_max_min_context());
 
-    buf_puts(b,
+    buf_printf(b,
         ",\"sampling\":{"
-            "\"defaults\":{\"temperature\":1,\"top_p\":1,\"top_k\":0,\"min_p\":0},"
-            "\"thinking_override\":{\"enabled\":true,\"temperature\":1,\"top_p\":1,\"top_k\":0,\"min_p\":0},"
-            "\"tool_protocol_sampling\":{\"structural_temperature\":0}}");
+            "\"defaults\":{\"temperature\":%g,\"top_p\":%g,\"top_k\":%d,\"min_p\":%g},"
+            "\"thinking_override\":{\"enabled\":true,\"temperature\":%g,\"top_p\":%g,\"top_k\":%d,\"min_p\":%g},"
+            "\"tool_protocol_sampling\":{\"structural_temperature\":%g}}",
+        (double)DS4_DEFAULT_TEMPERATURE, (double)DS4_DEFAULT_TOP_P,
+        DS4_DEFAULT_TOP_K, (double)DS4_DEFAULT_MIN_P,
+        (double)DS4_DEFAULT_TEMPERATURE, (double)DS4_DEFAULT_TOP_P,
+        DS4_DEFAULT_TOP_K, (double)DS4_DEFAULT_MIN_P,
+        (double)DS4_TOOL_STRUCTURAL_TEMPERATURE);
 
     buf_printf(b,
         ",\"context_memory\":{"
@@ -11200,6 +11215,7 @@ static void append_props_json(buf *b, server *s) {
         mem.raw_cap,
         mem.comp_cap);
 
+    /* kv.len is only mutated on the worker thread; lockless read here matches the rest of kv.* in this function. */
     buf_puts(b, ",\"kv_disk_cache\":{");
     buf_printf(b, "\"enabled\":%s,\"dir\":", s->kv.enabled ? "true" : "false");
     append_nullable_string_json(b, s->kv.enabled ? s->kv.dir : NULL);
@@ -11233,6 +11249,7 @@ static void append_props_json(buf *b, server *s) {
         tool_ids,
         (unsigned long long)tool_bytes);
 
+    // Keep this list in sync with the GET/POST routing in client_main().
     buf_puts(b,
         ",\"api\":{"
             "\"endpoints\":["
@@ -11281,22 +11298,6 @@ static void client_done(server *s) {
     pthread_mutex_unlock(&s->mu);
 }
 
-static bool client_peer_is_loopback(int fd) {
-    struct sockaddr_storage ss;
-    socklen_t len = sizeof(ss);
-    if (getpeername(fd, (struct sockaddr *)&ss, &len) != 0) return false;
-    if (ss.ss_family == AF_INET) {
-        const struct sockaddr_in *in = (const struct sockaddr_in *)&ss;
-        uint32_t addr = ntohl(in->sin_addr.s_addr);
-        return (addr >> 24) == 127;
-    }
-    if (ss.ss_family == AF_INET6) {
-        const struct sockaddr_in6 *in6 = (const struct sockaddr_in6 *)&ss;
-        return IN6_IS_ADDR_LOOPBACK(&in6->sin6_addr);
-    }
-    return false;
-}
-
 static void set_client_socket_nonblocking(int fd);
 
 static void *client_main(void *arg) {
@@ -11322,8 +11323,7 @@ static void *client_main(void *arg) {
         goto done;
     }
     if (!strcmp(hr.method, "GET") && !strcmp(hr.path, "/props")) {
-        if (client_peer_is_loopback(fd)) send_props(s, fd);
-        else http_error(fd, 404, "unknown endpoint");
+        send_props(s, fd);
         http_request_free(&hr);
         goto done;
     }
@@ -11758,8 +11758,8 @@ int main(int argc, char **argv) {
         .default_tokens = cfg.default_tokens,
         .n_threads = cfg.engine.n_threads,
         .routed_quant_bits = ds4_engine_routed_quant_bits(engine),
-        .mtp_enabled = cfg.engine.mtp_path != NULL,
-        .mtp_draft_tokens = cfg.engine.mtp_draft_tokens,
+        .mtp_enabled = ds4_engine_has_mtp(engine),
+        .mtp_draft_tokens = ds4_engine_mtp_draft_tokens(engine),
         .quality = cfg.engine.quality,
         .warm_weights = cfg.engine.warm_weights,
     };

From 33a474ac249cb6147a7bf6dfff76d79b191983de Mon Sep 17 00:00:00 2001
From: Erik LaBianca <erik.labianca@gmail.com>
Date: Mon, 11 May 2026 16:04:56 -0400
Subject: [PATCH 3/3] Wrap over-long /props lockless-read comment

The previous polish commit landed a 121-char single-line block comment,
which is the longest single-line block comment in the file and past the
~90-char convention used elsewhere in ds4_server.c. Wrap it with the
leading-asterisk style used by other multi-line block comments in this
file.
---
 ds4_server.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ds4_server.c b/ds4_server.c
index b8fac45..5c31b0f 100644
--- a/ds4_server.c
+++ b/ds4_server.c
@@ -11215,7 +11215,8 @@ static void append_props_json(buf *b, server *s) {
         mem.raw_cap,
         mem.comp_cap);
 
-    /* kv.len is only mutated on the worker thread; lockless read here matches the rest of kv.* in this function. */
+    /* kv.len is only mutated on the worker thread; lockless read here
+     * matches the rest of kv.* in this function. */
     buf_puts(b, ",\"kv_disk_cache\":{");
     buf_printf(b, "\"enabled\":%s,\"dir\":", s->kv.enabled ? "true" : "false");
     append_nullable_string_json(b, s->kv.enabled ? s->kv.dir : NULL);