From 7c1d873ec8685ba293a8bcfb2d2fb23537142578 Mon Sep 17 00:00:00 2001 From: Audrey Tang Date: Tue, 12 May 2026 06:50:23 -0400 Subject: [PATCH] feat(server): add /v1/messages/count_tokens endpoint Anthropic's count_tokens API takes the same request shape as /v1/messages but only returns the prompt token count without running inference. This short-circuits before enqueueing a job: parse_anthropic_request renders and tokenizes the prompt the same way it would for a real generation, then we serialize {"input_tokens": N} and release the request. Useful for clients that need to plan context budgets before committing to a generation, e.g. the Anthropic SDK token-counting flow. Co-Authored-By: Claude Opus 4.7 (1M context) --- ds4_server.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/ds4_server.c b/ds4_server.c index bc8abbbd..ddcd683f 100644 --- a/ds4_server.c +++ b/ds4_server.c @@ -7701,8 +7701,19 @@ static void *client_main(void *arg) { request req; char err[160]; bool ok = false; + bool count_tokens_only = false; const int ctx_size = ds4_session_ctx(s->session); - if (!strcmp(hr.method, "POST") && !strcmp(hr.path, "/v1/messages")) { + if (!strcmp(hr.method, "POST") && !strcmp(hr.path, "/v1/messages/count_tokens")) { + /* Anthropic's count_tokens endpoint takes the same request shape as + * /v1/messages but only returns the prompt token total — no inference + * runs, so we short-circuit before the worker queue. Pass a NULL + * server so parse_anthropic_request skips the tool-memory and + * KV-cache lookups it would normally do; both helpers no-op cleanly + * on NULL, leaving shared state untouched for a read-only count. */ + ok = parse_anthropic_request(s->engine, NULL, hr.body, s->default_tokens, + ctx_size, &req, err, sizeof(err)); + if (ok) count_tokens_only = true; + } else if (!strcmp(hr.method, "POST") && !strcmp(hr.path, "/v1/messages")) { ok = parse_anthropic_request(s->engine, s, hr.body, s->default_tokens, ctx_size, &req, err, sizeof(err)); } else if (!strcmp(hr.method, "POST") && !strcmp(hr.path, "/v1/chat/completions")) { @@ -7723,6 +7734,14 @@ static void *client_main(void *arg) { goto done; } + if (count_tokens_only) { + char body[64]; + snprintf(body, sizeof(body), "{\"input_tokens\":%d}", req.prompt.len); + http_response(fd, 200, "application/json", body); + request_free(&req); + goto done; + } + set_client_socket_nonblocking(fd); job j; memset(&j, 0, sizeof(j)); @@ -7957,7 +7976,7 @@ static void usage(FILE *fp) { " ./ds4-server --ctx 100000 --kv-disk-dir /tmp/ds4-kv --kv-disk-space-mb 8192\n" "\n" "Notes:\n" - " Use /v1/chat/completions, /v1/completions, or /v1/messages.\n" + " Use /v1/chat/completions, /v1/completions, /v1/messages, or /v1/messages/count_tokens.\n" " Larger --ctx values allocate more KV memory at startup; the startup log prints the estimate.\n" " Disk KV caching is best for agents that resend long prompts with stable prefixes.\n" "\n"