mudler · richiejp · May 5, 2026 · May 5, 2026 · May 5, 2026 · May 6, 2026
diff --git a/.dockerignore b/.dockerignore
@@ -4,6 +4,7 @@
 .devcontainer
 models
 backends
+volumes
 examples/chatbot-ui/models
 backend/go/image/stablediffusion-ggml/build/
 backend/go/*/build

diff --git a/backend/backend.proto b/backend/backend.proto
@@ -37,6 +37,22 @@ service Backend {
 
   rpc Rerank(RerankRequest) returns (RerankResult) {}
 
+  // TokenClassify runs a token-classification (NER) model on the
+  // supplied text and returns each detected entity span. Used by the
+  // PII redactor's optional NER tier — the regex tier still handles
+  // formatted hits cheaply, while this catches names, locations, and
+  // other unformatted PII that regex misses.
+  rpc TokenClassify(TokenClassifyRequest) returns (TokenClassifyResponse) {}
+
+  // Score evaluates the model's joint log-probability of each
+  // supplied candidate continuation given a shared prompt. The
+  // prompt's KV cache is computed once and reused across candidates.
+  // Used for routing-policy multi-label classification, reranking,
+  // calibrated confidence, and reward-model scoring — any task where
+  // the consumer wants the model's confidence in a pre-specified
+  // continuation rather than a generated one.
+  rpc Score(ScoreRequest) returns (ScoreResponse) {}
+
   rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
 
   rpc VAD(VADRequest) returns (VADResponse) {}
@@ -81,6 +97,76 @@ message MetricsResponse {
   int32 prompt_tokens_processed = 5;
 }
 
+// TokenClassifyRequest carries the text to classify plus an optional
+// score threshold. The transformers backend interprets threshold as
+// the minimum confidence to include in the response; 0 = include all.
+message TokenClassifyRequest {
+  string text = 1;
+  float threshold = 2;
+}
+
+// TokenClassifyEntity is one detected entity span. Byte offsets are
+// into the original UTF-8 text — start..end is a half-open range that
+// addresses the substring corresponding to entity_group.
+//
+// entity_group follows HuggingFace's aggregated-tag convention (e.g.
+// "PER", "LOC", "ORG", or a PII-specific label like "EMAIL" /
+// "SSN" depending on the model). The redactor's per-pattern action
+// map keys off this string.
+message TokenClassifyEntity {
+  string entity_group = 1;
+  int32 start = 2;
+  int32 end = 3;
+  float score = 4;
+  string text = 5;
+}
+
+message TokenClassifyResponse {
+  repeated TokenClassifyEntity entities = 1;
+}
+
+// ScoreRequest carries one shared prompt and one or more continuations
+// to score against it. The backend tokenises the prompt once and reuses
+// the resulting KV cache across all candidates in this request.
+message ScoreRequest {
+  string prompt = 1;
+  repeated string candidates = 2;
+  // Return per-token logprobs for each candidate when true. Default
+  // false to keep the wire response small; the joint log_prob field
+  // covers the common ranking case.
+  bool include_token_logprobs = 3;
+  // When true, the response also populates length_normalized_log_prob
+  // (joint log-prob divided by candidate token count). Useful when
+  // candidates differ in length and the consumer wants a per-token
+  // measure comparable across them (PMI-style scoring).
+  bool length_normalize = 4;
+}
+
+// CandidateScore is one row in the ScoreResponse, matching by index
+// the candidate in ScoreRequest.candidates.
+message CandidateScore {
+  // Sum of log P(token_i | prompt, candidate_token_<i) across the
+  // candidate's tokens. The primary ranking signal.
+  double log_prob = 1;
+  // log_prob / num_tokens — populated when length_normalize=true on
+  // the request.
+  double length_normalized_log_prob = 2;
+  // Per-token detail — populated when include_token_logprobs=true.
+  repeated TokenLogProb tokens = 3;
+  // Number of tokens the backend tokenised this candidate into, after
+  // any backend-specific normalisation (e.g. leading-space handling).
+  int32 num_tokens = 4;
+}
+
+message TokenLogProb {
+  string token = 1;
+  double log_prob = 2;
+}
+
+message ScoreResponse {
+  repeated CandidateScore candidates = 1;
+}
+
 message RerankRequest {
   string query = 1;
   repeated string documents = 2;

diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -34,6 +34,7 @@
 #include <regex>
 #include <algorithm>
 #include <atomic>
+#include <cmath>
 #include <cstdlib>
 #include <fstream>
 #include <iterator>
@@ -3095,6 +3096,207 @@ class BackendServiceImpl final : public backend::Backend::Service {
         return grpc::Status::OK;
     }
 
+    // Score returns the model's joint log-probability of each candidate
+    // continuation given a shared prompt.
+    //
+    // WHY bypass the slot/task queue: upstream server_context exposes
+    // get_llama_context as "main thread only" and the slot loop's
+    // update_slots() owns the context whenever a task is in flight.
+    // No public synchronization primitive is available — so Score is
+    // unsafe to call concurrently with active generation through this
+    // backend. In practice routing-classifier calls happen before the
+    // request is routed to a generation backend, so the model used
+    // for Score is typically idle. Concurrent Score calls are
+    // serialised by a local mutex; KV-cache state is isolated behind
+    // a dedicated sequence ID cleared between candidates.
+    //
+    // A patch to server-context.cpp that adds SERVER_TASK_TYPE_SCORE
+    // and routes scoring through the slot loop would be the correct
+    // long-term fix; tracked as a follow-up.
+    //
+    // Perf TODO (measured: ~450 ms warm for 3 candidates on Arch-
+    // Router-1.5B Q4_K_M + Intel SYCL): the current loop re-decodes
+    // `prompt + candidate` from scratch for every candidate, throwing
+    // away the prompt's KV cache between iterations. A smarter
+    // version would:
+    //   1. Decode just the prompt once into score_seq_id.
+    //   2. Snapshot/cp that sequence (llama_memory_seq_cp) into a
+    //      per-candidate sequence id.
+    //   3. For each candidate, decode only its tokens onto the copy
+    //      (continuing from the saved prompt state), read logits.
+    //   4. llama_memory_seq_rm the copy.
+    // Estimated speedup: 3-candidate calls 450 ms -> ~150-200 ms,
+    // 6-candidate calls 630 ms -> ~220 ms. Single source-file change,
+    // no proto / Go-side changes needed. Worth doing once routing is
+    // wired into the middleware and Score is on the hot path of every
+    // chat request.
+    grpc::Status Score(ServerContext* context, const backend::ScoreRequest* request, backend::ScoreResponse* response) override {
+        auto auth = checkAuth(context);
+        if (!auth.ok()) return auth;
+        if (params_base.model.path.empty()) {
+            return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
+        }
+        if (request->candidates_size() == 0) {
+            return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "candidates must be non-empty");
+        }
+
+        // Serialise concurrent Score calls. The slot loop is still
+        // free to race with us — see the class comment above.
+        static std::mutex score_mutex;
+        std::lock_guard<std::mutex> score_lock(score_mutex);
+
+        llama_context * lctx = ctx_server.get_llama_context();
+        if (lctx == nullptr) {
+            return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "llama context unavailable (sleeping?)");
+        }
+        const llama_vocab * vocab = ctx_server.impl->vocab;
+        const int32_t n_vocab = llama_vocab_n_tokens(vocab);
+        const int32_t n_ctx = llama_n_ctx(lctx);
+        llama_memory_t mem = llama_get_memory(lctx);
+
+        // The KV-cache is sized to seq_to_stream.size() at load
+        // (typically equal to n_slots, often 1). Sequence IDs must
+        // be in [0, n_seq_max), so we can't pick a high-value
+        // "private" ID — we have to share with the slot. We clear
+        // the cache before AND after each candidate to keep
+        // scoring isolated from whatever state the slot held, and
+        // the static mutex above guarantees no other Score call is
+        // racing in the meantime. The slot loop is still free to
+        // race (see comment on this method) — Score must not run
+        // concurrently with generation through this backend.
+        const llama_seq_id score_seq_id = 0;
+        llama_memory_seq_rm(mem, score_seq_id, -1, -1);
+
+        // Tokenize the shared prompt once with add_special=true so
+        // BOS is prepended when the model requires it. parse_special
+        // keeps chat-template markers in the prompt intact.
+        const std::string prompt = request->prompt();
+        std::vector<llama_token> prompt_tokens = common_tokenize(vocab, prompt, /*add_special=*/true, /*parse_special=*/true);
+        const int32_t prompt_len = (int32_t) prompt_tokens.size();
+
+        for (int ci = 0; ci < request->candidates_size(); ci++) {
+            const std::string & candidate_text = request->candidates(ci);
+
+            // Re-tokenize prompt + candidate as a single string. BPE
+            // merges across the boundary can shift the tokenization
+            // versus tokenize(prompt) ++ tokenize(candidate), so we
+            // find the divergence point against prompt_tokens.
+            std::vector<llama_token> full_tokens = common_tokenize(vocab, prompt + candidate_text, /*add_special=*/true, /*parse_special=*/true);
+            int32_t divergence = prompt_len;
+            const int32_t min_len = std::min<int32_t>(prompt_len, (int32_t) full_tokens.size());
+            for (int32_t i = 0; i < min_len; i++) {
+                if (prompt_tokens[i] != full_tokens[i]) {
+                    divergence = i;
+                    break;
+                }
+            }
+            const int32_t cand_len = (int32_t) full_tokens.size() - divergence;
+            backend::CandidateScore * cs = response->add_candidates();
+            cs->set_num_tokens(cand_len);
+            if (cand_len <= 0) {
+                cs->set_log_prob(0.0);
+                if (request->length_normalize()) {
+                    cs->set_length_normalized_log_prob(0.0);
+                }
+                continue;
+            }
+            if (divergence < 1) {
+                // Need at least one prior token (typically BOS) to
+                // predict the first candidate token's logit. Tokeniser
+                // models without BOS + an empty prompt fall in here.
+                return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT,
+                    "Score: prompt produced no leading tokens; need at least one (e.g. BOS) to predict candidate");
+            }
+            if ((int32_t) full_tokens.size() > n_ctx) {
+                return grpc::Status(grpc::StatusCode::OUT_OF_RANGE,
+                    "Score: prompt+candidate exceeds context size (got " +
+                    std::to_string(full_tokens.size()) + ", n_ctx=" + std::to_string(n_ctx) + ")");
+            }
+
+            // Build a batch covering the entire prompt+candidate. We
+            // need logits at (divergence-1) onward — those are the
+            // predictions for each candidate token.
+            llama_batch batch = llama_batch_init((int32_t) full_tokens.size(), 0, 1);
+            for (int32_t i = 0; i < (int32_t) full_tokens.size(); i++) {
+                batch.token[i]    = full_tokens[i];
+                batch.pos[i]      = i;
+                batch.n_seq_id[i] = 1;
+                batch.seq_id[i][0] = score_seq_id;
+                // logits[i] is "do we want the prediction *for the
+                // next token*, computed from this position?"
+                // We want predictions for candidate tokens at
+                // positions divergence .. full_tokens.size()-1, which
+                // come from logits at positions (divergence-1) ..
+                // (full_tokens.size()-2).
+                bool need_logit = (i >= divergence - 1) && (i < (int32_t) full_tokens.size() - 1);
+                batch.logits[i] = need_logit ? 1 : 0;
+            }
+            batch.n_tokens = (int32_t) full_tokens.size();
+
+            // Decode the batch. If decode fails (e.g. KV slot
+            // exhaustion), surface as INTERNAL — the caller will
+            // typically fall back to a sampling-based classifier.
+            int decode_err = llama_decode(lctx, batch);
+            if (decode_err != 0) {
+                llama_batch_free(batch);
+                llama_memory_seq_rm(mem, score_seq_id, -1, -1);
+                return grpc::Status(grpc::StatusCode::INTERNAL,
+                    "llama_decode failed during Score: " + std::to_string(decode_err));
+            }
+
+            // Sum log-probabilities of the actual candidate tokens.
+            double total_log_prob = 0.0;
+            for (int32_t k = 0; k < cand_len; k++) {
+                // The k-th candidate token sits at full_tokens index
+                // (divergence + k). Its predicting logit is at batch
+                // position (divergence + k - 1).
+                int32_t logit_pos = divergence + k - 1;
+                const float * logits = llama_get_logits_ith(lctx, logit_pos);
+                if (logits == nullptr) {
+                    llama_batch_free(batch);
+                    llama_memory_seq_rm(mem, score_seq_id, -1, -1);
+                    return grpc::Status(grpc::StatusCode::INTERNAL,
+                        "llama_get_logits_ith returned null at position " + std::to_string(logit_pos));
+                }
+                llama_token target_token = full_tokens[divergence + k];
+
+                // Compute log_softmax(logits)[target_token] with the
+                // max-subtraction stability trick.
+                float max_logit = logits[0];
+                for (int32_t v = 1; v < n_vocab; v++) {
+                    if (logits[v] > max_logit) max_logit = logits[v];
+                }
+                double sum_exp = 0.0;
+                for (int32_t v = 0; v < n_vocab; v++) {
+                    sum_exp += std::exp((double)(logits[v] - max_logit));
+                }
+                double token_log_prob = (double)(logits[target_token] - max_logit) - std::log(sum_exp);
+                total_log_prob += token_log_prob;
+
+                if (request->include_token_logprobs()) {
+                    backend::TokenLogProb * tlp = cs->add_tokens();
+                    std::string piece = common_token_to_piece(lctx, target_token);
+                    tlp->set_token(piece);
+                    tlp->set_log_prob(token_log_prob);
+                }
+            }
+
+            cs->set_log_prob(total_log_prob);
+            if (request->length_normalize() && cand_len > 0) {
+                cs->set_length_normalized_log_prob(total_log_prob / (double) cand_len);
+            }
+
+            llama_batch_free(batch);
+            // Drop this candidate's KV-cache contribution so the next
+            // candidate starts from a clean state. Without this, the
+            // next decode would conflict at positions 0..N-1 for our
+            // sequence ID.
+            llama_memory_seq_rm(mem, score_seq_id, -1, -1);
+        }
+
+        return grpc::Status::OK;
+    }
+
     grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response) override {
         auto auth = checkAuth(context);
         if (!auth.ok()) return auth;