Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
28e0149
feat(routing): add billing recorder and stats backend foundation
richiejp May 5, 2026
f19adfe
feat(routing): expose usage stats in REST, UI, and MCP
richiejp May 5, 2026
53ec0bd
feat(routing): add regex PII filter with REST and MCP surfaces
richiejp May 5, 2026
f4b1e24
feat(routing): record usage end-to-end in no-auth mode
richiejp May 6, 2026
1329da3
feat(routing): per-model PII gating + middleware admin page
richiejp May 6, 2026
5dc6963
feat(routing): rule-based intelligent router (subsystem 2 MVP)
richiejp May 6, 2026
e3843de
feat(routing): streaming PII filter with buffered-emit invariant
richiejp May 6, 2026
fda4c1c
feat(routing): PII pattern editor in model config UI
richiejp May 6, 2026
b53b6b0
feat(routing): streaming PII filter on Anthropic /v1/messages and /v1…
richiejp May 6, 2026
4eb40f1
feat(routing): cloud passthrough proxy (subsystem 4 MVP)
richiejp May 7, 2026
af75e2b
docs(routing): cloud passthrough proxy feature page
richiejp May 7, 2026
f337780
feat(routing): MITM proxy for subscription-auth Claude Code / Codex
richiejp May 7, 2026
c6df98c
feat(mitm): negotiate HTTP/2 with h1.1 fallback
richiejp May 7, 2026
0b5382d
refactor(cloudproxy): extract shared SSE wire helpers, trim dead stat…
richiejp May 7, 2026
b874aaf
feat(import-model): add cloud-proxy templates to YAML editor
richiejp May 7, 2026
1455e99
Revert "feat(import-model): add cloud-proxy templates to YAML editor"
richiejp May 7, 2026
69e79b5
feat(model-editor): add cloud-proxy templates to Add Model picker
richiejp May 7, 2026
4ade7be
feat(mitm): runtime control of listener and intercept allowlist
richiejp May 7, 2026
c8a4fa7
feat(middleware-ui): MITM proxy admin tab
richiejp May 7, 2026
50ff664
refactor(mitm): simplify-pass cleanup
richiejp May 7, 2026
d6902b4
feat(mitm): emit proxy_connect + proxy_traffic audit events
richiejp May 7, 2026
e5ccb8a
test(mitm): cover tunneled-host event + Events tab kind filter
richiejp May 7, 2026
d5098e1
fix(mitm): restore listener from runtime_settings.json on restart
richiejp May 7, 2026
519ce28
fix(routing): address code-review findings across pii/mitm/router
richiejp May 8, 2026
866c3fb
feat(middleware): per-pattern PII toggle, model-config-owned MITM hosts
richiejp May 8, 2026
cbb721f
refactor(store/local): extract in-process vector store library
richiejp May 11, 2026
6b6d852
feat(routing): KNN + LLM classifiers and per-model admission control
richiejp May 11, 2026
8a412dc
refactor(store): keep the vector store out of the main process
richiejp May 11, 2026
d673473
feat(backend): TokenClassify RPC + transformers NER pipeline
richiejp May 11, 2026
5ffd986
fix(openai): add missing auth import to chat.go
richiejp May 11, 2026
449e63a
feat(pii): NER tier in the redactor
richiejp May 11, 2026
980817a
feat(middleware-ui): router template + Create routing model link
richiejp May 11, 2026
a4ed57d
fix(model-editor): code-editor crash on structured template values
richiejp May 11, 2026
cd51c34
feat(model-editor): structured router-candidates editor + proxy chat …
richiejp May 11, 2026
2c086c9
fix(router-candidates): one textarea per exemplar, multi-line-safe
richiejp May 11, 2026
650a136
feat(router): KNN consumes a benchmarker-produced routing dataset
richiejp May 12, 2026
6b4b938
docs(router): recommend nomic-embed-text-v1.5 over Longformer
richiejp May 12, 2026
99f79f4
feat(routing): Score gRPC primitive, score classifier, L2 embedding c…
richiejp May 13, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
.devcontainer
models
backends
volumes
examples/chatbot-ui/models
backend/go/image/stablediffusion-ggml/build/
backend/go/*/build
Expand Down
86 changes: 86 additions & 0 deletions backend/backend.proto
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,22 @@ service Backend {

rpc Rerank(RerankRequest) returns (RerankResult) {}

// TokenClassify runs a token-classification (NER) model on the
// supplied text and returns each detected entity span. Used by the
// PII redactor's optional NER tier — the regex tier still handles
// formatted hits cheaply, while this catches names, locations, and
// other unformatted PII that regex misses.
rpc TokenClassify(TokenClassifyRequest) returns (TokenClassifyResponse) {}

// Score evaluates the model's joint log-probability of each
// supplied candidate continuation given a shared prompt. The
// prompt's KV cache is computed once and reused across candidates.
// Used for routing-policy multi-label classification, reranking,
// calibrated confidence, and reward-model scoring — any task where
// the consumer wants the model's confidence in a pre-specified
// continuation rather than a generated one.
rpc Score(ScoreRequest) returns (ScoreResponse) {}

rpc GetMetrics(MetricsRequest) returns (MetricsResponse);

rpc VAD(VADRequest) returns (VADResponse) {}
Expand Down Expand Up @@ -81,6 +97,76 @@ message MetricsResponse {
int32 prompt_tokens_processed = 5;
}

// TokenClassifyRequest carries the text to classify plus an optional
// score threshold. The transformers backend interprets threshold as
// the minimum confidence to include in the response; 0 = include all.
message TokenClassifyRequest {
string text = 1;
float threshold = 2;
}

// TokenClassifyEntity is one detected entity span. Byte offsets are
// into the original UTF-8 text — start..end is a half-open range that
// addresses the substring corresponding to entity_group.
//
// entity_group follows HuggingFace's aggregated-tag convention (e.g.
// "PER", "LOC", "ORG", or a PII-specific label like "EMAIL" /
// "SSN" depending on the model). The redactor's per-pattern action
// map keys off this string.
message TokenClassifyEntity {
string entity_group = 1;
int32 start = 2;
int32 end = 3;
float score = 4;
string text = 5;
}

message TokenClassifyResponse {
repeated TokenClassifyEntity entities = 1;
}

// ScoreRequest carries one shared prompt and one or more continuations
// to score against it. The backend tokenises the prompt once and reuses
// the resulting KV cache across all candidates in this request.
message ScoreRequest {
string prompt = 1;
repeated string candidates = 2;
// Return per-token logprobs for each candidate when true. Default
// false to keep the wire response small; the joint log_prob field
// covers the common ranking case.
bool include_token_logprobs = 3;
// When true, the response also populates length_normalized_log_prob
// (joint log-prob divided by candidate token count). Useful when
// candidates differ in length and the consumer wants a per-token
// measure comparable across them (PMI-style scoring).
bool length_normalize = 4;
}

// CandidateScore is one row in the ScoreResponse, matching by index
// the candidate in ScoreRequest.candidates.
message CandidateScore {
// Sum of log P(token_i | prompt, candidate_token_<i) across the
// candidate's tokens. The primary ranking signal.
double log_prob = 1;
// log_prob / num_tokens — populated when length_normalize=true on
// the request.
double length_normalized_log_prob = 2;
// Per-token detail — populated when include_token_logprobs=true.
repeated TokenLogProb tokens = 3;
// Number of tokens the backend tokenised this candidate into, after
// any backend-specific normalisation (e.g. leading-space handling).
int32 num_tokens = 4;
}

message TokenLogProb {
string token = 1;
double log_prob = 2;
}

message ScoreResponse {
repeated CandidateScore candidates = 1;
}

message RerankRequest {
string query = 1;
repeated string documents = 2;
Expand Down
202 changes: 202 additions & 0 deletions backend/cpp/llama-cpp/grpc-server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <regex>
#include <algorithm>
#include <atomic>
#include <cmath>
#include <cstdlib>
#include <fstream>
#include <iterator>
Expand Down Expand Up @@ -3095,6 +3096,207 @@ class BackendServiceImpl final : public backend::Backend::Service {
return grpc::Status::OK;
}

// Score returns the model's joint log-probability of each candidate
// continuation given a shared prompt.
//
// WHY bypass the slot/task queue: upstream server_context exposes
// get_llama_context as "main thread only" and the slot loop's
// update_slots() owns the context whenever a task is in flight.
// No public synchronization primitive is available — so Score is
// unsafe to call concurrently with active generation through this
// backend. In practice routing-classifier calls happen before the
// request is routed to a generation backend, so the model used
// for Score is typically idle. Concurrent Score calls are
// serialised by a local mutex; KV-cache state is isolated behind
// a dedicated sequence ID cleared between candidates.
//
// A patch to server-context.cpp that adds SERVER_TASK_TYPE_SCORE
// and routes scoring through the slot loop would be the correct
// long-term fix; tracked as a follow-up.
//
// Perf TODO (measured: ~450 ms warm for 3 candidates on Arch-
// Router-1.5B Q4_K_M + Intel SYCL): the current loop re-decodes
// `prompt + candidate` from scratch for every candidate, throwing
// away the prompt's KV cache between iterations. A smarter
// version would:
// 1. Decode just the prompt once into score_seq_id.
// 2. Snapshot/cp that sequence (llama_memory_seq_cp) into a
// per-candidate sequence id.
// 3. For each candidate, decode only its tokens onto the copy
// (continuing from the saved prompt state), read logits.
// 4. llama_memory_seq_rm the copy.
// Estimated speedup: 3-candidate calls 450 ms -> ~150-200 ms,
// 6-candidate calls 630 ms -> ~220 ms. Single source-file change,
// no proto / Go-side changes needed. Worth doing once routing is
// wired into the middleware and Score is on the hot path of every
// chat request.
grpc::Status Score(ServerContext* context, const backend::ScoreRequest* request, backend::ScoreResponse* response) override {
auto auth = checkAuth(context);
if (!auth.ok()) return auth;
if (params_base.model.path.empty()) {
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "Model not loaded");
}
if (request->candidates_size() == 0) {
return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "candidates must be non-empty");
}

// Serialise concurrent Score calls. The slot loop is still
// free to race with us — see the class comment above.
static std::mutex score_mutex;
std::lock_guard<std::mutex> score_lock(score_mutex);

llama_context * lctx = ctx_server.get_llama_context();
if (lctx == nullptr) {
return grpc::Status(grpc::StatusCode::FAILED_PRECONDITION, "llama context unavailable (sleeping?)");
}
const llama_vocab * vocab = ctx_server.impl->vocab;
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
const int32_t n_ctx = llama_n_ctx(lctx);
llama_memory_t mem = llama_get_memory(lctx);

// The KV-cache is sized to seq_to_stream.size() at load
// (typically equal to n_slots, often 1). Sequence IDs must
// be in [0, n_seq_max), so we can't pick a high-value
// "private" ID — we have to share with the slot. We clear
// the cache before AND after each candidate to keep
// scoring isolated from whatever state the slot held, and
// the static mutex above guarantees no other Score call is
// racing in the meantime. The slot loop is still free to
// race (see comment on this method) — Score must not run
// concurrently with generation through this backend.
const llama_seq_id score_seq_id = 0;
llama_memory_seq_rm(mem, score_seq_id, -1, -1);

// Tokenize the shared prompt once with add_special=true so
// BOS is prepended when the model requires it. parse_special
// keeps chat-template markers in the prompt intact.
const std::string prompt = request->prompt();
std::vector<llama_token> prompt_tokens = common_tokenize(vocab, prompt, /*add_special=*/true, /*parse_special=*/true);
const int32_t prompt_len = (int32_t) prompt_tokens.size();

for (int ci = 0; ci < request->candidates_size(); ci++) {
const std::string & candidate_text = request->candidates(ci);

// Re-tokenize prompt + candidate as a single string. BPE
// merges across the boundary can shift the tokenization
// versus tokenize(prompt) ++ tokenize(candidate), so we
// find the divergence point against prompt_tokens.
std::vector<llama_token> full_tokens = common_tokenize(vocab, prompt + candidate_text, /*add_special=*/true, /*parse_special=*/true);
int32_t divergence = prompt_len;
const int32_t min_len = std::min<int32_t>(prompt_len, (int32_t) full_tokens.size());
for (int32_t i = 0; i < min_len; i++) {
if (prompt_tokens[i] != full_tokens[i]) {
divergence = i;
break;
}
}
const int32_t cand_len = (int32_t) full_tokens.size() - divergence;
backend::CandidateScore * cs = response->add_candidates();
cs->set_num_tokens(cand_len);
if (cand_len <= 0) {
cs->set_log_prob(0.0);
if (request->length_normalize()) {
cs->set_length_normalized_log_prob(0.0);
}
continue;
}
if (divergence < 1) {
// Need at least one prior token (typically BOS) to
// predict the first candidate token's logit. Tokeniser
// models without BOS + an empty prompt fall in here.
return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT,
"Score: prompt produced no leading tokens; need at least one (e.g. BOS) to predict candidate");
}
if ((int32_t) full_tokens.size() > n_ctx) {
return grpc::Status(grpc::StatusCode::OUT_OF_RANGE,
"Score: prompt+candidate exceeds context size (got " +
std::to_string(full_tokens.size()) + ", n_ctx=" + std::to_string(n_ctx) + ")");
}

// Build a batch covering the entire prompt+candidate. We
// need logits at (divergence-1) onward — those are the
// predictions for each candidate token.
llama_batch batch = llama_batch_init((int32_t) full_tokens.size(), 0, 1);
for (int32_t i = 0; i < (int32_t) full_tokens.size(); i++) {
batch.token[i] = full_tokens[i];
batch.pos[i] = i;
batch.n_seq_id[i] = 1;
batch.seq_id[i][0] = score_seq_id;
// logits[i] is "do we want the prediction *for the
// next token*, computed from this position?"
// We want predictions for candidate tokens at
// positions divergence .. full_tokens.size()-1, which
// come from logits at positions (divergence-1) ..
// (full_tokens.size()-2).
bool need_logit = (i >= divergence - 1) && (i < (int32_t) full_tokens.size() - 1);
batch.logits[i] = need_logit ? 1 : 0;
}
batch.n_tokens = (int32_t) full_tokens.size();

// Decode the batch. If decode fails (e.g. KV slot
// exhaustion), surface as INTERNAL — the caller will
// typically fall back to a sampling-based classifier.
int decode_err = llama_decode(lctx, batch);
if (decode_err != 0) {
llama_batch_free(batch);
llama_memory_seq_rm(mem, score_seq_id, -1, -1);
return grpc::Status(grpc::StatusCode::INTERNAL,
"llama_decode failed during Score: " + std::to_string(decode_err));
}

// Sum log-probabilities of the actual candidate tokens.
double total_log_prob = 0.0;
for (int32_t k = 0; k < cand_len; k++) {
// The k-th candidate token sits at full_tokens index
// (divergence + k). Its predicting logit is at batch
// position (divergence + k - 1).
int32_t logit_pos = divergence + k - 1;
const float * logits = llama_get_logits_ith(lctx, logit_pos);
if (logits == nullptr) {
llama_batch_free(batch);
llama_memory_seq_rm(mem, score_seq_id, -1, -1);
return grpc::Status(grpc::StatusCode::INTERNAL,
"llama_get_logits_ith returned null at position " + std::to_string(logit_pos));
}
llama_token target_token = full_tokens[divergence + k];

// Compute log_softmax(logits)[target_token] with the
// max-subtraction stability trick.
float max_logit = logits[0];
for (int32_t v = 1; v < n_vocab; v++) {
if (logits[v] > max_logit) max_logit = logits[v];
}
double sum_exp = 0.0;
for (int32_t v = 0; v < n_vocab; v++) {
sum_exp += std::exp((double)(logits[v] - max_logit));
}
double token_log_prob = (double)(logits[target_token] - max_logit) - std::log(sum_exp);
total_log_prob += token_log_prob;

if (request->include_token_logprobs()) {
backend::TokenLogProb * tlp = cs->add_tokens();
std::string piece = common_token_to_piece(lctx, target_token);
tlp->set_token(piece);
tlp->set_log_prob(token_log_prob);
}
}

cs->set_log_prob(total_log_prob);
if (request->length_normalize() && cand_len > 0) {
cs->set_length_normalized_log_prob(total_log_prob / (double) cand_len);
}

llama_batch_free(batch);
// Drop this candidate's KV-cache contribution so the next
// candidate starts from a clean state. Without this, the
// next decode would conflict at positions 0..N-1 for our
// sequence ID.
llama_memory_seq_rm(mem, score_seq_id, -1, -1);
}

return grpc::Status::OK;
}

grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response) override {
auto auth = checkAuth(context);
if (!auth.ok()) return auth;
Expand Down
Loading
Loading