From c846dde08b5d1307d56221b2c212edadeb74937d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 5 Apr 2026 19:16:57 +0000
Subject: [PATCH 1/8] Extract append_task helper to replace 3 near-identical
 task-construction blocks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 5–6-line pattern of constructing a server_task (assign id, index,
prompt_tokens, oaicompat, push) appeared verbatim in embed, handleRerank,
and handleEmbeddings.  A single append_task() helper in jni_server_helpers.hpp
covers all three; callers that need format_rerank() pass the result as the
prompt_tokens argument.  The helper is unit-testable without JNI.

https://claude.ai/code/session_01VTBYmrewrc3STtLv2JM5L7
---
 src/main/cpp/jllama.cpp             | 28 ++++------------------
 src/main/cpp/jni_server_helpers.hpp | 36 +++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp
index 04304abd..8f7696e3 100644
--- a/src/main/cpp/jllama.cpp
+++ b/src/main/cpp/jllama.cpp
@@ -831,16 +831,7 @@ JNIEXPORT jfloatArray JNICALL Java_de_kherud_llama_LlamaModel_embed(JNIEnv *env,
     auto tokens = tokenize_mixed(ctx_server->vocab, prompt, true, true);
     std::vector<server_task> tasks;
 
-    server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING);
-
-    task.id = ctx_server->queue_tasks.get_new_id();
-    task.index = 0;
-    task.prompt_tokens = server_tokens(tokens, false);
-
-    // OAI-compat
-    task.params.oaicompat = OAICOMPAT_TYPE_NONE;
-
-    tasks.push_back(std::move(task));
+    append_task(ctx_server, tasks, SERVER_TASK_TYPE_EMBEDDING, tokens, 0);
 
     const auto task_ids = dispatch_tasks(ctx_server, tasks);
     const auto id_task = *task_ids.begin();
@@ -917,12 +908,8 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleRerank(JNIEnv *e
 
     tasks.reserve(tokenized_docs.size());
     for (size_t i = 0; i < tokenized_docs.size(); i++) {
-        auto task = server_task(SERVER_TASK_TYPE_RERANK);
-        task.id = ctx_server->queue_tasks.get_new_id();
-        task.index = i;
-        auto tokens = format_rerank(ctx_server->vocab, tokenized_query, tokenized_docs[i]);
-        task.prompt_tokens = server_tokens(tokens, false);
-        tasks.push_back(std::move(task));
+        append_task(ctx_server, tasks, SERVER_TASK_TYPE_RERANK,
+                    format_rerank(ctx_server->vocab, tokenized_query, tokenized_docs[i]), i);
     }
     const auto task_ids = dispatch_tasks(ctx_server, tasks);
 
@@ -1259,14 +1246,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleEmbeddings(JNIEn
     tasks.reserve(tokenized_prompts.size());
 
     for (size_t i = 0; i < tokenized_prompts.size(); i++) {
-        server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING);
-
-        task.id = ctx_server->queue_tasks.get_new_id();
-        task.index = i;
-        task.prompt_tokens = server_tokens(tokenized_prompts[i], false);
-        task.params.oaicompat = oaicompat;
-
-        tasks.push_back(std::move(task));
+        append_task(ctx_server, tasks, SERVER_TASK_TYPE_EMBEDDING, tokenized_prompts[i], i, oaicompat);
     }
 
     const auto task_ids = dispatch_tasks(ctx_server, tasks);
diff --git a/src/main/cpp/jni_server_helpers.hpp b/src/main/cpp/jni_server_helpers.hpp
index dd404a55..41878279 100644
--- a/src/main/cpp/jni_server_helpers.hpp
+++ b/src/main/cpp/jni_server_helpers.hpp
@@ -11,6 +11,7 @@
 //   build_completion_tasks_impl  — tokenise and build a server_task vector
 //   collect_task_results_impl    — drain results from the response queue
 //   recv_slot_task_result_impl   — recv + check a single slot-action result
+//   append_task                  — construct and push a single server_task
 //
 // All parameters are explicit (no module-level globals) so each function can
 // be exercised in unit tests using local server objects and a mock JNIEnv.
@@ -31,6 +32,7 @@
 //   6. results_to_json_impl        — no dependencies on helpers above
 //   7. results_to_jstring_impl     — uses 2 + 6
 //   8. check_infill_support_impl   — no dependencies on helpers above
+//   9. append_task                 — no dependencies on helpers above
 
 #include "jni.h"
 
@@ -236,6 +238,40 @@
 // Extracted from the 10-line compatibility block in handleInfill so it can
 // be unit-tested independently of the JNI dispatch layer.
 // ---------------------------------------------------------------------------
+// ---------------------------------------------------------------------------
+// append_task
+//
+// Constructs a server_task of the given type and appends it to `tasks`.
+// Captures the repeated 5–6-line block that appears in embed (single task),
+// handleEmbeddings (loop), and handleRerank (loop):
+//
+//   server_task task(type);
+//   task.id            = ctx_server->queue_tasks.get_new_id();
+//   task.index         = index;
+//   task.prompt_tokens = server_tokens(prompt_tokens, false);
+//   task.params.oaicompat = oaicompat;
+//   tasks.push_back(std::move(task));
+//
+// The caller is responsible for pre-computing `prompt_tokens` (e.g. via
+// format_rerank() for rerank tasks).  `oaicompat` defaults to NONE so the
+// rerank call site needs no explicit argument.
+//
+// Unit-testable without JNI: takes only C++ objects, no JNIEnv calls.
+// ---------------------------------------------------------------------------
+inline void append_task(server_context           *ctx_server,
+                        std::vector<server_task> &tasks,
+                        server_task_type          type,
+                        const llama_tokens       &prompt_tokens,
+                        size_t                    index,
+                        oaicompat_type            oaicompat = OAICOMPAT_TYPE_NONE) {
+    server_task task(type);
+    task.id               = ctx_server->queue_tasks.get_new_id();
+    task.index            = index;
+    task.prompt_tokens    = server_tokens(prompt_tokens, false);
+    task.params.oaicompat = oaicompat;
+    tasks.push_back(std::move(task));
+}
+
 [[nodiscard]] inline bool check_infill_support_impl(JNIEnv            *env,
                                                      const llama_vocab *vocab,
                                                      jclass             error_class) {

From 7885ced5fd0037bada542d73ec66ef86df361c1e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 5 Apr 2026 19:17:37 +0000
Subject: [PATCH 2/8] Use get_result_error_message in handleEmbeddings error
 path

The raw result->to_json()["message"].get<std::string>() expression was
still used in handleEmbeddings despite get_result_error_message() being
extracted for exactly this purpose.  All other call sites already use
the helper; this brings handleEmbeddings into line.

https://claude.ai/code/session_01VTBYmrewrc3STtLv2JM5L7
---
 src/main/cpp/jllama.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp
index 8f7696e3..960362a3 100644
--- a/src/main/cpp/jllama.cpp
+++ b/src/main/cpp/jllama.cpp
@@ -1258,8 +1258,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleEmbeddings(JNIEn
 
         if (result->is_error()) {
             ctx_server->queue_results.remove_waiting_task_ids(task_ids);
-            std::string error_msg = result->to_json()["message"].get<std::string>();
-            env->ThrowNew(c_llama_error, error_msg.c_str());
+            env->ThrowNew(c_llama_error, get_result_error_message(result).c_str());
             return nullptr;
         }
 

From 31772065e1c95fae4b9915168f48b810b97fa136 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 5 Apr 2026 19:19:20 +0000
Subject: [PATCH 3/8] Replace duplicate n_threads/n_threads_batch blocks with
 local lambda

The two near-identical if-blocks in configureParallelInference differed
only in JSON key and target field.  A local lambda captures env, config,
and c_llama_error, leaving only the varying key and target as parameters.

https://claude.ai/code/session_01VTBYmrewrc3STtLv2JM5L7
---
 src/main/cpp/jllama.cpp | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp
index 960362a3..dab359d6 100644
--- a/src/main/cpp/jllama.cpp
+++ b/src/main/cpp/jllama.cpp
@@ -1372,23 +1372,18 @@ JNIEXPORT jboolean JNICALL Java_de_kherud_llama_LlamaModel_configureParallelInfe
         ctx_server->slot_prompt_similarity = similarity;
     }
 
-    if (config.contains("n_threads")) {
-        int n_threads = config["n_threads"].get<int>();
-        if (n_threads <= 0) {
-            env->ThrowNew(c_llama_error, "n_threads must be greater than 0");
-            return JNI_FALSE;
-        }
-        ctx_server->params_base.cpuparams.n_threads = n_threads;
-    }
-
-    if (config.contains("n_threads_batch")) {
-        int n_threads_batch = config["n_threads_batch"].get<int>();
-        if (n_threads_batch <= 0) {
-            env->ThrowNew(c_llama_error, "n_threads_batch must be greater than 0");
-            return JNI_FALSE;
+    auto apply_thread_count = [&](const char *key, int &target) -> bool {
+        if (!config.contains(key)) return true;
+        int v = config[key].get<int>();
+        if (v <= 0) {
+            env->ThrowNew(c_llama_error, (std::string(key) + " must be greater than 0").c_str());
+            return false;
         }
-        ctx_server->params_base.cpuparams_batch.n_threads = n_threads_batch;
-    }
+        target = v;
+        return true;
+    };
+    if (!apply_thread_count("n_threads",       ctx_server->params_base.cpuparams.n_threads))       return JNI_FALSE;
+    if (!apply_thread_count("n_threads_batch", ctx_server->params_base.cpuparams_batch.n_threads)) return JNI_FALSE;
 
     return JNI_TRUE;
 }

From 49062fd20863b1d2e3df42b0df3cd88c8a606ed2 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 5 Apr 2026 19:24:23 +0000
Subject: [PATCH 4/8] Use collect_task_results in handleRerank and
 handleEmbeddings

Both functions manually duplicated the drain-and-error-check loop that
collect_task_results already encapsulates.  Replace each with a call to
the helper followed by a second-pass loop over the collected results for
per-result processing (index/score extraction for rerank, to_json() for
embeddings).

https://claude.ai/code/session_01VTBYmrewrc3STtLv2JM5L7
---
 src/main/cpp/jllama.cpp | 34 ++++++++++------------------------
 1 file changed, 10 insertions(+), 24 deletions(-)

diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp
index dab359d6..153f090a 100644
--- a/src/main/cpp/jllama.cpp
+++ b/src/main/cpp/jllama.cpp
@@ -913,20 +913,15 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleRerank(JNIEnv *e
     }
     const auto task_ids = dispatch_tasks(ctx_server, tasks);
 
-    json results_json = json::array();
-
-    for (size_t i = 0; i < task_ids.size(); i++) {
-        server_task_result_ptr result = ctx_server->queue_results.recv(task_ids);
-        if (result->is_error()) {
-            ctx_server->queue_results.remove_waiting_task_ids(task_ids);
-            env->ThrowNew(c_llama_error, get_result_error_message(result).c_str());
-            return nullptr;
-        }
+    std::vector<server_task_result_ptr> results;
+    results.reserve(task_ids.size());
+    if (!collect_task_results(env, ctx_server, task_ids, results)) return nullptr;
 
+    json results_json = json::array();
+    for (const auto &result : results) {
         const auto out_res = result->to_json();
         int index = out_res["index"].get<int>();
         float score = out_res["score"].get<float>();
-
         results_json.push_back({
             {"document", document_vector[index]},
             {"index", index},
@@ -934,8 +929,6 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleRerank(JNIEnv *e
         });
     }
 
-    ctx_server->queue_results.remove_waiting_task_ids(task_ids);
-
     return json_to_jstring(env, results_json);
 }
 
@@ -1251,22 +1244,15 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleEmbeddings(JNIEn
 
     const auto task_ids = dispatch_tasks(ctx_server, tasks);
 
-    json responses = json::array();
-
-    for (size_t i = 0; i < task_ids.size(); i++) {
-        server_task_result_ptr result = ctx_server->queue_results.recv(task_ids);
-
-        if (result->is_error()) {
-            ctx_server->queue_results.remove_waiting_task_ids(task_ids);
-            env->ThrowNew(c_llama_error, get_result_error_message(result).c_str());
-            return nullptr;
-        }
+    std::vector<server_task_result_ptr> results;
+    results.reserve(task_ids.size());
+    if (!collect_task_results(env, ctx_server, task_ids, results)) return nullptr;
 
+    json responses = json::array();
+    for (const auto &result : results) {
         responses.push_back(result->to_json());
     }
 
-    ctx_server->queue_results.remove_waiting_task_ids(task_ids);
-
     json root = oaicompat == OAICOMPAT_TYPE_EMBEDDING
                     ? format_embeddings_response_oaicompat(body, responses, use_base64)
                     : json(responses);

From 3f35a2e4e7c7daaf86b666981eae0eef3d9361d6 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 5 Apr 2026 19:25:03 +0000
Subject: [PATCH 5/8] Collapse repeated get_server_context guard into
 REQUIRE_SERVER_CONTEXT macro

Every JNI entry point opened with the same two-line pattern:
  auto *ctx_server = get_server_context(env, obj);
  if (!ctx_server) return <sentinel>;

A variadic macro covers all four sentinel variants (nullptr, 0, JNI_FALSE,
and bare return for void functions) while declaring ctx_server in the
caller's scope for use in the rest of the function body.

https://claude.ai/code/session_01VTBYmrewrc3STtLv2JM5L7
---
 src/main/cpp/jllama.cpp | 64 +++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 38 deletions(-)

diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp
index 153f090a..37bb1292 100644
--- a/src/main/cpp/jllama.cpp
+++ b/src/main/cpp/jllama.cpp
@@ -457,6 +457,13 @@ void log_callback_trampoline(ggml_log_level level, const char *text, void *user_
 }
 } // namespace
 
+// Validates the server_context at every JNI entry point.  Declares `ctx_server`
+// in the caller's scope and returns the given sentinel (omit for void functions)
+// if the model is not loaded.
+#define REQUIRE_SERVER_CONTEXT(...) \
+    auto *ctx_server = get_server_context(env, obj); \
+    if (!ctx_server) return __VA_ARGS__
+
 /**
  * The VM calls JNI_OnLoad when the native library is loaded (for example, through `System.loadLibrary`).
  * `JNI_OnLoad` must return the JNI version needed by the native library.
@@ -763,8 +770,7 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo
 }
 
 JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestCompletion(JNIEnv *env, jobject obj, jstring jparams) {
-    auto *ctx_server = get_server_context(env, obj);
-    if (!ctx_server) return 0;
+    REQUIRE_SERVER_CONTEXT(0);
 
     json data = parse_json_params(env, jparams);
 
@@ -786,15 +792,13 @@ JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestCompletion(JNIEnv
 }
 
 JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_releaseTask(JNIEnv *env, jobject obj, jint id_task) {
-    auto *ctx_server = get_server_context(env, obj);
-    if (!ctx_server) return;
+    REQUIRE_SERVER_CONTEXT();
     ctx_server->queue_results.remove_waiting_task_id(id_task);
 }
 
 JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_receiveCompletionJson(JNIEnv *env, jobject obj,
                                                                                jint id_task) {
-    auto *ctx_server = get_server_context(env, obj);
-    if (!ctx_server) return nullptr;
+    REQUIRE_SERVER_CONTEXT(nullptr);
 
     server_task_result_ptr result = ctx_server->queue_results.recv(id_task);
 
@@ -815,8 +819,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_receiveCompletionJson(
 }
 
 JNIEXPORT jfloatArray JNICALL Java_de_kherud_llama_LlamaModel_embed(JNIEnv *env, jobject obj, jstring jprompt) {
-    auto *ctx_server = get_server_context(env, obj);
-    if (!ctx_server) return nullptr;
+    REQUIRE_SERVER_CONTEXT(nullptr);
 
     if (!ctx_server->params_base.embedding) {
         env->ThrowNew(c_llama_error,
@@ -885,8 +888,7 @@ JNIEXPORT jfloatArray JNICALL Java_de_kherud_llama_LlamaModel_embed(JNIEnv *env,
 
 JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleRerank(JNIEnv *env, jobject obj, jstring jprompt,
                                                                       jobjectArray documents) {
-    auto *ctx_server = get_server_context(env, obj);
-    if (!ctx_server) return nullptr;
+    REQUIRE_SERVER_CONTEXT(nullptr);
 
     if (!ctx_server->params_base.embedding || ctx_server->params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) {
         env->ThrowNew(c_llama_error,
@@ -933,8 +935,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleRerank(JNIEnv *e
 }
 
 JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_applyTemplate(JNIEnv *env, jobject obj, jstring jparams) {
-    auto *ctx_server = get_server_context(env, obj);
-    if (!ctx_server) return nullptr;
+    REQUIRE_SERVER_CONTEXT(nullptr);
 
     json data = parse_json_params(env, jparams);
 
@@ -947,8 +948,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_applyTemplate(JNIEnv *
 
 JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleChatCompletions(JNIEnv *env, jobject obj,
                                                                                 jstring jparams) {
-    auto *ctx_server = get_server_context(env, obj);
-    if (!ctx_server) return nullptr;
+    REQUIRE_SERVER_CONTEXT(nullptr);
 
     json body = parse_json_params(env, jparams);
 
@@ -967,8 +967,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleChatCompletions(
 
 JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestChatCompletion(JNIEnv *env, jobject obj,
                                                                              jstring jparams) {
-    auto *ctx_server = get_server_context(env, obj);
-    if (!ctx_server) return 0;
+    REQUIRE_SERVER_CONTEXT(0);
 
     json body = parse_json_params(env, jparams);
 
@@ -987,8 +986,7 @@ JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestChatCompletion(JNI
 }
 
 JNIEXPORT jintArray JNICALL Java_de_kherud_llama_LlamaModel_encode(JNIEnv *env, jobject obj, jstring jprompt) {
-    auto *ctx_server = get_server_context(env, obj);
-    if (!ctx_server) return nullptr;
+    REQUIRE_SERVER_CONTEXT(nullptr);
 
     const std::string c_prompt = parse_jstring(env, jprompt);
 
@@ -1023,8 +1021,7 @@ static std::string detokenize(const server_context *ctx_server,
 
 JNIEXPORT jbyteArray JNICALL Java_de_kherud_llama_LlamaModel_decodeBytes(JNIEnv *env, jobject obj,
                                                                          jintArray java_tokens) {
-    auto *ctx_server = get_server_context(env, obj);
-    if (!ctx_server) return nullptr;
+    REQUIRE_SERVER_CONTEXT(nullptr);
 
     const auto tokens = jint_array_to_tokens(env, java_tokens);
     return parse_jbytes(env, detokenize(ctx_server, tokens));
@@ -1058,8 +1055,7 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_delete(JNIEnv *env, jobje
 }
 
 JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_cancelCompletion(JNIEnv *env, jobject obj, jint id_task) {
-    auto *ctx_server = get_server_context(env, obj);
-    if (!ctx_server) return;
+    REQUIRE_SERVER_CONTEXT();
     std::unordered_set<int> id_tasks = {id_task};
     ctx_server->cancel_tasks(id_tasks);
     ctx_server->queue_results.remove_waiting_task_id(id_task);
@@ -1100,8 +1096,7 @@ JNIEXPORT jbyteArray JNICALL Java_de_kherud_llama_LlamaModel_jsonSchemaToGrammar
 
 JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleCompletions(JNIEnv *env, jobject obj,
                                                                             jstring jparams) {
-    auto *ctx_server = get_server_context(env, obj);
-    if (!ctx_server) return nullptr;
+    REQUIRE_SERVER_CONTEXT(nullptr);
 
     json data = parse_json_params(env, jparams);
 
@@ -1117,8 +1112,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleCompletions(JNIE
 
 JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleCompletionsOai(JNIEnv *env, jobject obj,
                                                                                jstring jparams) {
-    auto *ctx_server = get_server_context(env, obj);
-    if (!ctx_server) return nullptr;
+    REQUIRE_SERVER_CONTEXT(nullptr);
 
     json body = parse_json_params(env, jparams);
 
@@ -1150,8 +1144,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleCompletionsOai(J
 }
 
 JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleInfill(JNIEnv *env, jobject obj, jstring jparams) {
-    auto *ctx_server = get_server_context(env, obj);
-    if (!ctx_server) return nullptr;
+    REQUIRE_SERVER_CONTEXT(nullptr);
 
     if (!check_infill_support(env, ctx_server)) return nullptr;
 
@@ -1185,8 +1178,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleInfill(JNIEnv *e
 
 JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleEmbeddings(JNIEnv *env, jobject obj,
                                                                            jstring jparams, jboolean joaiCompat) {
-    auto *ctx_server = get_server_context(env, obj);
-    if (!ctx_server) return nullptr;
+    REQUIRE_SERVER_CONTEXT(nullptr);
 
     if (!ctx_server->params_base.embedding) {
         env->ThrowNew(c_llama_error,
@@ -1263,8 +1255,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleEmbeddings(JNIEn
 JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleTokenize(JNIEnv *env, jobject obj, jstring jcontent,
                                                                          jboolean jaddSpecial,
                                                                          jboolean jwithPieces) {
-    auto *ctx_server = get_server_context(env, obj);
-    if (!ctx_server) return nullptr;
+    REQUIRE_SERVER_CONTEXT(nullptr);
 
     const std::string content = parse_jstring(env, jcontent);
     const bool add_special = jaddSpecial;
@@ -1301,8 +1292,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleTokenize(JNIEnv
 
 JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleDetokenize(JNIEnv *env, jobject obj,
                                                                            jintArray jtokens) {
-    auto *ctx_server = get_server_context(env, obj);
-    if (!ctx_server) return nullptr;
+    REQUIRE_SERVER_CONTEXT(nullptr);
 
     const auto tokens = jint_array_to_tokens(env, jtokens);
     json data = format_detokenized_response(detokenize(ctx_server, tokens));
@@ -1312,8 +1302,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleDetokenize(JNIEn
 
 JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleSlotAction(JNIEnv *env, jobject obj, jint action,
                                                                            jint slotId, jstring jfilename) {
-    auto *ctx_server = get_server_context(env, obj);
-    if (!ctx_server) return nullptr;
+    REQUIRE_SERVER_CONTEXT(nullptr);
 
     switch (action) {
     case 0: { // LIST — get slot info via metrics (priority post)
@@ -1344,8 +1333,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleSlotAction(JNIEn
 
 JNIEXPORT jboolean JNICALL Java_de_kherud_llama_LlamaModel_configureParallelInference(JNIEnv *env, jobject obj,
                                                                                       jstring jconfig) {
-    auto *ctx_server = get_server_context(env, obj);
-    if (!ctx_server) return JNI_FALSE;
+    REQUIRE_SERVER_CONTEXT(JNI_FALSE);
 
     json config = parse_json_params(env, jconfig);
 

From c35b18ad003bc16948f1004ec69d8623e87cd011 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 5 Apr 2026 19:25:43 +0000
Subject: [PATCH 6/8] Fix append_task: take prompt_tokens by value to satisfy
 server_tokens ctor

server_tokens(llama_tokens &, bool) requires a non-const lvalue reference,
so passing const llama_tokens & failed to compile.  Taking by value gives
a local non-const copy that binds correctly.

https://claude.ai/code/session_01VTBYmrewrc3STtLv2JM5L7
---
 src/main/cpp/jni_server_helpers.hpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/main/cpp/jni_server_helpers.hpp b/src/main/cpp/jni_server_helpers.hpp
index 41878279..5e2aaa39 100644
--- a/src/main/cpp/jni_server_helpers.hpp
+++ b/src/main/cpp/jni_server_helpers.hpp
@@ -253,15 +253,16 @@
 //   tasks.push_back(std::move(task));
 //
 // The caller is responsible for pre-computing `prompt_tokens` (e.g. via
-// format_rerank() for rerank tasks).  `oaicompat` defaults to NONE so the
-// rerank call site needs no explicit argument.
+// format_rerank() for rerank tasks).  Taken by value because server_tokens
+// constructor requires a non-const lvalue reference.  `oaicompat` defaults
+// to NONE so the rerank call site needs no explicit argument.
 //
 // Unit-testable without JNI: takes only C++ objects, no JNIEnv calls.
 // ---------------------------------------------------------------------------
 inline void append_task(server_context           *ctx_server,
                         std::vector<server_task> &tasks,
                         server_task_type          type,
-                        const llama_tokens       &prompt_tokens,
+                        llama_tokens              prompt_tokens,
                         size_t                    index,
                         oaicompat_type            oaicompat = OAICOMPAT_TYPE_NONE) {
     server_task task(type);

From 17ab8a9eb446214676a5b23025cb3179589c5ef7 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 5 Apr 2026 19:30:33 +0000
Subject: [PATCH 7/8] Extract rerank_results_to_json from handleRerank

The per-result JSON construction (index/score extraction, document lookup)
had no JNI dependencies and was untestable inline.  Moved to a pure
function in jni_server_helpers.hpp that takes only result pointers and
document strings, making it unit-testable without a JVM or llama model.

https://claude.ai/code/session_01VTBYmrewrc3STtLv2JM5L7
---
 src/main/cpp/jllama.cpp             | 14 +-------------
 src/main/cpp/jni_server_helpers.hpp | 30 ++++++++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp
index 37bb1292..793596a0 100644
--- a/src/main/cpp/jllama.cpp
+++ b/src/main/cpp/jllama.cpp
@@ -919,19 +919,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleRerank(JNIEnv *e
     results.reserve(task_ids.size());
     if (!collect_task_results(env, ctx_server, task_ids, results)) return nullptr;
 
-    json results_json = json::array();
-    for (const auto &result : results) {
-        const auto out_res = result->to_json();
-        int index = out_res["index"].get<int>();
-        float score = out_res["score"].get<float>();
-        results_json.push_back({
-            {"document", document_vector[index]},
-            {"index", index},
-            {"score", score}
-        });
-    }
-
-    return json_to_jstring(env, results_json);
+    return json_to_jstring(env, rerank_results_to_json(results, document_vector));
 }
 
 JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_applyTemplate(JNIEnv *env, jobject obj, jstring jparams) {
diff --git a/src/main/cpp/jni_server_helpers.hpp b/src/main/cpp/jni_server_helpers.hpp
index 5e2aaa39..9cde32af 100644
--- a/src/main/cpp/jni_server_helpers.hpp
+++ b/src/main/cpp/jni_server_helpers.hpp
@@ -32,7 +32,8 @@
 //   6. results_to_json_impl        — no dependencies on helpers above
 //   7. results_to_jstring_impl     — uses 2 + 6
 //   8. check_infill_support_impl   — no dependencies on helpers above
-//   9. append_task                 — no dependencies on helpers above
+//   9. rerank_results_to_json      — no dependencies on helpers above
+//  10. append_task                 — no dependencies on helpers above
 
 #include "jni.h"
 
@@ -238,6 +239,33 @@
 // Extracted from the 10-line compatibility block in handleInfill so it can
 // be unit-tested independently of the JNI dispatch layer.
 // ---------------------------------------------------------------------------
+// ---------------------------------------------------------------------------
+// rerank_results_to_json
+//
+// Converts a collected vector of rerank task results to a JSON array.
+// Each element contains the original document text (looked up via the
+// result's "index" field), the index, and the relevance score.
+//
+// Pure computation — no JNI calls, no llama context required.
+// Unit-testable with any vector of server_task_result_ptr and strings.
+// ---------------------------------------------------------------------------
+[[nodiscard]] inline json rerank_results_to_json(
+        const std::vector<server_task_result_ptr> &results,
+        const std::vector<std::string>            &documents) {
+    json arr = json::array();
+    for (const auto &result : results) {
+        const auto out = result->to_json();
+        int   index = out["index"].get<int>();
+        float score = out["score"].get<float>();
+        arr.push_back({
+            {"document", documents[index]},
+            {"index",    index},
+            {"score",    score}
+        });
+    }
+    return arr;
+}
+
 // ---------------------------------------------------------------------------
 // append_task
 //

From 57167c39ab50c36958d982a3d64cabcf9dd07780 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 5 Apr 2026 19:48:02 +0000
Subject: [PATCH 8/8] Upgrade llama.cpp from b8665 to b8668
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirror the one C++ change from the b8665→b8668 patch: replace the
n_threads/hardware_concurrency log lines with a build_info log line,
matching the upstream tools/server/server.cpp change.  Workflow-only
changes in the patch do not affect this project.

https://claude.ai/code/session_01VTBYmrewrc3STtLv2JM5L7
---
 CLAUDE.md               | 2 +-
 CMakeLists.txt          | 2 +-
 README.md               | 2 +-
 src/main/cpp/jllama.cpp | 5 +----
 4 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index acceeddf..a62ca232 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI.
 
-Current llama.cpp pinned version: **b8665**
+Current llama.cpp pinned version: **b8668**
 
 ## Upgrading CUDA Version
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 98efd0a0..19e45d52 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -90,7 +90,7 @@ set(GGML_AVX512 OFF CACHE BOOL "" FORCE)
 FetchContent_Declare(
 	llama.cpp
 	GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
-	GIT_TAG        b8665
+	GIT_TAG        b8668
 )
 FetchContent_MakeAvailable(llama.cpp)
 
diff --git a/README.md b/README.md
index bbd8b7e0..aece96bd 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational)
-[![llama.cpp b8665](https://img.shields.io/badge/llama.cpp-%23b8665-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8665)
+[![llama.cpp b8668](https://img.shields.io/badge/llama.cpp-%23b8668-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8668)
 
 # Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp)
 
diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp
index 793596a0..3e252ab0 100644
--- a/src/main/cpp/jllama.cpp
+++ b/src/main/cpp/jllama.cpp
@@ -695,11 +695,8 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo
 
     llama_numa_init(params.numa);
 
-    LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads,
-            params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
-    LOG_INF("\n");
+    LOG_INF("build_info: %s\n", build_info.c_str());
     LOG_INF("%s\n", common_params_get_system_info(params).c_str());
-    LOG_INF("\n");
 
     std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};