From c846dde08b5d1307d56221b2c212edadeb74937d Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 5 Apr 2026 19:16:57 +0000 Subject: [PATCH 1/8] Extract append_task helper to replace 3 near-identical task-construction blocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 5–6-line pattern of constructing a server_task (assign id, index, prompt_tokens, oaicompat, push) appeared verbatim in embed, handleRerank, and handleEmbeddings. A single append_task() helper in jni_server_helpers.hpp covers all three; callers that need format_rerank() pass the result as the prompt_tokens argument. The helper is unit-testable without JNI. https://claude.ai/code/session_01VTBYmrewrc3STtLv2JM5L7 --- src/main/cpp/jllama.cpp | 28 ++++------------------ src/main/cpp/jni_server_helpers.hpp | 36 +++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 24 deletions(-) diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp index 04304abd..8f7696e3 100644 --- a/src/main/cpp/jllama.cpp +++ b/src/main/cpp/jllama.cpp @@ -831,16 +831,7 @@ JNIEXPORT jfloatArray JNICALL Java_de_kherud_llama_LlamaModel_embed(JNIEnv *env, auto tokens = tokenize_mixed(ctx_server->vocab, prompt, true, true); std::vector tasks; - server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING); - - task.id = ctx_server->queue_tasks.get_new_id(); - task.index = 0; - task.prompt_tokens = server_tokens(tokens, false); - - // OAI-compat - task.params.oaicompat = OAICOMPAT_TYPE_NONE; - - tasks.push_back(std::move(task)); + append_task(ctx_server, tasks, SERVER_TASK_TYPE_EMBEDDING, tokens, 0); const auto task_ids = dispatch_tasks(ctx_server, tasks); const auto id_task = *task_ids.begin(); @@ -917,12 +908,8 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleRerank(JNIEnv *e tasks.reserve(tokenized_docs.size()); for (size_t i = 0; i < tokenized_docs.size(); i++) { - auto task = server_task(SERVER_TASK_TYPE_RERANK); - task.id = ctx_server->queue_tasks.get_new_id(); - task.index = i; - auto tokens = format_rerank(ctx_server->vocab, tokenized_query, tokenized_docs[i]); - task.prompt_tokens = server_tokens(tokens, false); - tasks.push_back(std::move(task)); + append_task(ctx_server, tasks, SERVER_TASK_TYPE_RERANK, + format_rerank(ctx_server->vocab, tokenized_query, tokenized_docs[i]), i); } const auto task_ids = dispatch_tasks(ctx_server, tasks); @@ -1259,14 +1246,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleEmbeddings(JNIEn tasks.reserve(tokenized_prompts.size()); for (size_t i = 0; i < tokenized_prompts.size(); i++) { - server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING); - - task.id = ctx_server->queue_tasks.get_new_id(); - task.index = i; - task.prompt_tokens = server_tokens(tokenized_prompts[i], false); - task.params.oaicompat = oaicompat; - - tasks.push_back(std::move(task)); + append_task(ctx_server, tasks, SERVER_TASK_TYPE_EMBEDDING, tokenized_prompts[i], i, oaicompat); } const auto task_ids = dispatch_tasks(ctx_server, tasks); diff --git a/src/main/cpp/jni_server_helpers.hpp b/src/main/cpp/jni_server_helpers.hpp index dd404a55..41878279 100644 --- a/src/main/cpp/jni_server_helpers.hpp +++ b/src/main/cpp/jni_server_helpers.hpp @@ -11,6 +11,7 @@ // build_completion_tasks_impl — tokenise and build a server_task vector // collect_task_results_impl — drain results from the response queue // recv_slot_task_result_impl — recv + check a single slot-action result +// append_task — construct and push a single server_task // // All parameters are explicit (no module-level globals) so each function can // be exercised in unit tests using local server objects and a mock JNIEnv. @@ -31,6 +32,7 @@ // 6. results_to_json_impl — no dependencies on helpers above // 7. results_to_jstring_impl — uses 2 + 6 // 8. check_infill_support_impl — no dependencies on helpers above +// 9. append_task — no dependencies on helpers above #include "jni.h" @@ -236,6 +238,40 @@ // Extracted from the 10-line compatibility block in handleInfill so it can // be unit-tested independently of the JNI dispatch layer. // --------------------------------------------------------------------------- +// --------------------------------------------------------------------------- +// append_task +// +// Constructs a server_task of the given type and appends it to `tasks`. +// Captures the repeated 5–6-line block that appears in embed (single task), +// handleEmbeddings (loop), and handleRerank (loop): +// +// server_task task(type); +// task.id = ctx_server->queue_tasks.get_new_id(); +// task.index = index; +// task.prompt_tokens = server_tokens(prompt_tokens, false); +// task.params.oaicompat = oaicompat; +// tasks.push_back(std::move(task)); +// +// The caller is responsible for pre-computing `prompt_tokens` (e.g. via +// format_rerank() for rerank tasks). `oaicompat` defaults to NONE so the +// rerank call site needs no explicit argument. +// +// Unit-testable without JNI: takes only C++ objects, no JNIEnv calls. +// --------------------------------------------------------------------------- +inline void append_task(server_context *ctx_server, + std::vector &tasks, + server_task_type type, + const llama_tokens &prompt_tokens, + size_t index, + oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE) { + server_task task(type); + task.id = ctx_server->queue_tasks.get_new_id(); + task.index = index; + task.prompt_tokens = server_tokens(prompt_tokens, false); + task.params.oaicompat = oaicompat; + tasks.push_back(std::move(task)); +} + [[nodiscard]] inline bool check_infill_support_impl(JNIEnv *env, const llama_vocab *vocab, jclass error_class) { From 7885ced5fd0037bada542d73ec66ef86df361c1e Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 5 Apr 2026 19:17:37 +0000 Subject: [PATCH 2/8] Use get_result_error_message in handleEmbeddings error path The raw result->to_json()["message"].get() expression was still used in handleEmbeddings despite get_result_error_message() being extracted for exactly this purpose. All other call sites already use the helper; this brings handleEmbeddings into line. https://claude.ai/code/session_01VTBYmrewrc3STtLv2JM5L7 --- src/main/cpp/jllama.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp index 8f7696e3..960362a3 100644 --- a/src/main/cpp/jllama.cpp +++ b/src/main/cpp/jllama.cpp @@ -1258,8 +1258,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleEmbeddings(JNIEn if (result->is_error()) { ctx_server->queue_results.remove_waiting_task_ids(task_ids); - std::string error_msg = result->to_json()["message"].get(); - env->ThrowNew(c_llama_error, error_msg.c_str()); + env->ThrowNew(c_llama_error, get_result_error_message(result).c_str()); return nullptr; } From 31772065e1c95fae4b9915168f48b810b97fa136 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 5 Apr 2026 19:19:20 +0000 Subject: [PATCH 3/8] Replace duplicate n_threads/n_threads_batch blocks with local lambda The two near-identical if-blocks in configureParallelInference differed only in JSON key and target field. A local lambda captures env, config, and c_llama_error, leaving only the varying key and target as parameters. https://claude.ai/code/session_01VTBYmrewrc3STtLv2JM5L7 --- src/main/cpp/jllama.cpp | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp index 960362a3..dab359d6 100644 --- a/src/main/cpp/jllama.cpp +++ b/src/main/cpp/jllama.cpp @@ -1372,23 +1372,18 @@ JNIEXPORT jboolean JNICALL Java_de_kherud_llama_LlamaModel_configureParallelInfe ctx_server->slot_prompt_similarity = similarity; } - if (config.contains("n_threads")) { - int n_threads = config["n_threads"].get(); - if (n_threads <= 0) { - env->ThrowNew(c_llama_error, "n_threads must be greater than 0"); - return JNI_FALSE; - } - ctx_server->params_base.cpuparams.n_threads = n_threads; - } - - if (config.contains("n_threads_batch")) { - int n_threads_batch = config["n_threads_batch"].get(); - if (n_threads_batch <= 0) { - env->ThrowNew(c_llama_error, "n_threads_batch must be greater than 0"); - return JNI_FALSE; + auto apply_thread_count = [&](const char *key, int &target) -> bool { + if (!config.contains(key)) return true; + int v = config[key].get(); + if (v <= 0) { + env->ThrowNew(c_llama_error, (std::string(key) + " must be greater than 0").c_str()); + return false; } - ctx_server->params_base.cpuparams_batch.n_threads = n_threads_batch; - } + target = v; + return true; + }; + if (!apply_thread_count("n_threads", ctx_server->params_base.cpuparams.n_threads)) return JNI_FALSE; + if (!apply_thread_count("n_threads_batch", ctx_server->params_base.cpuparams_batch.n_threads)) return JNI_FALSE; return JNI_TRUE; } From 49062fd20863b1d2e3df42b0df3cd88c8a606ed2 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 5 Apr 2026 19:24:23 +0000 Subject: [PATCH 4/8] Use collect_task_results in handleRerank and handleEmbeddings Both functions manually duplicated the drain-and-error-check loop that collect_task_results already encapsulates. Replace each with a call to the helper followed by a second-pass loop over the collected results for per-result processing (index/score extraction for rerank, to_json() for embeddings). https://claude.ai/code/session_01VTBYmrewrc3STtLv2JM5L7 --- src/main/cpp/jllama.cpp | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp index dab359d6..153f090a 100644 --- a/src/main/cpp/jllama.cpp +++ b/src/main/cpp/jllama.cpp @@ -913,20 +913,15 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleRerank(JNIEnv *e } const auto task_ids = dispatch_tasks(ctx_server, tasks); - json results_json = json::array(); - - for (size_t i = 0; i < task_ids.size(); i++) { - server_task_result_ptr result = ctx_server->queue_results.recv(task_ids); - if (result->is_error()) { - ctx_server->queue_results.remove_waiting_task_ids(task_ids); - env->ThrowNew(c_llama_error, get_result_error_message(result).c_str()); - return nullptr; - } + std::vector results; + results.reserve(task_ids.size()); + if (!collect_task_results(env, ctx_server, task_ids, results)) return nullptr; + json results_json = json::array(); + for (const auto &result : results) { const auto out_res = result->to_json(); int index = out_res["index"].get(); float score = out_res["score"].get(); - results_json.push_back({ {"document", document_vector[index]}, {"index", index}, @@ -934,8 +929,6 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleRerank(JNIEnv *e }); } - ctx_server->queue_results.remove_waiting_task_ids(task_ids); - return json_to_jstring(env, results_json); } @@ -1251,22 +1244,15 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleEmbeddings(JNIEn const auto task_ids = dispatch_tasks(ctx_server, tasks); - json responses = json::array(); - - for (size_t i = 0; i < task_ids.size(); i++) { - server_task_result_ptr result = ctx_server->queue_results.recv(task_ids); - - if (result->is_error()) { - ctx_server->queue_results.remove_waiting_task_ids(task_ids); - env->ThrowNew(c_llama_error, get_result_error_message(result).c_str()); - return nullptr; - } + std::vector results; + results.reserve(task_ids.size()); + if (!collect_task_results(env, ctx_server, task_ids, results)) return nullptr; + json responses = json::array(); + for (const auto &result : results) { responses.push_back(result->to_json()); } - ctx_server->queue_results.remove_waiting_task_ids(task_ids); - json root = oaicompat == OAICOMPAT_TYPE_EMBEDDING ? format_embeddings_response_oaicompat(body, responses, use_base64) : json(responses); From 3f35a2e4e7c7daaf86b666981eae0eef3d9361d6 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 5 Apr 2026 19:25:03 +0000 Subject: [PATCH 5/8] Collapse repeated get_server_context guard into REQUIRE_SERVER_CONTEXT macro Every JNI entry point opened with the same two-line pattern: auto *ctx_server = get_server_context(env, obj); if (!ctx_server) return ; A variadic macro covers all four sentinel variants (nullptr, 0, JNI_FALSE, and bare return for void functions) while declaring ctx_server in the caller's scope for use in the rest of the function body. https://claude.ai/code/session_01VTBYmrewrc3STtLv2JM5L7 --- src/main/cpp/jllama.cpp | 64 +++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 38 deletions(-) diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp index 153f090a..37bb1292 100644 --- a/src/main/cpp/jllama.cpp +++ b/src/main/cpp/jllama.cpp @@ -457,6 +457,13 @@ void log_callback_trampoline(ggml_log_level level, const char *text, void *user_ } } // namespace +// Validates the server_context at every JNI entry point. Declares `ctx_server` +// in the caller's scope and returns the given sentinel (omit for void functions) +// if the model is not loaded. +#define REQUIRE_SERVER_CONTEXT(...) \ + auto *ctx_server = get_server_context(env, obj); \ + if (!ctx_server) return __VA_ARGS__ + /** * The VM calls JNI_OnLoad when the native library is loaded (for example, through `System.loadLibrary`). * `JNI_OnLoad` must return the JNI version needed by the native library. @@ -763,8 +770,7 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo } JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestCompletion(JNIEnv *env, jobject obj, jstring jparams) { - auto *ctx_server = get_server_context(env, obj); - if (!ctx_server) return 0; + REQUIRE_SERVER_CONTEXT(0); json data = parse_json_params(env, jparams); @@ -786,15 +792,13 @@ JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestCompletion(JNIEnv } JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_releaseTask(JNIEnv *env, jobject obj, jint id_task) { - auto *ctx_server = get_server_context(env, obj); - if (!ctx_server) return; + REQUIRE_SERVER_CONTEXT(); ctx_server->queue_results.remove_waiting_task_id(id_task); } JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_receiveCompletionJson(JNIEnv *env, jobject obj, jint id_task) { - auto *ctx_server = get_server_context(env, obj); - if (!ctx_server) return nullptr; + REQUIRE_SERVER_CONTEXT(nullptr); server_task_result_ptr result = ctx_server->queue_results.recv(id_task); @@ -815,8 +819,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_receiveCompletionJson( } JNIEXPORT jfloatArray JNICALL Java_de_kherud_llama_LlamaModel_embed(JNIEnv *env, jobject obj, jstring jprompt) { - auto *ctx_server = get_server_context(env, obj); - if (!ctx_server) return nullptr; + REQUIRE_SERVER_CONTEXT(nullptr); if (!ctx_server->params_base.embedding) { env->ThrowNew(c_llama_error, @@ -885,8 +888,7 @@ JNIEXPORT jfloatArray JNICALL Java_de_kherud_llama_LlamaModel_embed(JNIEnv *env, JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleRerank(JNIEnv *env, jobject obj, jstring jprompt, jobjectArray documents) { - auto *ctx_server = get_server_context(env, obj); - if (!ctx_server) return nullptr; + REQUIRE_SERVER_CONTEXT(nullptr); if (!ctx_server->params_base.embedding || ctx_server->params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) { env->ThrowNew(c_llama_error, @@ -933,8 +935,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleRerank(JNIEnv *e } JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_applyTemplate(JNIEnv *env, jobject obj, jstring jparams) { - auto *ctx_server = get_server_context(env, obj); - if (!ctx_server) return nullptr; + REQUIRE_SERVER_CONTEXT(nullptr); json data = parse_json_params(env, jparams); @@ -947,8 +948,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_applyTemplate(JNIEnv * JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleChatCompletions(JNIEnv *env, jobject obj, jstring jparams) { - auto *ctx_server = get_server_context(env, obj); - if (!ctx_server) return nullptr; + REQUIRE_SERVER_CONTEXT(nullptr); json body = parse_json_params(env, jparams); @@ -967,8 +967,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleChatCompletions( JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestChatCompletion(JNIEnv *env, jobject obj, jstring jparams) { - auto *ctx_server = get_server_context(env, obj); - if (!ctx_server) return 0; + REQUIRE_SERVER_CONTEXT(0); json body = parse_json_params(env, jparams); @@ -987,8 +986,7 @@ JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestChatCompletion(JNI } JNIEXPORT jintArray JNICALL Java_de_kherud_llama_LlamaModel_encode(JNIEnv *env, jobject obj, jstring jprompt) { - auto *ctx_server = get_server_context(env, obj); - if (!ctx_server) return nullptr; + REQUIRE_SERVER_CONTEXT(nullptr); const std::string c_prompt = parse_jstring(env, jprompt); @@ -1023,8 +1021,7 @@ static std::string detokenize(const server_context *ctx_server, JNIEXPORT jbyteArray JNICALL Java_de_kherud_llama_LlamaModel_decodeBytes(JNIEnv *env, jobject obj, jintArray java_tokens) { - auto *ctx_server = get_server_context(env, obj); - if (!ctx_server) return nullptr; + REQUIRE_SERVER_CONTEXT(nullptr); const auto tokens = jint_array_to_tokens(env, java_tokens); return parse_jbytes(env, detokenize(ctx_server, tokens)); @@ -1058,8 +1055,7 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_delete(JNIEnv *env, jobje } JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_cancelCompletion(JNIEnv *env, jobject obj, jint id_task) { - auto *ctx_server = get_server_context(env, obj); - if (!ctx_server) return; + REQUIRE_SERVER_CONTEXT(); std::unordered_set id_tasks = {id_task}; ctx_server->cancel_tasks(id_tasks); ctx_server->queue_results.remove_waiting_task_id(id_task); @@ -1100,8 +1096,7 @@ JNIEXPORT jbyteArray JNICALL Java_de_kherud_llama_LlamaModel_jsonSchemaToGrammar JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleCompletions(JNIEnv *env, jobject obj, jstring jparams) { - auto *ctx_server = get_server_context(env, obj); - if (!ctx_server) return nullptr; + REQUIRE_SERVER_CONTEXT(nullptr); json data = parse_json_params(env, jparams); @@ -1117,8 +1112,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleCompletions(JNIE JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleCompletionsOai(JNIEnv *env, jobject obj, jstring jparams) { - auto *ctx_server = get_server_context(env, obj); - if (!ctx_server) return nullptr; + REQUIRE_SERVER_CONTEXT(nullptr); json body = parse_json_params(env, jparams); @@ -1150,8 +1144,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleCompletionsOai(J } JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleInfill(JNIEnv *env, jobject obj, jstring jparams) { - auto *ctx_server = get_server_context(env, obj); - if (!ctx_server) return nullptr; + REQUIRE_SERVER_CONTEXT(nullptr); if (!check_infill_support(env, ctx_server)) return nullptr; @@ -1185,8 +1178,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleInfill(JNIEnv *e JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleEmbeddings(JNIEnv *env, jobject obj, jstring jparams, jboolean joaiCompat) { - auto *ctx_server = get_server_context(env, obj); - if (!ctx_server) return nullptr; + REQUIRE_SERVER_CONTEXT(nullptr); if (!ctx_server->params_base.embedding) { env->ThrowNew(c_llama_error, @@ -1263,8 +1255,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleEmbeddings(JNIEn JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleTokenize(JNIEnv *env, jobject obj, jstring jcontent, jboolean jaddSpecial, jboolean jwithPieces) { - auto *ctx_server = get_server_context(env, obj); - if (!ctx_server) return nullptr; + REQUIRE_SERVER_CONTEXT(nullptr); const std::string content = parse_jstring(env, jcontent); const bool add_special = jaddSpecial; @@ -1301,8 +1292,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleTokenize(JNIEnv JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleDetokenize(JNIEnv *env, jobject obj, jintArray jtokens) { - auto *ctx_server = get_server_context(env, obj); - if (!ctx_server) return nullptr; + REQUIRE_SERVER_CONTEXT(nullptr); const auto tokens = jint_array_to_tokens(env, jtokens); json data = format_detokenized_response(detokenize(ctx_server, tokens)); @@ -1312,8 +1302,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleDetokenize(JNIEn JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleSlotAction(JNIEnv *env, jobject obj, jint action, jint slotId, jstring jfilename) { - auto *ctx_server = get_server_context(env, obj); - if (!ctx_server) return nullptr; + REQUIRE_SERVER_CONTEXT(nullptr); switch (action) { case 0: { // LIST — get slot info via metrics (priority post) @@ -1344,8 +1333,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleSlotAction(JNIEn JNIEXPORT jboolean JNICALL Java_de_kherud_llama_LlamaModel_configureParallelInference(JNIEnv *env, jobject obj, jstring jconfig) { - auto *ctx_server = get_server_context(env, obj); - if (!ctx_server) return JNI_FALSE; + REQUIRE_SERVER_CONTEXT(JNI_FALSE); json config = parse_json_params(env, jconfig); From c35b18ad003bc16948f1004ec69d8623e87cd011 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 5 Apr 2026 19:25:43 +0000 Subject: [PATCH 6/8] Fix append_task: take prompt_tokens by value to satisfy server_tokens ctor server_tokens(llama_tokens &, bool) requires a non-const lvalue reference, so passing const llama_tokens & failed to compile. Taking by value gives a local non-const copy that binds correctly. https://claude.ai/code/session_01VTBYmrewrc3STtLv2JM5L7 --- src/main/cpp/jni_server_helpers.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main/cpp/jni_server_helpers.hpp b/src/main/cpp/jni_server_helpers.hpp index 41878279..5e2aaa39 100644 --- a/src/main/cpp/jni_server_helpers.hpp +++ b/src/main/cpp/jni_server_helpers.hpp @@ -253,15 +253,16 @@ // tasks.push_back(std::move(task)); // // The caller is responsible for pre-computing `prompt_tokens` (e.g. via -// format_rerank() for rerank tasks). `oaicompat` defaults to NONE so the -// rerank call site needs no explicit argument. +// format_rerank() for rerank tasks). Taken by value because server_tokens +// constructor requires a non-const lvalue reference. `oaicompat` defaults +// to NONE so the rerank call site needs no explicit argument. // // Unit-testable without JNI: takes only C++ objects, no JNIEnv calls. // --------------------------------------------------------------------------- inline void append_task(server_context *ctx_server, std::vector &tasks, server_task_type type, - const llama_tokens &prompt_tokens, + llama_tokens prompt_tokens, size_t index, oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE) { server_task task(type); From 17ab8a9eb446214676a5b23025cb3179589c5ef7 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 5 Apr 2026 19:30:33 +0000 Subject: [PATCH 7/8] Extract rerank_results_to_json from handleRerank The per-result JSON construction (index/score extraction, document lookup) had no JNI dependencies and was untestable inline. Moved to a pure function in jni_server_helpers.hpp that takes only result pointers and document strings, making it unit-testable without a JVM or llama model. https://claude.ai/code/session_01VTBYmrewrc3STtLv2JM5L7 --- src/main/cpp/jllama.cpp | 14 +------------- src/main/cpp/jni_server_helpers.hpp | 30 ++++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp index 37bb1292..793596a0 100644 --- a/src/main/cpp/jllama.cpp +++ b/src/main/cpp/jllama.cpp @@ -919,19 +919,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleRerank(JNIEnv *e results.reserve(task_ids.size()); if (!collect_task_results(env, ctx_server, task_ids, results)) return nullptr; - json results_json = json::array(); - for (const auto &result : results) { - const auto out_res = result->to_json(); - int index = out_res["index"].get(); - float score = out_res["score"].get(); - results_json.push_back({ - {"document", document_vector[index]}, - {"index", index}, - {"score", score} - }); - } - - return json_to_jstring(env, results_json); + return json_to_jstring(env, rerank_results_to_json(results, document_vector)); } JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_applyTemplate(JNIEnv *env, jobject obj, jstring jparams) { diff --git a/src/main/cpp/jni_server_helpers.hpp b/src/main/cpp/jni_server_helpers.hpp index 5e2aaa39..9cde32af 100644 --- a/src/main/cpp/jni_server_helpers.hpp +++ b/src/main/cpp/jni_server_helpers.hpp @@ -32,7 +32,8 @@ // 6. results_to_json_impl — no dependencies on helpers above // 7. results_to_jstring_impl — uses 2 + 6 // 8. check_infill_support_impl — no dependencies on helpers above -// 9. append_task — no dependencies on helpers above +// 9. rerank_results_to_json — no dependencies on helpers above +// 10. append_task — no dependencies on helpers above #include "jni.h" @@ -238,6 +239,33 @@ // Extracted from the 10-line compatibility block in handleInfill so it can // be unit-tested independently of the JNI dispatch layer. // --------------------------------------------------------------------------- +// --------------------------------------------------------------------------- +// rerank_results_to_json +// +// Converts a collected vector of rerank task results to a JSON array. +// Each element contains the original document text (looked up via the +// result's "index" field), the index, and the relevance score. +// +// Pure computation — no JNI calls, no llama context required. +// Unit-testable with any vector of server_task_result_ptr and strings. +// --------------------------------------------------------------------------- +[[nodiscard]] inline json rerank_results_to_json( + const std::vector &results, + const std::vector &documents) { + json arr = json::array(); + for (const auto &result : results) { + const auto out = result->to_json(); + int index = out["index"].get(); + float score = out["score"].get(); + arr.push_back({ + {"document", documents[index]}, + {"index", index}, + {"score", score} + }); + } + return arr; +} + // --------------------------------------------------------------------------- // append_task // From 57167c39ab50c36958d982a3d64cabcf9dd07780 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 5 Apr 2026 19:48:02 +0000 Subject: [PATCH 8/8] Upgrade llama.cpp from b8665 to b8668 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror the one C++ change from the b8665→b8668 patch: replace the n_threads/hardware_concurrency log lines with a build_info log line, matching the upstream tools/server/server.cpp change. Workflow-only changes in the patch do not affect this project. https://claude.ai/code/session_01VTBYmrewrc3STtLv2JM5L7 --- CLAUDE.md | 2 +- CMakeLists.txt | 2 +- README.md | 2 +- src/main/cpp/jllama.cpp | 5 +---- 4 files changed, 4 insertions(+), 7 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index acceeddf..a62ca232 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI. -Current llama.cpp pinned version: **b8665** +Current llama.cpp pinned version: **b8668** ## Upgrading CUDA Version diff --git a/CMakeLists.txt b/CMakeLists.txt index 98efd0a0..19e45d52 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -90,7 +90,7 @@ set(GGML_AVX512 OFF CACHE BOOL "" FORCE) FetchContent_Declare( llama.cpp GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG b8665 + GIT_TAG b8668 ) FetchContent_MakeAvailable(llama.cpp) diff --git a/README.md b/README.md index bbd8b7e0..aece96bd 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational) -[![llama.cpp b8665](https://img.shields.io/badge/llama.cpp-%23b8665-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8665) +[![llama.cpp b8668](https://img.shields.io/badge/llama.cpp-%23b8668-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8668) # Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp index 793596a0..3e252ab0 100644 --- a/src/main/cpp/jllama.cpp +++ b/src/main/cpp/jllama.cpp @@ -695,11 +695,8 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo llama_numa_init(params.numa); - LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, - params.cpuparams_batch.n_threads, std::thread::hardware_concurrency()); - LOG_INF("\n"); + LOG_INF("build_info: %s\n", build_info.c_str()); LOG_INF("%s\n", common_params_get_system_info(params).c_str()); - LOG_INF("\n"); std::atomic state{SERVER_STATE_LOADING_MODEL};