Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
62fc8dc
Upgrade llama.cpp from b8854 to b8887
claude Apr 22, 2026
cc21d6a
Import server_chat_msg_diff_to_json_oaicompat from server-chat.h
claude Apr 23, 2026
6784206
Fix server-chat.h import: forward-declare to avoid server-common.h co…
claude Apr 23, 2026
8ad09cf
Revert to local server_chat_msg_diff_to_json_oaicompat definition
claude Apr 23, 2026
0077a4e
Remove 20 identical symbols from utils.hpp; switch to server-common.h…
claude Apr 23, 2026
29ea276
Fix base64_decode visibility and format_error_response ambiguity
claude Apr 23, 2026
60d9a10
Update OaicompatCompletionParams.NNotOne test for upstream behavior
claude Apr 23, 2026
0f56eb0
Switch to upstream tokenize_input_prompts, format_embeddings_response…
claude Apr 23, 2026
9f4019b
Replace local tokens_to_str template overloads with upstream const-re…
claude Apr 23, 2026
dfb5c81
Document base64 copy-from-upstream section with fold markers in utils…
claude Apr 23, 2026
42d5361
Remove local server_chat_msg_diff_to_json_oaicompat; use upstream ser…
claude Apr 23, 2026
4042c4b
Replace local oaicompat_parser_options with upstream server_chat_params
claude Apr 23, 2026
72af1e1
Fix build: store chat_templates directly in oai_parser_opt.tmpls
claude Apr 23, 2026
634e563
Fix remaining chat_templates references in jllama.cpp
claude Apr 23, 2026
7729e1a
Fix enable_thinking: mirror upstream logic using template support check
claude Apr 23, 2026
5f76eca
Refactor parse_lora_request to delegate JSON parsing to upstream
claude Apr 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co

Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI.

Current llama.cpp pinned version: **b8854**
Current llama.cpp pinned version: **b8887**

## Upgrading CUDA Version

Expand Down Expand Up @@ -137,7 +137,7 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren
`ggml/include/ggml.h`, `ggml/include/ggml-backend.h`, `ggml/include/ggml-opt.h`,
`ggml-alloc.h`, `ggml-cpu.h`, `peg-parser.h`, `base64.hpp`

**Known breaking changes by version range** (b5022 → b8841):
**Known breaking changes by version range** (b5022 → b8887):

| Version | File | Change |
|---------|------|--------|
Expand All @@ -159,6 +159,9 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren
| ~b8841–b8854 | `common/common.h` | `common_params::clear_idle` renamed to `cache_idle_slots`; new `common_context_seq_rm_type` enum + `common_context_can_seq_rm()` replacing `common_speculative_is_compat()`; `get_model_endpoint()` → `common_get_model_endpoint()` |
| ~b8841–b8854 | `tools/mtmd/mtmd.h` + `mtmd-helper.h` | `mtmd_decoder_pos` gains `z` field; `mtmd_image_tokens_get_decoder_pos()` + `mtmd_helper_image_get_decoder_pos()` gain new `pos_0` parameter |
| ~b8841–b8854 | project `utils.hpp` / `server.hpp` | `server_tokens::get_text_tokens()` split: `get_tokens()` returns raw `const llama_tokens &`; new `get_text_tokens()` returns filtered copy (removes `LLAMA_TOKEN_NULL` mtmd placeholders); save/load and context-shift call sites updated to `get_tokens()` |
| ~b8854–b8887 | `common/chat.h` | `common_chat_msg_diff_to_json_oaicompat` removed; moved to `tools/server/server-chat.cpp`; project defines it locally in `server.hpp` — importing server-chat.cpp is impractical because it pulls in `convert_transcriptions_to_chatcmpl` → `get_media_marker` → `server-common.cpp` |
| ~b8854–b8887 | `common/common.h` | `common_params::reasoning_budget` and `reasoning_budget_message` moved into `common_params::sampling` sub-struct as `reasoning_budget_tokens`; update: `params_base.reasoning_budget` → `params_base.sampling.reasoning_budget_tokens` |
| ~b8854–b8887 | `common/fit.h` (new) | `llama_params_fit` and `llama_memory_breakdown_print` removed from `include/llama.h`; now `common_fit_params` / `common_memory_breakdown_print` in new `common/fit.h`; not used directly by project |

## Build Commands

Expand Down
19 changes: 15 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ set(GGML_AVX512 OFF CACHE BOOL "" FORCE)
FetchContent_Declare(
llama.cpp
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
GIT_TAG b8854
GIT_TAG b8887
)
FetchContent_MakeAvailable(llama.cpp)

Expand Down Expand Up @@ -208,10 +208,19 @@ if(NOT JNI_INCLUDE_DIRS)
endif()
endif()

add_library(jllama SHARED src/main/cpp/jllama.cpp src/main/cpp/server.hpp src/main/cpp/utils.hpp)
add_library(jllama SHARED
src/main/cpp/jllama.cpp
src/main/cpp/server.hpp
src/main/cpp/utils.hpp
${llama.cpp_SOURCE_DIR}/tools/server/server-common.cpp
${llama.cpp_SOURCE_DIR}/tools/server/server-chat.cpp)

set_target_properties(jllama PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(jllama PRIVATE src/main/cpp ${JNI_INCLUDE_DIRS} ${llama.cpp_SOURCE_DIR}/tools/mtmd)
target_include_directories(jllama PRIVATE
src/main/cpp
${JNI_INCLUDE_DIRS}
${llama.cpp_SOURCE_DIR}/tools/mtmd
${llama.cpp_SOURCE_DIR}/tools/server)
target_link_libraries(jllama PRIVATE llama-common mtmd llama nlohmann_json)
target_compile_features(jllama PRIVATE cxx_std_11)

Expand Down Expand Up @@ -258,14 +267,16 @@ if(BUILD_TESTING)
src/test/cpp/test_server.cpp
src/test/cpp/test_jni_helpers.cpp
src/test/cpp/test_json_helpers.cpp
)
${llama.cpp_SOURCE_DIR}/tools/server/server-common.cpp
${llama.cpp_SOURCE_DIR}/tools/server/server-chat.cpp)

target_include_directories(jllama_test PRIVATE
src/main/cpp
# mtmd.h is not always propagated transitively — add it explicitly
${llama.cpp_SOURCE_DIR}/tools/mtmd
# jni.h / jni_md.h needed by jni_helpers.hpp (mock JNI tests, no JVM required)
${JNI_INCLUDE_DIRS}
${llama.cpp_SOURCE_DIR}/tools/server
)
target_link_libraries(jllama_test PRIVATE llama-common mtmd llama nlohmann_json GTest::gtest_main)
target_compile_features(jllama_test PRIVATE cxx_std_17)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
![Java 8+](https://img.shields.io/badge/Java-8%2B-informational)
[![llama.cpp b8854](https://img.shields.io/badge/llama.cpp-%23b8854-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8854)
[![llama.cpp b8887](https://img.shields.io/badge/llama.cpp-%23b8887-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8887)

# Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp)

Expand Down
22 changes: 11 additions & 11 deletions src/main/cpp/jllama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -779,8 +779,8 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo

// print sample chat example to make it clear which template is used
LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
common_chat_templates_source(ctx_server->chat_templates.get()).c_str(),
common_chat_format_example(ctx_server->chat_templates.get(), ctx_server->params_base.use_jinja, ctx_server->params_base.default_template_kwargs).c_str());
common_chat_templates_source(ctx_server->oai_parser_opt.tmpls.get()).c_str(),
common_chat_format_example(ctx_server->oai_parser_opt.tmpls.get(), ctx_server->params_base.use_jinja, ctx_server->params_base.default_template_kwargs).c_str());

ctx_server->queue_tasks.on_new_task(
std::bind(&server_context::process_single_task, ctx_server, std::placeholders::_1));
Expand Down Expand Up @@ -912,12 +912,12 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleRerank(JNIEnv *e
auto document_vector = std::vector<std::string>(document_array, document_array + amount_documents);
free_string_array(document_array, amount_documents);

std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server->vocab, document_vector, true, true);
std::vector<server_tokens> tokenized_docs = tokenize_input_prompts(ctx_server->vocab, nullptr, document_vector, true, true);

tasks.reserve(tokenized_docs.size());
for (size_t i = 0; i < tokenized_docs.size(); i++) {
append_task(ctx_server, tasks, SERVER_TASK_TYPE_RERANK,
format_rerank(ctx_server->vocab, tokenized_query, tokenized_docs[i]), i);
format_rerank(ctx_server->vocab, tokenized_query, tokenized_docs[i].get_tokens()), i);
}
std::vector<server_task_result_ptr> results;
if (!dispatch_and_collect(env, ctx_server, std::move(tasks), results)) return nullptr;
Expand Down Expand Up @@ -983,9 +983,9 @@ JNIEXPORT jintArray JNICALL Java_de_kherud_llama_LlamaModel_encode(JNIEnv *env,
static std::string detokenize(const server_context *ctx_server,
const std::vector<llama_token> &tokens) {
if (!ctx_server->is_vocab_only()) {
return tokens_to_str(ctx_server->ctx, tokens.cbegin(), tokens.cend());
return tokens_to_str(ctx_server->ctx, tokens);
}
return tokens_to_str(ctx_server->vocab, tokens.cbegin(), tokens.cend());
return tokens_to_str(ctx_server->vocab, tokens);
}

JNIEXPORT jbyteArray JNICALL Java_de_kherud_llama_LlamaModel_decodeBytes(JNIEnv *env, jobject obj,
Expand Down Expand Up @@ -1115,13 +1115,13 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleInfill(JNIEnv *e

// Format the infill prompt
std::string prompt = json_value(data, "prompt", std::string());
std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server->vocab, prompt, false, true);
std::vector<server_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server->vocab, nullptr, prompt, false, true);

data["prompt"] = format_infill(ctx_server->vocab, data.at("input_prefix"), data.at("input_suffix"),
data.at("input_extra"), ctx_server->params_base.n_batch,
ctx_server->params_base.n_predict, ctx_server->slots[0].n_ctx,
ctx_server->params_base.spm_infill,
tokenized_prompts.empty() ? llama_tokens() : tokenized_prompts[0]);
tokenized_prompts.empty() ? llama_tokens() : tokenized_prompts[0].get_tokens());

return dispatch_completion_and_serialize(env, ctx_server, data,
SERVER_TASK_TYPE_INFILL, OAICOMPAT_TYPE_NONE);
Expand Down Expand Up @@ -1155,10 +1155,10 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleEmbeddings(JNIEn
}
if (force_no_oaicompat) oaicompat = OAICOMPAT_TYPE_NONE;

std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server->vocab, prompt, true, true);
std::vector<server_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server->vocab, nullptr, prompt, true, true);

for (const auto &tokens : tokenized_prompts) {
if (tokens.empty()) {
if (tokens.get_tokens().empty()) {
env->ThrowNew(c_llama_error, "Input content cannot be empty");
return nullptr;
}
Expand All @@ -1168,7 +1168,7 @@ JNIEXPORT jstring JNICALL Java_de_kherud_llama_LlamaModel_handleEmbeddings(JNIEn
tasks.reserve(tokenized_prompts.size());

for (size_t i = 0; i < tokenized_prompts.size(); i++) {
append_task(ctx_server, tasks, SERVER_TASK_TYPE_EMBEDDING, tokenized_prompts[i], i, oaicompat);
append_task(ctx_server, tasks, SERVER_TASK_TYPE_EMBEDDING, tokenized_prompts[i].get_tokens(), i, oaicompat);
}

std::vector<server_task_result_ptr> results;
Expand Down
6 changes: 3 additions & 3 deletions src/main/cpp/jni_helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,16 +219,16 @@ struct jllama_context {
try {
const auto &prompt = data.at("prompt"); // throws before ctx_server is touched

std::vector<llama_tokens> tokenized_prompts =
tokenize_input_prompts(ctx_server->vocab, prompt, true, true);
std::vector<server_tokens> tokenized_prompts =
tokenize_input_prompts(ctx_server->vocab, nullptr, prompt, true, true);

tasks.reserve(tokenized_prompts.size());
for (size_t i = 0; i < tokenized_prompts.size(); i++) {
server_task task = server_task(task_type);
task.id = ctx_server->queue_tasks.get_new_id();
task.index = i;

task.prompt_tokens = server_tokens(tokenized_prompts[i], false);
task.prompt_tokens = std::move(tokenized_prompts[i]);
task.params = server_task::params_from_json_cmpl(
ctx_server->ctx, ctx_server->params_base, data);
task.id_selected_slot = json_value(data, "id_slot", -1);
Expand Down
2 changes: 1 addition & 1 deletion src/main/cpp/json_helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@
responses.push_back(result->to_json());
}
if (oaicompat == OAICOMPAT_TYPE_EMBEDDING) {
return format_embeddings_response_oaicompat(body, responses, use_base64);
return format_embeddings_response_oaicompat(body, json_value(body, "model", std::string(DEFAULT_OAICOMPAT_MODEL)), responses, use_base64);
}
return responses;
}
Expand Down
96 changes: 24 additions & 72 deletions src/main/cpp/server.hpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "chat.h"
#include "server-chat.h"
#include "utils.hpp"

#include "arg.h"
Expand All @@ -25,8 +26,6 @@
#include <unordered_map>
#include <unordered_set>

using json = nlohmann::ordered_json;

constexpr int HTTP_POLLING_SECONDS = 1;

enum stop_type {
Expand Down Expand Up @@ -72,16 +71,7 @@ enum oaicompat_type {
OAICOMPAT_TYPE_EMBEDDING,
};

// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
enum error_type {
ERROR_TYPE_INVALID_REQUEST,
ERROR_TYPE_AUTHENTICATION,
ERROR_TYPE_SERVER,
ERROR_TYPE_NOT_FOUND,
ERROR_TYPE_PERMISSION,
ERROR_TYPE_UNAVAILABLE, // custom error
ERROR_TYPE_NOT_SUPPORTED, // custom error
};
// error_type enum provided by server-common.h (via utils.hpp)

static bool server_task_type_need_embd(server_task_type task_type) {
switch (task_type) {
Expand Down Expand Up @@ -614,6 +604,7 @@ inline std::string oaicompat_finish_reason(stop_type stop, bool has_tool_calls =
return "length";
}


struct completion_token_output {
llama_token tok;
float prob;
Expand Down Expand Up @@ -821,7 +812,7 @@ struct server_task_result_cmpl_final : server_task_result {
json{
{"finish_reason", nullptr},
{"index", index},
{"delta", common_chat_msg_diff_to_json_oaicompat(diff)},
{"delta", server_chat_msg_diff_to_json_oaicompat(diff)},
},
})},
{"created", t},
Expand Down Expand Up @@ -989,7 +980,7 @@ struct server_task_result_cmpl_partial : server_task_result {
}

for (const auto &diff : oaicompat_msg_diffs) {
add_delta(common_chat_msg_diff_to_json_oaicompat(diff));
add_delta(server_chat_msg_diff_to_json_oaicompat(diff));
}

if (!deltas.empty()) {
Expand Down Expand Up @@ -1058,46 +1049,7 @@ struct server_task_result_rerank : server_task_result {
}
};

// this function maybe used outside of server_task_result_error
static json format_error_response(const std::string &message, const enum error_type type) {
std::string type_str;
int code = 500;
switch (type) {
case ERROR_TYPE_INVALID_REQUEST:
type_str = "invalid_request_error";
code = 400;
break;
case ERROR_TYPE_AUTHENTICATION:
type_str = "authentication_error";
code = 401;
break;
case ERROR_TYPE_NOT_FOUND:
type_str = "not_found_error";
code = 404;
break;
case ERROR_TYPE_SERVER:
type_str = "server_error";
code = 500;
break;
case ERROR_TYPE_PERMISSION:
type_str = "permission_error";
code = 403;
break;
case ERROR_TYPE_NOT_SUPPORTED:
type_str = "not_supported_error";
code = 501;
break;
case ERROR_TYPE_UNAVAILABLE:
type_str = "unavailable_error";
code = 503;
break;
}
return json{
{"code", code},
{"message", message},
{"type", type_str},
};
}
// format_error_response is provided by server-common.h / server-common.cpp

struct server_task_result_error : server_task_result {
int index = 0;
Expand Down Expand Up @@ -1848,8 +1800,7 @@ struct server_context {
// Necessary similarity of prompt for slot selection
float slot_prompt_similarity = 0.0f;

common_chat_templates_ptr chat_templates;
oaicompat_parser_options oai_parser_opt;
server_chat_params oai_parser_opt;

// Returns true when the model was loaded in vocab-only mode:
// the vocabulary is available but no inference context was created.
Expand Down Expand Up @@ -1955,15 +1906,15 @@ struct server_context {
params_base.speculative.cparams_dft = common_context_params_to_llama(params_dft);
}

chat_templates = common_chat_templates_init(model, params_base.chat_template);
oai_parser_opt.tmpls = common_chat_templates_init(model, params_base.chat_template);
try {
common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs);
common_chat_format_example(oai_parser_opt.tmpls.get(), params.use_jinja, params.default_template_kwargs);
} catch (const std::exception &e) {
SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. "
"This may cause the model to output suboptimal responses\n",
__func__);
chat_templates = common_chat_templates_init(model, "chatml");
oai_parser_opt.tmpls = common_chat_templates_init(model, "chatml");
}

std::string &mmproj_path = params_base.mmproj.path;
Expand Down Expand Up @@ -2058,15 +2009,14 @@ struct server_context {

metrics.init();

oai_parser_opt = {
/* use_jinja */ params_base.use_jinja,
/* prefill_assistant */ params_base.prefill_assistant,
/* reasoning_format */ params_base.reasoning_format,
/* common_chat_templates */ chat_templates.get(),
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
/* allow_audio */ mctx ? mtmd_support_audio(mctx) : false,
/* enable_thinking */ params_base.reasoning_budget != 0,
};
oai_parser_opt.use_jinja = params_base.use_jinja;
oai_parser_opt.prefill_assistant = params_base.prefill_assistant;
oai_parser_opt.reasoning_format = params_base.reasoning_format;
oai_parser_opt.allow_image = mctx ? mtmd_support_vision(mctx) : false;
oai_parser_opt.allow_audio = mctx ? mtmd_support_audio(mctx) : false;
oai_parser_opt.enable_thinking = params_base.enable_reasoning != 0 &&
params_base.use_jinja &&
common_chat_templates_support_enable_thinking(oai_parser_opt.tmpls.get());
}

server_slot *get_slot_by_id(int id) {
Expand Down Expand Up @@ -3267,9 +3217,11 @@ struct server_context {
// check if we should process the image
if (slot.n_past < slot.n_prompt_tokens && slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
// process the image
int32_t new_n_past;
int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
int32_t n_pos = new_n_past - slot.n_past;
size_t n_tokens_out;
int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, static_cast<size_t>(slot.n_past),
static_cast<llama_pos>(slot.n_past),
slot.id, n_tokens_out);
int32_t n_pos = static_cast<int32_t>(n_tokens_out);

if (res != 0) {
SLT_ERR(slot, "failed to process image, res = %d\n", res);
Expand All @@ -3280,7 +3232,7 @@ struct server_context {

// add the image chunk to cache
{
const auto &chunk = slot.prompt_tokens.find_chunk(slot.n_past);
const auto &chunk = slot.prompt_tokens.find_chunk(static_cast<size_t>(slot.n_past));
slot.cache_tokens.push_back(chunk.get()); // copy
}

Expand Down
Loading
Loading