bernardladenthin · bernardladenthin · Apr 20, 2026 · Apr 20, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI.
 
-Current llama.cpp pinned version: **b8841**
+Current llama.cpp pinned version: **b8854**
 
 ## Upgrading CUDA Version
 
@@ -156,6 +156,9 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren
 | ~b8808–b8831 | `common/common.h` → new `common/build-info.h` | `build_info` `std::string` removed; replaced by `llama_build_info()` (`const char*`) in new `build-info.h`; add `#include "build-info.h"` in `server.hpp` and `utils.hpp`; call sites: `std::string(llama_build_info())` in `server.hpp` (6×), `llama_build_info()` in `jllama.cpp` (1×) and `utils.hpp` (1×) |
 | ~b8808–b8831 | `ggml/src/ggml.c` | New `ggml_graph_next_uid()` calls `_InterlockedIncrement64` via `<intrin.h>` on x86; intrinsic unavailable on 32-bit MSVC; fix: `src/main/cpp/compat/ggml_x86_compat.c` provides `__cdecl _InterlockedIncrement64` via `InterlockedIncrement64` (CMPXCHG8B), added to `ggml-base` via `target_sources` guarded by `MSVC AND CMAKE_SIZEOF_VOID_P EQUAL 4` |
 | ~b8838–b8841 | `src/llama-model.h` | Attention bias fields renamed: `bq`→`wq_b`, `bk`→`wk_b`, `bv`→`wv_b`, `bo`→`wo_b`, `bqkv`→`wqkv_b`; internal to llama.cpp, no impact on this project |
+| ~b8841–b8854 | `common/common.h` | `common_params::clear_idle` renamed to `cache_idle_slots`; new `common_context_seq_rm_type` enum + `common_context_can_seq_rm()` replacing `common_speculative_is_compat()`; `get_model_endpoint()` → `common_get_model_endpoint()` |
+| ~b8841–b8854 | `tools/mtmd/mtmd.h` + `mtmd-helper.h` | `mtmd_decoder_pos` gains `z` field; `mtmd_image_tokens_get_decoder_pos()` + `mtmd_helper_image_get_decoder_pos()` gain new `pos_0` parameter |
+| ~b8841–b8854 | project `utils.hpp` / `server.hpp` | `server_tokens::get_text_tokens()` split: `get_tokens()` returns raw `const llama_tokens &`; new `get_text_tokens()` returns filtered copy (removes `LLAMA_TOKEN_NULL` mtmd placeholders); save/load and context-shift call sites updated to `get_tokens()` |
 
 ## Build Commands
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -97,7 +97,7 @@ set(GGML_AVX512  OFF CACHE BOOL "" FORCE)
 FetchContent_Declare(
 	llama.cpp
 	GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
-	GIT_TAG        b8841
+	GIT_TAG        b8854
 )
 FetchContent_MakeAvailable(llama.cpp)
 

diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational)
-[![llama.cpp b8841](https://img.shields.io/badge/llama.cpp-%23b8841-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8841)
+[![llama.cpp b8854](https://img.shields.io/badge/llama.cpp-%23b8854-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8854)
 
 # Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp)
 

diff --git a/pom.xml b/pom.xml
@@ -68,7 +68,7 @@
 		<dependency>
 			<groupId>com.fasterxml.jackson.core</groupId>
 			<artifactId>jackson-databind</artifactId>
-			<version>2.19.0</version>
+			<version>2.21.2</version>
 		</dependency>
 	</dependencies>
 

diff --git a/src/main/cpp/server.hpp b/src/main/cpp/server.hpp
@@ -2799,7 +2799,7 @@ struct server_context {
             std::string filename = task.slot_action.filename;
             std::string filepath = task.slot_action.filepath;
 
-            const llama_tokens &tokens = slot->cache_tokens.get_text_tokens();
+            const llama_tokens &tokens = slot->cache_tokens.get_tokens();
             const size_t nwrite =
                 llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count);
 
@@ -2962,7 +2962,7 @@ struct server_context {
 
                 // add generated tokens to cache
                 {
-                    llama_tokens new_tokens = slot.cache_tokens.get_text_tokens(); // copy
+                    llama_tokens new_tokens = slot.cache_tokens.get_tokens(); // copy
                     for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) {
                         new_tokens[i - n_discard] = new_tokens[i];
                     }
@@ -3123,7 +3123,7 @@ struct server_context {
                                 const int erased_blocks =
                                     (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
 
-                                const llama_tokens &curr_tokens = slot.prompt_tokens.get_text_tokens();
+                                const llama_tokens &curr_tokens = slot.prompt_tokens.get_tokens();
                                 llama_tokens new_tokens(curr_tokens.begin(), curr_tokens.begin() + slot.params.n_keep);
 
                                 new_tokens.insert(new_tokens.end(),

diff --git a/src/main/cpp/utils.hpp b/src/main/cpp/utils.hpp
@@ -1216,11 +1216,23 @@ struct server_tokens {
     }
 
     // for compatibility with speculative decoding, ctx shift, slot save/load
-    const llama_tokens &get_text_tokens() const {
+    const llama_tokens &get_tokens() const {
         GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
         return tokens;
     }
 
+    // returns a copy with LLAMA_TOKEN_NULL entries filtered out (mtmd image placeholders)
+    llama_tokens get_text_tokens() const {
+        llama_tokens res;
+        res.reserve(tokens.size());
+        for (llama_token t : tokens) {
+            if (t != LLAMA_TOKEN_NULL) {
+                res.push_back(t);
+            }
+        }
+        return res;
+    }
+
     // for compatibility with speculative decoding
     void set_token(llama_pos pos, llama_token id) {
         GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled