diff --git a/CLAUDE.md b/CLAUDE.md index e7accd52..56b75e38 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,7 +6,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co Java bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) via JNI, providing a high-level API for LLM inference in Java. The Java layer communicates with a native C++ library through JNI. -Current llama.cpp pinned version: **b8841** +Current llama.cpp pinned version: **b8854** ## Upgrading CUDA Version @@ -156,6 +156,9 @@ Also review the project `CMakeLists.txt` for build-system-level breaks (e.g. ren | ~b8808–b8831 | `common/common.h` → new `common/build-info.h` | `build_info` `std::string` removed; replaced by `llama_build_info()` (`const char*`) in new `build-info.h`; add `#include "build-info.h"` in `server.hpp` and `utils.hpp`; call sites: `std::string(llama_build_info())` in `server.hpp` (6×), `llama_build_info()` in `jllama.cpp` (1×) and `utils.hpp` (1×) | | ~b8808–b8831 | `ggml/src/ggml.c` | New `ggml_graph_next_uid()` calls `_InterlockedIncrement64` via `` on x86; intrinsic unavailable on 32-bit MSVC; fix: `src/main/cpp/compat/ggml_x86_compat.c` provides `__cdecl _InterlockedIncrement64` via `InterlockedIncrement64` (CMPXCHG8B), added to `ggml-base` via `target_sources` guarded by `MSVC AND CMAKE_SIZEOF_VOID_P EQUAL 4` | | ~b8838–b8841 | `src/llama-model.h` | Attention bias fields renamed: `bq`→`wq_b`, `bk`→`wk_b`, `bv`→`wv_b`, `bo`→`wo_b`, `bqkv`→`wqkv_b`; internal to llama.cpp, no impact on this project | +| ~b8841–b8854 | `common/common.h` | `common_params::clear_idle` renamed to `cache_idle_slots`; new `common_context_seq_rm_type` enum + `common_context_can_seq_rm()` replacing `common_speculative_is_compat()`; `get_model_endpoint()` → `common_get_model_endpoint()` | +| ~b8841–b8854 | `tools/mtmd/mtmd.h` + `mtmd-helper.h` | `mtmd_decoder_pos` gains `z` field; `mtmd_image_tokens_get_decoder_pos()` + `mtmd_helper_image_get_decoder_pos()` gain new `pos_0` parameter | +| ~b8841–b8854 | project `utils.hpp` / `server.hpp` | `server_tokens::get_text_tokens()` split: `get_tokens()` returns raw `const llama_tokens &`; new `get_text_tokens()` returns filtered copy (removes `LLAMA_TOKEN_NULL` mtmd placeholders); save/load and context-shift call sites updated to `get_tokens()` | ## Build Commands diff --git a/CMakeLists.txt b/CMakeLists.txt index e10ad6fe..c5ebdd14 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,7 +97,7 @@ set(GGML_AVX512 OFF CACHE BOOL "" FORCE) FetchContent_Declare( llama.cpp GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG b8841 + GIT_TAG b8854 ) FetchContent_MakeAvailable(llama.cpp) diff --git a/README.md b/README.md index 66c84a66..1dcf44a8 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ ![Java 8+](https://img.shields.io/badge/Java-8%2B-informational) -[![llama.cpp b8841](https://img.shields.io/badge/llama.cpp-%23b8841-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8841) +[![llama.cpp b8854](https://img.shields.io/badge/llama.cpp-%23b8854-informational)](https://github.com/ggml-org/llama.cpp/releases/tag/b8854) # Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) diff --git a/pom.xml b/pom.xml index d84cfd2c..7d6b0925 100644 --- a/pom.xml +++ b/pom.xml @@ -68,7 +68,7 @@ com.fasterxml.jackson.core jackson-databind - 2.19.0 + 2.21.2 diff --git a/src/main/cpp/server.hpp b/src/main/cpp/server.hpp index 31013c0c..d3c19f06 100644 --- a/src/main/cpp/server.hpp +++ b/src/main/cpp/server.hpp @@ -2799,7 +2799,7 @@ struct server_context { std::string filename = task.slot_action.filename; std::string filepath = task.slot_action.filepath; - const llama_tokens &tokens = slot->cache_tokens.get_text_tokens(); + const llama_tokens &tokens = slot->cache_tokens.get_tokens(); const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count); @@ -2962,7 +2962,7 @@ struct server_context { // add generated tokens to cache { - llama_tokens new_tokens = slot.cache_tokens.get_text_tokens(); // copy + llama_tokens new_tokens = slot.cache_tokens.get_tokens(); // copy for (size_t i = n_keep + n_discard; i < new_tokens.size(); i++) { new_tokens[i - n_discard] = new_tokens[i]; } @@ -3123,7 +3123,7 @@ struct server_context { const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; - const llama_tokens &curr_tokens = slot.prompt_tokens.get_text_tokens(); + const llama_tokens &curr_tokens = slot.prompt_tokens.get_tokens(); llama_tokens new_tokens(curr_tokens.begin(), curr_tokens.begin() + slot.params.n_keep); new_tokens.insert(new_tokens.end(), diff --git a/src/main/cpp/utils.hpp b/src/main/cpp/utils.hpp index 4dfc7bb0..0b60fe52 100644 --- a/src/main/cpp/utils.hpp +++ b/src/main/cpp/utils.hpp @@ -1216,11 +1216,23 @@ struct server_tokens { } // for compatibility with speculative decoding, ctx shift, slot save/load - const llama_tokens &get_text_tokens() const { + const llama_tokens &get_tokens() const { GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled return tokens; } + // returns a copy with LLAMA_TOKEN_NULL entries filtered out (mtmd image placeholders) + llama_tokens get_text_tokens() const { + llama_tokens res; + res.reserve(tokens.size()); + for (llama_token t : tokens) { + if (t != LLAMA_TOKEN_NULL) { + res.push_back(t); + } + } + return res; + } + // for compatibility with speculative decoding void set_token(llama_pos pos, llama_token id) { GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled