Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
594 commits
Select commit Hold shift + click to select a range
7623de1
tests: test-backend-ops -j <N> to run tests in parallel (#23637)
jeffbolznv May 26, 2026
581d020
SYCL: implement ggml_sycl_pool_vmm (#22862)
sanmai May 26, 2026
6fe90de
models : Attach Mistral3 NVFP4 weight scales (#23629)
michaelw9999 May 26, 2026
dbe9c0c
convert : support Gemma4ForCausalLM architecture (#23682)
aoleg May 26, 2026
3dc7684
ci : reduce (disable SYCL and CANN builds/releases) (#23705)
ggerganov May 26, 2026
ef41a69
ci : move sanitizer jobs to self-hosted runners (#23713)
ggerganov May 26, 2026
678d43d
ci : move more CPU jobs to self-hosted runners (#23715)
ggerganov May 26, 2026
ef66bfa
hexagon: add support for CONCAT op (#23648)
max-krasnyansky May 26, 2026
3a3ed15
ci : remove vulkan SDK dep from webgpu job (#23718)
ggerganov May 26, 2026
7799d31
vulkan: optimize conv2d and implement coopmat1 support (#22620)
jeffbolznv May 26, 2026
5190c2e
ci : move macos jobs to the apple workflow + fix names (#23721)
ggerganov May 26, 2026
35a74c8
ci : add `[no release]` keyword + fix sanitizer builds (#23728)
ggerganov May 26, 2026
08bc21b
ci : move [no release] check to dedicated check_release job (#23734)
ggerganov May 26, 2026
0d18aaa
ci : do not allocate ccache for 3rd-party hosted runners (#23730)
ggerganov May 26, 2026
b4c0549
ggml-zendnn : fixed naming of matmul function (#20964)
truecoder34 May 26, 2026
7085492
server : fix the log message when using SSL (#23393)
rgerganov May 27, 2026
9777256
convert: add MiniCPM5 tokenizer support (#23384)
zhangtao2-1 May 27, 2026
1d971bb
docs : fix duplicated "the" in granitevision and model-conversion doc…
quyentonndbs May 27, 2026
0d227ec
ci : add ccache to server builds + fix undefined sanitizer build (#23…
ggerganov May 27, 2026
4d8cc0c
vulkan: avoid preferring transfer queue on AMD UMA devices (#22455)
winstonma May 27, 2026
b3a739c
ci : remove wasm test (#23733)
CISC May 27, 2026
9f0e4b1
ci : fix windows ccaches (#23777)
ggerganov May 27, 2026
6b4e4bd
common : fix env names to all have LLAMA_ARG_ prefix (#23778)
ggerganov May 27, 2026
2d0656f
ci : bump cuda release to 13.3 (#23749)
CISC May 27, 2026
fda8528
CUDA: restrict PDL to CTK >= 12.3 due to MSVC issues (#23742)
ORippler May 27, 2026
87b0a60
pyproject : add conversion folder and update dependencies (#23746)
CISC May 27, 2026
617255d
vendor : update cpp-httplib to 0.46.0 (#23650)
cabelo May 27, 2026
ba4dd0b
ci : move ARM jobs to self-hosted + disable kleidiai mac release (#23…
ggerganov May 27, 2026
837bb6b
vulkan: add REPEAT op support for f16 to f16. (#23298)
l8bloom May 27, 2026
b36eefc
vulkan: use GL_NV_cooperative_matrix_decode_vector for faster matmul …
jeffbolznv May 27, 2026
c6e4088
vulkan: Switch MUL_MAT_VEC to 4 K per iteration for F16/32 (#22887)
TheBlueMatt May 27, 2026
c40006a
ggml-webgpu: Fix how to dispatch WG to some ops (#23750)
yomaytk May 27, 2026
aa50b2c
hexagon: add support for Q4_1 in MUL_MAT and MUL_MAT_ID (#23647)
max-krasnyansky May 27, 2026
f12cc6d
ggml-webgpu: remove legacy constants (#23672)
reeselevine May 27, 2026
8ad8aef
opencl: OP_GATED_DELTA_NET (#23312)
ymcki May 28, 2026
939a7dd
Hexagon: OP_GATED_DELTA_NET K>1 support (#23531)
ymcki May 28, 2026
491c4d7
ci : refactor (#23789)
ggerganov May 28, 2026
e31cdaa
ggml: fixed Arm SVE usage bug in vec.h, vec.cpp (#22841)
martin-klacer-arm May 28, 2026
c522908
convert : add FP8 to Q8 conversion (#23250)
ynankani May 28, 2026
48e7eae
perplexity : fix format specifier in LOG_ERR (#23788)
angt May 28, 2026
09e7b76
cuda : fix KQ mask offset integer overflow in fattn MMA kernel (#23610)
fairydreaming May 28, 2026
e8d2567
docker : add ZenDNN Dockerfile (#23716)
z-sachin May 28, 2026
d205df6
server, ui : Add support for HTTP ETags in llama-server (#23701)
mtavenrath May 28, 2026
91eb8f4
vulkan: Fix memory logger unsafe iterator access (#23667)
winstonma May 28, 2026
7c48fb8
vulkan: fix wrong index variable in inner loop (#23665)
winstonma May 28, 2026
bb771cb
chat : add Granite 4.1 chat template (#23518)
jesus-talavera-ibm May 28, 2026
48e7078
vulkan: fast path for walsh-hadamard transform (#23687)
jeffbolznv May 28, 2026
a919001
hexagon: minor refresh for HMX FA and MM (#23796)
max-krasnyansky May 28, 2026
0b24686
server: minor tweaks to use more cpp features (#23785)
mfuntowicz May 28, 2026
bc81d47
CUDA: route batch>=4 quantized matmul to MMQ on AMD MFMA hardware (#2…
jadenmach2 May 28, 2026
d7be461
mmvq Optim: add MMVQ_PARAMETERS_TURING(mmvq_parameter_table_id) for …
yaohengxu May 28, 2026
30af6e2
ggml: auto apply iGPU flag CUDA/HIP if integrated device (#23007)
fl0rianr May 28, 2026
d374e71
test-llama-archs: fix table format [no release] (#23810)
JohannesGaessler May 28, 2026
7fb1e70
arg: Add LLAMA_ARG_API_KEY_FILE environment variable for --api-key-fi…
kucharskim May 28, 2026
dd15579
ci : change Vulkan builds to Release to reduce ccache (#23820)
ggerganov May 28, 2026
d6be315
mtmd: fix gemma 4 audio rms norm eps (#23815)
ngxson May 28, 2026
0b56d28
mtmd: n_head_kv defaults to n_head (#23782)
sfallah May 28, 2026
479a9a1
app : improve help output (#23805)
angt May 28, 2026
445b7ce
ci : releases use Github-hosted builds for the UI (#23823)
ggerganov May 28, 2026
2f6c815
ui: fix audio and video modality detection (#23756)
ValdikSS May 28, 2026
3ef2369
ci : run ui publish on ubuntu-slim (#23818)
CISC May 28, 2026
408ae2b
opencl: move backend info printing into its own function (#23702)
lhez May 28, 2026
c8914ad
mtmd: fix gemma 4 projector pre_norm (#23822)
ngxson May 28, 2026
751ebd1
mtmd-debug: add color and rainbow mode (#23829)
ngxson May 28, 2026
19e92c3
hexagon: basic/generic op fusion support and RMS_NORM+MUL fusion (#23…
max-krasnyansky May 28, 2026
33c718d
meta : Add missing `buffer` set in allreduce fallback !COMPUTE clear …
TheBlueMatt May 29, 2026
241cbd4
cuda : disables launch_fattn PDL enrollment due to compiler bug (#23825)
aendk May 29, 2026
98e480a
app : move licences to llama-app (#23824)
angt May 29, 2026
eef59a7
llama: add llm_graph_input_mtp (#23643)
am17an May 29, 2026
b000431
ngram-mod : Add missing include (#23857)
oazizi000 May 29, 2026
ea02bc3
ggml : bump version to 0.13.1 (ggml/1523)
ggerganov May 29, 2026
fe12e42
sync : ggml
ggerganov May 29, 2026
031ddb2
llama: use f16 mask for FA to save VRAM (#23764)
am17an May 29, 2026
1f0aa2a
model : support for DeepseekV32ForCausalLM with generic DeepSeek Spar…
fairydreaming May 29, 2026
cb47092
server: bump timeout to 3600s (#23842)
ngxson May 29, 2026
6ed481e
CUDA: Check PTX version on host side to guard PDL dispatch (#23530)
ORippler May 29, 2026
da3f990
mtmd: Add DeepSeekOCR 2 Support (#20975)
sfallah May 29, 2026
06d26df
download: add option to skip_download (#23059)
ngxson May 29, 2026
dc71236
ci : update macos release to use macos-26 runner (#23878)
ggerganov May 29, 2026
b5f5228
server: remove obsolete scripts (#23870)
ngxson May 29, 2026
764f1e6
graph : ensure DS32 kq_mask_lid is F32 (#23864)
CISC May 29, 2026
2084434
vocab : support tokenizer for LFM2.5-8B-A1B (#23826)
tdakhran May 29, 2026
22d66b5
ui: handle audio/vnd.wave as audio WAV file (#23754)
ValdikSS May 29, 2026
5a46b46
app: add llama update self updater (#23865)
ServeurpersoCom May 29, 2026
689a9a4
server-bench : add speed-bench for speculative decoding benchmarking …
ruixiang63 May 29, 2026
b22da25
ggml-webgpu: add q4_0/q8_0 SET_ROWS (#23760)
reeselevine May 29, 2026
151f3a9
ggml-webgpu: Check earlier for WebGPU required features (#23879)
reeselevine May 29, 2026
0821c5f
server: in SSE mode, send HTTP headers when slot starts (#23884)
ngxson May 29, 2026
1738129
llama : do not skip iGPU when only RPC devices are present (#23868)
rgerganov May 30, 2026
d4204b0
ci : clear cache instead of "no timestamp" keys + fix macos (#23895)
ggerganov May 30, 2026
3375285
ci : fix s390x release job (#23898)
ggerganov May 30, 2026
6e093b8
vulkan: add Flash Attention support for BFloat16 KV cache (#23420)
0cc4m May 30, 2026
d48a56e
ggml : add some lsx support (#23798)
MQ-mengqing May 30, 2026
4c4e91b
ci : update ios-xcode release job to macos-26 (#23906)
ggerganov May 30, 2026
e674b12
test: (test-llama-archs) log the config name first (#23885)
ngxson May 30, 2026
2d9b7c8
metal : restore im2col implementation for large kernels (#23901)
ggerganov May 30, 2026
8b0e0db
TP: fix granularity for Qwen 3.5/3.6 + 3 GPUs (#23843)
JohannesGaessler May 30, 2026
d38d50e
ui: exclude generated build dirs from prettier and eslint so lint err…
ServeurpersoCom May 30, 2026
d6588da
opencl: support bf16 by converting to f16 (#23839)
lhez May 30, 2026
aa46bda
Support `-fa auto` in llama-bench (#23714)
gaugarg-nv May 30, 2026
d749821
webui: add custom CSS injection via config (#23904)
ServeurpersoCom May 30, 2026
22cadc1
llama: only use one iGPU device by default (#23897)
0cc4m May 31, 2026
e6123e2
docs : update ZenDNN docs for Q8 support (#23791)
truecoder34 May 31, 2026
3292da0
ui: fix ETag truncation with MSVC compiler (#23917)
EZForever May 31, 2026
d4c8e2c
vocab : add tokenizer support for jina-embeddings-v2-base-zh (#18756)
o7si May 31, 2026
399739d
ci : limit trigger paths for the CPU workflow (#23938)
ggerganov May 31, 2026
6f165c1
server : handle If-None-Match weak ETags (#23916)
EZForever May 31, 2026
af6528e
ci: remove redundant or duplicate jobs (#23927)
netrunnereve Jun 1, 2026
44e211c
sycl : Optimize Q3_K mul_mat by reorder (#23725)
arthw Jun 1, 2026
4162522
[SYCL] Add more types in GET_ROWS OP (#23710)
arthw Jun 1, 2026
a511424
[SYCL] Support Q4_1, Q5_0, Q5_1 in Flash-attention (#23812)
arthw Jun 1, 2026
e22b0de
ci : add missing Linux label to cpu-x64-high-perf runner (#23958)
ggerganov Jun 1, 2026
5254a79
common : support manually triggering the reasoning budget end sequenc…
aldehir Jun 1, 2026
f8c0a19
vulkan: Removed unused functions (#23175)
winstonma Jun 1, 2026
1962000
vulkan: Block-load Q3_K/Q6_K block data and subtract on 32b ints (#23…
TheBlueMatt Jun 1, 2026
48b88c3
model: Add EXAONE 4.5 implementations (#21733)
nuxlear Jun 1, 2026
02a5701
security : disable private disclosures (#23963)
ggerganov Jun 1, 2026
8e6fff8
TP: quantized KV cache support (#23792)
JohannesGaessler Jun 1, 2026
5aba536
vocab: add normalizer.lowercase support to WPM (#23899)
o7si Jun 1, 2026
bef69f1
vulkan: reduce host memory lock contention (#23376)
winstonma Jun 1, 2026
55ac090
vulkan: don't hold the device mutex while compiling pipelines (#23641)
jeffbolznv Jun 1, 2026
95b8b8e
metal: template GLU kernels to support f16/f32 (#23882)
shrivasshankar Jun 1, 2026
de6f727
llama: limit max outputs of `llama_context` (#23861)
am17an Jun 1, 2026
335abed
vendor : update cpp-httplib to 0.46.1 (#23980)
angt Jun 1, 2026
27d9ed8
opencl: add basic support for q5_0 and q5_1 (#23548)
shaofeiqi Jun 1, 2026
5aa3a64
nix : add nix-nodejs facilities to build Web UI (#23846)
choener Jun 1, 2026
5dcb711
speculative : fix n_outputs_max and remove draft-simple auto-enable (…
ggerganov Jun 1, 2026
b8275a8
revert to using global_invocation_id for cpy shader (#23955)
yomaytk Jun 1, 2026
210a657
opencl: fix compiler warnings for non-adreno path (#23922)
lhez Jun 2, 2026
1fd5f48
clean up unused variables warnings (#23975)
anavp-nvidia Jun 2, 2026
354ebac
server: real-time reasoning interruption via control endpoint (#23971)
ServeurpersoCom Jun 2, 2026
d178a11
hexagon: add gelu_quick (#24007)
tboinovski1 Jun 2, 2026
8f7f3bf
hexagon: MUL_MAT, MUL_MAT_ID, FLASH_ATTN and GDN cleanup and optimiza…
max-krasnyansky Jun 2, 2026
4f3a4be
llama : deprecate `llama_set_warmup` (#24009)
ggerganov Jun 2, 2026
f7a0777
convert : support Step3.7-Flash (#23845)
forforever73 Jun 2, 2026
2365315
kv-cache : SWA checkpoints store only non-masked cells (#23981)
ggerganov Jun 2, 2026
f8e67fc
ui: Add Thinking mode toggle with reasoning effort levels + improveme…
allozaur Jun 2, 2026
69cea5b
ui: simplify network error handling (#23431)
socram8888 Jun 2, 2026
d5ab083
docs : update HOWTO-add-model.md (#23883)
Xarbirus Jun 2, 2026
a468b89
ci : reduce self-hosted server workflow jobs (#24012)
ggerganov Jun 2, 2026
60130d1
server: add SSE ping interval (#24013)
ngxson Jun 2, 2026
0b71540
common : fix state save in common_prompt_batch_decode (#23468)
danbev Jun 2, 2026
2187e00
StepFun 3.5 MTP (#23274)
pwilkin Jun 2, 2026
bfb4308
model : support granite multilingual embeddings R2 (ibm-granite/grani…
hansolosan Jun 2, 2026
4fb16ec
model: add Mellum architecture (#23966)
Xarbirus Jun 2, 2026
5c394fd
hexagon: profiler output fix and script updates (#24042)
max-krasnyansky Jun 2, 2026
63e66fd
opencl: use flat variants of q4_K and q6_K gemv for very large M (#24…
lhez Jun 2, 2026
e366626
arg : removed unecesary mmproj download when users pass --no-mmproj (…
ryan-mangeno Jun 3, 2026
4da6370
ci : disable ccache for msvc windows release jobs (#23911)
ggerganov Jun 3, 2026
d545a2a
update BoringSSL to 0.20260526.0 (#23794)
cabelo Jun 3, 2026
06938ac
tests : add support for qwen3 SSM archs (#24031)
ggerganov Jun 3, 2026
f8f0a47
cuda: reserve space for quantize kv-cache at startup (#23907)
am17an Jun 3, 2026
3571fa5
ggml-cpu: use runtime SVE width in FWHT (#24059)
chaxu01 Jun 3, 2026
9e58d4d
Avoid PDL race conditions by disabling __restrict__ when PDL is used …
aendk Jun 3, 2026
ee4cf70
ui: Mermaid Diagrams in chat + interactive preview (#24032)
allozaur Jun 3, 2026
a731805
mtmd, model: allow skip build_vit() (#24077)
ngxson Jun 3, 2026
c8d6a00
mtmd: enable non-causal vision for gemma 4 unified (#24082)
ngxson Jun 3, 2026
166fe29
qwen35: use post-norm hidden state for MTP (#24025)
am17an Jun 3, 2026
94a220c
mtmd: fix Gemma 4 unified FPE (#24088)
abetlen Jun 3, 2026
f478f1b
sycl : Improve SYCL doc (#23025)
malsbat Jun 4, 2026
3c7450c
ggml-cpu: extend RVV quantization vec dot to higher VLENs (#22754)
rehan-10xengineer Jun 4, 2026
e8c5489
ggml-webgpu: FlashAttention refactor + standardize quantization suppo…
reeselevine Jun 4, 2026
3d19986
metal : reduce rset heartbeat from 500ms -> 5ms (#24074)
ggerganov Jun 4, 2026
65ef50a
tests : refactor test-save-load-state to accept token input (#24073)
ggerganov Jun 4, 2026
6ddc943
readme : add status badges (#24104)
ggerganov Jun 4, 2026
e3ba22d
fix(mtmd): handle Gemma 4 audio projector embedding size (#24091)
abetlen Jun 4, 2026
7ac5a42
cmake: skip cvector-generator and export-lora when CPU backend is dis…
arichiardi Jun 4, 2026
0066404
server : add header to tools/server/server-http.h (#24089)
abawany Jun 4, 2026
4d74287
build : use umbrella Headers directory for XCFramework module map (#2…
gmarzjr Jun 4, 2026
4586479
webui: fix tool selector toggle/counter, key tools by stable identity…
ServeurpersoCom Jun 4, 2026
a121232
agents: refactor, include more guidelines (#24111)
ngxson Jun 4, 2026
6f3a9f3
server: avoid unnecessary checkpoint restore when new tokens are pres…
Abioy Jun 4, 2026
4c51309
ggml: vectorize ggml_vec_dot_q4_1_q8_1 with WASM SIMD128 (#22209)
sirohikartik Jun 4, 2026
e802356
convert: Fix Gemma 4 Unified conversion (#24118)
pcuenca Jun 4, 2026
0dbfa66
return filter to save memory (#24125)
forforever73 Jun 4, 2026
5269770
ui: added single line reasoning preview (#23601)
gugugiyu Jun 4, 2026
21444c8
ui: Fixed packages (#24119)
allozaur Jun 4, 2026
e7bcf1c
Move duplicated imatrix code into single common imatrix-loader.cpp (#…
bartowski1182 Jun 4, 2026
42b2d60
webui: [a11y] fix keyboard navigation issues in chat interface and si…
vignesh191 Jun 4, 2026
260862b
arg: fix double mtp downloads (#24128)
ngxson Jun 4, 2026
7c158fb
server : disable on-device spec checkpoints (#24108)
ggerganov Jun 4, 2026
7fe2ae4
sycl : port multi-column MMVQ from CUDA backend (#21845)
masonmilby Jun 5, 2026
46fa662
ci : build-msys job slimming [no ci] (#24157)
danbev Jun 5, 2026
2154a0f
CUDA: enroll mul_mat_vec_q_moe into pdl (#24087)
ORippler Jun 5, 2026
3ecfb15
kleidiai : dynamic chunck-based scheduling for hybrid execution (#23819)
chaxu01 Jun 5, 2026
7acb4e8
hparams : refactor `hparams.n_layer` (#24060)
ggerganov Jun 5, 2026
59917d3
minor : fix lint issues (#24165)
ggerganov Jun 5, 2026
ad1b88c
docs: Update quantization readme (#24133)
pcuenca Jun 5, 2026
cc7bef3
ui: add ignore-scripts=true to npmrc (#24149)
ngxson Jun 5, 2026
9c955c4
Fix link to available UI settings (#24169)
wariuccio Jun 5, 2026
2016bf2
ui: run npm install when package-lock.json is newer than node_modules…
ServeurpersoCom Jun 5, 2026
96fbe00
model : fix llama_model::n_gpu_layers() (#24188)
ggerganov Jun 5, 2026
86591c7
cli: fix model params not propagated (#23893)
therealkenc Jun 5, 2026
6effcec
TP: round up granularity to 128 (#24180)
JohannesGaessler Jun 5, 2026
64086f2
model, mtmd: Granite4 Vision (#23545)
gabe-l-hart Jun 5, 2026
c4a278d
model: fix build failed (#24193)
ngxson Jun 5, 2026
e82beaa
vulkan: add fwht support for Intel with shmem reduction (#23964)
0cc4m Jun 5, 2026
da87e9b
common/chat : unify and fix LFM2/LFM2.5 tool parser (#24178)
tdakhran Jun 5, 2026
308f61c
opencl: improve get_rows, cpy, concat and q6_k flat gemv (#24160)
lhez Jun 5, 2026
603300b
context : fix off-by-one comparisons to n_gpu_layers (#24208)
CISC Jun 6, 2026
5343f45
model : rename local n_layer_all variable (#24209)
CISC Jun 6, 2026
5a69c97
vulkan: check coopmat2 features before reporting support (#24186)
0cc4m Jun 6, 2026
f5c6ae1
mtmd, server: add "placeholder bitmap" for counting tokens , add */in…
ngxson Jun 6, 2026
588f0dc
completion : fix format specifier in LOG_INF (#24213)
angt Jun 6, 2026
6b80c74
completion : remove useless statics (#24226)
angt Jun 6, 2026
31e8249
mtmd: support "frame merge" for qwen-vl-based models (#21858)
ngxson Jun 6, 2026
98d5e8b
common/chat : fix LFM2/LFM2.5 reasoning round-trip and <think> leak (…
tdakhran Jun 6, 2026
3f7c79d
docker : bump cuda13 to 13.3.0 (#24228)
CISC Jun 7, 2026
f71af35
convert : fix Gemma4 with no audio encoder (#24242)
CISC Jun 7, 2026
465b1f0
arg: Skip mmproj download when user supplied mmproj (#24239)
konradmb Jun 7, 2026
8a091c4
spec : fix vocab compatibility check (#24256)
CISC Jun 7, 2026
04eb4c4
llama : add Gemma4 MTP (#23398)
am17an Jun 7, 2026
f0156d1
kv-cache: follow the source cache size when sharing cells (#24267)
ServeurpersoCom Jun 7, 2026
379ac66
kv-cache : avoid kv cells copies (#24277)
ggerganov Jun 7, 2026
8a963fc
convert : fix conversion for Mistral-Medium-3.5-128B (#24268)
dfriehs Jun 7, 2026
9e3b928
common : relax sampler name matching (#23744)
ddh0 Jun 7, 2026
d403f00
[SYCL] Update compute runtime version to 26.x in docker (#24070)
arthw Jun 8, 2026
daf6bc9
metal : fix im2col 1D case (audio models) (#24220)
ngxson Jun 8, 2026
19bba67
HIP: add gfx1152 and gfx1153 to RDNA3.5 (#24129)
harkgill-amd Jun 8, 2026
0f7fada
cuda: reset cuda context after reading memory size (#23935)
0cc4m Jun 8, 2026
c74759a
vulkan: Use cm2 decode_vector for mul_mat_id B matrix loads (#23991)
jeffbolznv Jun 8, 2026
715b86a
cli: fix spinner not show during prompt processing (#24283)
ngxson Jun 8, 2026
6a1de6f
ggml : bump version to 0.14.0 (ggml/1533)
ggerganov Jun 8, 2026
c2b1518
sync : ggml
ggerganov Jun 8, 2026
8f83d6c
mtmd : add video input support (#24269)
ngxson Jun 8, 2026
3ebe862
docker: install ffmpeg in the released image (#24302)
ngxson Jun 8, 2026
3b3da01
[ggml-webgpu] Implement 2D workgroups for scale, binary, and unary op…
nikhilJain17 Jun 8, 2026
1705d43
[ggml-webgpu] Handle buffer overlap / buffer aliasing for concat oper…
nikhilJain17 Jun 8, 2026
a66d505
graph: guard iswa kq_mask on its own buffer (#24294)
ServeurpersoCom Jun 8, 2026
42a0afd
server : do not parse when flushing http headers (#24281)
aldehir Jun 8, 2026
7d2b45b
mtp: support for gemma-4 E2B and E4B assistants (#24282)
max-krasnyansky Jun 8, 2026
1e1aca0
ggml-webgpu: Improve prefill speeds for k-quants + refactor matmul fo…
yomaytk Jun 8, 2026
3ac3c20
ggml-webgpu: Add clang-format job (#24308)
reeselevine Jun 9, 2026
e3471b3
Remove case for GGML_TYPE_Q4_K in mvvq.cu (#23528)
ravel7524 Jun 9, 2026
fd3271e
ggml-cpu : fix rms_norm_back wrong output under in-place aliasing (#2…
devYRPauli Jun 9, 2026
f0152ef
models : fix plamo2 attention_key/value_length regression (#24317)
CISC Jun 9, 2026
961e9a3
server : do not clear slots without unified KV cache (#24190)
fiesh Jun 9, 2026
2602169
ggml : add GGML_OP_COL2IM_1D (#24206)
ServeurpersoCom Jun 9, 2026
efbacf8
ui: fix mobile chat form overflow and bust stale bundle cache (#24158)
ServeurpersoCom Jun 9, 2026
1e91256
server: log prompts to directory (#22031)
jacekpoplawski Jun 9, 2026
9682e35
mtmd: refactor video subproc handling (#24316)
ngxson Jun 9, 2026
ae735b1
ui: Fix excessive style recalculation on hover (#24243)
ntowle Jun 9, 2026
b4e3dc6
vulkan: add `v_dot2_f32_f16` support in matrix-matrix multiplication …
0cc4m Jun 9, 2026
d6d0ce8
vulkan: reduce iq1 shared memory usage for mul_mm (#24287)
jeffbolznv Jun 9, 2026
49f3542
mtmd: build_vit batching (#24352)
sfallah Jun 9, 2026
4836095
ui: add opt-in run_javascript frontend tool (#24244)
ServeurpersoCom Jun 9, 2026
e25a32e
ci : fix windows release (#24369)
CISC Jun 9, 2026
d73cd07
graph: Fix granite speech model inference by applying embedding scale…
arnu515 Jun 9, 2026
2f3ea40
server: improve Responses API compliance and Codex CLI compatibility
krystophny Mar 30, 2026
43fb8c0
server: persist context checkpoints across slot save/restore
European-tech Mar 30, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
17 changes: 17 additions & 0 deletions .devops/cann.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
# Define the CANN base image for easier version updates later
ARG CHIP_TYPE=910b
ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11
ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A

# ==============================================================================
# BUILD STAGE
Expand Down Expand Up @@ -55,6 +58,7 @@ RUN mkdir -p /app/lib && \
RUN mkdir -p /app/full && \
cp build/bin/* /app/full/ && \
cp *.py /app/full/ && \
cp -r conversion /app/full/ && \
cp -r gguf-py /app/full/ && \
cp -r requirements /app/full/ && \
cp requirements.txt /app/full/
Expand All @@ -67,6 +71,19 @@ RUN mkdir -p /app/full && \
# ==============================================================================
FROM ${CANN_BASE_IMAGE} AS base

ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
LABEL org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.version=$APP_VERSION \
org.opencontainers.image.revision=$APP_REVISION \
org.opencontainers.image.title="llama.cpp" \
org.opencontainers.image.description="LLM inference in C/C++" \
org.opencontainers.image.url=$IMAGE_URL \
org.opencontainers.image.source=$IMAGE_SOURCE

# -- Install runtime dependencies --
RUN yum install -y libgomp curl && \
yum clean all && \
Expand Down
19 changes: 18 additions & 1 deletion .devops/cpu.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
ARG UBUNTU_VERSION=24.04
ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A

FROM ubuntu:$UBUNTU_VERSION AS build

Expand Down Expand Up @@ -27,6 +30,7 @@ RUN mkdir -p /app/lib && \
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r conversion /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
Expand All @@ -35,8 +39,21 @@ RUN mkdir -p /app/full \
## Base image
FROM ubuntu:$UBUNTU_VERSION AS base

ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
LABEL org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.version=$APP_VERSION \
org.opencontainers.image.revision=$APP_REVISION \
org.opencontainers.image.title="llama.cpp" \
org.opencontainers.image.description="LLM inference in C/C++" \
org.opencontainers.image.url=$IMAGE_URL \
org.opencontainers.image.source=$IMAGE_SOURCE

RUN apt-get update \
&& apt-get install -y libgomp1 curl \
&& apt-get install -y libgomp1 curl ffmpeg \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
Expand Down
20 changes: 19 additions & 1 deletion .devops/cuda.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER

ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A

FROM ${BASE_CUDA_DEV_CONTAINER} AS build

# CUDA architecture to build for (defaults to all supported archs)
Expand All @@ -32,6 +36,7 @@ RUN mkdir -p /app/lib && \
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r conversion /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
Expand All @@ -40,8 +45,21 @@ RUN mkdir -p /app/full \
## Base image
FROM ${BASE_CUDA_RUN_CONTAINER} AS base

ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
LABEL org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.version=$APP_VERSION \
org.opencontainers.image.revision=$APP_REVISION \
org.opencontainers.image.title="llama.cpp" \
org.opencontainers.image.description="LLM inference in C/C++" \
org.opencontainers.image.url=$IMAGE_URL \
org.opencontainers.image.source=$IMAGE_SOURCE

RUN apt-get update \
&& apt-get install -y libgomp1 curl \
&& apt-get install -y libgomp1 curl ffmpeg \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
Expand Down
49 changes: 41 additions & 8 deletions .devops/intel.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A

## Build Image

FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build

ARG GGML_SYCL_F16=OFF
ARG LEVEL_ZERO_VERSION=1.28.2
ARG LEVEL_ZERO_UBUNTU_VERSION=u24.04
RUN apt-get update && \
apt-get install -y git libssl-dev
apt-get install -y git libssl-dev wget ca-certificates && \
cd /tmp && \
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb && \
wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb && \
apt-get -o Dpkg::Options::="--force-overwrite" install -y ./level-zero.deb ./level-zero-devel.deb && \
rm -f /tmp/level-zero.deb /tmp/level-zero-devel.deb

WORKDIR /app

Expand All @@ -26,18 +36,42 @@ RUN mkdir -p /app/lib && \
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r conversion /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
&& cp .devops/tools.sh /app/full/tools.sh

FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

ARG IGC_VERSION=v2.30.1
ARG IGC_VERSION_FULL=2_2.30.1+20950
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
ARG IGDGMM_VERSION=22.9.0
ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
LABEL org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.version=$APP_VERSION \
org.opencontainers.image.revision=$APP_REVISION \
org.opencontainers.image.title="llama.cpp" \
org.opencontainers.image.description="LLM inference in C/C++" \
org.opencontainers.image.url=$IMAGE_URL \
org.opencontainers.image.source=$IMAGE_SOURCE

#Following versions are for multiple GPUs, since 26.x has known issue:
# https://github.com/ggml-org/llama.cpp/issues/21747,
# https://github.com/intel/compute-runtime/issues/921.
#ARG IGC_VERSION=v2.20.5
#ARG IGC_VERSION_FULL=2_2.20.5+19972
#ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10
#ARG COMPUTE_RUNTIME_VERSION_FULL=25.40.35563.10-0
#ARG IGDGMM_VERSION=22.8.2


ARG IGC_VERSION=v2.34.4
ARG IGC_VERSION_FULL=2_2.34.4+21428
ARG COMPUTE_RUNTIME_VERSION=26.18.38308.1
ARG COMPUTE_RUNTIME_VERSION_FULL=26.18.38308.1-0
ARG IGDGMM_VERSION=22.10.0
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
Expand All @@ -51,7 +85,7 @@ RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
&& dpkg --install *.deb

RUN apt-get update \
&& apt-get install -y libgomp1 curl \
&& apt-get install -y libgomp1 curl ffmpeg \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
Expand Down Expand Up @@ -109,4 +143,3 @@ WORKDIR /app
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

ENTRYPOINT [ "/app/llama-server" ]

17 changes: 17 additions & 0 deletions .devops/llama-cli-cann.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
ARG ASCEND_VERSION=8.5.0-910b-openeuler22.03-py3.10
ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A

FROM ascendai/cann:$ASCEND_VERSION AS build

Expand Down Expand Up @@ -28,6 +31,20 @@ RUN echo "Building with static libs" && \

# TODO: use image with NNRT
FROM ascendai/cann:$ASCEND_VERSION AS runtime

ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
LABEL org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.version=$APP_VERSION \
org.opencontainers.image.revision=$APP_REVISION \
org.opencontainers.image.title="llama.cpp" \
org.opencontainers.image.description="LLM inference in C/C++" \
org.opencontainers.image.url=$IMAGE_URL \
org.opencontainers.image.source=$IMAGE_SOURCE

COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /

ENV LC_ALL=C.utf8
Expand Down
20 changes: 19 additions & 1 deletion .devops/musa.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_V

ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64

ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A

FROM ${BASE_MUSA_DEV_CONTAINER} AS build

# MUSA architecture to build for (defaults to all supported archs)
Expand Down Expand Up @@ -37,6 +41,7 @@ RUN mkdir -p /app/lib && \
RUN mkdir -p /app/full \
&& cp build/bin/* /app/full \
&& cp *.py /app/full \
&& cp -r conversion /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
Expand All @@ -45,8 +50,21 @@ RUN mkdir -p /app/full \
## Base image
FROM ${BASE_MUSA_RUN_CONTAINER} AS base

ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
LABEL org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.version=$APP_VERSION \
org.opencontainers.image.revision=$APP_REVISION \
org.opencontainers.image.title="llama.cpp" \
org.opencontainers.image.description="LLM inference in C/C++" \
org.opencontainers.image.url=$IMAGE_URL \
org.opencontainers.image.source=$IMAGE_SOURCE

RUN apt-get update \
&& apt-get install -y libgomp1 curl \
&& apt-get install -y libgomp1 curl ffmpeg \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
Expand Down
31 changes: 29 additions & 2 deletions .devops/nix/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
glibc,
config,
stdenv,
stdenvNoCC,
runCommand,
cmake,
ninja,
Expand All @@ -19,6 +20,8 @@
openssl,
shaderc,
spirv-headers,
nodejs,
importNpmLock,
useBlas ?
builtins.all (x: !x) [
useCuda
Expand Down Expand Up @@ -103,6 +106,7 @@ let
vulkan-headers
vulkan-loader
shaderc
spirv-headers
];
in

Expand All @@ -129,7 +133,31 @@ effectiveStdenv.mkDerivation (finalAttrs: {
src = lib.cleanSource ../../.;
};

postPatch = ''
# Builds the webui locally, taking care not to require updating any sha256 hash.
webui = stdenvNoCC.mkDerivation {
pname = "webui";
version = llamaVersion;
src = lib.cleanSource ../../tools/ui;

nativeBuildInputs = [
nodejs
importNpmLock.linkNodeModulesHook
];

# no sha256 required when using buildNodeModules
npmDeps = importNpmLock.buildNodeModules {
npmRoot = ../../tools/ui;
inherit nodejs;
};

installPhase = ''
LLAMA_UI_OUT_DIR=$out npm run build --offline
'';
};

postPatch = lib.optionalString useWebUi ''
cp -r ${finalAttrs.webui} tools/ui/dist
chmod -R u+w tools/ui/dist
'';

# With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015,
Expand All @@ -146,7 +174,6 @@ effectiveStdenv.mkDerivation (finalAttrs: {
ninja
pkg-config
git
spirv-headers
]
++ optionals useCuda [
cudaPackages.cuda_nvcc
Expand Down
19 changes: 18 additions & 1 deletion .devops/openvino.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
ARG http_proxy=
ARG https_proxy=

ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A

## Build Image
FROM ubuntu:${UBUNTU_VERSION} AS build

Expand Down Expand Up @@ -77,6 +81,7 @@ RUN mkdir -p /app/lib && \
RUN mkdir -p /app/full \
&& cp build/ReleaseOV/bin/* /app/full/ \
&& cp *.py /app/full \
&& cp -r conversion /app/full \
&& cp -r gguf-py /app/full \
&& cp -r requirements /app/full \
&& cp requirements.txt /app/full \
Expand All @@ -88,9 +93,21 @@ FROM ubuntu:${UBUNTU_VERSION} AS base
# Pass proxy args to runtime stage
ARG http_proxy
ARG https_proxy
ARG BUILD_DATE=N/A
ARG APP_VERSION=N/A
ARG APP_REVISION=N/A
ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
LABEL org.opencontainers.image.created=$BUILD_DATE \
org.opencontainers.image.version=$APP_VERSION \
org.opencontainers.image.revision=$APP_REVISION \
org.opencontainers.image.title="llama.cpp" \
org.opencontainers.image.description="LLM inference in C/C++" \
org.opencontainers.image.url=$IMAGE_URL \
org.opencontainers.image.source=$IMAGE_SOURCE

RUN apt-get update \
&& apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
&& apt-get install -y libgomp1 libtbb12 curl wget ffmpeg ocl-icd-libopencl1 \
&& apt autoremove -y \
&& apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \
Expand Down
Loading