Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions .github/workflows/build-virtgpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
name: CI (virtgpu)

on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-virtgpu.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp'
]

pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-virtgpu.yml',
'ggml/src/ggml-virtgpu/**'
]

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true

jobs:
ubuntu-24-virtgpu:
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}

steps:
- name: Clone
id: checkout
uses: actions/checkout@v6

- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential libdrm-dev pkg-config libssl-dev

- name: Build
id: cmake_build
run: |
cmake -B build \
-DGGML_VIRTGPU=ON \
-DGGML_VIRTGPU_BACKEND=ON
cmake --build build --config Release -j $(nproc)
26 changes: 25 additions & 1 deletion convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2176,7 +2176,8 @@ def __init__(self, *args, **kwargs):
text_config = {
k: v for k, v in self.hparams.items() if k not in ["vision_encoder", "audio_encoder"]
}
self.n_embd_text = text_config.get("hidden_dim", 0)
# mistral native params.json: "dim" is the text hidden size ("hidden_dim" is the FFN intermediate size)
self.n_embd_text = text_config.get("dim", 0)

assert self.n_embd_text > 0, "n_embd not found in hparams"

Expand Down Expand Up @@ -3137,6 +3138,11 @@ def __init__(self, *args, **kwargs):
assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json"
if self.use_break_tok:
self.img_break_tok_id = self.find_vparam(["image_break_token_id"])

# params.json may ship -1 placeholders (Mistral Medium 3.5)
# resolve the real id from the bundled tokenizer in that case
if self.img_break_tok_id < 0:
self.img_break_tok_id = self.get_mistral_token_id("[IMG_BREAK]")
else:
raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
logger.info(f"Image break token id: {self.img_break_tok_id}")
Expand All @@ -3156,6 +3162,24 @@ def get_token_id(self, token: str) -> int:
return int(token_data["id"])
raise ValueError(f"Token '{token}' not found in tokenizer config.")

def get_mistral_token_id(self, token: str) -> int:
# mistral native format ships tekken.json or a versioned spm tokenizer
tekken_file = self.dir_model / "tekken.json"
if tekken_file.is_file():
with open(tekken_file, "r", encoding="utf-8") as f:
data = json.load(f)
for entry in data.get("special_tokens", []):
if entry.get("token_str") == token:
return int(entry["rank"])
tokenizer_json_file = self.dir_model / "tokenizer.json"
if tokenizer_json_file.is_file():
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
data = json.load(f)
for entry in data.get("added_tokens", []):
if entry.get("content") == token:
return int(entry["id"])
raise ValueError(f"Token '{token}' not found in mistral tokenizer files.")

def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
Expand Down
6 changes: 3 additions & 3 deletions examples/speculative-simple/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Demonstration of basic greedy speculative decoding
./bin/llama-speculative-simple \
-m ../models/qwen2.5-32b-coder-instruct/ggml-model-q8_0.gguf \
-md ../models/qwen2.5-1.5b-coder-instruct/ggml-model-q4_0.gguf \
-f test.txt -c 0 -ngl 99 --color \
--sampling-seq k --top-k 1 -fa --temp 0.0 \
-ngld 99 --draft-max 16 --draft-min 5 --draft-p-min 0.9
-f test.txt -c 0 -ngl 99 --color on \
--sampling-seq k --top-k 1 -fa on --temp 0.0 \
-ngld 99 --spec-draft-n-max 16 --spec-draft-n-draft-min 5 --draft-p-min 0.9
```
1 change: 1 addition & 0 deletions ggml/src/ggml-cuda/argsort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# include <cub/cub.cuh>
# if (CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 1)
# define STRIDED_ITERATOR_AVAILABLE
# include <cuda/iterator>
# endif
using namespace cub;
#endif // GGML_CUDA_USE_CUB
Expand Down
253 changes: 110 additions & 143 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp

Large diffs are not rendered by default.

176 changes: 72 additions & 104 deletions ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

#include "types.glsl"
#include "flash_attn_base.glsl"
#include "flash_attn_dequant.glsl"

const uint32_t HSK_per_thread = HSK / D_split;
const uint32_t HSV_per_thread = HSV / D_split;
Expand Down Expand Up @@ -128,18 +129,20 @@ void main() {

Qf[buf_ib].qs[buf_iqs] = pack32(i8vec4(vals));

#if defined(DATA_A_Q8_0) || defined(DATA_A_IQ4_NL)
if (buf_iqs == 0) {
Qf[buf_ib].ds = FLOAT_TYPEV2(qd, 0.0);
}
#else // Q4_0, Q4_1, Q5_0, Q5_1
const FLOAT_TYPE thread_sum = vals.x + vals.y + vals.z + vals.w;
const FLOAT_TYPE sum = subgroupClusteredAdd(thread_sum, 8);
// Q8_0 K only needs (qd, _); the asymmetric Q4_*/Q5_* family also stores
// the row-sum scaled by qd, used in k_dot_correction.
if (FaTypeK == FA_TYPE_Q8_0) {
if (buf_iqs == 0) {
Qf[buf_ib].ds = FLOAT_TYPEV2(qd, 0.0);
}
} else {
const FLOAT_TYPE thread_sum = vals.x + vals.y + vals.z + vals.w;
const FLOAT_TYPE sum = subgroupClusteredAdd(thread_sum, 8);

if (buf_iqs == 0) {
Qf[buf_ib].ds = FLOAT_TYPEV2(qd, sum * qd);
if (buf_iqs == 0) {
Qf[buf_ib].ds = FLOAT_TYPEV2(qd, sum * qd);
}
}
#endif
#endif
}
barrier();
Expand Down Expand Up @@ -177,13 +180,9 @@ void main() {
// mo_offset will point to the tile starting at row i*Br and col 0
uint32_t mo_offset = mo_stride * i;

#if BLOCK_SIZE > 1
uint32_t k_offset = (ik2*p.nb12 + ik3*p.nb13) / BLOCK_BYTE_SIZE;
uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / BLOCK_BYTE_SIZE;
#else
uint32_t k_offset = (ik2*p.nb12 + ik3*p.nb13) / 2;
uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / 2;
#endif
// FaBlockBytesK/V == 2 for f16, 16 for f32, ggml block byte size for quants.
uint32_t k_offset = (ik2*p.nb12 + ik3*p.nb13) / FaBlockBytesK;
uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / FaBlockBytesV;
uint32_t m_offset = gqa_iq1*KV;
if (p.nem2 != 1 || p.nem3 != 1) {
m_offset += ((iq3 % p.nem3) * p.nem2 + (iq2 % p.nem2)) * p.nem1 * KV;
Expand Down Expand Up @@ -257,21 +256,21 @@ void main() {
if (idx + gl_WorkGroupSize.x <= Bc * HSK / 4 || c < Bc) {
FLOAT_TYPEV4 K_Tf = FLOAT_TYPEV4(0);
if (!KV_bounds_check || j * Bc + c < KV) {
#if BLOCK_SIZE > 1
uint coord = (j * Bc + c) * k_stride * BLOCK_SIZE + 4 * d;
uint ib = coord / BLOCK_SIZE;
uint iqs = (coord % BLOCK_SIZE);
K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K);
#else
K_Tf = FLOAT_TYPEV4(data_kv4[k_offset / 4 + (j * Bc + c) * k_stride / 4 + d]);
#endif
if (USE_DECODE_K) {
uint coord = (j * Bc + c) * k_stride * BLOCK_SIZE_K + 4 * d;
uint ib = coord / BLOCK_SIZE_K;
uint iqs = (coord % BLOCK_SIZE_K);
K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K);
} else {
K_Tf = FLOAT_TYPEV4(data_kv4[k_offset / 4 + (j * Bc + c) * k_stride / 4 + d]);
}
}

kvsh[c * kvsh_stride + d] = K_Tf;
}
}
#else // MMQ
const uint ints_per_block = 8 / QUANT_R_MMQ;
const uint ints_per_block = 8u / fa_quant_r_mmq(FaTypeK);
const uint quant_iters = Bc * HSK / 32 * ints_per_block;
[[unroll]] for (uint32_t idx = 0; idx < quant_iters; idx += gl_WorkGroupSize.x) {
const uint32_t iqs = (idx + tid) % ints_per_block;
Expand Down Expand Up @@ -310,15 +309,13 @@ void main() {
FLOAT_TYPEV4 K_Tf;
if (SHMEM_STAGING != 0) {
K_Tf = kvsh[(c * cols_per_iter + col_tid) * kvsh_stride + (d * D_split + d_tid)];
} else {
#if BLOCK_SIZE > 1
uint coord = (j * Bc + c * cols_per_iter + col_tid) * k_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid);
uint ib = coord / BLOCK_SIZE;
uint iqs = (coord % BLOCK_SIZE);
} else if (USE_DECODE_K) {
uint coord = (j * Bc + c * cols_per_iter + col_tid) * k_stride * BLOCK_SIZE_K + 4 * (d * D_split + d_tid);
uint ib = coord / BLOCK_SIZE_K;
uint iqs = (coord % BLOCK_SIZE_K);
K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K);
#else
} else {
K_Tf = FLOAT_TYPEV4(data_kv4[k_offset / 4 + (j * Bc + c * cols_per_iter + col_tid) * k_stride / 4 + d * D_split + d_tid]);
#endif
}
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Sf[r][c] += dot(ACC_TYPEV4(Q_cache[r]), ACC_TYPEV4(K_Tf));
Expand All @@ -335,15 +332,13 @@ void main() {
FLOAT_TYPEV4 K_Tf;
if (SHMEM_STAGING != 0) {
K_Tf = kvsh[(c * cols_per_iter + col_tid) * kvsh_stride + (d * D_split + d_tid)];
} else {
#if BLOCK_SIZE > 1
uint coord = (j * Bc + c * cols_per_iter + col_tid) * k_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid);
uint ib = coord / BLOCK_SIZE;
uint iqs = (coord % BLOCK_SIZE);
} else if (USE_DECODE_K) {
uint coord = (j * Bc + c * cols_per_iter + col_tid) * k_stride * BLOCK_SIZE_K + 4 * (d * D_split + d_tid);
uint ib = coord / BLOCK_SIZE_K;
uint iqs = (coord % BLOCK_SIZE_K);
K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K);
#else
} else {
K_Tf = FLOAT_TYPEV4(data_kv4[k_offset / 4 + (j * Bc + c * cols_per_iter + col_tid) * k_stride / 4 + d * D_split + d_tid]);
#endif
}
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Sf[r][c] += dot(ACC_TYPEV4(Qf[tile_row(r) * qf_stride + d * D_split + d_tid]), ACC_TYPEV4(K_Tf));
Expand All @@ -366,72 +361,47 @@ void main() {
int32_t k_quants[d_per_step];
ACC_TYPEV2 k_dm;

// Q4_*/Q5_* take the block-8 fast path when one step covers a full
// block; Q8_0 always goes through the per-int get_k_qs* helpers
// (its qs is byte-packed, not nibble-packed).
const bool block8_fast = (d_per_step == 8) && (FaTypeK != FA_TYPE_Q8_0);

if (SHMEM_STAGING != 0) {
const uint k_block_idx = (d_tid * (HSK_per_thread / 4) + d_block) / 8;
const uint buf_ib = (c * cols_per_iter + col_tid) * qf_stride + k_block_idx;
#if QUANT_AUXF == 1
k_dm = ACC_TYPEV2(kblocksh[buf_ib].dm, 0.0);
#else
k_dm = ACC_TYPEV2(kblocksh[buf_ib].dm);
#endif

#if defined(DATA_A_Q4_0) || defined(DATA_A_Q4_1) || defined(DATA_A_Q5_0) || defined(DATA_A_Q5_1)
if (d_per_step == 8) {
if (block8_fast) {
const bool has_qh = (FaTypeK == FA_TYPE_Q5_0) || (FaTypeK == FA_TYPE_Q5_1);
[[unroll]] for (uint32_t d = 0; d < 4; d++) {
uint vui = kblocksh[buf_ib].qs[d];
k_quants[d ] = int32_t( vui & 0x0F0F0F0F);
k_quants[d + 4] = int32_t((vui >> 4) & 0x0F0F0F0F);
#if defined(DATA_A_Q5_0) || defined(DATA_A_Q5_1)
uint qh_lo = (kblocksh[buf_ib].qh >> (d * 4)) & 0xF;
uint qh_hi = (kblocksh[buf_ib].qh >> (d * 4 + 16)) & 0xF;
k_quants[d ] |= int32_t((qh_lo * 0x02040810u) & 0x10101010u);
k_quants[d + 4] |= int32_t((qh_hi * 0x02040810u) & 0x10101010u);
#endif
if (has_qh) {
uint qh_lo = (kblocksh[buf_ib].qh >> (d * 4)) & 0xF;
uint qh_hi = (kblocksh[buf_ib].qh >> (d * 4 + 16)) & 0xF;
k_quants[d ] |= int32_t((qh_lo * 0x02040810u) & 0x10101010u);
k_quants[d + 4] |= int32_t((qh_hi * 0x02040810u) & 0x10101010u);
}
}
} else
#endif
{
} else {
[[unroll]] for (uint32_t d = 0; d < d_per_step; d++) {
k_quants[d] = get_k_qs_shmem(buf_ib, (d_tid * (HSK_per_thread / 4) + d_block) % 8 + d);
}
}
} else {
const uint coord = (j * Bc + c * cols_per_iter + col_tid) * k_stride * BLOCK_SIZE + 4 * (d_tid * (HSK_per_thread / 4) + d_block);
const uint ib = coord / BLOCK_SIZE;
const uint iqs = (coord % BLOCK_SIZE);
const uint coord = (j * Bc + c * cols_per_iter + col_tid) * k_stride * BLOCK_SIZE_K + 4 * (d_tid * (HSK_per_thread / 4) + d_block);
const uint ib = coord / BLOCK_SIZE_K;
const uint iqs = (coord % BLOCK_SIZE_K);

#if QUANT_AUXF == 1
k_dm = ACC_TYPEV2(get_k_d(ib, k_offset), 0.0);
#else
k_dm = ACC_TYPEV2(get_k_dm(ib, k_offset));
#endif
#if defined(DATA_A_Q4_0) || defined(DATA_A_Q4_1) || defined(DATA_A_Q5_0) || defined(DATA_A_Q5_1)
if (d_per_step == 8) {
#if defined(DATA_A_Q5_0)
uint qh = pack32(u16vec2(k_packed.k_data_packed16[k_offset + ib].qh[0],
k_packed.k_data_packed16[k_offset + ib].qh[1]));
#elif defined(DATA_A_Q5_1)
uint qh = k_packed.k_data_packed16[k_offset + ib].qh;
#endif
[[unroll]] for (uint32_t d = 0; d < 4; d++) {
#if defined(A_TYPE_PACKED32)
uint vui = k_packed32.k_data_packed32[k_offset + ib].qs[d];
#else
uint vui = pack32(u16vec2(k_packed.k_data_packed16[k_offset + ib].qs[iqs / 2 + d * 2 + 0],
k_packed.k_data_packed16[k_offset + ib].qs[iqs / 2 + d * 2 + 1]));
#endif
k_quants[d ] = int32_t( vui & 0x0F0F0F0F);
k_quants[d + 4] = int32_t((vui >> 4) & 0x0F0F0F0F);
#if defined(DATA_A_Q5_0) || defined(DATA_A_Q5_1)
uint qh_lo = (qh >> (d * 4)) & 0xF;
uint qh_hi = (qh >> (d * 4 + 16)) & 0xF;
k_quants[d ] |= int32_t((qh_lo * 0x02040810u) & 0x10101010u);
k_quants[d + 4] |= int32_t((qh_hi * 0x02040810u) & 0x10101010u);
#endif
k_dm = ACC_TYPEV2(get_k_scale(ib, k_offset));

if (block8_fast) {
fa_k_qs_block8 blk = get_k_qs_block8(ib, k_offset);
[[unroll]] for (uint32_t d = 0; d < 8; d++) {
k_quants[d] = blk.qs[d];
}
} else
#endif
{
} else {
[[unroll]] for (uint32_t d = 0; d < d_per_step; d++) {
k_quants[d] = get_k_qs(ib, iqs + d * 4, k_offset);
}
Expand Down Expand Up @@ -516,14 +486,14 @@ void main() {
if (idx + gl_WorkGroupSize.x <= Bc * HSV / 4 || c < Bc) {
FLOAT_TYPEV4 V_Tf = FLOAT_TYPEV4(0);
if (!KV_bounds_check || j * Bc + c < KV) {
#if BLOCK_SIZE > 1
uint coord = (j * Bc + c) * v_stride * BLOCK_SIZE + 4 * d;
uint ib = coord / BLOCK_SIZE;
uint iqs = (coord % BLOCK_SIZE);
V_Tf = dequantize4(ib, iqs, v_offset, BINDING_IDX_V);
#else
V_Tf = FLOAT_TYPEV4(data_vv4[v_offset / 4 + (j * Bc + c) * v_stride / 4 + d]);
#endif
if (USE_DECODE_V) {
uint coord = (j * Bc + c) * v_stride * BLOCK_SIZE_V + 4 * d;
uint ib = coord / BLOCK_SIZE_V;
uint iqs = (coord % BLOCK_SIZE_V);
V_Tf = dequantize4(ib, iqs, v_offset, BINDING_IDX_V);
} else {
V_Tf = FLOAT_TYPEV4(data_vv4[v_offset / 4 + (j * Bc + c) * v_stride / 4 + d]);
}
}

kvsh[c * kvsh_stride + d] = V_Tf;
Expand All @@ -547,15 +517,13 @@ void main() {
FLOAT_TYPEV4 Vf;
if (SHMEM_STAGING != 0) {
Vf = kvsh[(c * cols_per_iter + col_tid) * kvsh_stride + (d * D_split + d_tid)];
} else {
#if BLOCK_SIZE > 1
uint coord = (j * Bc + c * cols_per_iter + col_tid) * v_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid);
uint ib = coord / BLOCK_SIZE;
uint iqs = (coord % BLOCK_SIZE);
} else if (USE_DECODE_V) {
uint coord = (j * Bc + c * cols_per_iter + col_tid) * v_stride * BLOCK_SIZE_V + 4 * (d * D_split + d_tid);
uint ib = coord / BLOCK_SIZE_V;
uint iqs = (coord % BLOCK_SIZE_V);
Vf = dequantize4(ib, iqs, v_offset, BINDING_IDX_V);
#else
} else {
Vf = FLOAT_TYPEV4(data_vv4[v_offset / 4 + (j * Bc + c * cols_per_iter + col_tid) * v_stride / 4 + d * D_split + d_tid]);
#endif
}
[[unroll]] for (uint32_t r = 0; r < rows_per_thread; ++r) {
Of[r][d] += FLOAT_TYPEV4(Pf[r] * Vf);
Expand Down
Loading
Loading