diff --git a/ds4.c b/ds4.c index 1f8349b6..da320104 100644 --- a/ds4.c +++ b/ds4.c @@ -2220,6 +2220,29 @@ static void tensor_expect_plain_layout( tensor_expect_layout(t, t->type, ndim, d0, d1, d2); } +/* Tensors that flow through the polymorphic dense matvec/matmul dispatch + * (matvec_any / ds4_gpu_matmul_dispatch_tensor) accept any of F16, F32, or + * Q8_0. Used for the small projections antirez ships at F16 but stock + * llama.cpp converters emit at Q8_0 (HC fn weights, compressors, indexer + * projections) or F32 (router gate). */ +static void tensor_expect_dispatch_layout( + const ds4_tensor *t, + uint32_t ndim, + uint64_t d0, + uint64_t d1, + uint64_t d2) { + if (!t) ds4_die("internal error: missing tensor while validating layout"); + if (t->type != DS4_TENSOR_F16 && t->type != DS4_TENSOR_F32 && t->type != DS4_TENSOR_Q8_0) { + fprintf(stderr, + "ds4: tensor %.*s has type %s, expected F16, F32, or Q8_0\n", + (int)t->name.len, + t->name.ptr, + tensor_type_name(t->type)); + exit(1); + } + tensor_expect_layout(t, t->type, ndim, d0, d1, d2); +} + static bool tensor_is_routed_expert_type(uint32_t type) { return type == DS4_TENSOR_IQ2_XXS || type == DS4_TENSOR_Q2_K || @@ -2288,9 +2311,17 @@ static void weights_validate_layout(const ds4_weights *w) { const uint64_t q_dim = (uint64_t)DS4_N_HEAD * DS4_N_HEAD_DIM; const uint64_t out_low_dim = (uint64_t)DS4_N_OUT_GROUP * DS4_N_LORA_O; - tensor_expect_layout(w->token_embd, DS4_TENSOR_F16, 2, DS4_N_EMBD, DS4_N_VOCAB, 0); + /* token_embd may be F16 (antirez recipe) or Q8_0 (stock llama.cpp recipe used + * by e.g. cyberneurova converts). Both have CPU and Metal kernels above. */ + if (w->token_embd->type != DS4_TENSOR_F16 && w->token_embd->type != DS4_TENSOR_Q8_0) { + fprintf(stderr, + "ds4: tensor token_embd.weight has type %s, expected F16 or Q8_0\n", + tensor_type_name(w->token_embd->type)); + exit(1); + } + tensor_expect_layout(w->token_embd, w->token_embd->type, 2, DS4_N_EMBD, DS4_N_VOCAB, 0); tensor_expect_layout(w->output_hc_base, DS4_TENSOR_F32, 1, DS4_N_HC, 0, 0); - tensor_expect_layout(w->output_hc_fn, DS4_TENSOR_F16, 2, hc_dim, DS4_N_HC, 0); + tensor_expect_dispatch_layout(w->output_hc_fn, 2, hc_dim, DS4_N_HC, 0); tensor_expect_layout(w->output_hc_scale, DS4_TENSOR_F32, 1, 1, 0, 0); tensor_expect_layout(w->output_norm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); tensor_expect_layout(w->output, DS4_TENSOR_Q8_0, 2, DS4_N_EMBD, DS4_N_VOCAB, 0); @@ -2299,7 +2330,7 @@ static void weights_validate_layout(const ds4_weights *w) { const ds4_layer_weights *l = &w->layer[il]; const uint32_t ratio = ds4_layer_compress_ratio(il); - tensor_expect_layout(l->hc_attn_fn, DS4_TENSOR_F16, 2, hc_dim, hc_mix_dim, 0); + tensor_expect_dispatch_layout(l->hc_attn_fn, 2, hc_dim, hc_mix_dim, 0); tensor_expect_layout(l->hc_attn_scale, DS4_TENSOR_F32, 1, 3, 0, 0); tensor_expect_layout(l->hc_attn_base, DS4_TENSOR_F32, 1, hc_mix_dim, 0, 0); tensor_expect_layout(l->attn_norm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); @@ -2315,27 +2346,27 @@ static void weights_validate_layout(const ds4_weights *w) { if (ratio != 0) { const uint32_t coff = ratio == 4 ? 2u : 1u; const uint64_t comp_width = (uint64_t)coff * DS4_N_HEAD_DIM; - tensor_expect_layout(l->attn_compressor_ape, DS4_TENSOR_F16, 2, comp_width, ratio, 0); - tensor_expect_layout(l->attn_compressor_kv, DS4_TENSOR_F16, 2, DS4_N_EMBD, comp_width, 0); - tensor_expect_layout(l->attn_compressor_gate, DS4_TENSOR_F16, 2, DS4_N_EMBD, comp_width, 0); + tensor_expect_dispatch_layout(l->attn_compressor_ape, 2, comp_width, ratio, 0); + tensor_expect_dispatch_layout(l->attn_compressor_kv, 2, DS4_N_EMBD, comp_width, 0); + tensor_expect_dispatch_layout(l->attn_compressor_gate, 2, DS4_N_EMBD, comp_width, 0); tensor_expect_layout(l->attn_compressor_norm, DS4_TENSOR_F32, 1, DS4_N_HEAD_DIM, 0, 0); } if (ratio == 4) { const uint64_t index_q_dim = (uint64_t)DS4_N_INDEXER_HEAD * DS4_N_INDEXER_HEAD_DIM; const uint64_t index_width = 2u * DS4_N_INDEXER_HEAD_DIM; - tensor_expect_layout(l->indexer_attn_q_b, DS4_TENSOR_F16, 2, DS4_N_LORA_Q, index_q_dim, 0); - tensor_expect_layout(l->indexer_proj, DS4_TENSOR_F16, 2, DS4_N_EMBD, DS4_N_INDEXER_HEAD, 0); - tensor_expect_layout(l->indexer_compressor_ape, DS4_TENSOR_F16, 2, index_width, ratio, 0); - tensor_expect_layout(l->indexer_compressor_kv, DS4_TENSOR_F16, 2, DS4_N_EMBD, index_width, 0); - tensor_expect_layout(l->indexer_compressor_gate, DS4_TENSOR_F16, 2, DS4_N_EMBD, index_width, 0); + tensor_expect_dispatch_layout(l->indexer_attn_q_b, 2, DS4_N_LORA_Q, index_q_dim, 0); + tensor_expect_dispatch_layout(l->indexer_proj, 2, DS4_N_EMBD, DS4_N_INDEXER_HEAD, 0); + tensor_expect_dispatch_layout(l->indexer_compressor_ape, 2, index_width, ratio, 0); + tensor_expect_dispatch_layout(l->indexer_compressor_kv, 2, DS4_N_EMBD, index_width, 0); + tensor_expect_dispatch_layout(l->indexer_compressor_gate, 2, DS4_N_EMBD, index_width, 0); tensor_expect_layout(l->indexer_compressor_norm, DS4_TENSOR_F32, 1, DS4_N_INDEXER_HEAD_DIM, 0, 0); } - tensor_expect_layout(l->hc_ffn_fn, DS4_TENSOR_F16, 2, hc_dim, hc_mix_dim, 0); + tensor_expect_dispatch_layout(l->hc_ffn_fn, 2, hc_dim, hc_mix_dim, 0); tensor_expect_layout(l->hc_ffn_scale, DS4_TENSOR_F32, 1, 3, 0, 0); tensor_expect_layout(l->hc_ffn_base, DS4_TENSOR_F32, 1, hc_mix_dim, 0, 0); tensor_expect_layout(l->ffn_norm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0); - tensor_expect_layout(l->ffn_gate_inp, DS4_TENSOR_F16, 2, DS4_N_EMBD, DS4_N_EXPERT, 0); + tensor_expect_dispatch_layout(l->ffn_gate_inp, 2, DS4_N_EMBD, DS4_N_EXPERT, 0); tensor_expect_optional(l->ffn_exp_probs_b, DS4_TENSOR_F32, 1, DS4_N_EXPERT, 0, 0); tensor_expect_routed_expert(l->ffn_gate_exps, 3, DS4_N_EMBD, DS4_N_FF_EXP, DS4_N_EXPERT); tensor_expect_routed_expert(l->ffn_up_exps, 3, DS4_N_EMBD, DS4_N_FF_EXP, DS4_N_EXPERT); @@ -2680,7 +2711,8 @@ static void weights_free(ds4_weights *w) { memset(w, 0, sizeof(*w)); } -/* Load one token embedding row and expand it to float activations. */ +/* Load one token embedding row and expand it to float activations. + * The CPU reference path mirrors the Metal kernel_get_rows_{f16,q8_0} dispatch. */ static void embed_token_f16(const ds4_model *m, const ds4_weights *w, int token, float *out) { ds4_tensor *te = w->token_embd; if (token < 0 || (uint64_t)token >= te->dim[1]) { @@ -2696,6 +2728,37 @@ static void embed_token_f16(const ds4_model *m, const ds4_weights *w, int token, } } +/* Q8_0 token embedding: source row is `(stride / 32)` block_q8_0 records, + * each [uint16_t scale][int8_t qs[32]]; dequantize to float on the fly. */ +static void embed_token_q8_0(const ds4_model *m, const ds4_weights *w, int token, float *out) { + ds4_tensor *te = w->token_embd; + if (token < 0 || (uint64_t)token >= te->dim[1]) { + ds4_die("token id is outside the embedding table"); + } + const uint64_t stride = te->dim[0]; + if ((stride % 32) != 0) ds4_die("Q8_0 token embedding stride is not 32-aligned"); + const uint64_t blocks = stride / 32; + const uint8_t *base = tensor_data(m, te); + const uint8_t *row = base + (uint64_t)token * blocks * 34; + for (uint64_t b = 0; b < blocks; b++) { + uint16_t scale_bits; + memcpy(&scale_bits, row + b * 34, sizeof(scale_bits)); + const int8_t *qs = (const int8_t *)(row + b * 34 + 2); + const float scale = f16_to_f32(scale_bits); + for (int i = 0; i < 32; i++) { + out[b * 32 + i] = (float)qs[i] * scale; + } + } +} + +static void embed_token(const ds4_model *m, const ds4_weights *w, int token, float *out) { + switch (w->token_embd->type) { + case DS4_TENSOR_F16: embed_token_f16(m, w, token, out); return; + case DS4_TENSOR_Q8_0: embed_token_q8_0(m, w, token, out); return; + default: ds4_die("unsupported token_embd tensor type"); + } +} + /* RMSNorm without a learned scale, used by hyper-connection control vectors. */ static void rms_norm_no_weight(float *out, const float *x, uint64_t n, float eps) { double ss = 0.0; @@ -3706,6 +3769,17 @@ static void matvec_any(float *out, const ds4_model *m, const ds4_tensor *w, cons } } +/* Serial dispatcher; for tiny matrices where parallelization overhead exceeds the gain. */ +static void matvec_any_serial(float *out, const ds4_model *m, const ds4_tensor *w, const float *x) { + switch (w->type) { + case 1: matvec_f16_serial(out, m, w, x); return; + case 0: + case 8: matvec_any(out, m, w, x); return; /* no specialized serial path; parallel is cheap here */ + default: + ds4_die("unsupported tensor type for dense matvec_serial"); + } +} + static float tensor_1d_value(const ds4_model *m, const ds4_tensor *t, uint64_t i) { if (i >= t->elements) ds4_die("tensor scalar index is out of bounds"); if (t->type == 0) { @@ -4300,9 +4374,9 @@ static void hc_pre_from_state_one_scratch( rms_norm_no_weight(flat, residual_hc, hc_dim, DS4_RMS_EPS); if (serial_fn) { - matvec_f16_serial(mix, model, fn, flat); + matvec_any_serial(mix, model, fn, flat); } else { - matvec_f16(mix, model, fn, flat); + matvec_any(mix, model, fn, flat); } const float *scale = tensor_data(model, scale_tensor); @@ -5173,7 +5247,7 @@ static void layer_router_probs_one( const float * x) { float logits[DS4_N_EXPERT]; - matvec_f16(logits, model, layer->ffn_gate_inp, x); + matvec_any(logits, model, layer->ffn_gate_inp, x); for (int i = 0; i < DS4_N_EXPERT; i++) { probs[i] = sqrtf(softplus_stable(logits[i])); } @@ -7625,7 +7699,7 @@ static void forward_token_raw_swa_cpu_decode_scratch( float *cur = scratch->cur; float *next = scratch->next; - embed_token_f16(model, weights, token, scratch->plain); + embed_token(model, weights, token, scratch->plain); hc_from_plain_embedding(cur, scratch->plain, DS4_N_EMBD, DS4_N_HC); for (uint32_t il = 0; il < DS4_N_LAYER; il++) { @@ -7700,7 +7774,7 @@ static void prefill_layer_major_cpu( } for (uint64_t t = 0; t < n_tok; t++) { - embed_token_f16(model, weights, prompt->v[t], plain); + embed_token(model, weights, prompt->v[t], plain); hc_from_plain_embedding(cur + t * hc_dim, plain, DS4_N_EMBD, DS4_N_HC); } @@ -7897,7 +7971,7 @@ static void forward_first_token_cpu( float *cur = xmalloc((size_t)DS4_N_HC * DS4_N_EMBD * sizeof(cur[0])); float *next = xmalloc((size_t)DS4_N_HC * DS4_N_EMBD * sizeof(next[0])); - embed_token_f16(model, weights, token, plain); + embed_token(model, weights, token, plain); hc_from_plain_embedding(cur, plain, DS4_N_EMBD, DS4_N_HC); for (uint32_t il = 0; il < DS4_N_LAYER; il++) { @@ -7928,7 +8002,7 @@ static void output_hc_head_one( float *w = xmalloc((size_t)n_hc * sizeof(w[0])); rms_norm_no_weight(flat, inp_hc, hc_dim, DS4_RMS_EPS); - matvec_f16(pre, model, weights->output_hc_fn, flat); + matvec_any(pre, model, weights->output_hc_fn, flat); const float *scale = tensor_data(model, weights->output_hc_scale); const float *base = tensor_data(model, weights->output_hc_base); @@ -7972,7 +8046,7 @@ static void output_logits_one_decode_scratch( const uint64_t hc_dim = (uint64_t)DS4_N_EMBD * n_hc; rms_norm_no_weight(scratch->output_flat, inp_hc, hc_dim, DS4_RMS_EPS); - matvec_f16(scratch->output_pre, model, weights->output_hc_fn, scratch->output_flat); + matvec_any(scratch->output_pre, model, weights->output_hc_fn, scratch->output_flat); const float *scale = tensor_data(model, weights->output_hc_scale); const float *base = tensor_data(model, weights->output_hc_base); @@ -9036,6 +9110,16 @@ static bool metal_graph_matmul_plain_tensor( uint64_t out_dim, const ds4_gpu_tensor *x, uint64_t n_tok); +static bool metal_graph_matmul_pair_plain_tensor( + ds4_gpu_tensor *out_a, + ds4_gpu_tensor *out_b, + const ds4_model *model, + const ds4_tensor *w_a, + const ds4_tensor *w_b, + uint64_t in_dim, + uint64_t out_dim, + const ds4_gpu_tensor *x, + uint64_t n_tok); static bool metal_graph_encode_decode_layer( ds4_gpu_graph *g, @@ -9233,13 +9317,14 @@ static bool metal_graph_encode_decode_layer( const bool emit = ((pos + 1u) % ratio) == 0u; if (!layer->attn_compressor_kv || !layer->attn_compressor_gate || !layer->attn_compressor_ape || !layer->attn_compressor_norm || - layer->attn_compressor_kv->type != DS4_TENSOR_F16 || - layer->attn_compressor_gate->type != DS4_TENSOR_F16 || + (layer->attn_compressor_kv->type != DS4_TENSOR_F16 && + layer->attn_compressor_kv->type != DS4_TENSOR_Q8_0) || + layer->attn_compressor_kv->type != layer->attn_compressor_gate->type || layer->attn_compressor_kv->dim[0] != DS4_N_EMBD || layer->attn_compressor_gate->dim[0] != DS4_N_EMBD || layer->attn_compressor_kv->dim[1] != comp_width || layer->attn_compressor_gate->dim[1] != comp_width) { - fprintf(stderr, "ds4: Metal graph compressor expects paired F16 compressor projections\n"); + fprintf(stderr, "ds4: Metal graph compressor expects paired F16 or Q8_0 compressor projections\n"); ok = false; } if (ok && emit && g->layer_n_comp[il] >= g->comp_cap) { @@ -9247,25 +9332,10 @@ static bool metal_graph_encode_decode_layer( ok = false; } if (ok && !metal_graph_use_reference_compressor_pair_proj()) { - ok = ds4_gpu_matmul_f16_pair_tensor(g->comp_kv_cur, - g->comp_sc_cur, - model->map, - model->size, - layer->attn_compressor_kv->abs_offset, - layer->attn_compressor_gate->abs_offset, - DS4_N_EMBD, - comp_width, - g->attn_norm, - 1) != 0; + ok = metal_graph_matmul_pair_plain_tensor(g->comp_kv_cur, g->comp_sc_cur, model, layer->attn_compressor_kv, layer->attn_compressor_gate, DS4_N_EMBD, comp_width, g->attn_norm, 1); } else { - if (ok) ok = ds4_gpu_matmul_f16_tensor(g->comp_kv_cur, model->map, model->size, - layer->attn_compressor_kv->abs_offset, - DS4_N_EMBD, comp_width, - g->attn_norm, 1) != 0; - if (ok) ok = ds4_gpu_matmul_f16_tensor(g->comp_sc_cur, model->map, model->size, - layer->attn_compressor_gate->abs_offset, - DS4_N_EMBD, comp_width, - g->attn_norm, 1) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(g->comp_kv_cur, model, layer->attn_compressor_kv, DS4_N_EMBD, comp_width, g->attn_norm, 1); + if (ok) ok = metal_graph_matmul_plain_tensor(g->comp_sc_cur, model, layer->attn_compressor_gate, DS4_N_EMBD, comp_width, g->attn_norm, 1); } const uint32_t comp_row = g->layer_n_comp[il]; if (ok) ok = ds4_gpu_compressor_update_tensor(g->comp_kv_cur, @@ -9313,13 +9383,14 @@ static bool metal_graph_encode_decode_layer( const uint32_t index_width = coff * DS4_N_INDEXER_HEAD_DIM; if (!layer->indexer_compressor_kv || !layer->indexer_compressor_gate || !layer->indexer_compressor_ape || !layer->indexer_compressor_norm || - layer->indexer_compressor_kv->type != DS4_TENSOR_F16 || - layer->indexer_compressor_gate->type != DS4_TENSOR_F16 || + (layer->indexer_compressor_kv->type != DS4_TENSOR_F16 && + layer->indexer_compressor_kv->type != DS4_TENSOR_Q8_0) || + layer->indexer_compressor_kv->type != layer->indexer_compressor_gate->type || layer->indexer_compressor_kv->dim[0] != DS4_N_EMBD || layer->indexer_compressor_gate->dim[0] != DS4_N_EMBD || layer->indexer_compressor_kv->dim[1] != index_width || layer->indexer_compressor_gate->dim[1] != index_width) { - fprintf(stderr, "ds4: Metal graph indexer compressor expects paired F16 projections\n"); + fprintf(stderr, "ds4: Metal graph indexer compressor expects paired F16 or Q8_0 projections\n"); ok = false; } if (ok && emit && g->layer_n_index_comp[il] >= g->comp_cap) { @@ -9327,25 +9398,10 @@ static bool metal_graph_encode_decode_layer( ok = false; } if (ok && !metal_graph_use_reference_compressor_pair_proj()) { - ok = ds4_gpu_matmul_f16_pair_tensor(g->comp_kv_cur, - g->comp_sc_cur, - model->map, - model->size, - layer->indexer_compressor_kv->abs_offset, - layer->indexer_compressor_gate->abs_offset, - DS4_N_EMBD, - index_width, - g->attn_norm, - 1) != 0; + ok = metal_graph_matmul_pair_plain_tensor(g->comp_kv_cur, g->comp_sc_cur, model, layer->indexer_compressor_kv, layer->indexer_compressor_gate, DS4_N_EMBD, index_width, g->attn_norm, 1); } else { - if (ok) ok = ds4_gpu_matmul_f16_tensor(g->comp_kv_cur, model->map, model->size, - layer->indexer_compressor_kv->abs_offset, - DS4_N_EMBD, index_width, - g->attn_norm, 1) != 0; - if (ok) ok = ds4_gpu_matmul_f16_tensor(g->comp_sc_cur, model->map, model->size, - layer->indexer_compressor_gate->abs_offset, - DS4_N_EMBD, index_width, - g->attn_norm, 1) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(g->comp_kv_cur, model, layer->indexer_compressor_kv, DS4_N_EMBD, index_width, g->attn_norm, 1); + if (ok) ok = metal_graph_matmul_plain_tensor(g->comp_sc_cur, model, layer->indexer_compressor_gate, DS4_N_EMBD, index_width, g->attn_norm, 1); } const uint32_t index_row = g->layer_n_index_comp[il]; if (ok) ok = ds4_gpu_compressor_update_tensor(g->comp_kv_cur, @@ -9377,23 +9433,22 @@ static bool metal_graph_encode_decode_layer( if (ok && g->layer_n_comp[il] > decode_top_k) { const uint64_t indexer_q_dim = (uint64_t)DS4_N_INDEXER_HEAD * DS4_N_INDEXER_HEAD_DIM; if (!layer->indexer_attn_q_b || - layer->indexer_attn_q_b->type != DS4_TENSOR_F16 || + (layer->indexer_attn_q_b->type != DS4_TENSOR_F16 && + layer->indexer_attn_q_b->type != DS4_TENSOR_Q8_0) || layer->indexer_attn_q_b->dim[0] != q_rank || layer->indexer_attn_q_b->dim[1] != indexer_q_dim) { - fprintf(stderr, "ds4: Metal graph indexer q projection expects F16 weights\n"); + fprintf(stderr, "ds4: Metal graph indexer q projection expects F16 or Q8_0 weights\n"); ok = false; } if (ok && (!layer->indexer_proj || - layer->indexer_proj->type != DS4_TENSOR_F16 || + (layer->indexer_proj->type != DS4_TENSOR_F16 && + layer->indexer_proj->type != DS4_TENSOR_Q8_0) || layer->indexer_proj->dim[0] != DS4_N_EMBD || layer->indexer_proj->dim[1] != DS4_N_INDEXER_HEAD)) { - fprintf(stderr, "ds4: Metal graph indexer weight projection expects F16 weights\n"); + fprintf(stderr, "ds4: Metal graph indexer weight projection expects F16 or Q8_0 weights\n"); ok = false; } - if (ok) ok = ds4_gpu_matmul_f16_tensor(g->indexer_q, model->map, model->size, - layer->indexer_attn_q_b->abs_offset, - q_rank, indexer_q_dim, - g->qr_norm, 1) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(g->indexer_q, model, layer->indexer_attn_q_b, q_rank, indexer_q_dim, g->qr_norm, 1); if (ok) ok = ds4_gpu_rope_tail_tensor(g->indexer_q, 1, DS4_N_INDEXER_HEAD, DS4_N_INDEXER_HEAD_DIM, @@ -9407,10 +9462,7 @@ static bool metal_graph_encode_decode_layer( attn_factor, DS4_ROPE_YARN_BETA_FAST, DS4_ROPE_YARN_BETA_SLOW) != 0; - if (ok) ok = ds4_gpu_matmul_f16_tensor(g->indexer_weights, model->map, model->size, - layer->indexer_proj->abs_offset, - DS4_N_EMBD, DS4_N_INDEXER_HEAD, - g->attn_norm, 1) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(g->indexer_weights, model, layer->indexer_proj, DS4_N_EMBD, DS4_N_INDEXER_HEAD, g->attn_norm, 1); const float index_scale = 1.0f / sqrtf((float)(DS4_N_INDEXER_HEAD_DIM * DS4_N_INDEXER_HEAD)); if (ok && decode_index_stage_profile) { ok = metal_graph_indexer_stage_profile_boundary(NULL, @@ -9793,14 +9845,7 @@ static bool metal_graph_encode_output_head( uint64_t vocab_dim) { const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD; bool ok = ds4_gpu_rms_norm_plain_tensor(g->flat_hc, g->cur_hc, (uint32_t)hc_dim, DS4_RMS_EPS) != 0; - if (ok) ok = ds4_gpu_matmul_f16_tensor(g->output_pre, - model->map, - model->size, - weights->output_hc_fn->abs_offset, - hc_dim, - DS4_N_HC, - g->flat_hc, - 1) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(g->output_pre, model, weights->output_hc_fn, hc_dim, DS4_N_HC, g->flat_hc, 1); if (ok) { metal_graph_debug_dump_tensor("result_hc_pre", g->output_pre, DS4_N_HC, DS4_N_LAYER, 0); } @@ -9893,14 +9938,7 @@ static bool metal_graph_encode_output_head_batch( (uint32_t)hc_dim, n_tokens, DS4_RMS_EPS) != 0; - if (ok) ok = ds4_gpu_matmul_f16_tensor(output_pre, - model->map, - model->size, - weights->output_hc_fn->abs_offset, - hc_dim, - DS4_N_HC, - g->batch_flat_hc, - n_tokens) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(output_pre, model, weights->output_hc_fn, hc_dim, DS4_N_HC, g->batch_flat_hc, n_tokens); if (ok) ok = ds4_gpu_output_hc_weights_tensor(output_weights, output_pre, model->map, @@ -9947,16 +9985,54 @@ static bool metal_graph_matmul_plain_tensor( uint64_t out_dim, const ds4_gpu_tensor *x, uint64_t n_tok) { + int ok = 0; const char *kind = "?"; if (w->type == DS4_TENSOR_F16) { - return ds4_gpu_matmul_f16_tensor(out, model->map, model->size, - w->abs_offset, in_dim, out_dim, x, n_tok) != 0; + kind = "f16"; + ok = ds4_gpu_matmul_f16_tensor(out, model->map, model->size, + w->abs_offset, in_dim, out_dim, x, n_tok); + } else if (w->type == DS4_TENSOR_F32) { + kind = "f32"; + ok = ds4_gpu_matmul_f32_tensor(out, model->map, model->size, + w->abs_offset, in_dim, out_dim, x, n_tok); + } else if (w->type == DS4_TENSOR_Q8_0) { + kind = "q8_0"; + ok = ds4_gpu_matmul_q8_0_tensor(out, model->map, model->size, + w->abs_offset, in_dim, out_dim, x, n_tok); + } else { + fprintf(stderr, "ds4: Metal plain matmul does not support %s\n", tensor_type_name(w->type)); + return false; } - if (w->type == DS4_TENSOR_F32) { - return ds4_gpu_matmul_f32_tensor(out, model->map, model->size, - w->abs_offset, in_dim, out_dim, x, n_tok) != 0; + if (!ok) { + fprintf(stderr, "ds4: PLAIN_MATMUL FAIL kind=%s tensor=%.*s in_dim=%llu out_dim=%llu n_tok=%llu\n", + kind, (int)w->name.len, w->name.ptr, + (unsigned long long)in_dim, (unsigned long long)out_dim, (unsigned long long)n_tok); } - fprintf(stderr, "ds4: Metal plain matmul does not support %s\n", tensor_type_name(w->type)); - return false; + return ok != 0; +} + +/* Pair variant of metal_graph_matmul_plain_tensor: fused F16-pair kernel when + * both tensors are F16, otherwise dispatch to two single-tensor matmuls. */ +static bool metal_graph_matmul_pair_plain_tensor( + ds4_gpu_tensor *out_a, + ds4_gpu_tensor *out_b, + const ds4_model *model, + const ds4_tensor *w_a, + const ds4_tensor *w_b, + uint64_t in_dim, + uint64_t out_dim, + const ds4_gpu_tensor *x, + uint64_t n_tok) { + if (w_a->type != w_b->type) { + fprintf(stderr, "ds4: Metal plain pair matmul: paired tensors must share type\n"); + return false; + } + if (w_a->type == DS4_TENSOR_F16) { + return ds4_gpu_matmul_f16_pair_tensor(out_a, out_b, model->map, model->size, + w_a->abs_offset, w_b->abs_offset, + in_dim, out_dim, x, n_tok) != 0; + } + if (!metal_graph_matmul_plain_tensor(out_a, model, w_a, in_dim, out_dim, x, n_tok)) return false; + return metal_graph_matmul_plain_tensor(out_b, model, w_b, in_dim, out_dim, x, n_tok); } static bool metal_graph_encode_output_head_mtp( @@ -10286,7 +10362,7 @@ static int metal_graph_decode_test( int selected[DS4_N_EXPERT_USED]; float expert_weight[DS4_N_EXPERT_USED]; - embed_token_f16(model, weights, token, plain); + embed_token(model, weights, token, plain); hc_from_plain_embedding(cpu_hc, plain, DS4_N_EMBD, DS4_N_HC); hc_pre_from_state_one(model, layer->hc_attn_fn, @@ -10348,7 +10424,8 @@ static int metal_graph_decode_test( (uint32_t)weights->token_embd->dim[1], (uint32_t)token, DS4_N_EMBD, - DS4_N_HC) != 0; + DS4_N_HC, + weights->token_embd->type) != 0; if (ok) ok = metal_graph_encode_decode_layer(&g, model, layer, @@ -10497,7 +10574,7 @@ static int metal_graph_first_token_full_test( float *cpu_cur = xmalloc((size_t)hc_dim * sizeof(float)); float *cpu_next = xmalloc((size_t)hc_dim * sizeof(float)); - embed_token_f16(model, weights, token, plain); + embed_token(model, weights, token, plain); hc_from_plain_embedding(cpu_cur, plain, DS4_N_EMBD, DS4_N_HC); ok = ds4_gpu_begin_commands() != 0; if (ok) ok = ds4_gpu_embed_token_hc_tensor(g.cur_hc, @@ -10507,7 +10584,8 @@ static int metal_graph_first_token_full_test( (uint32_t)weights->token_embd->dim[1], (uint32_t)token, DS4_N_EMBD, - DS4_N_HC) != 0; + DS4_N_HC, + weights->token_embd->type) != 0; if (ok) ok = ds4_gpu_end_commands() != 0; for (uint32_t il = 0; ok && il < DS4_N_LAYER; il++) { @@ -10556,7 +10634,8 @@ static int metal_graph_first_token_full_test( (uint32_t)weights->token_embd->dim[1], (uint32_t)token, DS4_N_EMBD, - DS4_N_HC) != 0; + DS4_N_HC, + weights->token_embd->type) != 0; for (uint32_t il = 0; ok && il < DS4_N_LAYER; il++) { ok = metal_graph_encode_decode_layer(&g, model, &weights->layer[il], @@ -10637,7 +10716,8 @@ static bool metal_graph_encode_token_raw_swa( (uint32_t)weights->token_embd->dim[1], (uint32_t)token, DS4_N_EMBD, - DS4_N_HC) != 0; + DS4_N_HC, + weights->token_embd->type) != 0; /* * Start executing the prefix of the decode graph while the CPU is still @@ -10741,24 +10821,10 @@ static bool metal_graph_refresh_ratio4_compressor_state( 4ull * DS4_N_EMBD * sizeof(float)); bool ok = tail_hc != NULL; if (ok) { - ok = ds4_gpu_matmul_f16_tensor(g->batch_comp_kv, - model->map, - model->size, - kv_weight->abs_offset, - DS4_N_EMBD, - width, - tail_hc, - 4) != 0; + ok = metal_graph_matmul_plain_tensor(g->batch_comp_kv, model, kv_weight, DS4_N_EMBD, width, tail_hc, 4); } if (ok) { - ok = ds4_gpu_matmul_f16_tensor(g->batch_comp_sc, - model->map, - model->size, - score_weight->abs_offset, - DS4_N_EMBD, - width, - tail_hc, - 4) != 0; + ok = metal_graph_matmul_plain_tensor(g->batch_comp_sc, model, score_weight, DS4_N_EMBD, width, tail_hc, 4); } if (ok) { ok = ds4_gpu_compressor_prefill_state_ratio4_tensor(state_kv, @@ -10793,7 +10859,7 @@ static bool metal_graph_upload_prompt_embeddings_hc_cpu( float *plain = xmalloc((size_t)DS4_N_EMBD * sizeof(plain[0])); for (uint32_t t = 0; t < n_tokens; t++) { - embed_token_f16(model, weights, prompt->v[pos0 + t], plain); + embed_token(model, weights, prompt->v[pos0 + t], plain); float *dst = hc + (uint64_t)t * hc_dim; for (uint32_t h = 0; h < DS4_N_HC; h++) { memcpy(dst + (uint64_t)h * DS4_N_EMBD, @@ -10838,7 +10904,8 @@ static bool metal_graph_upload_prompt_embeddings_hc( (uint32_t)weights->token_embd->dim[1], n_tokens, DS4_N_EMBD, - DS4_N_HC) != 0; + DS4_N_HC, + weights->token_embd->type) != 0; } return metal_graph_upload_prompt_embeddings_hc_cpu(out_hc, @@ -10869,14 +10936,7 @@ static bool metal_graph_warmup_prefill_kernels( bool ok = ds4_gpu_begin_commands() != 0; if (ok) { - ok = ds4_gpu_matmul_f16_tensor(g->batch_hc_mix, - model->map, - model->size, - weights->layer[0].hc_attn_fn->abs_offset, - hc_dim, - mix_hc, - g->batch_flat_hc, - n_tokens) != 0; + ok = metal_graph_matmul_plain_tensor(g->batch_hc_mix, model, weights->layer[0].hc_attn_fn, hc_dim, mix_hc, g->batch_flat_hc, n_tokens); } if (ok) ok = ds4_gpu_end_commands() != 0; if (!ok) { @@ -11016,14 +11076,7 @@ static bool metal_graph_encode_layer_attention_batch( (uint32_t)hc_dim, n_tokens, DS4_RMS_EPS) != 0; - if (ok) ok = ds4_gpu_matmul_f16_tensor(hc_mix_view, - model->map, - model->size, - layer->hc_attn_fn->abs_offset, - hc_dim, - mix_hc, - g->batch_flat_hc, - n_tokens) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(hc_mix_view, model, layer->hc_attn_fn, hc_dim, mix_hc, g->batch_flat_hc, n_tokens); if (metal_graph_use_reference_hc_decode()) { if (ok) ok = ds4_gpu_hc_split_sinkhorn_tensor(hc_split_view, hc_mix_view, @@ -11307,27 +11360,13 @@ static bool metal_graph_encode_layer_attention_batch( fprintf(stderr, "ds4: Metal layer-major prefill needs attention compressor weights\n"); ok = false; } - if (ok) ok = ds4_gpu_matmul_f16_tensor(g->batch_comp_kv, - model->map, - model->size, - layer->attn_compressor_kv->abs_offset, - DS4_N_EMBD, - comp_width, - g->batch_attn_norm, - n_tokens) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(g->batch_comp_kv, model, layer->attn_compressor_kv, DS4_N_EMBD, comp_width, g->batch_attn_norm, n_tokens); if (ok) metal_graph_debug_dump_tensor("attn_comp_kv_raw", g->batch_comp_kv, (uint64_t)comp_width * n_tokens, il, pos0); - if (ok) ok = ds4_gpu_matmul_f16_tensor(g->batch_comp_sc, - model->map, - model->size, - layer->attn_compressor_gate->abs_offset, - DS4_N_EMBD, - comp_width, - g->batch_attn_norm, - n_tokens) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(g->batch_comp_sc, model, layer->attn_compressor_gate, DS4_N_EMBD, comp_width, g->batch_attn_norm, n_tokens); if (ok) metal_graph_debug_dump_tensor("attn_comp_score_raw", g->batch_comp_sc, (uint64_t)comp_width * n_tokens, @@ -11585,40 +11624,22 @@ static bool metal_graph_encode_layer_attention_batch( fprintf(stderr, "ds4: Metal layer-major prefill needs indexer weights\n"); ok = false; } - if (ok) ok = ds4_gpu_matmul_f16_tensor(g->batch_comp_kv, - model->map, - model->size, - layer->indexer_compressor_kv->abs_offset, - DS4_N_EMBD, - index_width, - g->batch_attn_norm, - n_tokens) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(g->batch_comp_kv, model, layer->indexer_compressor_kv, DS4_N_EMBD, index_width, g->batch_attn_norm, n_tokens); if (ok) metal_graph_debug_dump_tensor("indexer_comp_kv_raw", g->batch_comp_kv, (uint64_t)index_width * n_tokens, il, pos0); - if (ok) ok = ds4_gpu_matmul_f16_tensor(g->batch_comp_sc, - model->map, - model->size, - layer->indexer_compressor_gate->abs_offset, - DS4_N_EMBD, - index_width, - g->batch_attn_norm, - n_tokens) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(g->batch_comp_sc, model, layer->indexer_compressor_gate, DS4_N_EMBD, index_width, g->batch_attn_norm, n_tokens); if (ok) metal_graph_debug_dump_tensor("indexer_comp_score_raw", g->batch_comp_sc, (uint64_t)index_width * n_tokens, il, pos0); - if (ok) ok = ds4_gpu_matmul_f16_tensor(g->batch_indexer_q, - model->map, - model->size, - layer->indexer_attn_q_b->abs_offset, - q_rank, - (uint64_t)DS4_N_INDEXER_HEAD * DS4_N_INDEXER_HEAD_DIM, - g->batch_qr_norm, - n_tokens) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(g->batch_indexer_q, model, layer->indexer_attn_q_b, + q_rank, + (uint64_t)DS4_N_INDEXER_HEAD * DS4_N_INDEXER_HEAD_DIM, + g->batch_qr_norm, n_tokens); if (ok) ok = ds4_gpu_rope_tail_tensor(g->batch_indexer_q, n_tokens, DS4_N_INDEXER_HEAD, @@ -11633,14 +11654,7 @@ static bool metal_graph_encode_layer_attention_batch( attn_factor, DS4_ROPE_YARN_BETA_FAST, DS4_ROPE_YARN_BETA_SLOW) != 0; - if (ok) ok = ds4_gpu_matmul_f16_tensor(g->batch_indexer_weights, - model->map, - model->size, - layer->indexer_proj->abs_offset, - DS4_N_EMBD, - DS4_N_INDEXER_HEAD, - g->batch_attn_norm, - n_tokens) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(g->batch_indexer_weights, model, layer->indexer_proj, DS4_N_EMBD, DS4_N_INDEXER_HEAD, g->batch_attn_norm, n_tokens); if (zero_prefix) { if (ok && n_comp > g->comp_cap) { fprintf(stderr, "ds4: Metal layer-major indexer cache capacity exceeded at layer %u\n", il); @@ -12298,14 +12312,7 @@ static bool metal_graph_encode_layer_ffn_batch( (uint32_t)hc_dim, n_tokens, DS4_RMS_EPS) != 0; - if (ok) ok = ds4_gpu_matmul_f16_tensor(hc_mix_view, - model->map, - model->size, - layer->hc_ffn_fn->abs_offset, - hc_dim, - mix_hc, - g->batch_flat_hc, - n_tokens) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(hc_mix_view, model, layer->hc_ffn_fn, hc_dim, mix_hc, g->batch_flat_hc, n_tokens); if (metal_graph_use_reference_hc_decode()) { if (ok) ok = ds4_gpu_hc_split_sinkhorn_tensor(hc_split_view, hc_mix_view, @@ -12353,14 +12360,7 @@ static bool metal_graph_encode_layer_ffn_batch( (uint64_t)n_tokens * DS4_N_EMBD, il, pos0); } DS4_METAL_PROFILE_FFN_STAGE("norm"); - if (ok) ok = ds4_gpu_matmul_f16_tensor(g->batch_router_logits, - model->map, - model->size, - layer->ffn_gate_inp->abs_offset, - DS4_N_EMBD, - DS4_N_EXPERT, - g->batch_ffn_norm, - n_tokens) != 0; + if (ok) ok = metal_graph_matmul_plain_tensor(g->batch_router_logits, model, layer->ffn_gate_inp, DS4_N_EMBD, DS4_N_EXPERT, g->batch_ffn_norm, n_tokens); if (ok) ok = ds4_gpu_router_select_batch_tensor(g->batch_router_selected, g->batch_router_weights, @@ -12523,7 +12523,9 @@ static bool metal_graph_encode_layer_batch( uint32_t pos0, uint32_t n_tokens) { bool ok = metal_graph_encode_layer_attention_batch(g, model, layer, il, pos0, n_tokens); - if (ok) ok = metal_graph_encode_layer_ffn_batch(g, model, layer, il, pos0, n_tokens); + if (!ok) { fprintf(stderr, "ds4: TRACE attn_batch FAILED il=%u pos0=%u n_tokens=%u\n", il, pos0, n_tokens); return false; } + ok = metal_graph_encode_layer_ffn_batch(g, model, layer, il, pos0, n_tokens); + if (!ok) { fprintf(stderr, "ds4: TRACE ffn_batch FAILED il=%u pos0=%u n_tokens=%u\n", il, pos0, n_tokens); return false; } if (ok) { ds4_gpu_tensor *tmp = g->batch_cur_hc; g->batch_cur_hc = g->batch_next_hc; @@ -12639,7 +12641,8 @@ static bool metal_graph_eval_mtp_draft_from_hc( (uint32_t)base_weights->token_embd->dim[1], (uint32_t)token, DS4_N_EMBD, - 1) != 0; + 1, + base_weights->token_embd->type) != 0; if (ok) ok = ds4_gpu_rms_norm_weight_tensor(g->mtp_enorm, g->mtp_embed, mtp_model->map, @@ -13360,7 +13363,8 @@ static bool metal_graph_verify_decode2_exact( (uint32_t)weights->token_embd->dim[1], (uint32_t)token0, DS4_N_EMBD, - DS4_N_HC) != 0; + DS4_N_HC, + weights->token_embd->type) != 0; if (ok) ok = ds4_gpu_embed_token_hc_tensor(cur1, model->map, model->size, @@ -13368,7 +13372,8 @@ static bool metal_graph_verify_decode2_exact( (uint32_t)weights->token_embd->dim[1], (uint32_t)token1, DS4_N_EMBD, - DS4_N_HC) != 0; + DS4_N_HC, + weights->token_embd->type) != 0; ds4_gpu_tensor *saved_cur = g->cur_hc; ds4_gpu_tensor *saved_after = g->after_ffn_hc; @@ -13765,7 +13770,7 @@ static void embed_prompt( uint32_t n_embd, float * out) { for (int i = 0; i < tokens->len; i++) { - embed_token_f16(model, weights, tokens->v[i], out + (uint64_t)i * n_embd); + embed_token(model, weights, tokens->v[i], out + (uint64_t)i * n_embd); } } diff --git a/ds4_gpu.h b/ds4_gpu.h index 799065a8..3ae48c13 100644 --- a/ds4_gpu.h +++ b/ds4_gpu.h @@ -50,6 +50,7 @@ void ds4_gpu_print_memory_report(const char *label); * compressed-attention indexer that chooses visible compressed rows. */ +/* `weight_type` is the GGUF tensor type code (1 for F16, 8 for Q8_0). */ int ds4_gpu_embed_token_hc_tensor( ds4_gpu_tensor *out_hc, const void *model_map, @@ -58,7 +59,8 @@ int ds4_gpu_embed_token_hc_tensor( uint32_t n_vocab, uint32_t token, uint32_t n_embd, - uint32_t n_hc); + uint32_t n_hc, + uint32_t weight_type); int ds4_gpu_embed_tokens_hc_tensor( ds4_gpu_tensor *out_hc, @@ -69,7 +71,8 @@ int ds4_gpu_embed_tokens_hc_tensor( uint32_t n_vocab, uint32_t n_tokens, uint32_t n_embd, - uint32_t n_hc); + uint32_t n_hc, + uint32_t weight_type); int ds4_gpu_indexer_score_one_tensor( ds4_gpu_tensor *scores, diff --git a/ds4_metal.m b/ds4_metal.m index 6870bf9a..b87963d1 100644 --- a/ds4_metal.m +++ b/ds4_metal.m @@ -26,6 +26,8 @@ */ enum { + DS4_METAL_TENSOR_F16 = 1, + DS4_METAL_TENSOR_Q8_0 = 8, DS4_METAL_TENSOR_Q2_K = 10, DS4_METAL_TENSOR_Q4_K = 12, DS4_METAL_TENSOR_IQ2_XXS = 16, @@ -40,12 +42,14 @@ static id g_set_rows_f32_i32_pipeline; static id g_get_rows_f32_pipeline; static id g_get_rows_f16_pipeline; +static id g_get_rows_q8_0_pipeline; static id g_get_rows_i32_pipeline; static id g_repeat_f32_pipeline; static id g_concat_pipeline; static id g_cpy_f32_f32_pipeline; static id g_cpy_f32_f16_pipeline; static id g_cpy_f16_f32_pipeline; +static id g_cpy_q8_0_f32_pipeline; static id g_swiglu_pipeline; static id g_add_pipeline; static id g_mul_pipeline; @@ -1401,6 +1405,69 @@ static float ds4_gpu_negative_infinity(void) { return v.f; } +/* Total bytes for `n_elems` of an APE-style weight tensor of `ape_type`. + * F32 (0) = 4 bytes/elem; F16 (1) = 2 bytes/elem; Q8_0 (8) = 34 bytes per + * QK8_0=32 elements (caller must ensure n_elems is 32-aligned for Q8_0). */ +static uint64_t ds4_gpu_ape_bytes(uint32_t ape_type, uint64_t n_elems) { + if (ape_type == 8u) return (n_elems / 32u) * 34u; + return n_elems * (ape_type == 1u ? 2u : 4u); +} + +/* Convert IEEE-754 half (bits) to float on the CPU. Used by the Q8_0 ape + * CPU-side dequant; the Metal-side `kernel_cpy_q8_0_f32` path turned out to + * produce silently wrong output on M5 Max for the compressor APE shapes we + * hit, so the pre-add scratch is filled from the host instead. */ +static float ds4_gpu_half_bits_to_float(uint16_t bits) { + const uint32_t sign = (uint32_t)(bits >> 15) & 0x1u; + const uint32_t exph = (uint32_t)(bits >> 10) & 0x1fu; + const uint32_t mant = (uint32_t)bits & 0x3ffu; + uint32_t f32bits; + if (exph == 0u) { + if (mant == 0u) { + f32bits = sign << 31; + } else { + uint32_t e = 1u, m = mant; + while ((m & 0x400u) == 0u) { m <<= 1; e++; } + m &= 0x3ffu; + f32bits = (sign << 31) | ((127u - 15u - e + 1u) << 23) | (m << 13); + } + } else if (exph == 0x1fu) { + f32bits = (sign << 31) | (0xffu << 23) | (mant << 13); + } else { + f32bits = (sign << 31) | ((exph + 127u - 15u) << 23) | (mant << 13); + } + float v; + memcpy(&v, &f32bits, sizeof(v)); + return v; +} + +/* Dequantize `row_count` rows of `row_elems` Q8_0-stored values from `src` + * into contiguous F32 at `dst`. `row_elems` must be a multiple of QK8_0=32. + * Source layout is contiguous block_q8_0 records (no per-row padding), so + * one row is `row_elems / 32` blocks of 34 bytes each. */ +static void ds4_gpu_cpu_dequant_q8_0_rows( + float *dst, + const uint8_t *src, + uint32_t row_count, + uint32_t row_elems) { + const uint32_t blocks_per_row = row_elems / 32u; + for (uint32_t r = 0; r < row_count; r++) { + const uint8_t *row = src + (uint64_t)r * (uint64_t)blocks_per_row * 34u; + float *out = dst + (uint64_t)r * row_elems; + for (uint32_t b = 0; b < blocks_per_row; b++) { + const uint8_t *blk = row + (uint64_t)b * 34u; + uint16_t scale_bits; + memcpy(&scale_bits, blk, sizeof(scale_bits)); + const float scale = ds4_gpu_half_bits_to_float(scale_bits); + const int8_t *qs = (const int8_t *)(blk + 2); + float *out_blk = out + (uint64_t)b * 32u; + for (uint32_t k = 0; k < 32u; k++) { + out_blk[k] = (float)qs[k] * scale; + } + } + } +} + static float ds4_gpu_positive_infinity(void) { union { uint32_t u; float f; } v = { 0x7f800000u }; return v.f; @@ -2742,6 +2809,23 @@ int ds4_gpu_init(void) { return 0; } + fn = [library newFunctionWithName:@"kernel_get_rows_q8_0"]; + if (!fn) { + fprintf(stderr, "ds4: Metal kernel_get_rows_q8_0 function not found\n"); + g_queue = nil; + g_device = nil; + return 0; + } + + g_get_rows_q8_0_pipeline = [g_device newComputePipelineStateWithFunction:fn error:&error]; + if (!g_get_rows_q8_0_pipeline) { + fprintf(stderr, "ds4: Metal kernel_get_rows_q8_0 pipeline failed: %s\n", + [[error localizedDescription] UTF8String]); + g_queue = nil; + g_device = nil; + return 0; + } + fn = [library newFunctionWithName:@"kernel_get_rows_i32"]; if (!fn) { fprintf(stderr, "ds4: Metal kernel_get_rows_i32 function not found\n"); @@ -2861,6 +2945,22 @@ int ds4_gpu_init(void) { return 0; } + fn = [library newFunctionWithName:@"kernel_cpy_q8_0_f32"]; + if (!fn) { + fprintf(stderr, "ds4: Metal kernel_cpy_q8_0_f32 function not found\n"); + g_queue = nil; + g_device = nil; + return 0; + } + g_cpy_q8_0_f32_pipeline = [g_device newComputePipelineStateWithFunction:fn error:&error]; + if (!g_cpy_q8_0_f32_pipeline) { + fprintf(stderr, "ds4: Metal kernel_cpy_q8_0_f32 pipeline failed: %s\n", + [[error localizedDescription] UTF8String]); + g_queue = nil; + g_device = nil; + return 0; + } + fn = [library newFunctionWithName:@"kernel_dsv4_fp8_kv_quantize_f32"]; if (!fn) { fprintf(stderr, "ds4: Metal kernel_dsv4_fp8_kv_quantize_f32 function not found\n"); @@ -3955,12 +4055,14 @@ void ds4_gpu_cleanup(void) { g_set_rows_f32_i32_pipeline = nil; g_get_rows_f32_pipeline = nil; g_get_rows_f16_pipeline = nil; + g_get_rows_q8_0_pipeline = nil; g_get_rows_i32_pipeline = nil; g_repeat_f32_pipeline = nil; g_concat_pipeline = nil; g_cpy_f32_f32_pipeline = nil; g_cpy_f32_f16_pipeline = nil; g_cpy_f16_f32_pipeline = nil; + g_cpy_q8_0_f32_pipeline = nil; g_swiglu_pipeline = nil; g_add_pipeline = nil; g_mul_pipeline = nil; @@ -4090,7 +4192,9 @@ void ds4_gpu_cleanup(void) { } } -static int ds4_gpu_encode_get_rows_f16( +static int ds4_gpu_embed_row_layout(uint32_t, uint32_t, uint64_t *, id *); + +static int ds4_gpu_encode_get_rows( id cb, id weight, NSUInteger weight_offset, @@ -4100,12 +4204,16 @@ static int ds4_gpu_encode_get_rows_f16( NSUInteger out_offset, uint32_t n_vocab, uint32_t n_tokens, - uint32_t n_embd) { + uint32_t n_embd, + uint32_t weight_type) { if (!cb || !weight || !tokens || !out || n_vocab == 0 || n_tokens == 0 || n_embd == 0) { return 0; } - const uint64_t src_row_bytes = (uint64_t)n_embd * sizeof(uint16_t); + uint64_t src_row_bytes = 0; + id pipeline = nil; + if (!ds4_gpu_embed_row_layout(weight_type, n_embd, &src_row_bytes, &pipeline)) return 0; + const uint64_t dst_row_bytes = (uint64_t)n_embd * sizeof(float); const uint64_t token_bytes = (uint64_t)n_tokens * sizeof(int32_t); ds4_gpu_get_rows_args args = { @@ -4124,13 +4232,13 @@ static int ds4_gpu_encode_get_rows_f16( }; NSUInteger nth = (NSUInteger)n_embd; - const NSUInteger max_threads = g_get_rows_f16_pipeline.maxTotalThreadsPerThreadgroup; + const NSUInteger max_threads = pipeline.maxTotalThreadsPerThreadgroup; if (nth > max_threads) nth = max_threads; if (nth == 0) nth = 1; const NSUInteger nw0 = ((NSUInteger)n_embd + nth - 1u) / nth; id enc = ds4_gpu_compute_encoder(cb); - [enc setComputePipelineState:g_get_rows_f16_pipeline]; + [enc setComputePipelineState:pipeline]; [enc setBytes:&args length:sizeof(args) atIndex:0]; [enc setBuffer:weight offset:weight_offset atIndex:1]; [enc setBuffer:tokens offset:tokens_offset atIndex:2]; @@ -4188,6 +4296,33 @@ static int ds4_gpu_encode_repeat_hc_embedding( return 1; } +/* Per-row stride and gather pipeline for a token-embedding tensor type. + * Returns 0 (and a NULL pipeline) if the type is not supported. */ +static int ds4_gpu_embed_row_layout( + uint32_t weight_type, + uint32_t n_embd, + uint64_t *out_src_row_bytes, + id *out_pipeline) { + switch (weight_type) { + case DS4_METAL_TENSOR_F16: + *out_src_row_bytes = (uint64_t)n_embd * sizeof(uint16_t); + *out_pipeline = g_get_rows_f16_pipeline; + return 1; + case DS4_METAL_TENSOR_Q8_0: + if ((n_embd % 32) != 0) { + fprintf(stderr, "ds4: Q8_0 token embedding stride %u is not 32-aligned\n", n_embd); + return 0; + } + /* block_q8_0 is sizeof(half) + 32*sizeof(int8_t) = 34 bytes per QK8_0 elements. */ + *out_src_row_bytes = (uint64_t)(n_embd / 32) * 34; + *out_pipeline = g_get_rows_q8_0_pipeline; + return 1; + default: + fprintf(stderr, "ds4: token embedding type %u not supported by Metal embed\n", weight_type); + return 0; + } +} + int ds4_gpu_embed_token_hc_tensor( ds4_gpu_tensor *out_hc, const void *model_map, @@ -4196,7 +4331,8 @@ int ds4_gpu_embed_token_hc_tensor( uint32_t n_vocab, uint32_t token, uint32_t n_embd, - uint32_t n_hc) { + uint32_t n_hc, + uint32_t weight_type) { if (!g_initialized && !ds4_gpu_init()) return 0; if (!out_hc || !model_map || n_vocab == 0 || token >= n_vocab || n_embd == 0 || n_hc == 0) { return 0; @@ -4210,7 +4346,11 @@ int ds4_gpu_embed_token_hc_tensor( return 0; } - const uint64_t weight_bytes = (uint64_t)n_vocab * n_embd * sizeof(uint16_t); + uint64_t src_row_bytes = 0; + id pipeline = nil; + if (!ds4_gpu_embed_row_layout(weight_type, n_embd, &src_row_bytes, &pipeline)) return 0; + + const uint64_t weight_bytes = (uint64_t)n_vocab * src_row_bytes; if (weight_offset > model_size || weight_bytes > model_size - weight_offset) { fprintf(stderr, "ds4: Metal graph embedding range is outside the mapped model\n"); return 0; @@ -4233,7 +4373,6 @@ int ds4_gpu_embed_token_hc_tensor( if (!cb) return 0; const int32_t token_i32 = (int32_t)token; - const uint64_t src_row_bytes = (uint64_t)n_embd * sizeof(uint16_t); const uint64_t dst_row_bytes = (uint64_t)n_embd * sizeof(float); ds4_gpu_get_rows_args args = { .ne00t = (int32_t)n_embd, @@ -4250,12 +4389,12 @@ int ds4_gpu_embed_token_hc_tensor( .nb3 = dst_row_bytes, }; NSUInteger nth = (NSUInteger)n_embd; - const NSUInteger max_threads = g_get_rows_f16_pipeline.maxTotalThreadsPerThreadgroup; + const NSUInteger max_threads = pipeline.maxTotalThreadsPerThreadgroup; if (nth > max_threads) nth = max_threads; if (nth == 0) nth = 1; const NSUInteger nw0 = ((NSUInteger)n_embd + nth - 1u) / nth; id enc = ds4_gpu_compute_encoder(cb); - [enc setComputePipelineState:g_get_rows_f16_pipeline]; + [enc setComputePipelineState:pipeline]; [enc setBytes:&args length:sizeof(args) atIndex:0]; [enc setBuffer:wbuf offset:(NSUInteger)inner_offset atIndex:1]; [enc setBytes:&token_i32 length:sizeof(token_i32) atIndex:2]; @@ -4290,7 +4429,8 @@ int ds4_gpu_embed_tokens_hc_tensor( uint32_t n_vocab, uint32_t n_tokens, uint32_t n_embd, - uint32_t n_hc) { + uint32_t n_hc, + uint32_t weight_type) { if (!g_initialized && !ds4_gpu_init()) return 0; if (!out_hc || !tokens || !model_map || n_vocab == 0 || n_tokens == 0 || n_embd == 0 || n_hc == 0) { return 0; @@ -4308,7 +4448,11 @@ int ds4_gpu_embed_tokens_hc_tensor( return 0; } - const uint64_t weight_bytes = (uint64_t)n_vocab * n_embd * sizeof(uint16_t); + uint64_t src_row_bytes = 0; + id _row_pipeline = nil; + if (!ds4_gpu_embed_row_layout(weight_type, n_embd, &src_row_bytes, &_row_pipeline)) return 0; + + const uint64_t weight_bytes = (uint64_t)n_vocab * src_row_bytes; if (weight_offset > model_size || weight_bytes > model_size - weight_offset) { fprintf(stderr, "ds4: Metal graph batched embedding range is outside the mapped model\n"); return 0; @@ -4330,16 +4474,17 @@ int ds4_gpu_embed_tokens_hc_tensor( id cb = ds4_gpu_command_buffer(&owned); if (!cb) return 0; - if (!ds4_gpu_encode_get_rows_f16(cb, - wbuf, - (NSUInteger)inner_offset, - tokbuf, - ds4_gpu_tensor_offset(tokens), - g_embed_rows_buffer, - 0, - n_vocab, - n_tokens, - n_embd) || + if (!ds4_gpu_encode_get_rows(cb, + wbuf, + (NSUInteger)inner_offset, + tokbuf, + ds4_gpu_tensor_offset(tokens), + g_embed_rows_buffer, + 0, + n_vocab, + n_tokens, + n_embd, + weight_type) || !ds4_gpu_encode_repeat_hc_embedding(cb, g_embed_rows_buffer, 0, @@ -5353,13 +5498,13 @@ int ds4_gpu_matmul_f32_tensor( const ds4_gpu_tensor *x, uint64_t n_tok) { if (!g_initialized && !ds4_gpu_init()) return 0; - if (in_dim > UINT32_MAX || out_dim > UINT32_MAX || n_tok > UINT32_MAX || n_tok != 1) return 0; + if (in_dim > UINT32_MAX || out_dim > UINT32_MAX || n_tok > UINT32_MAX) return 0; @autoreleasepool { id xbuf = ds4_gpu_tensor_buffer(x); id outbuf = ds4_gpu_tensor_buffer(out); - const uint64_t x_bytes = in_dim * sizeof(float); - const uint64_t out_bytes = out_dim * sizeof(float); + const uint64_t x_bytes = n_tok * in_dim * sizeof(float); + const uint64_t out_bytes = n_tok * out_dim * sizeof(float); if (!xbuf || !outbuf || ds4_gpu_tensor_bytes(x) < x_bytes || ds4_gpu_tensor_bytes(out) < out_bytes) { @@ -5382,6 +5527,36 @@ int ds4_gpu_matmul_f32_tensor( id cb = ds4_gpu_command_buffer(&owned); if (!cb) return 0; + if (n_tok > 1) { + /* Multi-token prefill path: kernel_mul_mm_f32_f32 mirrors the + * existing F16/Q8_0 mul_mm_t instantiations in dense.metal. */ + const bool bc_inp = (in_dim % 32u) != 0; + const bool bc_out = (out_dim % 64u) != 0 || (n_tok % 32u) != 0; + id pipeline = + ds4_gpu_get_mul_mm_pipeline("kernel_mul_mm_f32_f32", bc_inp, bc_out); + if (!pipeline) return 0; + + ds4_gpu_mul_mm_args args = ds4_gpu_make_mm_args(in_dim, out_dim, n_tok, row_bytes); + + id enc = ds4_gpu_compute_encoder(cb); + [enc setComputePipelineState:pipeline]; + [enc setBytes:&args length:sizeof(args) atIndex:0]; + [enc setBuffer:wbuf offset:(NSUInteger)inner_offset atIndex:1]; + [enc setBuffer:xbuf offset:ds4_gpu_tensor_offset(x) atIndex:2]; + [enc setBuffer:outbuf offset:ds4_gpu_tensor_offset(out) atIndex:3]; + [enc setThreadgroupMemoryLength:(bc_out ? 8192u : 6144u) atIndex:0]; + [enc dispatchThreadgroups:MTLSizeMake(((NSUInteger)n_tok + 31u) / 32u, + ((NSUInteger)out_dim + 63u) / 64u, + 1) + threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + ds4_gpu_end_compute_encoder(cb, enc); + + if (!ds4_gpu_finish_command_buffer(cb, owned, "F32 tensor matmul")) { + return 0; + } + return 1; + } + ds4_gpu_q8_0_matvec_args mv_args = ds4_gpu_make_f32_mv_args(in_dim, out_dim, 1); ds4_gpu_mv_dispatch mv_dispatch = ds4_gpu_make_plain_mv_dispatch(in_dim, 1); mv_args.nr0 = mv_dispatch.nr0; @@ -6171,7 +6346,15 @@ static int ds4_gpu_encode_compressor_score_with_ape( uint32_t n_tokens) { if (!cb || !score_src || !score_dst || !apebuf || width == 0 || ratio == 0 || n_tokens == 0 || - (ape_type != 0u && ape_type != 1u)) { + (ape_type != 0u && ape_type != 1u && ape_type != 8u)) { + return 0; + } + /* Q8_0 ape rows must be QK8_0-aligned (the dequant kernel walks whole + * blocks and src_off must land on a block boundary). */ + if (ape_type == 8u && (width % 32u) != 0u) { + fprintf(stderr, + "ds4: Metal compressor APE Q8_0 width %u is not 32-aligned\n", + width); return 0; } @@ -6189,34 +6372,93 @@ static int ds4_gpu_encode_compressor_score_with_ape( return 0; } - const uint64_t elem_ape = ape_type == 1u ? 2u : 4u; - uint32_t copied_rows = 0; - uint32_t pos_mod = pos0 % ratio; - while (copied_rows < n_tokens) { - uint32_t seg_rows = ratio - pos_mod; - if (seg_rows > n_tokens - copied_rows) seg_rows = n_tokens - copied_rows; - const uint32_t seg_elems = seg_rows * width; - const NSUInteger src_off = ape_offset + (NSUInteger)pos_mod * width * elem_ape; - const NSUInteger dst_off = (NSUInteger)copied_rows * width * sizeof(float); - int ok; - if (ape_type == 1u) { - ok = ds4_gpu_encode_cpy_f16_f32_1d(cb, - apebuf, - src_off, - g_compressor_store_ape_buffer, - dst_off, - seg_elems); - } else { - ok = ds4_gpu_encode_cpy_f32_f32_1d(cb, - apebuf, - src_off, - g_compressor_store_ape_buffer, - dst_off, - seg_elems); + /* Per-row source stride. F16/F32 use natural element bytes; Q8_0 uses + * (width/32) * sizeof(block_q8_0) = (width/32) * 34 bytes. */ + const uint64_t row_src_bytes = ape_type == 8u + ? (uint64_t)(width / 32u) * 34u + : (uint64_t)width * (ape_type == 1u ? 2u : 4u); + + /* For Q8_0 ape, dequant on CPU into a *per-call* private buffer. The + * Metal-side `kernel_cpy_q8_0_f32` path produced silently wrong output on + * M5 Max for compressor APE shapes; using the shared g_compressor_store_ape_buffer + * with CPU writes also produces wrong output because multiple CPU writes + * to the same scratch in one command buffer collapse to the last write + * at execute time (Metal kernels run in encode order, but CPU writes + * don't participate in that ordering when the same scratch is reused). + * + * The local buffer is retained until cb completes via addCompletedHandler + * because Metal does NOT strongly retain buffers bound to encoders. */ + if (ape_type == 8u) { + const uint8_t *apebytes = (const uint8_t *) [apebuf contents]; + if (!apebytes) { + fprintf(stderr, "ds4: Metal compressor APE Q8_0: source buffer has no CPU contents\n"); + return 0; + } + const NSUInteger ape_call_bytes = (NSUInteger)total_elems * sizeof(float); + id ape_call_buf = [g_device newBufferWithLength:ape_call_bytes + options:MTLResourceStorageModeShared]; + if (!ape_call_buf) { + fprintf(stderr, "ds4: Metal compressor APE Q8_0: per-call scratch alloc failed\n"); + return 0; + } + float *scratch = (float *) [ape_call_buf contents]; + uint32_t copied_rows = 0; + uint32_t pos_mod = pos0 % ratio; + while (copied_rows < n_tokens) { + uint32_t seg_rows = ratio - pos_mod; + if (seg_rows > n_tokens - copied_rows) seg_rows = n_tokens - copied_rows; + const uint64_t src_off = (uint64_t)ape_offset + (uint64_t)pos_mod * row_src_bytes; + ds4_gpu_cpu_dequant_q8_0_rows(scratch + (uint64_t)copied_rows * width, + apebytes + src_off, + seg_rows, + width); + copied_rows += seg_rows; + pos_mod = 0; + } + const int add_ok = ds4_gpu_encode_add_f32_1d(cb, + score_src, + score_src_offset, + ape_call_buf, + 0, + score_dst, + score_dst_offset, + total_elems); + if (add_ok) { + /* Keep ape_call_buf alive until the GPU is done reading it. */ + [cb addCompletedHandler:^(id _Nonnull __unused done) { + (void) ape_call_buf; + }]; + } + return add_ok; + } else { + uint32_t copied_rows = 0; + uint32_t pos_mod = pos0 % ratio; + while (copied_rows < n_tokens) { + uint32_t seg_rows = ratio - pos_mod; + if (seg_rows > n_tokens - copied_rows) seg_rows = n_tokens - copied_rows; + const uint32_t seg_elems = seg_rows * width; + const NSUInteger src_off = ape_offset + (NSUInteger)pos_mod * row_src_bytes; + const NSUInteger dst_off = (NSUInteger)copied_rows * width * sizeof(float); + int ok; + if (ape_type == 1u) { + ok = ds4_gpu_encode_cpy_f16_f32_1d(cb, + apebuf, + src_off, + g_compressor_store_ape_buffer, + dst_off, + seg_elems); + } else { + ok = ds4_gpu_encode_cpy_f32_f32_1d(cb, + apebuf, + src_off, + g_compressor_store_ape_buffer, + dst_off, + seg_elems); + } + if (!ok) return 0; + copied_rows += seg_rows; + pos_mod = 0; } - if (!ok) return 0; - copied_rows += seg_rows; - pos_mod = 0; } return ds4_gpu_encode_add_f32_1d(cb, @@ -6302,7 +6544,7 @@ static int ds4_gpu_compressor_store_one_tensor( uint32_t ratio, uint32_t pos) { if (!kv || !sc || !state_kv || !state_score || !model_map || - width == 0 || ratio == 0 || (ape_type != 0u && ape_type != 1u)) { + width == 0 || ratio == 0 || (ape_type != 0u && ape_type != 1u && ape_type != 8u)) { return 0; } @@ -6312,10 +6554,9 @@ static int ds4_gpu_compressor_store_one_tensor( if (!pipeline) return 0; const uint32_t state_rows = ratio == 4u ? 2u * ratio : ratio; - const uint64_t elem_ape = ape_type == 1u ? 2u : 4u; const uint64_t row_bytes = (uint64_t)width * sizeof(float); const uint64_t state_bytes = (uint64_t)state_rows * row_bytes; - const uint64_t ape_bytes = (uint64_t)width * ratio * elem_ape; + const uint64_t ape_bytes = ds4_gpu_ape_bytes(ape_type, (uint64_t)width * ratio); if (ape_offset > model_size || ape_bytes > model_size - ape_offset || ds4_gpu_tensor_bytes(kv) < row_bytes || ds4_gpu_tensor_bytes(sc) < row_bytes || @@ -6377,7 +6618,7 @@ int ds4_gpu_compressor_store_batch_tensor( if (!g_initialized && !ds4_gpu_init()) return 0; if (!kv || !sc || !state_kv || !state_score || !model_map || head_dim == 0 || ratio == 0 || n_tokens == 0 || - (ape_type != 0u && ape_type != 1u)) { + (ape_type != 0u && ape_type != 1u && ape_type != 8u)) { return 0; } @@ -6385,10 +6626,9 @@ int ds4_gpu_compressor_store_batch_tensor( const uint32_t coff = ratio == 4u ? 2u : 1u; const uint32_t width = coff * head_dim; const uint32_t state_rows = coff * ratio; - const uint64_t elem_ape = ape_type == 1u ? 2u : 4u; const uint64_t kv_bytes = (uint64_t)n_tokens * width * sizeof(float); const uint64_t state_bytes = (uint64_t)state_rows * width * sizeof(float); - const uint64_t ape_bytes = (uint64_t)width * ratio * elem_ape; + const uint64_t ape_bytes = ds4_gpu_ape_bytes(ape_type, (uint64_t)width * ratio); if (ape_offset > model_size || ape_bytes > model_size - ape_offset) { fprintf(stderr, "ds4: Metal compressor batch APE range is outside the mapped model\n"); @@ -6450,44 +6690,83 @@ int ds4_gpu_compressor_store_batch_tensor( } int ok = 1; - uint32_t copied_rows = 0; - uint32_t pos_mod = pos0 % ratio; - while (ok && copied_rows < n_tokens) { - uint32_t seg_rows = ratio - pos_mod; - if (seg_rows > n_tokens - copied_rows) seg_rows = n_tokens - copied_rows; - const uint32_t seg_elems = seg_rows * width; - const NSUInteger src_off = (NSUInteger)ape_inner + - (NSUInteger)pos_mod * width * elem_ape; - const NSUInteger dst_off = (NSUInteger)copied_rows * width * sizeof(float); - if (ape_type == 1u) { - ok = ds4_gpu_encode_cpy_f16_f32_1d(cb, - apebuf, - src_off, - g_compressor_store_ape_buffer, - dst_off, - seg_elems); - } else { - ok = ds4_gpu_encode_cpy_f32_f32_1d(cb, - apebuf, - src_off, - g_compressor_store_ape_buffer, - dst_off, - seg_elems); + const uint64_t row_src_bytes = ds4_gpu_ape_bytes(ape_type, (uint64_t)width); + id ape_call_buf = nil; + id ape_consume_buf = g_compressor_store_ape_buffer; + NSUInteger ape_consume_off = 0; + if (ape_type == 8u) { + /* CPU-side dequant for Q8_0: write to a fresh per-call shared + * buffer so the data survives encode-time race vs subsequent + * kernel reads (see ds4_gpu_encode_compressor_score_with_ape). */ + const uint8_t *apebytes = (const uint8_t *) [apebuf contents]; + const NSUInteger ape_call_bytes = (NSUInteger)total_elems * sizeof(float); + ape_call_buf = [g_device newBufferWithLength:ape_call_bytes + options:MTLResourceStorageModeShared]; + float *scratch = ape_call_buf ? (float *) [ape_call_buf contents] : NULL; + if (!apebytes || !scratch) { + fprintf(stderr, "ds4: Metal compressor APE Q8_0: per-call scratch alloc failed\n"); + ok = 0; + } + ape_consume_buf = ape_call_buf; + uint32_t copied_rows = 0; + uint32_t pos_mod = pos0 % ratio; + while (ok && copied_rows < n_tokens) { + uint32_t seg_rows = ratio - pos_mod; + if (seg_rows > n_tokens - copied_rows) seg_rows = n_tokens - copied_rows; + const uint64_t src_off = (uint64_t)ape_inner + (uint64_t)pos_mod * row_src_bytes; + ds4_gpu_cpu_dequant_q8_0_rows(scratch + (uint64_t)copied_rows * width, + apebytes + src_off, + seg_rows, + width); + copied_rows += seg_rows; + pos_mod = 0; + } + } else { + uint32_t copied_rows = 0; + uint32_t pos_mod = pos0 % ratio; + while (ok && copied_rows < n_tokens) { + uint32_t seg_rows = ratio - pos_mod; + if (seg_rows > n_tokens - copied_rows) seg_rows = n_tokens - copied_rows; + const uint32_t seg_elems = seg_rows * width; + const NSUInteger src_off = (NSUInteger)ape_inner + + (NSUInteger)pos_mod * row_src_bytes; + const NSUInteger dst_off = (NSUInteger)copied_rows * width * sizeof(float); + if (ape_type == 1u) { + ok = ds4_gpu_encode_cpy_f16_f32_1d(cb, + apebuf, + src_off, + g_compressor_store_ape_buffer, + dst_off, + seg_elems); + } else { + ok = ds4_gpu_encode_cpy_f32_f32_1d(cb, + apebuf, + src_off, + g_compressor_store_ape_buffer, + dst_off, + seg_elems); + } + copied_rows += seg_rows; + pos_mod = 0; } - copied_rows += seg_rows; - pos_mod = 0; } if (ok) { ok = ds4_gpu_encode_add_f32_1d(cb, scbuf, ds4_gpu_tensor_offset(sc), - g_compressor_store_ape_buffer, - 0, + ape_consume_buf, + ape_consume_off, g_compressor_store_score_buffer, 0, total_elems); } + if (ok && ape_call_buf) { + /* Retain the per-call ape buffer until the GPU is done. */ + [cb addCompletedHandler:^(id _Nonnull __unused done) { + (void) ape_call_buf; + }]; + } if (ok) { ok = ds4_gpu_encode_set_rows_f32_i32(cb, state_kv, @@ -7020,7 +7299,7 @@ int ds4_gpu_compressor_prefill_tensor( if (!comp_cache || !state_kv || !state_score || !kv || !sc || !model_map || head_dim == 0 || ratio == 0 || n_tokens == 0 || n_rot > head_dim || (n_rot & 1u) != 0 || - (ape_type != 0u && ape_type != 1u) || + (ape_type != 0u && ape_type != 1u && ape_type != 8u) || norm_type != 0u) { return 0; } @@ -7032,11 +7311,10 @@ int ds4_gpu_compressor_prefill_tensor( const uint32_t n_comp = n_tokens / ratio; const uint32_t cutoff = n_comp * ratio; const uint32_t rem = n_tokens - cutoff; - const uint64_t elem_ape = ape_type == 1u ? 2u : 4u; const uint64_t kv_bytes = (uint64_t)n_tokens * width * sizeof(float); const uint64_t state_bytes = (uint64_t)state_rows * width * sizeof(float); const uint64_t comp_bytes = (uint64_t)n_comp * head_dim * sizeof(float); - const uint64_t ape_bytes = (uint64_t)width * ratio * elem_ape; + const uint64_t ape_bytes = ds4_gpu_ape_bytes(ape_type, (uint64_t)width * ratio); const uint64_t norm_bytes = (uint64_t)head_dim * sizeof(float); if (ape_offset > model_size || ape_bytes > model_size - ape_offset || @@ -7375,7 +7653,7 @@ int ds4_gpu_compressor_prefill_ratio4_replay_tensor( if (!comp_cache || !state_kv || !state_score || !kv || !sc || !model_map || head_dim == 0 || n_tokens == 0 || (n_tokens & 3u) != 0 || (pos0 & 3u) != 0 || n_rot > head_dim || (n_rot & 1u) != 0 || - (ape_type != 0u && ape_type != 1u) || + (ape_type != 0u && ape_type != 1u && ape_type != 8u) || norm_type != 0u) { return 0; } @@ -7385,11 +7663,10 @@ int ds4_gpu_compressor_prefill_ratio4_replay_tensor( const uint32_t width = 2u * head_dim; const uint32_t state_rows = 8u; const uint32_t n_comp = n_tokens / ratio; - const uint64_t elem_ape = ape_type == 1u ? 2u : 4u; const uint64_t kv_bytes = (uint64_t)n_tokens * width * sizeof(float); const uint64_t state_bytes = (uint64_t)state_rows * width * sizeof(float); const uint64_t comp_bytes = (uint64_t)n_comp * head_dim * sizeof(float); - const uint64_t ape_bytes = (uint64_t)width * ratio * elem_ape; + const uint64_t ape_bytes = ds4_gpu_ape_bytes(ape_type, (uint64_t)width * ratio); const uint64_t norm_bytes = (uint64_t)head_dim * sizeof(float); if (ape_offset > model_size || ape_bytes > model_size - ape_offset || @@ -7669,7 +7946,7 @@ int ds4_gpu_compressor_prefill_state_ratio4_tensor( uint32_t pos0) { if (!g_initialized && !ds4_gpu_init()) return 0; if (!state_kv || !state_score || !kv_tail || !sc_tail || !model_map || - head_dim == 0 || (ape_type != 0u && ape_type != 1u)) { + head_dim == 0 || (ape_type != 0u && ape_type != 1u && ape_type != 8u)) { return 0; } @@ -7677,10 +7954,9 @@ int ds4_gpu_compressor_prefill_state_ratio4_tensor( const uint32_t ratio = 4u; const uint32_t width = 2u * head_dim; const uint32_t state_rows = 8u; - const uint64_t elem_ape = ape_type == 1u ? 2u : 4u; const uint64_t tail_bytes = (uint64_t)ratio * width * sizeof(float); const uint64_t state_bytes = (uint64_t)state_rows * width * sizeof(float); - const uint64_t ape_bytes = (uint64_t)ratio * width * elem_ape; + const uint64_t ape_bytes = ds4_gpu_ape_bytes(ape_type, (uint64_t)ratio * width); if (ape_offset > model_size || ape_bytes > model_size - ape_offset) { fprintf(stderr, "ds4: Metal compressor prefill-state APE range is outside the mapped model\n"); @@ -7783,7 +8059,7 @@ int ds4_gpu_compressor_update_tensor( if (!kv_cur || !sc_cur || !state_kv || !state_score || !comp_cache || !model_map || head_dim == 0 || ratio == 0 || n_rot > head_dim || (n_rot & 1u) != 0 || - (ape_type != 0u && ape_type != 1u) || + (ape_type != 0u && ape_type != 1u && ape_type != 8u) || norm_type != 0u) { return 0; } @@ -7793,11 +8069,10 @@ int ds4_gpu_compressor_update_tensor( const uint32_t width = coff * head_dim; const uint32_t state_rows = coff * ratio; const uint32_t emit = ((pos + 1u) % ratio) == 0u ? 1u : 0u; - const uint64_t elem_ape = ape_type == 1u ? 2u : 4u; const uint64_t kv_bytes = (uint64_t)width * sizeof(float); const uint64_t state_bytes = (uint64_t)state_rows * width * sizeof(float); const uint64_t comp_bytes = (uint64_t)(comp_row + (emit ? 1u : 0u)) * head_dim * sizeof(float); - const uint64_t ape_bytes = (uint64_t)width * ratio * elem_ape; + const uint64_t ape_bytes = ds4_gpu_ape_bytes(ape_type, (uint64_t)width * ratio); const uint64_t norm_bytes = (uint64_t)head_dim * sizeof(float); if (ape_offset > model_size || ape_bytes > model_size - ape_offset || diff --git a/metal/cpy.metal b/metal/cpy.metal index 3aa00ac1..b2558bab 100644 --- a/metal/cpy.metal +++ b/metal/cpy.metal @@ -55,3 +55,41 @@ typedef decltype(kernel_cpy_t_t) kernel_cpy_t; template [[host_name("kernel_cpy_f32_f32")]] kernel kernel_cpy_t kernel_cpy_t_t; template [[host_name("kernel_cpy_f32_f16")]] kernel kernel_cpy_t kernel_cpy_t_t; template [[host_name("kernel_cpy_f16_f32")]] kernel kernel_cpy_t kernel_cpy_t_t; + +// Q8_0 -> F32 dequantizing 1D copy. Used by the compressor APE path so +// stock-recipe GGUFs that ship `*.compressor_ape.weight` as Q8_0 can be +// read through the same byte-strided copy that the F16/F32 ape paths use. +// args.ne00 is the total element count (must be divisible by QK8_0 = 32); +// src is a packed Q8_0 region (sizeof(block_q8_0) = 34 bytes per QK8_0 elements) +// and dst is contiguous F32. +// +// Uses explicit byte arithmetic instead of `block_q8_0 *` indexing because +// the GGUF byte stride (34) does not match Metal's natural struct alignment +// for `block_q8_0` (which would be padded to a multiple of `alignof(half)` +// in some cases). Each thread handles one output element and re-reads the +// half scale from its block's first two bytes; that's redundant but cheap +// and the compressor APE total element count is tiny (a few thousand). +kernel void kernel_cpy_q8_0_f32( + constant ds4_metal_args_cpy & args, + device const char * src0, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiitg[[thread_index_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const int n = (int) args.ne00; + const int gid = (int)(tgpig.x * ntg.x + tiitg); + if (gid >= n) return; + + constexpr int BLOCK_BYTES = 34; + const int blk = gid / QK8_0; + const int idx = gid - blk * QK8_0; + device const char *bp = src0 + (uint64_t)blk * BLOCK_BYTES; + half d_h; + /* half scale lives at the first 2 bytes of the block */ + thread char *dp = (thread char *) &d_h; + dp[0] = bp[0]; dp[1] = bp[1]; + const float d = (float) d_h; + const int8_t q = (int8_t) bp[2 + idx]; + device float *out = (device float *) dst; + out[gid] = (float) q * d; +} diff --git a/metal/dense.metal b/metal/dense.metal index a84927e9..fa922b45 100644 --- a/metal/dense.metal +++ b/metal/dense.metal @@ -1116,6 +1116,7 @@ kernel void kernel_mul_mm( typedef decltype(kernel_mul_mm) mul_mm_t; -// Host-visible prefill matmul variants for F16 and Q8_0 weights. +// Host-visible prefill matmul variants for F16, Q8_0, and F32 weights. template [[host_name("kernel_mul_mm_f16_f32")]] kernel mul_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mul_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_f32_f32")]] kernel mul_mm_t kernel_mul_mm; diff --git a/metal/dsv4_kv.metal b/metal/dsv4_kv.metal index 89bd7d3a..80d5846c 100644 --- a/metal/dsv4_kv.metal +++ b/metal/dsv4_kv.metal @@ -218,6 +218,15 @@ kernel void kernel_dsv4_compressor_store_one( float ape_v; if (args.ape_type == 1u) { ape_v = (float)(((device const half *)ape)[ape_i]); + } else if (args.ape_type == 8u) { + /* Q8_0: 32 elements per 34-byte block (uint16_t scale + 32 int8 quants). */ + const uint blk = ape_i / 32u; + const uint idx = ape_i - blk * 32u; + device const uchar *bp = (device const uchar *)ape + (uint64_t)blk * 34u; + const ushort sb = (ushort)bp[0] | ((ushort)bp[1] << 8); + const half d = as_type(sb); + const int q = (int)(int8_t)bp[2u + idx]; + ape_v = (float)q * (float)d; } else { ape_v = ((device const float *)ape)[ape_i]; } diff --git a/metal/get_rows.metal b/metal/get_rows.metal index 31d7aab6..34ef0098 100644 --- a/metal/get_rows.metal +++ b/metal/get_rows.metal @@ -52,3 +52,37 @@ typedef decltype(kernel_get_rows_f) get_rows_f_t; template [[host_name("kernel_get_rows_f32")]] kernel get_rows_f_t kernel_get_rows_f; template [[host_name("kernel_get_rows_f16")]] kernel get_rows_f_t kernel_get_rows_f; template [[host_name("kernel_get_rows_i32")]] kernel get_rows_f_t kernel_get_rows_f; + +// Q8_0 token-embedding gather: dequantize blocks while reading the row. +// Source row is `(ne00 / QK8_0)` `block_q8_0` records ([half scale][int8 qs[32]]); +// destination is `ne00` floats. Layout dispatch matches kernel_get_rows_f. +kernel void kernel_get_rows_q8_0( + constant ds4_metal_args_get_rows & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiitg[[thread_index_in_threadgroup]], + ushort3 ntg [[threads_per_threadgroup]]) { + const int32_t iw0 = tgpig.x/args.ne10; + const int32_t i10 = tgpig.x%args.ne10; + const int32_t i11 = tgpig.y; + const int32_t i12 = tgpig.z; + + const int32_t r = ((const device int32_t *) (src1 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10))[0]; + + const int32_t i02 = i11; + const int32_t i03 = i12; + + auto psrc = (device const block_q8_0 *) (src0 + i03*args.nb03 + i02*args.nb02 + r*args.nb01); + auto pdst = ( device float *) (dst + i12*args.nb3 + i11*args.nb2 + i10*args.nb1); + + for (int ind = iw0*ntg.x + tiitg; ind < args.ne00t;) { + const int blk = ind / QK8_0; + const int idx = ind - blk * QK8_0; + const float d = (float) psrc[blk].d; + pdst[ind] = (float) psrc[blk].qs[idx] * d; + + break; + } +}