diff --git a/ds4.c b/ds4.c
index 1f8349b6..da320104 100644
--- a/ds4.c
+++ b/ds4.c
@@ -2220,6 +2220,29 @@ static void tensor_expect_plain_layout(
     tensor_expect_layout(t, t->type, ndim, d0, d1, d2);
 }
 
+/* Tensors that flow through the polymorphic dense matvec/matmul dispatch
+ * (matvec_any / ds4_gpu_matmul_dispatch_tensor) accept any of F16, F32, or
+ * Q8_0.  Used for the small projections antirez ships at F16 but stock
+ * llama.cpp converters emit at Q8_0 (HC fn weights, compressors, indexer
+ * projections) or F32 (router gate). */
+static void tensor_expect_dispatch_layout(
+        const ds4_tensor *t,
+        uint32_t          ndim,
+        uint64_t          d0,
+        uint64_t          d1,
+        uint64_t          d2) {
+    if (!t) ds4_die("internal error: missing tensor while validating layout");
+    if (t->type != DS4_TENSOR_F16 && t->type != DS4_TENSOR_F32 && t->type != DS4_TENSOR_Q8_0) {
+        fprintf(stderr,
+                "ds4: tensor %.*s has type %s, expected F16, F32, or Q8_0\n",
+                (int)t->name.len,
+                t->name.ptr,
+                tensor_type_name(t->type));
+        exit(1);
+    }
+    tensor_expect_layout(t, t->type, ndim, d0, d1, d2);
+}
+
 static bool tensor_is_routed_expert_type(uint32_t type) {
     return type == DS4_TENSOR_IQ2_XXS ||
            type == DS4_TENSOR_Q2_K ||
@@ -2288,9 +2311,17 @@ static void weights_validate_layout(const ds4_weights *w) {
     const uint64_t q_dim = (uint64_t)DS4_N_HEAD * DS4_N_HEAD_DIM;
     const uint64_t out_low_dim = (uint64_t)DS4_N_OUT_GROUP * DS4_N_LORA_O;
 
-    tensor_expect_layout(w->token_embd,      DS4_TENSOR_F16,  2, DS4_N_EMBD, DS4_N_VOCAB, 0);
+    /* token_embd may be F16 (antirez recipe) or Q8_0 (stock llama.cpp recipe used
+     * by e.g. cyberneurova converts).  Both have CPU and Metal kernels above. */
+    if (w->token_embd->type != DS4_TENSOR_F16 && w->token_embd->type != DS4_TENSOR_Q8_0) {
+        fprintf(stderr,
+                "ds4: tensor token_embd.weight has type %s, expected F16 or Q8_0\n",
+                tensor_type_name(w->token_embd->type));
+        exit(1);
+    }
+    tensor_expect_layout(w->token_embd, w->token_embd->type, 2, DS4_N_EMBD, DS4_N_VOCAB, 0);
     tensor_expect_layout(w->output_hc_base,  DS4_TENSOR_F32,  1, DS4_N_HC, 0, 0);
-    tensor_expect_layout(w->output_hc_fn,    DS4_TENSOR_F16,  2, hc_dim, DS4_N_HC, 0);
+    tensor_expect_dispatch_layout(w->output_hc_fn, 2, hc_dim, DS4_N_HC, 0);
     tensor_expect_layout(w->output_hc_scale, DS4_TENSOR_F32,  1, 1, 0, 0);
     tensor_expect_layout(w->output_norm,     DS4_TENSOR_F32,  1, DS4_N_EMBD, 0, 0);
     tensor_expect_layout(w->output,          DS4_TENSOR_Q8_0, 2, DS4_N_EMBD, DS4_N_VOCAB, 0);
@@ -2299,7 +2330,7 @@ static void weights_validate_layout(const ds4_weights *w) {
         const ds4_layer_weights *l = &w->layer[il];
         const uint32_t ratio = ds4_layer_compress_ratio(il);
 
-        tensor_expect_layout(l->hc_attn_fn,     DS4_TENSOR_F16,  2, hc_dim, hc_mix_dim, 0);
+        tensor_expect_dispatch_layout(l->hc_attn_fn, 2, hc_dim, hc_mix_dim, 0);
         tensor_expect_layout(l->hc_attn_scale,  DS4_TENSOR_F32,  1, 3, 0, 0);
         tensor_expect_layout(l->hc_attn_base,   DS4_TENSOR_F32,  1, hc_mix_dim, 0, 0);
         tensor_expect_layout(l->attn_norm,      DS4_TENSOR_F32,  1, DS4_N_EMBD, 0, 0);
@@ -2315,27 +2346,27 @@ static void weights_validate_layout(const ds4_weights *w) {
         if (ratio != 0) {
             const uint32_t coff = ratio == 4 ? 2u : 1u;
             const uint64_t comp_width = (uint64_t)coff * DS4_N_HEAD_DIM;
-            tensor_expect_layout(l->attn_compressor_ape,  DS4_TENSOR_F16, 2, comp_width, ratio, 0);
-            tensor_expect_layout(l->attn_compressor_kv,   DS4_TENSOR_F16, 2, DS4_N_EMBD, comp_width, 0);
-            tensor_expect_layout(l->attn_compressor_gate, DS4_TENSOR_F16, 2, DS4_N_EMBD, comp_width, 0);
+            tensor_expect_dispatch_layout(l->attn_compressor_ape,  2, comp_width, ratio, 0);
+            tensor_expect_dispatch_layout(l->attn_compressor_kv,   2, DS4_N_EMBD, comp_width, 0);
+            tensor_expect_dispatch_layout(l->attn_compressor_gate, 2, DS4_N_EMBD, comp_width, 0);
             tensor_expect_layout(l->attn_compressor_norm, DS4_TENSOR_F32, 1, DS4_N_HEAD_DIM, 0, 0);
         }
         if (ratio == 4) {
             const uint64_t index_q_dim = (uint64_t)DS4_N_INDEXER_HEAD * DS4_N_INDEXER_HEAD_DIM;
             const uint64_t index_width = 2u * DS4_N_INDEXER_HEAD_DIM;
-            tensor_expect_layout(l->indexer_attn_q_b,          DS4_TENSOR_F16, 2, DS4_N_LORA_Q, index_q_dim, 0);
-            tensor_expect_layout(l->indexer_proj,              DS4_TENSOR_F16, 2, DS4_N_EMBD, DS4_N_INDEXER_HEAD, 0);
-            tensor_expect_layout(l->indexer_compressor_ape,    DS4_TENSOR_F16, 2, index_width, ratio, 0);
-            tensor_expect_layout(l->indexer_compressor_kv,     DS4_TENSOR_F16, 2, DS4_N_EMBD, index_width, 0);
-            tensor_expect_layout(l->indexer_compressor_gate,   DS4_TENSOR_F16, 2, DS4_N_EMBD, index_width, 0);
+            tensor_expect_dispatch_layout(l->indexer_attn_q_b,        2, DS4_N_LORA_Q, index_q_dim, 0);
+            tensor_expect_dispatch_layout(l->indexer_proj,            2, DS4_N_EMBD, DS4_N_INDEXER_HEAD, 0);
+            tensor_expect_dispatch_layout(l->indexer_compressor_ape,  2, index_width, ratio, 0);
+            tensor_expect_dispatch_layout(l->indexer_compressor_kv,   2, DS4_N_EMBD, index_width, 0);
+            tensor_expect_dispatch_layout(l->indexer_compressor_gate, 2, DS4_N_EMBD, index_width, 0);
             tensor_expect_layout(l->indexer_compressor_norm,   DS4_TENSOR_F32, 1, DS4_N_INDEXER_HEAD_DIM, 0, 0);
         }
 
-        tensor_expect_layout(l->hc_ffn_fn,      DS4_TENSOR_F16,  2, hc_dim, hc_mix_dim, 0);
+        tensor_expect_dispatch_layout(l->hc_ffn_fn, 2, hc_dim, hc_mix_dim, 0);
         tensor_expect_layout(l->hc_ffn_scale,   DS4_TENSOR_F32,  1, 3, 0, 0);
         tensor_expect_layout(l->hc_ffn_base,    DS4_TENSOR_F32,  1, hc_mix_dim, 0, 0);
         tensor_expect_layout(l->ffn_norm,       DS4_TENSOR_F32,  1, DS4_N_EMBD, 0, 0);
-        tensor_expect_layout(l->ffn_gate_inp,   DS4_TENSOR_F16,  2, DS4_N_EMBD, DS4_N_EXPERT, 0);
+        tensor_expect_dispatch_layout(l->ffn_gate_inp, 2, DS4_N_EMBD, DS4_N_EXPERT, 0);
         tensor_expect_optional(l->ffn_exp_probs_b, DS4_TENSOR_F32, 1, DS4_N_EXPERT, 0, 0);
         tensor_expect_routed_expert(l->ffn_gate_exps, 3, DS4_N_EMBD, DS4_N_FF_EXP, DS4_N_EXPERT);
         tensor_expect_routed_expert(l->ffn_up_exps,   3, DS4_N_EMBD, DS4_N_FF_EXP, DS4_N_EXPERT);
@@ -2680,7 +2711,8 @@ static void weights_free(ds4_weights *w) {
     memset(w, 0, sizeof(*w));
 }
 
-/* Load one token embedding row and expand it to float activations. */
+/* Load one token embedding row and expand it to float activations.
+ * The CPU reference path mirrors the Metal kernel_get_rows_{f16,q8_0} dispatch. */
 static void embed_token_f16(const ds4_model *m, const ds4_weights *w, int token, float *out) {
     ds4_tensor *te = w->token_embd;
     if (token < 0 || (uint64_t)token >= te->dim[1]) {
@@ -2696,6 +2728,37 @@ static void embed_token_f16(const ds4_model *m, const ds4_weights *w, int token,
     }
 }
 
+/* Q8_0 token embedding: source row is `(stride / 32)` block_q8_0 records,
+ * each [uint16_t scale][int8_t qs[32]]; dequantize to float on the fly. */
+static void embed_token_q8_0(const ds4_model *m, const ds4_weights *w, int token, float *out) {
+    ds4_tensor *te = w->token_embd;
+    if (token < 0 || (uint64_t)token >= te->dim[1]) {
+        ds4_die("token id is outside the embedding table");
+    }
+    const uint64_t stride = te->dim[0];
+    if ((stride % 32) != 0) ds4_die("Q8_0 token embedding stride is not 32-aligned");
+    const uint64_t blocks = stride / 32;
+    const uint8_t *base = tensor_data(m, te);
+    const uint8_t *row = base + (uint64_t)token * blocks * 34;
+    for (uint64_t b = 0; b < blocks; b++) {
+        uint16_t scale_bits;
+        memcpy(&scale_bits, row + b * 34, sizeof(scale_bits));
+        const int8_t *qs = (const int8_t *)(row + b * 34 + 2);
+        const float scale = f16_to_f32(scale_bits);
+        for (int i = 0; i < 32; i++) {
+            out[b * 32 + i] = (float)qs[i] * scale;
+        }
+    }
+}
+
+static void embed_token(const ds4_model *m, const ds4_weights *w, int token, float *out) {
+    switch (w->token_embd->type) {
+    case DS4_TENSOR_F16:  embed_token_f16(m, w, token, out);  return;
+    case DS4_TENSOR_Q8_0: embed_token_q8_0(m, w, token, out); return;
+    default:              ds4_die("unsupported token_embd tensor type");
+    }
+}
+
 /* RMSNorm without a learned scale, used by hyper-connection control vectors. */
 static void rms_norm_no_weight(float *out, const float *x, uint64_t n, float eps) {
     double ss = 0.0;
@@ -3706,6 +3769,17 @@ static void matvec_any(float *out, const ds4_model *m, const ds4_tensor *w, cons
     }
 }
 
+/* Serial dispatcher; for tiny matrices where parallelization overhead exceeds the gain. */
+static void matvec_any_serial(float *out, const ds4_model *m, const ds4_tensor *w, const float *x) {
+    switch (w->type) {
+    case 1: matvec_f16_serial(out, m, w, x); return;
+    case 0:
+    case 8: matvec_any(out, m, w, x); return;  /* no specialized serial path; parallel is cheap here */
+    default:
+        ds4_die("unsupported tensor type for dense matvec_serial");
+    }
+}
+
 static float tensor_1d_value(const ds4_model *m, const ds4_tensor *t, uint64_t i) {
     if (i >= t->elements) ds4_die("tensor scalar index is out of bounds");
     if (t->type == 0) {
@@ -4300,9 +4374,9 @@ static void hc_pre_from_state_one_scratch(
 
     rms_norm_no_weight(flat, residual_hc, hc_dim, DS4_RMS_EPS);
     if (serial_fn) {
-        matvec_f16_serial(mix, model, fn, flat);
+        matvec_any_serial(mix, model, fn, flat);
     } else {
-        matvec_f16(mix, model, fn, flat);
+        matvec_any(mix, model, fn, flat);
     }
 
     const float *scale = tensor_data(model, scale_tensor);
@@ -5173,7 +5247,7 @@ static void layer_router_probs_one(
         const float       * x) {
     float logits[DS4_N_EXPERT];
 
-    matvec_f16(logits, model, layer->ffn_gate_inp, x);
+    matvec_any(logits, model, layer->ffn_gate_inp, x);
     for (int i = 0; i < DS4_N_EXPERT; i++) {
         probs[i] = sqrtf(softplus_stable(logits[i]));
     }
@@ -7625,7 +7699,7 @@ static void forward_token_raw_swa_cpu_decode_scratch(
     float *cur = scratch->cur;
     float *next = scratch->next;
 
-    embed_token_f16(model, weights, token, scratch->plain);
+    embed_token(model, weights, token, scratch->plain);
     hc_from_plain_embedding(cur, scratch->plain, DS4_N_EMBD, DS4_N_HC);
 
     for (uint32_t il = 0; il < DS4_N_LAYER; il++) {
@@ -7700,7 +7774,7 @@ static void prefill_layer_major_cpu(
     }
 
     for (uint64_t t = 0; t < n_tok; t++) {
-        embed_token_f16(model, weights, prompt->v[t], plain);
+        embed_token(model, weights, prompt->v[t], plain);
         hc_from_plain_embedding(cur + t * hc_dim, plain, DS4_N_EMBD, DS4_N_HC);
     }
 
@@ -7897,7 +7971,7 @@ static void forward_first_token_cpu(
     float *cur = xmalloc((size_t)DS4_N_HC * DS4_N_EMBD * sizeof(cur[0]));
     float *next = xmalloc((size_t)DS4_N_HC * DS4_N_EMBD * sizeof(next[0]));
 
-    embed_token_f16(model, weights, token, plain);
+    embed_token(model, weights, token, plain);
     hc_from_plain_embedding(cur, plain, DS4_N_EMBD, DS4_N_HC);
 
     for (uint32_t il = 0; il < DS4_N_LAYER; il++) {
@@ -7928,7 +8002,7 @@ static void output_hc_head_one(
     float *w = xmalloc((size_t)n_hc * sizeof(w[0]));
 
     rms_norm_no_weight(flat, inp_hc, hc_dim, DS4_RMS_EPS);
-    matvec_f16(pre, model, weights->output_hc_fn, flat);
+    matvec_any(pre, model, weights->output_hc_fn, flat);
 
     const float *scale = tensor_data(model, weights->output_hc_scale);
     const float *base = tensor_data(model, weights->output_hc_base);
@@ -7972,7 +8046,7 @@ static void output_logits_one_decode_scratch(
     const uint64_t hc_dim = (uint64_t)DS4_N_EMBD * n_hc;
 
     rms_norm_no_weight(scratch->output_flat, inp_hc, hc_dim, DS4_RMS_EPS);
-    matvec_f16(scratch->output_pre, model, weights->output_hc_fn, scratch->output_flat);
+    matvec_any(scratch->output_pre, model, weights->output_hc_fn, scratch->output_flat);
 
     const float *scale = tensor_data(model, weights->output_hc_scale);
     const float *base = tensor_data(model, weights->output_hc_base);
@@ -9036,6 +9110,16 @@ static bool metal_graph_matmul_plain_tensor(
         uint64_t                out_dim,
         const ds4_gpu_tensor *x,
         uint64_t                n_tok);
+static bool metal_graph_matmul_pair_plain_tensor(
+        ds4_gpu_tensor       *out_a,
+        ds4_gpu_tensor       *out_b,
+        const ds4_model        *model,
+        const ds4_tensor       *w_a,
+        const ds4_tensor       *w_b,
+        uint64_t                in_dim,
+        uint64_t                out_dim,
+        const ds4_gpu_tensor *x,
+        uint64_t                n_tok);
 
 static bool metal_graph_encode_decode_layer(
         ds4_gpu_graph  *g,
@@ -9233,13 +9317,14 @@ static bool metal_graph_encode_decode_layer(
         const bool emit = ((pos + 1u) % ratio) == 0u;
         if (!layer->attn_compressor_kv || !layer->attn_compressor_gate ||
             !layer->attn_compressor_ape || !layer->attn_compressor_norm ||
-            layer->attn_compressor_kv->type != DS4_TENSOR_F16 ||
-            layer->attn_compressor_gate->type != DS4_TENSOR_F16 ||
+            (layer->attn_compressor_kv->type != DS4_TENSOR_F16 &&
+             layer->attn_compressor_kv->type != DS4_TENSOR_Q8_0) ||
+            layer->attn_compressor_kv->type != layer->attn_compressor_gate->type ||
             layer->attn_compressor_kv->dim[0] != DS4_N_EMBD ||
             layer->attn_compressor_gate->dim[0] != DS4_N_EMBD ||
             layer->attn_compressor_kv->dim[1] != comp_width ||
             layer->attn_compressor_gate->dim[1] != comp_width) {
-            fprintf(stderr, "ds4: Metal graph compressor expects paired F16 compressor projections\n");
+            fprintf(stderr, "ds4: Metal graph compressor expects paired F16 or Q8_0 compressor projections\n");
             ok = false;
         }
         if (ok && emit && g->layer_n_comp[il] >= g->comp_cap) {
@@ -9247,25 +9332,10 @@ static bool metal_graph_encode_decode_layer(
             ok = false;
         }
         if (ok && !metal_graph_use_reference_compressor_pair_proj()) {
-            ok = ds4_gpu_matmul_f16_pair_tensor(g->comp_kv_cur,
-                                                  g->comp_sc_cur,
-                                                  model->map,
-                                                  model->size,
-                                                  layer->attn_compressor_kv->abs_offset,
-                                                  layer->attn_compressor_gate->abs_offset,
-                                                  DS4_N_EMBD,
-                                                  comp_width,
-                                                  g->attn_norm,
-                                                  1) != 0;
+            ok = metal_graph_matmul_pair_plain_tensor(g->comp_kv_cur, g->comp_sc_cur, model, layer->attn_compressor_kv, layer->attn_compressor_gate, DS4_N_EMBD, comp_width, g->attn_norm, 1);
         } else {
-            if (ok) ok = ds4_gpu_matmul_f16_tensor(g->comp_kv_cur, model->map, model->size,
-                                                     layer->attn_compressor_kv->abs_offset,
-                                                     DS4_N_EMBD, comp_width,
-                                                     g->attn_norm, 1) != 0;
-            if (ok) ok = ds4_gpu_matmul_f16_tensor(g->comp_sc_cur, model->map, model->size,
-                                                     layer->attn_compressor_gate->abs_offset,
-                                                     DS4_N_EMBD, comp_width,
-                                                     g->attn_norm, 1) != 0;
+            if (ok) ok = metal_graph_matmul_plain_tensor(g->comp_kv_cur, model, layer->attn_compressor_kv, DS4_N_EMBD, comp_width, g->attn_norm, 1);
+            if (ok) ok = metal_graph_matmul_plain_tensor(g->comp_sc_cur, model, layer->attn_compressor_gate, DS4_N_EMBD, comp_width, g->attn_norm, 1);
         }
         const uint32_t comp_row = g->layer_n_comp[il];
         if (ok) ok = ds4_gpu_compressor_update_tensor(g->comp_kv_cur,
@@ -9313,13 +9383,14 @@ static bool metal_graph_encode_decode_layer(
             const uint32_t index_width = coff * DS4_N_INDEXER_HEAD_DIM;
             if (!layer->indexer_compressor_kv || !layer->indexer_compressor_gate ||
                 !layer->indexer_compressor_ape || !layer->indexer_compressor_norm ||
-                layer->indexer_compressor_kv->type != DS4_TENSOR_F16 ||
-                layer->indexer_compressor_gate->type != DS4_TENSOR_F16 ||
+                (layer->indexer_compressor_kv->type != DS4_TENSOR_F16 &&
+                 layer->indexer_compressor_kv->type != DS4_TENSOR_Q8_0) ||
+                layer->indexer_compressor_kv->type != layer->indexer_compressor_gate->type ||
                 layer->indexer_compressor_kv->dim[0] != DS4_N_EMBD ||
                 layer->indexer_compressor_gate->dim[0] != DS4_N_EMBD ||
                 layer->indexer_compressor_kv->dim[1] != index_width ||
                 layer->indexer_compressor_gate->dim[1] != index_width) {
-                fprintf(stderr, "ds4: Metal graph indexer compressor expects paired F16 projections\n");
+                fprintf(stderr, "ds4: Metal graph indexer compressor expects paired F16 or Q8_0 projections\n");
                 ok = false;
             }
             if (ok && emit && g->layer_n_index_comp[il] >= g->comp_cap) {
@@ -9327,25 +9398,10 @@ static bool metal_graph_encode_decode_layer(
                 ok = false;
             }
             if (ok && !metal_graph_use_reference_compressor_pair_proj()) {
-                ok = ds4_gpu_matmul_f16_pair_tensor(g->comp_kv_cur,
-                                                      g->comp_sc_cur,
-                                                      model->map,
-                                                      model->size,
-                                                      layer->indexer_compressor_kv->abs_offset,
-                                                      layer->indexer_compressor_gate->abs_offset,
-                                                      DS4_N_EMBD,
-                                                      index_width,
-                                                      g->attn_norm,
-                                                      1) != 0;
+                ok = metal_graph_matmul_pair_plain_tensor(g->comp_kv_cur, g->comp_sc_cur, model, layer->indexer_compressor_kv, layer->indexer_compressor_gate, DS4_N_EMBD, index_width, g->attn_norm, 1);
             } else {
-                if (ok) ok = ds4_gpu_matmul_f16_tensor(g->comp_kv_cur, model->map, model->size,
-                                                         layer->indexer_compressor_kv->abs_offset,
-                                                         DS4_N_EMBD, index_width,
-                                                         g->attn_norm, 1) != 0;
-                if (ok) ok = ds4_gpu_matmul_f16_tensor(g->comp_sc_cur, model->map, model->size,
-                                                         layer->indexer_compressor_gate->abs_offset,
-                                                         DS4_N_EMBD, index_width,
-                                                         g->attn_norm, 1) != 0;
+                if (ok) ok = metal_graph_matmul_plain_tensor(g->comp_kv_cur, model, layer->indexer_compressor_kv, DS4_N_EMBD, index_width, g->attn_norm, 1);
+                if (ok) ok = metal_graph_matmul_plain_tensor(g->comp_sc_cur, model, layer->indexer_compressor_gate, DS4_N_EMBD, index_width, g->attn_norm, 1);
             }
             const uint32_t index_row = g->layer_n_index_comp[il];
             if (ok) ok = ds4_gpu_compressor_update_tensor(g->comp_kv_cur,
@@ -9377,23 +9433,22 @@ static bool metal_graph_encode_decode_layer(
             if (ok && g->layer_n_comp[il] > decode_top_k) {
                 const uint64_t indexer_q_dim = (uint64_t)DS4_N_INDEXER_HEAD * DS4_N_INDEXER_HEAD_DIM;
                 if (!layer->indexer_attn_q_b ||
-                    layer->indexer_attn_q_b->type != DS4_TENSOR_F16 ||
+                    (layer->indexer_attn_q_b->type != DS4_TENSOR_F16 &&
+                     layer->indexer_attn_q_b->type != DS4_TENSOR_Q8_0) ||
                     layer->indexer_attn_q_b->dim[0] != q_rank ||
                     layer->indexer_attn_q_b->dim[1] != indexer_q_dim) {
-                    fprintf(stderr, "ds4: Metal graph indexer q projection expects F16 weights\n");
+                    fprintf(stderr, "ds4: Metal graph indexer q projection expects F16 or Q8_0 weights\n");
                     ok = false;
                 }
                 if (ok && (!layer->indexer_proj ||
-                           layer->indexer_proj->type != DS4_TENSOR_F16 ||
+                           (layer->indexer_proj->type != DS4_TENSOR_F16 &&
+                            layer->indexer_proj->type != DS4_TENSOR_Q8_0) ||
                            layer->indexer_proj->dim[0] != DS4_N_EMBD ||
                            layer->indexer_proj->dim[1] != DS4_N_INDEXER_HEAD)) {
-                    fprintf(stderr, "ds4: Metal graph indexer weight projection expects F16 weights\n");
+                    fprintf(stderr, "ds4: Metal graph indexer weight projection expects F16 or Q8_0 weights\n");
                     ok = false;
                 }
-                if (ok) ok = ds4_gpu_matmul_f16_tensor(g->indexer_q, model->map, model->size,
-                                                         layer->indexer_attn_q_b->abs_offset,
-                                                         q_rank, indexer_q_dim,
-                                                         g->qr_norm, 1) != 0;
+                if (ok) ok = metal_graph_matmul_plain_tensor(g->indexer_q, model, layer->indexer_attn_q_b, q_rank, indexer_q_dim, g->qr_norm, 1);
                 if (ok) ok = ds4_gpu_rope_tail_tensor(g->indexer_q, 1,
                                                         DS4_N_INDEXER_HEAD,
                                                         DS4_N_INDEXER_HEAD_DIM,
@@ -9407,10 +9462,7 @@ static bool metal_graph_encode_decode_layer(
                                                         attn_factor,
                                                         DS4_ROPE_YARN_BETA_FAST,
                                                         DS4_ROPE_YARN_BETA_SLOW) != 0;
-                if (ok) ok = ds4_gpu_matmul_f16_tensor(g->indexer_weights, model->map, model->size,
-                                                         layer->indexer_proj->abs_offset,
-                                                         DS4_N_EMBD, DS4_N_INDEXER_HEAD,
-                                                         g->attn_norm, 1) != 0;
+                if (ok) ok = metal_graph_matmul_plain_tensor(g->indexer_weights, model, layer->indexer_proj, DS4_N_EMBD, DS4_N_INDEXER_HEAD, g->attn_norm, 1);
                 const float index_scale = 1.0f / sqrtf((float)(DS4_N_INDEXER_HEAD_DIM * DS4_N_INDEXER_HEAD));
                 if (ok && decode_index_stage_profile) {
                     ok = metal_graph_indexer_stage_profile_boundary(NULL,
@@ -9793,14 +9845,7 @@ static bool metal_graph_encode_output_head(
         uint64_t               vocab_dim) {
     const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD;
     bool ok = ds4_gpu_rms_norm_plain_tensor(g->flat_hc, g->cur_hc, (uint32_t)hc_dim, DS4_RMS_EPS) != 0;
-    if (ok) ok = ds4_gpu_matmul_f16_tensor(g->output_pre,
-                                             model->map,
-                                             model->size,
-                                             weights->output_hc_fn->abs_offset,
-                                             hc_dim,
-                                             DS4_N_HC,
-                                             g->flat_hc,
-                                             1) != 0;
+    if (ok) ok = metal_graph_matmul_plain_tensor(g->output_pre, model, weights->output_hc_fn, hc_dim, DS4_N_HC, g->flat_hc, 1);
     if (ok) {
         metal_graph_debug_dump_tensor("result_hc_pre", g->output_pre, DS4_N_HC, DS4_N_LAYER, 0);
     }
@@ -9893,14 +9938,7 @@ static bool metal_graph_encode_output_head_batch(
                                                       (uint32_t)hc_dim,
                                                       n_tokens,
                                                       DS4_RMS_EPS) != 0;
-    if (ok) ok = ds4_gpu_matmul_f16_tensor(output_pre,
-                                             model->map,
-                                             model->size,
-                                             weights->output_hc_fn->abs_offset,
-                                             hc_dim,
-                                             DS4_N_HC,
-                                             g->batch_flat_hc,
-                                             n_tokens) != 0;
+    if (ok) ok = metal_graph_matmul_plain_tensor(output_pre, model, weights->output_hc_fn, hc_dim, DS4_N_HC, g->batch_flat_hc, n_tokens);
     if (ok) ok = ds4_gpu_output_hc_weights_tensor(output_weights,
                                                     output_pre,
                                                     model->map,
@@ -9947,16 +9985,54 @@ static bool metal_graph_matmul_plain_tensor(
         uint64_t                out_dim,
         const ds4_gpu_tensor *x,
         uint64_t                n_tok) {
+    int ok = 0; const char *kind = "?";
     if (w->type == DS4_TENSOR_F16) {
-        return ds4_gpu_matmul_f16_tensor(out, model->map, model->size,
-                                           w->abs_offset, in_dim, out_dim, x, n_tok) != 0;
+        kind = "f16";
+        ok = ds4_gpu_matmul_f16_tensor(out, model->map, model->size,
+                                         w->abs_offset, in_dim, out_dim, x, n_tok);
+    } else if (w->type == DS4_TENSOR_F32) {
+        kind = "f32";
+        ok = ds4_gpu_matmul_f32_tensor(out, model->map, model->size,
+                                         w->abs_offset, in_dim, out_dim, x, n_tok);
+    } else if (w->type == DS4_TENSOR_Q8_0) {
+        kind = "q8_0";
+        ok = ds4_gpu_matmul_q8_0_tensor(out, model->map, model->size,
+                                          w->abs_offset, in_dim, out_dim, x, n_tok);
+    } else {
+        fprintf(stderr, "ds4: Metal plain matmul does not support %s\n", tensor_type_name(w->type));
+        return false;
     }
-    if (w->type == DS4_TENSOR_F32) {
-        return ds4_gpu_matmul_f32_tensor(out, model->map, model->size,
-                                           w->abs_offset, in_dim, out_dim, x, n_tok) != 0;
+    if (!ok) {
+        fprintf(stderr, "ds4: PLAIN_MATMUL FAIL kind=%s tensor=%.*s in_dim=%llu out_dim=%llu n_tok=%llu\n",
+                kind, (int)w->name.len, w->name.ptr,
+                (unsigned long long)in_dim, (unsigned long long)out_dim, (unsigned long long)n_tok);
     }
-    fprintf(stderr, "ds4: Metal plain matmul does not support %s\n", tensor_type_name(w->type));
-    return false;
+    return ok != 0;
+}
+
+/* Pair variant of metal_graph_matmul_plain_tensor: fused F16-pair kernel when
+ * both tensors are F16, otherwise dispatch to two single-tensor matmuls. */
+static bool metal_graph_matmul_pair_plain_tensor(
+        ds4_gpu_tensor       *out_a,
+        ds4_gpu_tensor       *out_b,
+        const ds4_model        *model,
+        const ds4_tensor       *w_a,
+        const ds4_tensor       *w_b,
+        uint64_t                in_dim,
+        uint64_t                out_dim,
+        const ds4_gpu_tensor *x,
+        uint64_t                n_tok) {
+    if (w_a->type != w_b->type) {
+        fprintf(stderr, "ds4: Metal plain pair matmul: paired tensors must share type\n");
+        return false;
+    }
+    if (w_a->type == DS4_TENSOR_F16) {
+        return ds4_gpu_matmul_f16_pair_tensor(out_a, out_b, model->map, model->size,
+                                                w_a->abs_offset, w_b->abs_offset,
+                                                in_dim, out_dim, x, n_tok) != 0;
+    }
+    if (!metal_graph_matmul_plain_tensor(out_a, model, w_a, in_dim, out_dim, x, n_tok)) return false;
+    return metal_graph_matmul_plain_tensor(out_b, model, w_b, in_dim, out_dim, x, n_tok);
 }
 
 static bool metal_graph_encode_output_head_mtp(
@@ -10286,7 +10362,7 @@ static int metal_graph_decode_test(
     int selected[DS4_N_EXPERT_USED];
     float expert_weight[DS4_N_EXPERT_USED];
 
-    embed_token_f16(model, weights, token, plain);
+    embed_token(model, weights, token, plain);
     hc_from_plain_embedding(cpu_hc, plain, DS4_N_EMBD, DS4_N_HC);
     hc_pre_from_state_one(model,
                           layer->hc_attn_fn,
@@ -10348,7 +10424,8 @@ static int metal_graph_decode_test(
                                                  (uint32_t)weights->token_embd->dim[1],
                                                  (uint32_t)token,
                                                      DS4_N_EMBD,
-                                                     DS4_N_HC) != 0;
+                                                     DS4_N_HC,
+                                                     weights->token_embd->type) != 0;
     if (ok) ok = metal_graph_encode_decode_layer(&g,
                                                model,
                                                layer,
@@ -10497,7 +10574,7 @@ static int metal_graph_first_token_full_test(
         float *cpu_cur = xmalloc((size_t)hc_dim * sizeof(float));
         float *cpu_next = xmalloc((size_t)hc_dim * sizeof(float));
 
-        embed_token_f16(model, weights, token, plain);
+        embed_token(model, weights, token, plain);
         hc_from_plain_embedding(cpu_cur, plain, DS4_N_EMBD, DS4_N_HC);
         ok = ds4_gpu_begin_commands() != 0;
         if (ok) ok = ds4_gpu_embed_token_hc_tensor(g.cur_hc,
@@ -10507,7 +10584,8 @@ static int metal_graph_first_token_full_test(
                                                      (uint32_t)weights->token_embd->dim[1],
                                                      (uint32_t)token,
                                                      DS4_N_EMBD,
-                                                     DS4_N_HC) != 0;
+                                                     DS4_N_HC,
+                                                     weights->token_embd->type) != 0;
         if (ok) ok = ds4_gpu_end_commands() != 0;
 
         for (uint32_t il = 0; ok && il < DS4_N_LAYER; il++) {
@@ -10556,7 +10634,8 @@ static int metal_graph_first_token_full_test(
                                                      (uint32_t)weights->token_embd->dim[1],
                                                      (uint32_t)token,
                                                      DS4_N_EMBD,
-                                                     DS4_N_HC) != 0;
+                                                     DS4_N_HC,
+                                                     weights->token_embd->type) != 0;
 
         for (uint32_t il = 0; ok && il < DS4_N_LAYER; il++) {
             ok = metal_graph_encode_decode_layer(&g, model, &weights->layer[il],
@@ -10637,7 +10716,8 @@ static bool metal_graph_encode_token_raw_swa(
                                               (uint32_t)weights->token_embd->dim[1],
                                               (uint32_t)token,
                                               DS4_N_EMBD,
-                                              DS4_N_HC) != 0;
+                                              DS4_N_HC,
+                                              weights->token_embd->type) != 0;
 
     /*
      * Start executing the prefix of the decode graph while the CPU is still
@@ -10741,24 +10821,10 @@ static bool metal_graph_refresh_ratio4_compressor_state(
             4ull * DS4_N_EMBD * sizeof(float));
     bool ok = tail_hc != NULL;
     if (ok) {
-        ok = ds4_gpu_matmul_f16_tensor(g->batch_comp_kv,
-                                         model->map,
-                                         model->size,
-                                         kv_weight->abs_offset,
-                                         DS4_N_EMBD,
-                                         width,
-                                         tail_hc,
-                                         4) != 0;
+        ok = metal_graph_matmul_plain_tensor(g->batch_comp_kv, model, kv_weight, DS4_N_EMBD, width, tail_hc, 4);
     }
     if (ok) {
-        ok = ds4_gpu_matmul_f16_tensor(g->batch_comp_sc,
-                                         model->map,
-                                         model->size,
-                                         score_weight->abs_offset,
-                                         DS4_N_EMBD,
-                                         width,
-                                         tail_hc,
-                                         4) != 0;
+        ok = metal_graph_matmul_plain_tensor(g->batch_comp_sc, model, score_weight, DS4_N_EMBD, width, tail_hc, 4);
     }
     if (ok) {
         ok = ds4_gpu_compressor_prefill_state_ratio4_tensor(state_kv,
@@ -10793,7 +10859,7 @@ static bool metal_graph_upload_prompt_embeddings_hc_cpu(
     float *plain = xmalloc((size_t)DS4_N_EMBD * sizeof(plain[0]));
 
     for (uint32_t t = 0; t < n_tokens; t++) {
-        embed_token_f16(model, weights, prompt->v[pos0 + t], plain);
+        embed_token(model, weights, prompt->v[pos0 + t], plain);
         float *dst = hc + (uint64_t)t * hc_dim;
         for (uint32_t h = 0; h < DS4_N_HC; h++) {
             memcpy(dst + (uint64_t)h * DS4_N_EMBD,
@@ -10838,7 +10904,8 @@ static bool metal_graph_upload_prompt_embeddings_hc(
                                                 (uint32_t)weights->token_embd->dim[1],
                                                 n_tokens,
                                                 DS4_N_EMBD,
-                                                DS4_N_HC) != 0;
+                                                DS4_N_HC,
+                                                weights->token_embd->type) != 0;
     }
 
     return metal_graph_upload_prompt_embeddings_hc_cpu(out_hc,
@@ -10869,14 +10936,7 @@ static bool metal_graph_warmup_prefill_kernels(
 
     bool ok = ds4_gpu_begin_commands() != 0;
     if (ok) {
-        ok = ds4_gpu_matmul_f16_tensor(g->batch_hc_mix,
-                                         model->map,
-                                         model->size,
-                                         weights->layer[0].hc_attn_fn->abs_offset,
-                                         hc_dim,
-                                         mix_hc,
-                                         g->batch_flat_hc,
-                                         n_tokens) != 0;
+        ok = metal_graph_matmul_plain_tensor(g->batch_hc_mix, model, weights->layer[0].hc_attn_fn, hc_dim, mix_hc, g->batch_flat_hc, n_tokens);
     }
     if (ok) ok = ds4_gpu_end_commands() != 0;
     if (!ok) {
@@ -11016,14 +11076,7 @@ static bool metal_graph_encode_layer_attention_batch(
                                                       (uint32_t)hc_dim,
                                                       n_tokens,
                                                       DS4_RMS_EPS) != 0;
-    if (ok) ok = ds4_gpu_matmul_f16_tensor(hc_mix_view,
-                                             model->map,
-                                             model->size,
-                                             layer->hc_attn_fn->abs_offset,
-                                             hc_dim,
-                                             mix_hc,
-                                             g->batch_flat_hc,
-                                             n_tokens) != 0;
+    if (ok) ok = metal_graph_matmul_plain_tensor(hc_mix_view, model, layer->hc_attn_fn, hc_dim, mix_hc, g->batch_flat_hc, n_tokens);
     if (metal_graph_use_reference_hc_decode()) {
         if (ok) ok = ds4_gpu_hc_split_sinkhorn_tensor(hc_split_view,
                                                         hc_mix_view,
@@ -11307,27 +11360,13 @@ static bool metal_graph_encode_layer_attention_batch(
             fprintf(stderr, "ds4: Metal layer-major prefill needs attention compressor weights\n");
             ok = false;
         }
-        if (ok) ok = ds4_gpu_matmul_f16_tensor(g->batch_comp_kv,
-                                                 model->map,
-                                                 model->size,
-                                                 layer->attn_compressor_kv->abs_offset,
-                                                 DS4_N_EMBD,
-                                                 comp_width,
-                                                 g->batch_attn_norm,
-                                                 n_tokens) != 0;
+        if (ok) ok = metal_graph_matmul_plain_tensor(g->batch_comp_kv, model, layer->attn_compressor_kv, DS4_N_EMBD, comp_width, g->batch_attn_norm, n_tokens);
         if (ok) metal_graph_debug_dump_tensor("attn_comp_kv_raw",
                                               g->batch_comp_kv,
                                               (uint64_t)comp_width * n_tokens,
                                               il,
                                               pos0);
-        if (ok) ok = ds4_gpu_matmul_f16_tensor(g->batch_comp_sc,
-                                                 model->map,
-                                                 model->size,
-                                                 layer->attn_compressor_gate->abs_offset,
-                                                 DS4_N_EMBD,
-                                                 comp_width,
-                                                 g->batch_attn_norm,
-                                                 n_tokens) != 0;
+        if (ok) ok = metal_graph_matmul_plain_tensor(g->batch_comp_sc, model, layer->attn_compressor_gate, DS4_N_EMBD, comp_width, g->batch_attn_norm, n_tokens);
         if (ok) metal_graph_debug_dump_tensor("attn_comp_score_raw",
                                               g->batch_comp_sc,
                                               (uint64_t)comp_width * n_tokens,
@@ -11585,40 +11624,22 @@ static bool metal_graph_encode_layer_attention_batch(
                 fprintf(stderr, "ds4: Metal layer-major prefill needs indexer weights\n");
                 ok = false;
             }
-            if (ok) ok = ds4_gpu_matmul_f16_tensor(g->batch_comp_kv,
-                                                     model->map,
-                                                     model->size,
-                                                     layer->indexer_compressor_kv->abs_offset,
-                                                     DS4_N_EMBD,
-                                                     index_width,
-                                                     g->batch_attn_norm,
-                                                     n_tokens) != 0;
+            if (ok) ok = metal_graph_matmul_plain_tensor(g->batch_comp_kv, model, layer->indexer_compressor_kv, DS4_N_EMBD, index_width, g->batch_attn_norm, n_tokens);
             if (ok) metal_graph_debug_dump_tensor("indexer_comp_kv_raw",
                                                   g->batch_comp_kv,
                                                   (uint64_t)index_width * n_tokens,
                                                   il,
                                                   pos0);
-            if (ok) ok = ds4_gpu_matmul_f16_tensor(g->batch_comp_sc,
-                                                     model->map,
-                                                     model->size,
-                                                     layer->indexer_compressor_gate->abs_offset,
-                                                     DS4_N_EMBD,
-                                                     index_width,
-                                                     g->batch_attn_norm,
-                                                     n_tokens) != 0;
+            if (ok) ok = metal_graph_matmul_plain_tensor(g->batch_comp_sc, model, layer->indexer_compressor_gate, DS4_N_EMBD, index_width, g->batch_attn_norm, n_tokens);
             if (ok) metal_graph_debug_dump_tensor("indexer_comp_score_raw",
                                                   g->batch_comp_sc,
                                                   (uint64_t)index_width * n_tokens,
                                                   il,
                                                   pos0);
-            if (ok) ok = ds4_gpu_matmul_f16_tensor(g->batch_indexer_q,
-                                                     model->map,
-                                                     model->size,
-                                                     layer->indexer_attn_q_b->abs_offset,
-                                                     q_rank,
-                                                     (uint64_t)DS4_N_INDEXER_HEAD * DS4_N_INDEXER_HEAD_DIM,
-                                                     g->batch_qr_norm,
-                                                     n_tokens) != 0;
+            if (ok) ok = metal_graph_matmul_plain_tensor(g->batch_indexer_q, model, layer->indexer_attn_q_b,
+                                                         q_rank,
+                                                         (uint64_t)DS4_N_INDEXER_HEAD * DS4_N_INDEXER_HEAD_DIM,
+                                                         g->batch_qr_norm, n_tokens);
             if (ok) ok = ds4_gpu_rope_tail_tensor(g->batch_indexer_q,
                                                     n_tokens,
                                                     DS4_N_INDEXER_HEAD,
@@ -11633,14 +11654,7 @@ static bool metal_graph_encode_layer_attention_batch(
                                                     attn_factor,
                                                     DS4_ROPE_YARN_BETA_FAST,
                                                     DS4_ROPE_YARN_BETA_SLOW) != 0;
-            if (ok) ok = ds4_gpu_matmul_f16_tensor(g->batch_indexer_weights,
-                                                     model->map,
-                                                     model->size,
-                                                     layer->indexer_proj->abs_offset,
-                                                     DS4_N_EMBD,
-                                                     DS4_N_INDEXER_HEAD,
-                                                     g->batch_attn_norm,
-                                                     n_tokens) != 0;
+            if (ok) ok = metal_graph_matmul_plain_tensor(g->batch_indexer_weights, model, layer->indexer_proj, DS4_N_EMBD, DS4_N_INDEXER_HEAD, g->batch_attn_norm, n_tokens);
             if (zero_prefix) {
                 if (ok && n_comp > g->comp_cap) {
                     fprintf(stderr, "ds4: Metal layer-major indexer cache capacity exceeded at layer %u\n", il);
@@ -12298,14 +12312,7 @@ static bool metal_graph_encode_layer_ffn_batch(
                                                       (uint32_t)hc_dim,
                                                       n_tokens,
                                                       DS4_RMS_EPS) != 0;
-    if (ok) ok = ds4_gpu_matmul_f16_tensor(hc_mix_view,
-                                             model->map,
-                                             model->size,
-                                             layer->hc_ffn_fn->abs_offset,
-                                             hc_dim,
-                                             mix_hc,
-                                             g->batch_flat_hc,
-                                             n_tokens) != 0;
+    if (ok) ok = metal_graph_matmul_plain_tensor(hc_mix_view, model, layer->hc_ffn_fn, hc_dim, mix_hc, g->batch_flat_hc, n_tokens);
     if (metal_graph_use_reference_hc_decode()) {
         if (ok) ok = ds4_gpu_hc_split_sinkhorn_tensor(hc_split_view,
                                                         hc_mix_view,
@@ -12353,14 +12360,7 @@ static bool metal_graph_encode_layer_ffn_batch(
                                       (uint64_t)n_tokens * DS4_N_EMBD, il, pos0);
     }
     DS4_METAL_PROFILE_FFN_STAGE("norm");
-    if (ok) ok = ds4_gpu_matmul_f16_tensor(g->batch_router_logits,
-                                             model->map,
-                                             model->size,
-                                             layer->ffn_gate_inp->abs_offset,
-                                             DS4_N_EMBD,
-                                             DS4_N_EXPERT,
-                                             g->batch_ffn_norm,
-                                             n_tokens) != 0;
+    if (ok) ok = metal_graph_matmul_plain_tensor(g->batch_router_logits, model, layer->ffn_gate_inp, DS4_N_EMBD, DS4_N_EXPERT, g->batch_ffn_norm, n_tokens);
 
     if (ok) ok = ds4_gpu_router_select_batch_tensor(g->batch_router_selected,
                                                       g->batch_router_weights,
@@ -12523,7 +12523,9 @@ static bool metal_graph_encode_layer_batch(
         uint32_t                pos0,
         uint32_t                n_tokens) {
     bool ok = metal_graph_encode_layer_attention_batch(g, model, layer, il, pos0, n_tokens);
-    if (ok) ok = metal_graph_encode_layer_ffn_batch(g, model, layer, il, pos0, n_tokens);
+    if (!ok) { fprintf(stderr, "ds4: TRACE attn_batch FAILED il=%u pos0=%u n_tokens=%u\n", il, pos0, n_tokens); return false; }
+    ok = metal_graph_encode_layer_ffn_batch(g, model, layer, il, pos0, n_tokens);
+    if (!ok) { fprintf(stderr, "ds4: TRACE ffn_batch FAILED il=%u pos0=%u n_tokens=%u\n", il, pos0, n_tokens); return false; }
     if (ok) {
         ds4_gpu_tensor *tmp = g->batch_cur_hc;
         g->batch_cur_hc = g->batch_next_hc;
@@ -12639,7 +12641,8 @@ static bool metal_graph_eval_mtp_draft_from_hc(
                                                   (uint32_t)base_weights->token_embd->dim[1],
                                                   (uint32_t)token,
                                                   DS4_N_EMBD,
-                                                  1) != 0;
+                                                  1,
+                                                  base_weights->token_embd->type) != 0;
     if (ok) ok = ds4_gpu_rms_norm_weight_tensor(g->mtp_enorm,
                                                   g->mtp_embed,
                                                   mtp_model->map,
@@ -13360,7 +13363,8 @@ static bool metal_graph_verify_decode2_exact(
                                                   (uint32_t)weights->token_embd->dim[1],
                                                   (uint32_t)token0,
                                                   DS4_N_EMBD,
-                                                  DS4_N_HC) != 0;
+                                                  DS4_N_HC,
+                                                  weights->token_embd->type) != 0;
     if (ok) ok = ds4_gpu_embed_token_hc_tensor(cur1,
                                                   model->map,
                                                   model->size,
@@ -13368,7 +13372,8 @@ static bool metal_graph_verify_decode2_exact(
                                                   (uint32_t)weights->token_embd->dim[1],
                                                   (uint32_t)token1,
                                                   DS4_N_EMBD,
-                                                  DS4_N_HC) != 0;
+                                                  DS4_N_HC,
+                                                  weights->token_embd->type) != 0;
 
     ds4_gpu_tensor *saved_cur = g->cur_hc;
     ds4_gpu_tensor *saved_after = g->after_ffn_hc;
@@ -13765,7 +13770,7 @@ static void embed_prompt(
         uint32_t            n_embd,
         float             * out) {
     for (int i = 0; i < tokens->len; i++) {
-        embed_token_f16(model, weights, tokens->v[i], out + (uint64_t)i * n_embd);
+        embed_token(model, weights, tokens->v[i], out + (uint64_t)i * n_embd);
     }
 }
 
diff --git a/ds4_gpu.h b/ds4_gpu.h
index 799065a8..3ae48c13 100644
--- a/ds4_gpu.h
+++ b/ds4_gpu.h
@@ -50,6 +50,7 @@ void ds4_gpu_print_memory_report(const char *label);
  * compressed-attention indexer that chooses visible compressed rows.
  */
 
+/* `weight_type` is the GGUF tensor type code (1 for F16, 8 for Q8_0). */
 int ds4_gpu_embed_token_hc_tensor(
         ds4_gpu_tensor *out_hc,
         const void       *model_map,
@@ -58,7 +59,8 @@ int ds4_gpu_embed_token_hc_tensor(
         uint32_t          n_vocab,
         uint32_t          token,
         uint32_t          n_embd,
-        uint32_t          n_hc);
+        uint32_t          n_hc,
+        uint32_t          weight_type);
 
 int ds4_gpu_embed_tokens_hc_tensor(
         ds4_gpu_tensor       *out_hc,
@@ -69,7 +71,8 @@ int ds4_gpu_embed_tokens_hc_tensor(
         uint32_t                n_vocab,
         uint32_t                n_tokens,
         uint32_t                n_embd,
-        uint32_t                n_hc);
+        uint32_t                n_hc,
+        uint32_t                weight_type);
 
 int ds4_gpu_indexer_score_one_tensor(
         ds4_gpu_tensor       *scores,
diff --git a/ds4_metal.m b/ds4_metal.m
index 6870bf9a..b87963d1 100644
--- a/ds4_metal.m
+++ b/ds4_metal.m
@@ -26,6 +26,8 @@
  */
 
 enum {
+    DS4_METAL_TENSOR_F16     = 1,
+    DS4_METAL_TENSOR_Q8_0    = 8,
     DS4_METAL_TENSOR_Q2_K    = 10,
     DS4_METAL_TENSOR_Q4_K    = 12,
     DS4_METAL_TENSOR_IQ2_XXS = 16,
@@ -40,12 +42,14 @@
 static id<MTLComputePipelineState> g_set_rows_f32_i32_pipeline;
 static id<MTLComputePipelineState> g_get_rows_f32_pipeline;
 static id<MTLComputePipelineState> g_get_rows_f16_pipeline;
+static id<MTLComputePipelineState> g_get_rows_q8_0_pipeline;
 static id<MTLComputePipelineState> g_get_rows_i32_pipeline;
 static id<MTLComputePipelineState> g_repeat_f32_pipeline;
 static id<MTLComputePipelineState> g_concat_pipeline;
 static id<MTLComputePipelineState> g_cpy_f32_f32_pipeline;
 static id<MTLComputePipelineState> g_cpy_f32_f16_pipeline;
 static id<MTLComputePipelineState> g_cpy_f16_f32_pipeline;
+static id<MTLComputePipelineState> g_cpy_q8_0_f32_pipeline;
 static id<MTLComputePipelineState> g_swiglu_pipeline;
 static id<MTLComputePipelineState> g_add_pipeline;
 static id<MTLComputePipelineState> g_mul_pipeline;
@@ -1401,6 +1405,69 @@ static float ds4_gpu_negative_infinity(void) {
     return v.f;
 }
 
+/* Total bytes for `n_elems` of an APE-style weight tensor of `ape_type`.
+ * F32 (0) = 4 bytes/elem; F16 (1) = 2 bytes/elem; Q8_0 (8) = 34 bytes per
+ * QK8_0=32 elements (caller must ensure n_elems is 32-aligned for Q8_0). */
+static uint64_t ds4_gpu_ape_bytes(uint32_t ape_type, uint64_t n_elems) {
+    if (ape_type == 8u) return (n_elems / 32u) * 34u;
+    return n_elems * (ape_type == 1u ? 2u : 4u);
+}
+
+/* Convert IEEE-754 half (bits) to float on the CPU.  Used by the Q8_0 ape
+ * CPU-side dequant; the Metal-side `kernel_cpy_q8_0_f32` path turned out to
+ * produce silently wrong output on M5 Max for the compressor APE shapes we
+ * hit, so the pre-add scratch is filled from the host instead. */
+static float ds4_gpu_half_bits_to_float(uint16_t bits) {
+    const uint32_t sign = (uint32_t)(bits >> 15) & 0x1u;
+    const uint32_t exph = (uint32_t)(bits >> 10) & 0x1fu;
+    const uint32_t mant = (uint32_t)bits & 0x3ffu;
+    uint32_t f32bits;
+    if (exph == 0u) {
+        if (mant == 0u) {
+            f32bits = sign << 31;
+        } else {
+            uint32_t e = 1u, m = mant;
+            while ((m & 0x400u) == 0u) { m <<= 1; e++; }
+            m &= 0x3ffu;
+            f32bits = (sign << 31) | ((127u - 15u - e + 1u) << 23) | (m << 13);
+        }
+    } else if (exph == 0x1fu) {
+        f32bits = (sign << 31) | (0xffu << 23) | (mant << 13);
+    } else {
+        f32bits = (sign << 31) | ((exph + 127u - 15u) << 23) | (mant << 13);
+    }
+    float v;
+    memcpy(&v, &f32bits, sizeof(v));
+    return v;
+}
+
+/* Dequantize `row_count` rows of `row_elems` Q8_0-stored values from `src`
+ * into contiguous F32 at `dst`.  `row_elems` must be a multiple of QK8_0=32.
+ * Source layout is contiguous block_q8_0 records (no per-row padding), so
+ * one row is `row_elems / 32` blocks of 34 bytes each. */
+static void ds4_gpu_cpu_dequant_q8_0_rows(
+        float         *dst,
+        const uint8_t *src,
+        uint32_t       row_count,
+        uint32_t       row_elems) {
+    const uint32_t blocks_per_row = row_elems / 32u;
+    for (uint32_t r = 0; r < row_count; r++) {
+        const uint8_t *row = src + (uint64_t)r * (uint64_t)blocks_per_row * 34u;
+        float *out = dst + (uint64_t)r * row_elems;
+        for (uint32_t b = 0; b < blocks_per_row; b++) {
+            const uint8_t *blk = row + (uint64_t)b * 34u;
+            uint16_t scale_bits;
+            memcpy(&scale_bits, blk, sizeof(scale_bits));
+            const float scale = ds4_gpu_half_bits_to_float(scale_bits);
+            const int8_t *qs = (const int8_t *)(blk + 2);
+            float *out_blk = out + (uint64_t)b * 32u;
+            for (uint32_t k = 0; k < 32u; k++) {
+                out_blk[k] = (float)qs[k] * scale;
+            }
+        }
+    }
+}
+
 static float ds4_gpu_positive_infinity(void) {
     union { uint32_t u; float f; } v = { 0x7f800000u };
     return v.f;
@@ -2742,6 +2809,23 @@ int ds4_gpu_init(void) {
             return 0;
         }
 
+        fn = [library newFunctionWithName:@"kernel_get_rows_q8_0"];
+        if (!fn) {
+            fprintf(stderr, "ds4: Metal kernel_get_rows_q8_0 function not found\n");
+            g_queue = nil;
+            g_device = nil;
+            return 0;
+        }
+
+        g_get_rows_q8_0_pipeline = [g_device newComputePipelineStateWithFunction:fn error:&error];
+        if (!g_get_rows_q8_0_pipeline) {
+            fprintf(stderr, "ds4: Metal kernel_get_rows_q8_0 pipeline failed: %s\n",
+                    [[error localizedDescription] UTF8String]);
+            g_queue = nil;
+            g_device = nil;
+            return 0;
+        }
+
         fn = [library newFunctionWithName:@"kernel_get_rows_i32"];
         if (!fn) {
             fprintf(stderr, "ds4: Metal kernel_get_rows_i32 function not found\n");
@@ -2861,6 +2945,22 @@ int ds4_gpu_init(void) {
             return 0;
         }
 
+        fn = [library newFunctionWithName:@"kernel_cpy_q8_0_f32"];
+        if (!fn) {
+            fprintf(stderr, "ds4: Metal kernel_cpy_q8_0_f32 function not found\n");
+            g_queue = nil;
+            g_device = nil;
+            return 0;
+        }
+        g_cpy_q8_0_f32_pipeline = [g_device newComputePipelineStateWithFunction:fn error:&error];
+        if (!g_cpy_q8_0_f32_pipeline) {
+            fprintf(stderr, "ds4: Metal kernel_cpy_q8_0_f32 pipeline failed: %s\n",
+                    [[error localizedDescription] UTF8String]);
+            g_queue = nil;
+            g_device = nil;
+            return 0;
+        }
+
         fn = [library newFunctionWithName:@"kernel_dsv4_fp8_kv_quantize_f32"];
         if (!fn) {
             fprintf(stderr, "ds4: Metal kernel_dsv4_fp8_kv_quantize_f32 function not found\n");
@@ -3955,12 +4055,14 @@ void ds4_gpu_cleanup(void) {
         g_set_rows_f32_i32_pipeline = nil;
         g_get_rows_f32_pipeline = nil;
         g_get_rows_f16_pipeline = nil;
+        g_get_rows_q8_0_pipeline = nil;
         g_get_rows_i32_pipeline = nil;
         g_repeat_f32_pipeline = nil;
         g_concat_pipeline = nil;
         g_cpy_f32_f32_pipeline = nil;
         g_cpy_f32_f16_pipeline = nil;
         g_cpy_f16_f32_pipeline = nil;
+        g_cpy_q8_0_f32_pipeline = nil;
         g_swiglu_pipeline = nil;
         g_add_pipeline = nil;
         g_mul_pipeline = nil;
@@ -4090,7 +4192,9 @@ void ds4_gpu_cleanup(void) {
     }
 }
 
-static int ds4_gpu_encode_get_rows_f16(
+static int ds4_gpu_embed_row_layout(uint32_t, uint32_t, uint64_t *, id<MTLComputePipelineState> *);
+
+static int ds4_gpu_encode_get_rows(
         id<MTLCommandBuffer> cb,
         id<MTLBuffer>        weight,
         NSUInteger           weight_offset,
@@ -4100,12 +4204,16 @@ static int ds4_gpu_encode_get_rows_f16(
         NSUInteger           out_offset,
         uint32_t             n_vocab,
         uint32_t             n_tokens,
-        uint32_t             n_embd) {
+        uint32_t             n_embd,
+        uint32_t             weight_type) {
     if (!cb || !weight || !tokens || !out || n_vocab == 0 || n_tokens == 0 || n_embd == 0) {
         return 0;
     }
 
-    const uint64_t src_row_bytes = (uint64_t)n_embd * sizeof(uint16_t);
+    uint64_t src_row_bytes = 0;
+    id<MTLComputePipelineState> pipeline = nil;
+    if (!ds4_gpu_embed_row_layout(weight_type, n_embd, &src_row_bytes, &pipeline)) return 0;
+
     const uint64_t dst_row_bytes = (uint64_t)n_embd * sizeof(float);
     const uint64_t token_bytes = (uint64_t)n_tokens * sizeof(int32_t);
     ds4_gpu_get_rows_args args = {
@@ -4124,13 +4232,13 @@ static int ds4_gpu_encode_get_rows_f16(
     };
 
     NSUInteger nth = (NSUInteger)n_embd;
-    const NSUInteger max_threads = g_get_rows_f16_pipeline.maxTotalThreadsPerThreadgroup;
+    const NSUInteger max_threads = pipeline.maxTotalThreadsPerThreadgroup;
     if (nth > max_threads) nth = max_threads;
     if (nth == 0) nth = 1;
     const NSUInteger nw0 = ((NSUInteger)n_embd + nth - 1u) / nth;
 
     id<MTLComputeCommandEncoder> enc = ds4_gpu_compute_encoder(cb);
-    [enc setComputePipelineState:g_get_rows_f16_pipeline];
+    [enc setComputePipelineState:pipeline];
     [enc setBytes:&args length:sizeof(args) atIndex:0];
     [enc setBuffer:weight offset:weight_offset atIndex:1];
     [enc setBuffer:tokens offset:tokens_offset atIndex:2];
@@ -4188,6 +4296,33 @@ static int ds4_gpu_encode_repeat_hc_embedding(
     return 1;
 }
 
+/* Per-row stride and gather pipeline for a token-embedding tensor type.
+ * Returns 0 (and a NULL pipeline) if the type is not supported. */
+static int ds4_gpu_embed_row_layout(
+        uint32_t weight_type,
+        uint32_t n_embd,
+        uint64_t *out_src_row_bytes,
+        id<MTLComputePipelineState> *out_pipeline) {
+    switch (weight_type) {
+    case DS4_METAL_TENSOR_F16:
+        *out_src_row_bytes = (uint64_t)n_embd * sizeof(uint16_t);
+        *out_pipeline = g_get_rows_f16_pipeline;
+        return 1;
+    case DS4_METAL_TENSOR_Q8_0:
+        if ((n_embd % 32) != 0) {
+            fprintf(stderr, "ds4: Q8_0 token embedding stride %u is not 32-aligned\n", n_embd);
+            return 0;
+        }
+        /* block_q8_0 is sizeof(half) + 32*sizeof(int8_t) = 34 bytes per QK8_0 elements. */
+        *out_src_row_bytes = (uint64_t)(n_embd / 32) * 34;
+        *out_pipeline = g_get_rows_q8_0_pipeline;
+        return 1;
+    default:
+        fprintf(stderr, "ds4: token embedding type %u not supported by Metal embed\n", weight_type);
+        return 0;
+    }
+}
+
 int ds4_gpu_embed_token_hc_tensor(
         ds4_gpu_tensor *out_hc,
         const void       *model_map,
@@ -4196,7 +4331,8 @@ int ds4_gpu_embed_token_hc_tensor(
         uint32_t          n_vocab,
         uint32_t          token,
         uint32_t          n_embd,
-        uint32_t          n_hc) {
+        uint32_t          n_hc,
+        uint32_t          weight_type) {
     if (!g_initialized && !ds4_gpu_init()) return 0;
     if (!out_hc || !model_map || n_vocab == 0 || token >= n_vocab || n_embd == 0 || n_hc == 0) {
         return 0;
@@ -4210,7 +4346,11 @@ int ds4_gpu_embed_token_hc_tensor(
             return 0;
         }
 
-        const uint64_t weight_bytes = (uint64_t)n_vocab * n_embd * sizeof(uint16_t);
+        uint64_t src_row_bytes = 0;
+        id<MTLComputePipelineState> pipeline = nil;
+        if (!ds4_gpu_embed_row_layout(weight_type, n_embd, &src_row_bytes, &pipeline)) return 0;
+
+        const uint64_t weight_bytes = (uint64_t)n_vocab * src_row_bytes;
         if (weight_offset > model_size || weight_bytes > model_size - weight_offset) {
             fprintf(stderr, "ds4: Metal graph embedding range is outside the mapped model\n");
             return 0;
@@ -4233,7 +4373,6 @@ int ds4_gpu_embed_token_hc_tensor(
         if (!cb) return 0;
 
         const int32_t token_i32 = (int32_t)token;
-        const uint64_t src_row_bytes = (uint64_t)n_embd * sizeof(uint16_t);
         const uint64_t dst_row_bytes = (uint64_t)n_embd * sizeof(float);
         ds4_gpu_get_rows_args args = {
             .ne00t = (int32_t)n_embd,
@@ -4250,12 +4389,12 @@ int ds4_gpu_embed_token_hc_tensor(
             .nb3 = dst_row_bytes,
         };
         NSUInteger nth = (NSUInteger)n_embd;
-        const NSUInteger max_threads = g_get_rows_f16_pipeline.maxTotalThreadsPerThreadgroup;
+        const NSUInteger max_threads = pipeline.maxTotalThreadsPerThreadgroup;
         if (nth > max_threads) nth = max_threads;
         if (nth == 0) nth = 1;
         const NSUInteger nw0 = ((NSUInteger)n_embd + nth - 1u) / nth;
         id<MTLComputeCommandEncoder> enc = ds4_gpu_compute_encoder(cb);
-        [enc setComputePipelineState:g_get_rows_f16_pipeline];
+        [enc setComputePipelineState:pipeline];
         [enc setBytes:&args length:sizeof(args) atIndex:0];
         [enc setBuffer:wbuf offset:(NSUInteger)inner_offset atIndex:1];
         [enc setBytes:&token_i32 length:sizeof(token_i32) atIndex:2];
@@ -4290,7 +4429,8 @@ int ds4_gpu_embed_tokens_hc_tensor(
         uint32_t                n_vocab,
         uint32_t                n_tokens,
         uint32_t                n_embd,
-        uint32_t                n_hc) {
+        uint32_t                n_hc,
+        uint32_t                weight_type) {
     if (!g_initialized && !ds4_gpu_init()) return 0;
     if (!out_hc || !tokens || !model_map || n_vocab == 0 || n_tokens == 0 || n_embd == 0 || n_hc == 0) {
         return 0;
@@ -4308,7 +4448,11 @@ int ds4_gpu_embed_tokens_hc_tensor(
             return 0;
         }
 
-        const uint64_t weight_bytes = (uint64_t)n_vocab * n_embd * sizeof(uint16_t);
+        uint64_t src_row_bytes = 0;
+        id<MTLComputePipelineState> _row_pipeline = nil;
+        if (!ds4_gpu_embed_row_layout(weight_type, n_embd, &src_row_bytes, &_row_pipeline)) return 0;
+
+        const uint64_t weight_bytes = (uint64_t)n_vocab * src_row_bytes;
         if (weight_offset > model_size || weight_bytes > model_size - weight_offset) {
             fprintf(stderr, "ds4: Metal graph batched embedding range is outside the mapped model\n");
             return 0;
@@ -4330,16 +4474,17 @@ int ds4_gpu_embed_tokens_hc_tensor(
         id<MTLCommandBuffer> cb = ds4_gpu_command_buffer(&owned);
         if (!cb) return 0;
 
-        if (!ds4_gpu_encode_get_rows_f16(cb,
-                                           wbuf,
-                                           (NSUInteger)inner_offset,
-                                           tokbuf,
-                                           ds4_gpu_tensor_offset(tokens),
-                                           g_embed_rows_buffer,
-                                           0,
-                                           n_vocab,
-                                           n_tokens,
-                                           n_embd) ||
+        if (!ds4_gpu_encode_get_rows(cb,
+                                       wbuf,
+                                       (NSUInteger)inner_offset,
+                                       tokbuf,
+                                       ds4_gpu_tensor_offset(tokens),
+                                       g_embed_rows_buffer,
+                                       0,
+                                       n_vocab,
+                                       n_tokens,
+                                       n_embd,
+                                       weight_type) ||
             !ds4_gpu_encode_repeat_hc_embedding(cb,
                                                   g_embed_rows_buffer,
                                                   0,
@@ -5353,13 +5498,13 @@ int ds4_gpu_matmul_f32_tensor(
         const ds4_gpu_tensor *x,
         uint64_t                n_tok) {
     if (!g_initialized && !ds4_gpu_init()) return 0;
-    if (in_dim > UINT32_MAX || out_dim > UINT32_MAX || n_tok > UINT32_MAX || n_tok != 1) return 0;
+    if (in_dim > UINT32_MAX || out_dim > UINT32_MAX || n_tok > UINT32_MAX) return 0;
 
     @autoreleasepool {
         id<MTLBuffer> xbuf = ds4_gpu_tensor_buffer(x);
         id<MTLBuffer> outbuf = ds4_gpu_tensor_buffer(out);
-        const uint64_t x_bytes = in_dim * sizeof(float);
-        const uint64_t out_bytes = out_dim * sizeof(float);
+        const uint64_t x_bytes = n_tok * in_dim * sizeof(float);
+        const uint64_t out_bytes = n_tok * out_dim * sizeof(float);
         if (!xbuf || !outbuf ||
             ds4_gpu_tensor_bytes(x) < x_bytes ||
             ds4_gpu_tensor_bytes(out) < out_bytes) {
@@ -5382,6 +5527,36 @@ int ds4_gpu_matmul_f32_tensor(
         id<MTLCommandBuffer> cb = ds4_gpu_command_buffer(&owned);
         if (!cb) return 0;
 
+        if (n_tok > 1) {
+            /* Multi-token prefill path: kernel_mul_mm_f32_f32 mirrors the
+             * existing F16/Q8_0 mul_mm_t instantiations in dense.metal. */
+            const bool bc_inp = (in_dim % 32u) != 0;
+            const bool bc_out = (out_dim % 64u) != 0 || (n_tok % 32u) != 0;
+            id<MTLComputePipelineState> pipeline =
+                ds4_gpu_get_mul_mm_pipeline("kernel_mul_mm_f32_f32", bc_inp, bc_out);
+            if (!pipeline) return 0;
+
+            ds4_gpu_mul_mm_args args = ds4_gpu_make_mm_args(in_dim, out_dim, n_tok, row_bytes);
+
+            id<MTLComputeCommandEncoder> enc = ds4_gpu_compute_encoder(cb);
+            [enc setComputePipelineState:pipeline];
+            [enc setBytes:&args length:sizeof(args) atIndex:0];
+            [enc setBuffer:wbuf offset:(NSUInteger)inner_offset atIndex:1];
+            [enc setBuffer:xbuf offset:ds4_gpu_tensor_offset(x) atIndex:2];
+            [enc setBuffer:outbuf offset:ds4_gpu_tensor_offset(out) atIndex:3];
+            [enc setThreadgroupMemoryLength:(bc_out ? 8192u : 6144u) atIndex:0];
+            [enc dispatchThreadgroups:MTLSizeMake(((NSUInteger)n_tok + 31u) / 32u,
+                                                  ((NSUInteger)out_dim + 63u) / 64u,
+                                                  1)
+                 threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+            ds4_gpu_end_compute_encoder(cb, enc);
+
+            if (!ds4_gpu_finish_command_buffer(cb, owned, "F32 tensor matmul")) {
+                return 0;
+            }
+            return 1;
+        }
+
         ds4_gpu_q8_0_matvec_args mv_args = ds4_gpu_make_f32_mv_args(in_dim, out_dim, 1);
         ds4_gpu_mv_dispatch mv_dispatch = ds4_gpu_make_plain_mv_dispatch(in_dim, 1);
         mv_args.nr0 = mv_dispatch.nr0;
@@ -6171,7 +6346,15 @@ static int ds4_gpu_encode_compressor_score_with_ape(
         uint32_t             n_tokens) {
     if (!cb || !score_src || !score_dst || !apebuf ||
         width == 0 || ratio == 0 || n_tokens == 0 ||
-        (ape_type != 0u && ape_type != 1u)) {
+        (ape_type != 0u && ape_type != 1u && ape_type != 8u)) {
+        return 0;
+    }
+    /* Q8_0 ape rows must be QK8_0-aligned (the dequant kernel walks whole
+     * blocks and src_off must land on a block boundary). */
+    if (ape_type == 8u && (width % 32u) != 0u) {
+        fprintf(stderr,
+                "ds4: Metal compressor APE Q8_0 width %u is not 32-aligned\n",
+                width);
         return 0;
     }
 
@@ -6189,34 +6372,93 @@ static int ds4_gpu_encode_compressor_score_with_ape(
         return 0;
     }
 
-    const uint64_t elem_ape = ape_type == 1u ? 2u : 4u;
-    uint32_t copied_rows = 0;
-    uint32_t pos_mod = pos0 % ratio;
-    while (copied_rows < n_tokens) {
-        uint32_t seg_rows = ratio - pos_mod;
-        if (seg_rows > n_tokens - copied_rows) seg_rows = n_tokens - copied_rows;
-        const uint32_t seg_elems = seg_rows * width;
-        const NSUInteger src_off = ape_offset + (NSUInteger)pos_mod * width * elem_ape;
-        const NSUInteger dst_off = (NSUInteger)copied_rows * width * sizeof(float);
-        int ok;
-        if (ape_type == 1u) {
-            ok = ds4_gpu_encode_cpy_f16_f32_1d(cb,
-                                                 apebuf,
-                                                 src_off,
-                                                 g_compressor_store_ape_buffer,
-                                                 dst_off,
-                                                 seg_elems);
-        } else {
-            ok = ds4_gpu_encode_cpy_f32_f32_1d(cb,
-                                                 apebuf,
-                                                 src_off,
-                                                 g_compressor_store_ape_buffer,
-                                                 dst_off,
-                                                 seg_elems);
+    /* Per-row source stride.  F16/F32 use natural element bytes; Q8_0 uses
+     * (width/32) * sizeof(block_q8_0) = (width/32) * 34 bytes. */
+    const uint64_t row_src_bytes = ape_type == 8u
+        ? (uint64_t)(width / 32u) * 34u
+        : (uint64_t)width * (ape_type == 1u ? 2u : 4u);
+
+    /* For Q8_0 ape, dequant on CPU into a *per-call* private buffer.  The
+     * Metal-side `kernel_cpy_q8_0_f32` path produced silently wrong output on
+     * M5 Max for compressor APE shapes; using the shared g_compressor_store_ape_buffer
+     * with CPU writes also produces wrong output because multiple CPU writes
+     * to the same scratch in one command buffer collapse to the last write
+     * at execute time (Metal kernels run in encode order, but CPU writes
+     * don't participate in that ordering when the same scratch is reused).
+     *
+     * The local buffer is retained until cb completes via addCompletedHandler
+     * because Metal does NOT strongly retain buffers bound to encoders. */
+    if (ape_type == 8u) {
+        const uint8_t *apebytes = (const uint8_t *) [apebuf contents];
+        if (!apebytes) {
+            fprintf(stderr, "ds4: Metal compressor APE Q8_0: source buffer has no CPU contents\n");
+            return 0;
+        }
+        const NSUInteger ape_call_bytes = (NSUInteger)total_elems * sizeof(float);
+        id<MTLBuffer> ape_call_buf = [g_device newBufferWithLength:ape_call_bytes
+                                                            options:MTLResourceStorageModeShared];
+        if (!ape_call_buf) {
+            fprintf(stderr, "ds4: Metal compressor APE Q8_0: per-call scratch alloc failed\n");
+            return 0;
+        }
+        float *scratch = (float *) [ape_call_buf contents];
+        uint32_t copied_rows = 0;
+        uint32_t pos_mod = pos0 % ratio;
+        while (copied_rows < n_tokens) {
+            uint32_t seg_rows = ratio - pos_mod;
+            if (seg_rows > n_tokens - copied_rows) seg_rows = n_tokens - copied_rows;
+            const uint64_t src_off = (uint64_t)ape_offset + (uint64_t)pos_mod * row_src_bytes;
+            ds4_gpu_cpu_dequant_q8_0_rows(scratch + (uint64_t)copied_rows * width,
+                                            apebytes + src_off,
+                                            seg_rows,
+                                            width);
+            copied_rows += seg_rows;
+            pos_mod = 0;
+        }
+        const int add_ok = ds4_gpu_encode_add_f32_1d(cb,
+                                                       score_src,
+                                                       score_src_offset,
+                                                       ape_call_buf,
+                                                       0,
+                                                       score_dst,
+                                                       score_dst_offset,
+                                                       total_elems);
+        if (add_ok) {
+            /* Keep ape_call_buf alive until the GPU is done reading it. */
+            [cb addCompletedHandler:^(id<MTLCommandBuffer> _Nonnull __unused done) {
+                (void) ape_call_buf;
+            }];
+        }
+        return add_ok;
+    } else {
+        uint32_t copied_rows = 0;
+        uint32_t pos_mod = pos0 % ratio;
+        while (copied_rows < n_tokens) {
+            uint32_t seg_rows = ratio - pos_mod;
+            if (seg_rows > n_tokens - copied_rows) seg_rows = n_tokens - copied_rows;
+            const uint32_t seg_elems = seg_rows * width;
+            const NSUInteger src_off = ape_offset + (NSUInteger)pos_mod * row_src_bytes;
+            const NSUInteger dst_off = (NSUInteger)copied_rows * width * sizeof(float);
+            int ok;
+            if (ape_type == 1u) {
+                ok = ds4_gpu_encode_cpy_f16_f32_1d(cb,
+                                                     apebuf,
+                                                     src_off,
+                                                     g_compressor_store_ape_buffer,
+                                                     dst_off,
+                                                     seg_elems);
+            } else {
+                ok = ds4_gpu_encode_cpy_f32_f32_1d(cb,
+                                                     apebuf,
+                                                     src_off,
+                                                     g_compressor_store_ape_buffer,
+                                                     dst_off,
+                                                     seg_elems);
+            }
+            if (!ok) return 0;
+            copied_rows += seg_rows;
+            pos_mod = 0;
         }
-        if (!ok) return 0;
-        copied_rows += seg_rows;
-        pos_mod = 0;
     }
 
     return ds4_gpu_encode_add_f32_1d(cb,
@@ -6302,7 +6544,7 @@ static int ds4_gpu_compressor_store_one_tensor(
         uint32_t                ratio,
         uint32_t                pos) {
     if (!kv || !sc || !state_kv || !state_score || !model_map ||
-        width == 0 || ratio == 0 || (ape_type != 0u && ape_type != 1u)) {
+        width == 0 || ratio == 0 || (ape_type != 0u && ape_type != 1u && ape_type != 8u)) {
         return 0;
     }
 
@@ -6312,10 +6554,9 @@ static int ds4_gpu_compressor_store_one_tensor(
     if (!pipeline) return 0;
 
     const uint32_t state_rows = ratio == 4u ? 2u * ratio : ratio;
-    const uint64_t elem_ape = ape_type == 1u ? 2u : 4u;
     const uint64_t row_bytes = (uint64_t)width * sizeof(float);
     const uint64_t state_bytes = (uint64_t)state_rows * row_bytes;
-    const uint64_t ape_bytes = (uint64_t)width * ratio * elem_ape;
+    const uint64_t ape_bytes = ds4_gpu_ape_bytes(ape_type, (uint64_t)width * ratio);
     if (ape_offset > model_size || ape_bytes > model_size - ape_offset ||
         ds4_gpu_tensor_bytes(kv) < row_bytes ||
         ds4_gpu_tensor_bytes(sc) < row_bytes ||
@@ -6377,7 +6618,7 @@ int ds4_gpu_compressor_store_batch_tensor(
     if (!g_initialized && !ds4_gpu_init()) return 0;
     if (!kv || !sc || !state_kv || !state_score || !model_map ||
         head_dim == 0 || ratio == 0 || n_tokens == 0 ||
-        (ape_type != 0u && ape_type != 1u)) {
+        (ape_type != 0u && ape_type != 1u && ape_type != 8u)) {
         return 0;
     }
 
@@ -6385,10 +6626,9 @@ int ds4_gpu_compressor_store_batch_tensor(
         const uint32_t coff = ratio == 4u ? 2u : 1u;
         const uint32_t width = coff * head_dim;
         const uint32_t state_rows = coff * ratio;
-        const uint64_t elem_ape = ape_type == 1u ? 2u : 4u;
         const uint64_t kv_bytes = (uint64_t)n_tokens * width * sizeof(float);
         const uint64_t state_bytes = (uint64_t)state_rows * width * sizeof(float);
-        const uint64_t ape_bytes = (uint64_t)width * ratio * elem_ape;
+        const uint64_t ape_bytes = ds4_gpu_ape_bytes(ape_type, (uint64_t)width * ratio);
 
         if (ape_offset > model_size || ape_bytes > model_size - ape_offset) {
             fprintf(stderr, "ds4: Metal compressor batch APE range is outside the mapped model\n");
@@ -6450,44 +6690,83 @@ int ds4_gpu_compressor_store_batch_tensor(
         }
 
         int ok = 1;
-        uint32_t copied_rows = 0;
-        uint32_t pos_mod = pos0 % ratio;
-        while (ok && copied_rows < n_tokens) {
-            uint32_t seg_rows = ratio - pos_mod;
-            if (seg_rows > n_tokens - copied_rows) seg_rows = n_tokens - copied_rows;
-            const uint32_t seg_elems = seg_rows * width;
-            const NSUInteger src_off = (NSUInteger)ape_inner +
-                                       (NSUInteger)pos_mod * width * elem_ape;
-            const NSUInteger dst_off = (NSUInteger)copied_rows * width * sizeof(float);
-            if (ape_type == 1u) {
-                ok = ds4_gpu_encode_cpy_f16_f32_1d(cb,
-                                                     apebuf,
-                                                     src_off,
-                                                     g_compressor_store_ape_buffer,
-                                                     dst_off,
-                                                     seg_elems);
-            } else {
-                ok = ds4_gpu_encode_cpy_f32_f32_1d(cb,
-                                                     apebuf,
-                                                     src_off,
-                                                     g_compressor_store_ape_buffer,
-                                                     dst_off,
-                                                     seg_elems);
+        const uint64_t row_src_bytes = ds4_gpu_ape_bytes(ape_type, (uint64_t)width);
+        id<MTLBuffer> ape_call_buf = nil;
+        id<MTLBuffer> ape_consume_buf = g_compressor_store_ape_buffer;
+        NSUInteger ape_consume_off = 0;
+        if (ape_type == 8u) {
+            /* CPU-side dequant for Q8_0: write to a fresh per-call shared
+             * buffer so the data survives encode-time race vs subsequent
+             * kernel reads (see ds4_gpu_encode_compressor_score_with_ape). */
+            const uint8_t *apebytes = (const uint8_t *) [apebuf contents];
+            const NSUInteger ape_call_bytes = (NSUInteger)total_elems * sizeof(float);
+            ape_call_buf = [g_device newBufferWithLength:ape_call_bytes
+                                                  options:MTLResourceStorageModeShared];
+            float *scratch = ape_call_buf ? (float *) [ape_call_buf contents] : NULL;
+            if (!apebytes || !scratch) {
+                fprintf(stderr, "ds4: Metal compressor APE Q8_0: per-call scratch alloc failed\n");
+                ok = 0;
+            }
+            ape_consume_buf = ape_call_buf;
+            uint32_t copied_rows = 0;
+            uint32_t pos_mod = pos0 % ratio;
+            while (ok && copied_rows < n_tokens) {
+                uint32_t seg_rows = ratio - pos_mod;
+                if (seg_rows > n_tokens - copied_rows) seg_rows = n_tokens - copied_rows;
+                const uint64_t src_off = (uint64_t)ape_inner + (uint64_t)pos_mod * row_src_bytes;
+                ds4_gpu_cpu_dequant_q8_0_rows(scratch + (uint64_t)copied_rows * width,
+                                                apebytes + src_off,
+                                                seg_rows,
+                                                width);
+                copied_rows += seg_rows;
+                pos_mod = 0;
+            }
+        } else {
+            uint32_t copied_rows = 0;
+            uint32_t pos_mod = pos0 % ratio;
+            while (ok && copied_rows < n_tokens) {
+                uint32_t seg_rows = ratio - pos_mod;
+                if (seg_rows > n_tokens - copied_rows) seg_rows = n_tokens - copied_rows;
+                const uint32_t seg_elems = seg_rows * width;
+                const NSUInteger src_off = (NSUInteger)ape_inner +
+                                           (NSUInteger)pos_mod * row_src_bytes;
+                const NSUInteger dst_off = (NSUInteger)copied_rows * width * sizeof(float);
+                if (ape_type == 1u) {
+                    ok = ds4_gpu_encode_cpy_f16_f32_1d(cb,
+                                                         apebuf,
+                                                         src_off,
+                                                         g_compressor_store_ape_buffer,
+                                                         dst_off,
+                                                         seg_elems);
+                } else {
+                    ok = ds4_gpu_encode_cpy_f32_f32_1d(cb,
+                                                         apebuf,
+                                                         src_off,
+                                                         g_compressor_store_ape_buffer,
+                                                         dst_off,
+                                                         seg_elems);
+                }
+                copied_rows += seg_rows;
+                pos_mod = 0;
             }
-            copied_rows += seg_rows;
-            pos_mod = 0;
         }
 
         if (ok) {
             ok = ds4_gpu_encode_add_f32_1d(cb,
                                              scbuf,
                                              ds4_gpu_tensor_offset(sc),
-                                             g_compressor_store_ape_buffer,
-                                             0,
+                                             ape_consume_buf,
+                                             ape_consume_off,
                                              g_compressor_store_score_buffer,
                                              0,
                                              total_elems);
         }
+        if (ok && ape_call_buf) {
+            /* Retain the per-call ape buffer until the GPU is done. */
+            [cb addCompletedHandler:^(id<MTLCommandBuffer> _Nonnull __unused done) {
+                (void) ape_call_buf;
+            }];
+        }
         if (ok) {
             ok = ds4_gpu_encode_set_rows_f32_i32(cb,
                                                    state_kv,
@@ -7020,7 +7299,7 @@ int ds4_gpu_compressor_prefill_tensor(
     if (!comp_cache || !state_kv || !state_score || !kv || !sc || !model_map ||
         head_dim == 0 || ratio == 0 || n_tokens == 0 ||
         n_rot > head_dim || (n_rot & 1u) != 0 ||
-        (ape_type != 0u && ape_type != 1u) ||
+        (ape_type != 0u && ape_type != 1u && ape_type != 8u) ||
         norm_type != 0u) {
         return 0;
     }
@@ -7032,11 +7311,10 @@ int ds4_gpu_compressor_prefill_tensor(
         const uint32_t n_comp = n_tokens / ratio;
         const uint32_t cutoff = n_comp * ratio;
         const uint32_t rem = n_tokens - cutoff;
-        const uint64_t elem_ape = ape_type == 1u ? 2u : 4u;
         const uint64_t kv_bytes = (uint64_t)n_tokens * width * sizeof(float);
         const uint64_t state_bytes = (uint64_t)state_rows * width * sizeof(float);
         const uint64_t comp_bytes = (uint64_t)n_comp * head_dim * sizeof(float);
-        const uint64_t ape_bytes = (uint64_t)width * ratio * elem_ape;
+        const uint64_t ape_bytes = ds4_gpu_ape_bytes(ape_type, (uint64_t)width * ratio);
         const uint64_t norm_bytes = (uint64_t)head_dim * sizeof(float);
 
         if (ape_offset > model_size || ape_bytes > model_size - ape_offset ||
@@ -7375,7 +7653,7 @@ int ds4_gpu_compressor_prefill_ratio4_replay_tensor(
     if (!comp_cache || !state_kv || !state_score || !kv || !sc || !model_map ||
         head_dim == 0 || n_tokens == 0 || (n_tokens & 3u) != 0 || (pos0 & 3u) != 0 ||
         n_rot > head_dim || (n_rot & 1u) != 0 ||
-        (ape_type != 0u && ape_type != 1u) ||
+        (ape_type != 0u && ape_type != 1u && ape_type != 8u) ||
         norm_type != 0u) {
         return 0;
     }
@@ -7385,11 +7663,10 @@ int ds4_gpu_compressor_prefill_ratio4_replay_tensor(
         const uint32_t width = 2u * head_dim;
         const uint32_t state_rows = 8u;
         const uint32_t n_comp = n_tokens / ratio;
-        const uint64_t elem_ape = ape_type == 1u ? 2u : 4u;
         const uint64_t kv_bytes = (uint64_t)n_tokens * width * sizeof(float);
         const uint64_t state_bytes = (uint64_t)state_rows * width * sizeof(float);
         const uint64_t comp_bytes = (uint64_t)n_comp * head_dim * sizeof(float);
-        const uint64_t ape_bytes = (uint64_t)width * ratio * elem_ape;
+        const uint64_t ape_bytes = ds4_gpu_ape_bytes(ape_type, (uint64_t)width * ratio);
         const uint64_t norm_bytes = (uint64_t)head_dim * sizeof(float);
 
         if (ape_offset > model_size || ape_bytes > model_size - ape_offset ||
@@ -7669,7 +7946,7 @@ int ds4_gpu_compressor_prefill_state_ratio4_tensor(
         uint32_t                pos0) {
     if (!g_initialized && !ds4_gpu_init()) return 0;
     if (!state_kv || !state_score || !kv_tail || !sc_tail || !model_map ||
-        head_dim == 0 || (ape_type != 0u && ape_type != 1u)) {
+        head_dim == 0 || (ape_type != 0u && ape_type != 1u && ape_type != 8u)) {
         return 0;
     }
 
@@ -7677,10 +7954,9 @@ int ds4_gpu_compressor_prefill_state_ratio4_tensor(
         const uint32_t ratio = 4u;
         const uint32_t width = 2u * head_dim;
         const uint32_t state_rows = 8u;
-        const uint64_t elem_ape = ape_type == 1u ? 2u : 4u;
         const uint64_t tail_bytes = (uint64_t)ratio * width * sizeof(float);
         const uint64_t state_bytes = (uint64_t)state_rows * width * sizeof(float);
-        const uint64_t ape_bytes = (uint64_t)ratio * width * elem_ape;
+        const uint64_t ape_bytes = ds4_gpu_ape_bytes(ape_type, (uint64_t)ratio * width);
 
         if (ape_offset > model_size || ape_bytes > model_size - ape_offset) {
             fprintf(stderr, "ds4: Metal compressor prefill-state APE range is outside the mapped model\n");
@@ -7783,7 +8059,7 @@ int ds4_gpu_compressor_update_tensor(
     if (!kv_cur || !sc_cur || !state_kv || !state_score || !comp_cache ||
         !model_map || head_dim == 0 || ratio == 0 ||
         n_rot > head_dim || (n_rot & 1u) != 0 ||
-        (ape_type != 0u && ape_type != 1u) ||
+        (ape_type != 0u && ape_type != 1u && ape_type != 8u) ||
         norm_type != 0u) {
         return 0;
     }
@@ -7793,11 +8069,10 @@ int ds4_gpu_compressor_update_tensor(
         const uint32_t width = coff * head_dim;
         const uint32_t state_rows = coff * ratio;
         const uint32_t emit = ((pos + 1u) % ratio) == 0u ? 1u : 0u;
-        const uint64_t elem_ape = ape_type == 1u ? 2u : 4u;
         const uint64_t kv_bytes = (uint64_t)width * sizeof(float);
         const uint64_t state_bytes = (uint64_t)state_rows * width * sizeof(float);
         const uint64_t comp_bytes = (uint64_t)(comp_row + (emit ? 1u : 0u)) * head_dim * sizeof(float);
-        const uint64_t ape_bytes = (uint64_t)width * ratio * elem_ape;
+        const uint64_t ape_bytes = ds4_gpu_ape_bytes(ape_type, (uint64_t)width * ratio);
         const uint64_t norm_bytes = (uint64_t)head_dim * sizeof(float);
 
         if (ape_offset > model_size || ape_bytes > model_size - ape_offset ||
diff --git a/metal/cpy.metal b/metal/cpy.metal
index 3aa00ac1..b2558bab 100644
--- a/metal/cpy.metal
+++ b/metal/cpy.metal
@@ -55,3 +55,41 @@ typedef decltype(kernel_cpy_t_t<float, float>) kernel_cpy_t;
 template [[host_name("kernel_cpy_f32_f32")]] kernel kernel_cpy_t kernel_cpy_t_t<float, float>;
 template [[host_name("kernel_cpy_f32_f16")]] kernel kernel_cpy_t kernel_cpy_t_t<float, half>;
 template [[host_name("kernel_cpy_f16_f32")]] kernel kernel_cpy_t kernel_cpy_t_t<half, float>;
+
+// Q8_0 -> F32 dequantizing 1D copy.  Used by the compressor APE path so
+// stock-recipe GGUFs that ship `*.compressor_ape.weight` as Q8_0 can be
+// read through the same byte-strided copy that the F16/F32 ape paths use.
+// args.ne00 is the total element count (must be divisible by QK8_0 = 32);
+// src is a packed Q8_0 region (sizeof(block_q8_0) = 34 bytes per QK8_0 elements)
+// and dst is contiguous F32.
+//
+// Uses explicit byte arithmetic instead of `block_q8_0 *` indexing because
+// the GGUF byte stride (34) does not match Metal's natural struct alignment
+// for `block_q8_0` (which would be padded to a multiple of `alignof(half)`
+// in some cases).  Each thread handles one output element and re-reads the
+// half scale from its block's first two bytes; that's redundant but cheap
+// and the compressor APE total element count is tiny (a few thousand).
+kernel void kernel_cpy_q8_0_f32(
+        constant ds4_metal_args_cpy & args,
+        device  const char * src0,
+        device        char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort  tiitg[[thread_index_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int n = (int) args.ne00;
+    const int gid = (int)(tgpig.x * ntg.x + tiitg);
+    if (gid >= n) return;
+
+    constexpr int BLOCK_BYTES = 34;
+    const int blk = gid / QK8_0;
+    const int idx = gid - blk * QK8_0;
+    device const char *bp = src0 + (uint64_t)blk * BLOCK_BYTES;
+    half d_h;
+    /* half scale lives at the first 2 bytes of the block */
+    thread char *dp = (thread char *) &d_h;
+    dp[0] = bp[0]; dp[1] = bp[1];
+    const float d = (float) d_h;
+    const int8_t q = (int8_t) bp[2 + idx];
+    device float *out = (device float *) dst;
+    out[gid] = (float) q * d;
+}
diff --git a/metal/dense.metal b/metal/dense.metal
index a84927e9..fa922b45 100644
--- a/metal/dense.metal
+++ b/metal/dense.metal
@@ -1116,6 +1116,7 @@ kernel void kernel_mul_mm(
 
 typedef decltype(kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, float4x4, 1, dequantize_f32, float, float4x4, float, float2x4>) mul_mm_t;
 
-// Host-visible prefill matmul variants for F16 and Q8_0 weights.
+// Host-visible prefill matmul variants for F16, Q8_0, and F32 weights.
 template [[host_name("kernel_mul_mm_f16_f32")]]  kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, half4x4, 1, dequantize_f16,  half,  half4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, block_q8_0, 2, dequantize_q8_0, float, float4x4, float, float2x4>;
+template [[host_name("kernel_mul_mm_f32_f32")]]  kernel mul_mm_t kernel_mul_mm<half, half4x4, simdgroup_half8x8, half, half2x4, simdgroup_half8x8, float4x4, 1, dequantize_f32, float, float4x4, float, float2x4>;
diff --git a/metal/dsv4_kv.metal b/metal/dsv4_kv.metal
index 89bd7d3a..80d5846c 100644
--- a/metal/dsv4_kv.metal
+++ b/metal/dsv4_kv.metal
@@ -218,6 +218,15 @@ kernel void kernel_dsv4_compressor_store_one(
     float ape_v;
     if (args.ape_type == 1u) {
         ape_v = (float)(((device const half *)ape)[ape_i]);
+    } else if (args.ape_type == 8u) {
+        /* Q8_0: 32 elements per 34-byte block (uint16_t scale + 32 int8 quants). */
+        const uint blk = ape_i / 32u;
+        const uint idx = ape_i - blk * 32u;
+        device const uchar *bp = (device const uchar *)ape + (uint64_t)blk * 34u;
+        const ushort sb = (ushort)bp[0] | ((ushort)bp[1] << 8);
+        const half d = as_type<half>(sb);
+        const int q = (int)(int8_t)bp[2u + idx];
+        ape_v = (float)q * (float)d;
     } else {
         ape_v = ((device const float *)ape)[ape_i];
     }
diff --git a/metal/get_rows.metal b/metal/get_rows.metal
index 31d7aab6..34ef0098 100644
--- a/metal/get_rows.metal
+++ b/metal/get_rows.metal
@@ -52,3 +52,37 @@ typedef decltype(kernel_get_rows_f<float, float>) get_rows_f_t;
 template [[host_name("kernel_get_rows_f32")]] kernel get_rows_f_t kernel_get_rows_f<float, float>;
 template [[host_name("kernel_get_rows_f16")]] kernel get_rows_f_t kernel_get_rows_f<half, float>;
 template [[host_name("kernel_get_rows_i32")]] kernel get_rows_f_t kernel_get_rows_f<int32_t, int32_t>;
+
+// Q8_0 token-embedding gather: dequantize blocks while reading the row.
+// Source row is `(ne00 / QK8_0)` `block_q8_0` records ([half scale][int8 qs[32]]);
+// destination is `ne00` floats. Layout dispatch matches kernel_get_rows_f.
+kernel void kernel_get_rows_q8_0(
+        constant ds4_metal_args_get_rows & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3               tgpig[[threadgroup_position_in_grid]],
+        ushort              tiitg[[thread_index_in_threadgroup]],
+        ushort3             ntg [[threads_per_threadgroup]]) {
+    const int32_t iw0 = tgpig.x/args.ne10;
+    const int32_t i10 = tgpig.x%args.ne10;
+    const int32_t i11 = tgpig.y;
+    const int32_t i12 = tgpig.z;
+
+    const int32_t r = ((const device int32_t *) (src1 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10))[0];
+
+    const int32_t i02 = i11;
+    const int32_t i03 = i12;
+
+    auto psrc = (device const block_q8_0 *) (src0 + i03*args.nb03 + i02*args.nb02 + r*args.nb01);
+    auto pdst = (      device float       *) (dst  + i12*args.nb3  + i11*args.nb2  + i10*args.nb1);
+
+    for (int ind = iw0*ntg.x + tiitg; ind < args.ne00t;) {
+        const int blk = ind / QK8_0;
+        const int idx = ind - blk * QK8_0;
+        const float d = (float) psrc[blk].d;
+        pdst[ind] = (float) psrc[blk].qs[idx] * d;
+
+        break;
+    }
+}