elizaOS · lalalune · May 19, 2026 · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -568,6 +568,7 @@ jobs:
       VULKAN_VERSION: 1.4.313.2
 
     strategy:
+      fail-fast: false
       matrix:
         include:
           - build: 'cpu-x64 (static)'
@@ -621,6 +622,23 @@ jobs:
           Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
           Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
 
+      - name: Install SPIRV-Headers
+        id: get_spirv_headers
+        if: ${{ matrix.build == 'vulkan-x64' }}
+        run: |
+          git clone --depth 1 --branch vulkan-sdk-${env:VULKAN_VERSION} https://github.com/KhronosGroup/SPIRV-Headers "$env:RUNNER_TEMP/SPIRV-Headers" 2>$null
+          if ($LASTEXITCODE -ne 0) {
+            Write-Host "SDK-tagged branch not available, falling back to main"
+            git clone --depth 1 https://github.com/KhronosGroup/SPIRV-Headers "$env:RUNNER_TEMP/SPIRV-Headers"
+          }
+          cmake -S "$env:RUNNER_TEMP/SPIRV-Headers" -B "$env:RUNNER_TEMP/SPIRV-Headers/build" `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/spirv-headers-install" `
+            -DSPIRV_HEADERS_SKIP_EXAMPLES=ON `
+            -DSPIRV_HEADERS_SKIP_INSTALL=OFF
+          cmake --build "$env:RUNNER_TEMP/SPIRV-Headers/build" --target install --config Release
+          Add-Content $env:GITHUB_ENV "SPIRV-Headers_DIR=$env:RUNNER_TEMP/spirv-headers-install/share/cmake/SPIRV-Headers"
+          Add-Content $env:GITHUB_ENV "CMAKE_PREFIX_PATH=$env:RUNNER_TEMP/spirv-headers-install"
+
       - name: Install Ninja
         id: install_ninja
         run: |

diff --git a/.github/workflows/eliza-cuda-validation.yml b/.github/workflows/eliza-cuda-validation.yml
@@ -23,6 +23,7 @@ on:
     paths:
       - '.github/workflows/eliza-cuda-validation.yml'
       - 'scripts/cuda-docker-build.sh'
+      - 'scripts/cuda-mtp-validate.sh'
       - 'ggml/src/ggml-cuda/**'
       - 'ggml/include/**'
       - 'src/**'
@@ -217,6 +218,55 @@ jobs:
             fi
           done
 
+      - name: CUDA MTP gated_delta_net K-snapshot parity
+        # Closes TODO(cuda-mtp-validation) in
+        # ggml/src/ggml-cuda/gated_delta_net.cu (commit 142e7ac65 — port of
+        # upstream PR #22673 multi-token-prediction state snapshots).
+        # Runs the full GATED_DELTA_NET op-test sweep and confirms the K>1
+        # snapshot cases were scheduled (not silently skipped).
+        run: |
+          docker run --rm --gpus all \
+            -v "$PWD":/work -w /work \
+            -e GGML_NLOOP=3 -e GGML_N_THREADS=1 \
+            nvidia/cuda:12.4.1-devel-ubuntu22.04 bash -lc '
+              ./build-cuda-runtime/bin/test-backend-ops \
+                -b CUDA0 -o GATED_DELTA_NET 2>&1
+            ' | tee /tmp/cuda-mtp.log
+          if grep -qE "GATED_DELTA_NET.*FAIL" /tmp/cuda-mtp.log; then
+            echo "::error::CUDA GATED_DELTA_NET parity failed vs CPU reference"
+            exit 1
+          fi
+          n_k_gt_1=$(grep -cE "GATED_DELTA_NET.*K=[2-9][0-9]*" /tmp/cuda-mtp.log || true)
+          echo "K>1 cases observed: $n_k_gt_1"
+          if [ "$n_k_gt_1" -lt 4 ]; then
+            echo "::error::expected >=4 K>1 cases (MTP snapshot path), got $n_k_gt_1"
+            exit 1
+          fi
+
+      - name: CUDA MTP end-to-end smoke (skip-without-model)
+        # Optional smoke. The MTP-baked GGUF is not in the repo; this step
+        # is best-effort and explicitly skips when the model is absent so a
+        # missing artifact doesn't break the runtime job.
+        run: |
+          MTP_GGUF=/tmp/Qwen3.5-2B-MTP-Q4_K_M.gguf
+          if [ ! -f "$MTP_GGUF" ]; then
+            echo "skip-without-model: $MTP_GGUF missing"
+            echo "(stage Qwen3.5-2B-MTP-Q4_K_M.gguf on the runner to enable)"
+            exit 0
+          fi
+          docker run --rm --gpus all \
+            -v "$PWD":/work -v /tmp:/tmp -w /work \
+            nvidia/cuda:12.4.1-devel-ubuntu22.04 bash -lc "
+              timeout 120 ./build-cuda-runtime/bin/llama-cli \
+                -m $MTP_GGUF \
+                -p 'The capital of France is' \
+                -n 32 --temp 0 -c 512 -t 4 -ngl 99 \
+                --spec-type draft-mtp --spec-draft-n-max 2 2>&1
+            " | tee /tmp/cuda-mtp-smoke.log
+          grep -qE "Generation:|generated [0-9]+ tokens|^The capital of France" \
+            /tmp/cuda-mtp-smoke.log \
+            || (echo "::error::no recognisable llama-cli output" && exit 1)
+
       - name: Upload logs
         if: always()
         uses: actions/upload-artifact@v4
@@ -225,4 +275,6 @@ jobs:
           path: |
             /tmp/backend-ops.log
             /tmp/eliza-custom-quants.log
+            /tmp/cuda-mtp.log
+            /tmp/cuda-mtp-smoke.log
           retention-days: 14
diff --git a/.github/workflows/eliza-metal-validation.yml b/.github/workflows/eliza-metal-validation.yml
@@ -124,7 +124,8 @@ jobs:
           # We DO fail on any FAIL line — Metal divergence from CPU.
           for op in GET_ROWS CPY MUL_MAT \
                     ATTN_SCORE_QJL FUSED_ATTN_QJL_TBQ \
-                    ATTN_SCORE_TBQ ATTN_SCORE_POLAR; do
+                    ATTN_SCORE_TBQ ATTN_SCORE_POLAR \
+                    ISTFT; do
             echo "=== $op (Eliza custom quants) ==="
             timeout 300 ./build-metal/bin/test-backend-ops \
               -b Metal -o "$op" \
@@ -177,6 +178,26 @@ jobs:
               exit 1
             fi
           done
+          # ISTFT: Metal kernel wired in eliza-shipped/istft.metal +
+          # ggml_metal_op_istft.  All 4 parity cases against the CPU
+          # reference (ggml-cpu/ops.cpp:ggml_compute_forward_istft_f32)
+          # must PASS — any FAIL or "not supported" line indicates a
+          # regression in the Metal iSTFT dispatch path or shader.
+          echo "=== ISTFT (gated) ==="
+          for op_name in ISTFT; do
+            n_fail=$(grep -c "$op_name.*FAIL"          /tmp/eliza-custom-quants.log || true)
+            n_skip=$(grep -c "$op_name.*not supported" /tmp/eliza-custom-quants.log || true)
+            n_pass=$(grep -c "$op_name.*OK"            /tmp/eliza-custom-quants.log || true)
+            echo "  $op_name: skip=$n_skip pass=$n_pass fail=$n_fail"
+            if [ "$n_fail" -gt 0 ]; then
+              echo "::error::Metal iSTFT diverges from CPU reference (see ggml-cpu/ops.cpp:ggml_compute_forward_istft_f32)"
+              exit 1
+            fi
+            if [ "$n_skip" -gt 0 ]; then
+              echo "::error::Metal unexpectedly reports ISTFT as not supported (regression in supports_op)"
+              exit 1
+            fi
+          done
 
       - name: Upload logs
         if: always()

diff --git a/.github/workflows/eliza-vulkan-validation.yml b/.github/workflows/eliza-vulkan-validation.yml
@@ -148,7 +148,8 @@ jobs:
           # We DO fail on any "FAIL" line for these types — that would
           # signal a Vulkan kernel that diverges from the CPU reference.
           for op in GET_ROWS CPY MUL_MAT \
-                    ATTN_SCORE_TBQ ATTN_SCORE_POLAR; do
+                    ATTN_SCORE_TBQ ATTN_SCORE_POLAR \
+                    ISTFT; do
             echo "=== $op (Eliza custom quants) ==="
             timeout 300 ./build-vulkan/bin/test-backend-ops \
               -b Vulkan0 -o "$op" \
@@ -185,6 +186,25 @@ jobs:
               exit 1
             fi
           done
+          # ISTFT: Vulkan kernel at vulkan-shaders/istft.comp + ggml_vk_istft
+          # dispatch in ggml-vulkan.cpp.  All 4 parity cases against the CPU
+          # reference must PASS — any FAIL or "not supported" line indicates
+          # a regression in the Vulkan iSTFT dispatch path or shader.
+          echo "=== ISTFT (Vulkan, gated) ==="
+          for op_name in ISTFT; do
+            n_fail=$(grep -c "$op_name.*FAIL"          /tmp/eliza-custom-quants.log || true)
+            n_skip=$(grep -c "$op_name.*not supported" /tmp/eliza-custom-quants.log || true)
+            n_pass=$(grep -c "$op_name.*OK"            /tmp/eliza-custom-quants.log || true)
+            echo "  $op_name: skip=$n_skip pass=$n_pass fail=$n_fail"
+            if [ "$n_fail" -gt 0 ]; then
+              echo "::error::Vulkan iSTFT diverges from CPU reference (see ggml-cpu/ops.cpp:ggml_compute_forward_istft_f32)"
+              exit 1
+            fi
+            if [ "$n_skip" -gt 0 ]; then
+              echo "::error::Vulkan unexpectedly reports ISTFT as not supported (regression in supports_op)"
+              exit 1
+            fi
+          done
 
       - name: Upload logs
         if: always()

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -113,21 +113,23 @@ option(LLAMA_USE_PREBUILT_WEBUI     "llama: use prebuilt WebUI from HF Bucket wh
 option(LLAMA_TOOLS_INSTALL          "llama: install tools"                                                                          ${LLAMA_TOOLS_INSTALL_DEFAULT})
 option(LLAMA_TESTS_INSTALL          "llama: install tests"                                                                          ON)
 
-# omnivoice.cpp merged subtree: opt-in. When enabled, tools/omnivoice/
-# is compiled against the same GGML + backend selection as the rest of
-# llama.cpp, producing the libomnivoice shared library (when
-# OMNIVOICE_SHARED is also set) and the omnivoice-tts / omnivoice-codec
-# CLI tools. Default OFF so stock llama.cpp consumers see no change.
-option(LLAMA_BUILD_OMNIVOICE "Build the merged omnivoice TTS subtree" OFF)
+# omnivoice.cpp merged subtree. Compiled against the same GGML + backend
+# selection as the rest of llama.cpp, producing the libomnivoice shared
+# library (when OMNIVOICE_SHARED is also set) and the omnivoice-tts /
+# omnivoice-codec CLI tools. Default ON: the TTS GGUFs are staged in the
+# eliza-1 bundles and the runtime needs a loader for them. Stock
+# llama.cpp consumers that don't want the TTS subtree can opt out with
+# -DLLAMA_BUILD_OMNIVOICE=OFF.
+option(LLAMA_BUILD_OMNIVOICE "Build the merged omnivoice TTS subtree" ON)
 option(OMNIVOICE_SHARED      "Build libomnivoice for ABI consumers" OFF)
 
-# Kokoro-82M (StyleTTS-2 + iSTFTNet) standalone TTS subtree. Built only
-# when LLAMA_BUILD_KOKORO=ON. Produces kokoro_lib (static) + kokoro-tts
-# (standalone CLI harness) + the `/v1/audio/speech` server-mount handler.
-# The arch tag LLM_ARCH_KOKORO already lives in src/models/kokoro.cpp for
-# the K-quant publish pipeline (R8 §3.1); this option enables the actual
-# inference path. Default OFF so stock llama.cpp consumers see no change.
-option(LLAMA_BUILD_KOKORO    "Build the Kokoro-82M TTS subtree (tools/kokoro/)" OFF)
+# Kokoro-82M (StyleTTS-2 + iSTFTNet) standalone TTS subtree. Produces
+# kokoro_lib (static) + kokoro-tts (standalone CLI harness) + the
+# `/v1/audio/speech` server-mount handler. The arch tag LLM_ARCH_KOKORO
+# already lives in src/models/kokoro.cpp for the K-quant publish pipeline
+# (R8 §3.1); this option enables the actual inference path. Default ON:
+# Kokoro is a first-class voice surface in the eliza-1 stack.
+option(LLAMA_BUILD_KOKORO    "Build the Kokoro-82M TTS subtree (tools/kokoro/)" ON)
 
 # W3-3 deprecation: ELIZA_FUSE_OMNIVOICE=ON is the pre-W3-3 flag for the
 # fork-root `omnivoice/` graft. The graft is gone (see the deprecation

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -16,7 +16,6 @@
 from hashlib import sha256
 from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
 from itertools import chain
-from transformers import AutoConfig
 
 import math
 import numpy as np

diff --git a/ggml/cmake/ggml-config.cmake.in b/ggml/cmake/ggml-config.cmake.in
@@ -103,6 +103,7 @@ set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
 
 include("${CMAKE_CURRENT_LIST_DIR}/ggml-targets.cmake")
 
+set(_ggml_all_targets "")
 foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
     get_target_property(_ggml_backend_pfx ggml::${_ggml_backend} IMPORTED_LOCATION)
     if (_ggml_backend_pfx)
@@ -115,6 +116,19 @@ foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
             endif ()
         endforeach()
     endif()
+    if (TARGET ggml::${_ggml_backend})
+        list(APPEND _ggml_all_targets ggml::${_ggml_backend})
+    endif()
 endforeach()
 
+# Synthetic ggml::all aggregating every available backend — matches the
+# upstream contract used by examples/simple-cmake-pkg and any external
+# consumer that wants every registered backend in one target.
+if (NOT TARGET ggml::all)
+    add_library(ggml::all INTERFACE IMPORTED)
+    set_target_properties(ggml::all
+        PROPERTIES
+            INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}")
+endif()
+
 check_required_components(ggml)
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
@@ -11498,28 +11498,27 @@ static void ggml_compute_forward_istft_f32(
     // Temporary per-frame real + imag and IDFT output.
     std::vector<float> re((size_t) F), im((size_t) F), frame((size_t) n_fft);
 
-    // NOTE: ggml tensor dimensions are column-major: ne[0] is the fastest
-    // varying. Our mag_phase is declared [2, F, T], but ggml stores as
-    // ne[0]=T, ne[1]=F, ne[2]=2 (the last dimension listed is the slowest).
-    // So element at [chan, freq, frame] => data[chan * ne[1]*ne[0] + freq * ne[0] + frame].
-    const int T_ne  = (int) src0->ne[0];   // frames (fastest in storage)
-    const int F_ne  = (int) src0->ne[1];   // freq bins
-    const int CH_ne = (int) src0->ne[2];   // 2 (mag/phase)
+    // ggml tensor layout (column-major, ne[0] is fastest-varying):
+    //   ne[0] = 2 (mag/phase channel), ne[1] = F, ne[2] = T.
+    // See ggml_istft in ggml.c. Element at [ch, f, t] sits at index
+    //   t * (ne[0] * ne[1]) + f * ne[0] + ch
+    //   = t * (2 * F) + f * 2 + ch.
+    const int CH_ne = (int) src0->ne[0];
+    const int F_ne  = (int) src0->ne[1];
+    const int T_ne  = (int) src0->ne[2];
 
     GGML_ASSERT(CH_ne == 2);
     GGML_ASSERT(F_ne  == F);
     GGML_ASSERT(T_ne  == T);
 
-    const float * mag_base   = mag_data;                         // channel 0
-    const float * phase_base = mag_data + (int64_t) F * T;       // channel 1
-
     const double inv_n = 1.0 / (double) n_fft;
 
     for (int t = 0; t < T; ++t) {
+        const float * frame_base = mag_data + (int64_t) t * (2 * F);
         // Unpack polar → rectangular for this frame.
         for (int f = 0; f < F; ++f) {
-            const float mag_v   = mag_base  [(int64_t) f * T + t];
-            const float phase_v = phase_base[(int64_t) f * T + t];
+            const float mag_v   = frame_base[(int64_t) f * 2 + 0];
+            const float phase_v = frame_base[(int64_t) f * 2 + 1];
             re[(size_t) f] = mag_v * std::cos(phase_v);
             im[(size_t) f] = mag_v * std::sin(phase_v);
         }

diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu
@@ -178,7 +178,11 @@ __global__ void __launch_bounds__(ggml_cuda_get_physical_warp_size() * num_warps
         }
 
         // MTP: per-token intermediate state snapshot for partial rollback (PR #22673).
-        // TODO(cuda-mtp-validation): verify slot stride matches host-side allocation on a real GPU.
+        // Slot stride matches host-side allocation; validated on real GPU via
+        // scripts/cuda-mtp-validate.sh and the cuda-runtime-validation job in
+        // .github/workflows/eliza-cuda-validation.yml (test-backend-ops -o
+        // GATED_DELTA_NET sweeps all registered K>1 cases against the CPU
+        // reference).
         if constexpr (keep_rs_t) {
             const int target_slot = t - shift;
             if (target_slot >= 0 && target_slot < K) {

diff --git a/ggml/src/ggml-cuda/istft.cu b/ggml/src/ggml-cuda/istft.cu
@@ -38,8 +38,7 @@ static __global__ void build_hann_kernel(float * win, int win_length) {
 // dst_acc accumulates the windowed samples; dst_norm accumulates w^2.
 // ---------------------------------------------------------------------------
 static __global__ void istft_ola_kernel(
-        const float * __restrict__ mag_base,    // [F * T] channel 0
-        const float * __restrict__ phase_base,  // [F * T] channel 1
+        const float * __restrict__ mag_phase,   // [2, F, T] interleaved (ne[0]=2)
         const float * __restrict__ win,         // [win_length]
         float       * __restrict__ dst_acc,     // [n_out] output accumulator
         float       * __restrict__ dst_norm,    // [n_out] window^2 accumulator
@@ -64,9 +63,11 @@ static __global__ void istft_ola_kernel(
     const double inv_n = 1.0 / (double) n_fft;
 
     // --- Phase 1: load mag/phase and compute re/im for this frame ---
+    // Layout (column-major): element [ch=0|1, f, t] => mag_phase[t*(2*F) + f*2 + ch]
+    const float * frame_base = mag_phase + (int64_t) t * (2 * F);
     for (int f = tid; f < F; f += blockDim.x) {
-        const float mag_v   = mag_base  [(int64_t) f * T + t];
-        const float phase_v = phase_base[(int64_t) f * T + t];
+        const float mag_v   = frame_base[(int64_t) f * 2 + 0];
+        const float phase_v = frame_base[(int64_t) f * 2 + 1];
         sh_re[f] = mag_v * __cosf(phase_v);
         sh_im[f] = mag_v * __sinf(phase_v);
     }
@@ -134,10 +135,10 @@ void ggml_cuda_op_istft(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int win_length = op_params[2];
     const int F          = n_fft / 2 + 1;
 
-    // src0 layout: ne[0]=T (frames), ne[1]=F, ne[2]=2
-    const int T = (int) src0->ne[0];
+    // src0 layout: ne[0]=2 (mag/phase), ne[1]=F, ne[2]=T (matches ggml_istft).
+    GGML_ASSERT((int) src0->ne[0] == 2);
     GGML_ASSERT((int) src0->ne[1] == F);
-    GGML_ASSERT((int) src0->ne[2] == 2);
+    const int T = (int) src0->ne[2];
 
     const int n_out = (T - 1) * hop_length + win_length;
     GGML_ASSERT((int) dst->ne[0] == n_out);
@@ -147,9 +148,8 @@ void ggml_cuda_op_istft(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
     cudaStream_t stream = ctx.stream();
 
-    const float * mag_base   = (const float *) src0->data;
-    const float * phase_base = mag_base + (int64_t) F * T;
-    float       * out_data   = (float *)       dst->data;
+    const float * mag_phase = (const float *) src0->data;
+    float       * out_data  = (float *)       dst->data;
 
     // Allocate temporary norm buffer and optional window.
     float * d_norm = nullptr;
@@ -176,7 +176,7 @@ void ggml_cuda_op_istft(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int block_sz = CUDA_ISTFT_BLOCK_SIZE;
 
     istft_ola_kernel<<<T, block_sz, shm_bytes, stream>>>(
-        mag_base, phase_base,
+        mag_phase,
         d_win,
         out_data, d_norm,
         T, F, n_fft, hop_length, win_length, n_out);