elizaOS · lalalune · May 19, 2026 · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/.github/workflows/eliza-cuda-validation.yml b/.github/workflows/eliza-cuda-validation.yml
@@ -18,6 +18,11 @@ name: Eliza CUDA Validation
 
 on:
   workflow_dispatch:
+    inputs:
+      force_cuda_runtime:
+        description: 'Force runtime validation on a self-hosted-cuda runner. Leave unchecked on push/PR — the runtime job is skipped automatically because no GPU runner is registered yet.'
+        type: boolean
+        default: false
   pull_request:
     types: [opened, synchronize, reopened]
     paths:
@@ -135,12 +140,18 @@ jobs:
 
   cuda-runtime-validation:
     # Gated: requires a self-hosted runner with a real NVIDIA GPU and
-    # the `self-hosted-cuda` label. The job is skipped automatically
-    # by GitHub Actions if no such runner is online.
+    # the `self-hosted-cuda` label. No such runner is registered in the
+    # elizaOS org yet, so this job is gated behind the
+    # `force_cuda_runtime` workflow_dispatch input — running it on
+    # every push/PR would queue forever and never reach a runner.
+    #
+    # To run it: register a runner with label `self-hosted-cuda`, then
+    #   gh workflow run eliza-cuda-validation.yml -f force_cuda_runtime=true
+    # See docs/eliza-cuda-runtime-validation.md for runner setup.
     needs: ubuntu-cuda-build
     runs-on: [self-hosted, self-hosted-cuda]
     timeout-minutes: 120
-    if: github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request'
+    if: ${{ github.event_name == 'workflow_dispatch' && inputs.force_cuda_runtime }}
 
     steps:
       - name: Clone

diff --git a/docs/eliza-cuda-runtime-validation.md b/docs/eliza-cuda-runtime-validation.md
@@ -0,0 +1,85 @@
+# Eliza CUDA Runtime Validation
+
+`.github/workflows/eliza-cuda-validation.yml` has two jobs:
+
+1. **`ubuntu-cuda-build`** — compile-only. Runs on every push and PR on
+   the GitHub-hosted `ubuntu-24.04` image inside the
+   `nvidia/cuda:12.4.1-devel-ubuntu22.04` Docker image. No GPU needed.
+   Catches compile/link regressions for the Eliza custom CUDA kernels
+   (`turbo-tcq`, `polarquant`, `qjl`, `fused-attn-qjl-tbq`, and the
+   `fattn-vec-instance-tbq*` template instances).
+
+2. **`cuda-runtime-validation`** — runs `test-backend-ops` and the
+   CUDA MTP `gated_delta_net` K-snapshot parity sweep on a real NVIDIA
+   GPU. Requires a self-hosted runner with the `self-hosted-cuda`
+   label. **Gated behind a workflow_dispatch input** so it does not
+   queue indefinitely while no runner is registered.
+
+## Triggering the runtime job
+
+```
+gh workflow run eliza-cuda-validation.yml \
+  -R elizaOS/llama.cpp \
+  --ref <branch> \
+  -f force_cuda_runtime=true
+```
+
+The job is skipped automatically on push and PR events, and on
+`workflow_dispatch` runs where `force_cuda_runtime` is left at its
+default `false`. It executes only when explicitly forced.
+
+## Runner setup
+
+### Hardware
+
+- NVIDIA GPU with **≥ 24 GB VRAM** (Qwen3.5-2B-MTP-Q4_K_M GGUF smoke
+  plus `test-backend-ops` working sets). RTX 3090 / 4090 / A6000 /
+  A100 / H100 all qualify.
+- NVIDIA driver ≥ 550.x (CUDA 12.4 runtime).
+- Linux x86_64 (Ubuntu 22.04 or 24.04 preferred — matches the Docker
+  base image used by the job).
+- Docker ≥ 24 with `nvidia-container-toolkit` installed and the
+  `nvidia` Docker runtime registered (`docker info | grep -i runtime`
+  should list it).
+
+### Registering with GitHub
+
+1. Repo Settings → Actions → Runners → **New self-hosted runner** →
+   Linux / x64. Follow the install + token instructions.
+2. When prompted for labels, include both `self-hosted` (default) and
+   `self-hosted-cuda`. The workflow targets the pair `[self-hosted,
+   self-hosted-cuda]`.
+3. Install as a service:
+   ```
+   sudo ./svc.sh install
+   sudo ./svc.sh start
+   ```
+4. Optionally stage the MTP smoke GGUF at
+   `/tmp/Qwen3.5-2B-MTP-Q4_K_M.gguf` to enable the
+   `CUDA MTP end-to-end smoke` step (otherwise it self-skips).
+
+### Cloud-runner options
+
+If no in-house GPU is available, spin up an on-demand runner against
+one of these providers:
+
+| Provider     | Spec                       | ~Cost (USD/hr) |
+| ------------ | -------------------------- | -------------- |
+| Lambda Cloud | RTX 6000 Ada (48 GB)       | ~0.80          |
+| RunPod       | RTX 4090 (24 GB) spot      | ~0.30–0.50     |
+| vast.ai      | RTX 4090 (24 GB) interrupt | ~0.20–0.40     |
+| Paperspace   | A6000 (48 GB)              | ~0.76          |
+
+Bring up an instance with CUDA 12.4 + Docker, register it as a runner
+with the two labels above, run the workflow, and tear the instance
+down. The runtime job typically completes in 20–40 minutes; cost per
+validation run is well under USD 1.
+
+## Why this is gated rather than `if: false`
+
+The `if:` expression skips cleanly (workflow shows "Skipped" rather
+than "Cancelled" or "Queued"). Once a runner is online, the gate flips
+just by passing `-f force_cuda_runtime=true` to `workflow run` — no
+workflow edit required. When runtime validation becomes routine
+enough to run on every PR, drop the gate and switch the trigger back
+to unconditional `pull_request`.
diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp
@@ -347,7 +347,13 @@ static FILE * get_handcrafted_file(const unsigned int seed, const enum handcraft
             name += "_" + std::to_string(i);
         }
         if (hft == HANDCRAFTED_TENSORS_BAD_NAME_SIZE) {
-            name += "_with_a_very_long_name_which_is_longer_than_what_is_allowed_for_ggml_tensors";
+            // Pad past GGML_MAX_NAME (default 64, can be raised at build time
+            // — omnivoice bumps it to 128). Keep growing until we're strictly
+            // over the limit so the "bad name size" branch actually exercises
+            // the oversize-name failure path under any GGML_MAX_NAME.
+            while (name.length() < (size_t) GGML_MAX_NAME) {
+                name += "_with_a_very_long_name_which_is_longer_than_what_is_allowed_for_ggml_tensors";
+            }
             GGML_ASSERT(name.length() >= GGML_MAX_NAME);
         }
         {

diff --git a/tools/kokoro/tools/kokoro-tts.cpp b/tools/kokoro/tools/kokoro-tts.cpp
@@ -3,11 +3,11 @@
 // kokoro-tts.cpp — standalone CLI harness for the Kokoro fork inference path.
 //
 // Usage:
-//     kokoro-tts \
-//         --model    <path-to-kokoro-v1.0.gguf> \
-//         --voice    <path-to-voices/af_sam.bin> \
-//         --text     "Hello world." \
-//         --output   <out.wav> \
+//     kokoro-tts
+//         --model    <path-to-kokoro-v1.0.gguf>
+//         --voice    <path-to-voices/af_sam.bin>
+//         --text     "Hello world."
+//         --output   <out.wav>
 //         [--speed 1.0]
 //
 // Exits 0 on a non-blank WAV being written; non-zero on any failure. Used

diff --git a/tools/omnivoice/CMakeLists.txt b/tools/omnivoice/CMakeLists.txt
@@ -244,25 +244,24 @@ target_link_libraries(omnivoice-test-abi-c PRIVATE omnivoice_lib)
 # dac_conv_t1d's ggml_conv_transpose_1d migration (elizaOS/eliza#7660).
 # The default run uses deterministic synthetic DAC block shapes; pass
 # `--gguf <omnivoice-tokenizer.gguf>` to exercise real conv_t1 weights.
-add_executable(omnivoice-dac-parity tests/dac-parity.cpp)
-target_compile_features(omnivoice-dac-parity PRIVATE cxx_std_17)
-target_include_directories(omnivoice-dac-parity PRIVATE
-    ${CMAKE_SOURCE_DIR}/ggml/include
-    ${CMAKE_SOURCE_DIR}/ggml/src)
-# Link ggml-cpu / ggml-base only when those CMake targets actually exist.
-# On some cross-compile configurations (Android NDK in particular) the
-# ggml-cpu target may not be defined, and naming it unconditionally in
-# target_link_libraries makes CMake fall back to a literal `-lggml-cpu`
-# link flag against a phantom library. Guard the link the same way
-# tools/kokoro/CMakeLists.txt does for kokoro_lib.
-target_link_libraries(omnivoice-dac-parity PRIVATE ggml)
-if(TARGET ggml-base)
-    target_link_libraries(omnivoice-dac-parity PRIVATE ggml-base)
-endif()
+#
+# The parity harness calls ggml_backend_cpu_init / ggml_backend_cpu_set_n_threads
+# directly, so it can only be built when the ggml-cpu CMake target is present.
+# On some cross-compile configurations (Android NDK in particular) the ggml-cpu
+# target is not defined, and the executable would fail to link. Gate the whole
+# target on ggml-cpu availability so those builds skip it cleanly.
 if(TARGET ggml-cpu)
-    target_link_libraries(omnivoice-dac-parity PRIVATE ggml-cpu)
-endif()
+    add_executable(omnivoice-dac-parity tests/dac-parity.cpp)
+    target_compile_features(omnivoice-dac-parity PRIVATE cxx_std_17)
+    target_include_directories(omnivoice-dac-parity PRIVATE
+        ${CMAKE_SOURCE_DIR}/ggml/include
+        ${CMAKE_SOURCE_DIR}/ggml/src)
+    target_link_libraries(omnivoice-dac-parity PRIVATE ggml ggml-cpu)
+    if(TARGET ggml-base)
+        target_link_libraries(omnivoice-dac-parity PRIVATE ggml-base)
+    endif()
 
-if(BUILD_TESTING)
-    add_test(NAME omnivoice-dac-parity COMMAND omnivoice-dac-parity --no-real)
+    if(BUILD_TESTING)
+        add_test(NAME omnivoice-dac-parity COMMAND omnivoice-dac-parity --no-real)
+    endif()
 endif()