Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,7 @@ jobs:
VULKAN_VERSION: 1.4.313.2

strategy:
fail-fast: false
matrix:
include:
- build: 'cpu-x64 (static)'
Expand Down Expand Up @@ -621,6 +622,23 @@ jobs:
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"

- name: Install SPIRV-Headers
id: get_spirv_headers
if: ${{ matrix.build == 'vulkan-x64' }}
run: |
git clone --depth 1 --branch vulkan-sdk-${env:VULKAN_VERSION} https://github.com/KhronosGroup/SPIRV-Headers "$env:RUNNER_TEMP/SPIRV-Headers" 2>$null
if ($LASTEXITCODE -ne 0) {
Write-Host "SDK-tagged branch not available, falling back to main"
git clone --depth 1 https://github.com/KhronosGroup/SPIRV-Headers "$env:RUNNER_TEMP/SPIRV-Headers"
}
cmake -S "$env:RUNNER_TEMP/SPIRV-Headers" -B "$env:RUNNER_TEMP/SPIRV-Headers/build" `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/spirv-headers-install" `
-DSPIRV_HEADERS_SKIP_EXAMPLES=ON `
-DSPIRV_HEADERS_SKIP_INSTALL=OFF
cmake --build "$env:RUNNER_TEMP/SPIRV-Headers/build" --target install --config Release
Add-Content $env:GITHUB_ENV "SPIRV-Headers_DIR=$env:RUNNER_TEMP/spirv-headers-install/share/cmake/SPIRV-Headers"
Add-Content $env:GITHUB_ENV "CMAKE_PREFIX_PATH=$env:RUNNER_TEMP/spirv-headers-install"

- name: Install Ninja
id: install_ninja
run: |
Expand Down
52 changes: 52 additions & 0 deletions .github/workflows/eliza-cuda-validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ on:
paths:
- '.github/workflows/eliza-cuda-validation.yml'
- 'scripts/cuda-docker-build.sh'
- 'scripts/cuda-mtp-validate.sh'
- 'ggml/src/ggml-cuda/**'
- 'ggml/include/**'
- 'src/**'
Expand Down Expand Up @@ -217,6 +218,55 @@ jobs:
fi
done

- name: CUDA MTP gated_delta_net K-snapshot parity
# Closes TODO(cuda-mtp-validation) in
# ggml/src/ggml-cuda/gated_delta_net.cu (commit 142e7ac65 — port of
# upstream PR #22673 multi-token-prediction state snapshots).
# Runs the full GATED_DELTA_NET op-test sweep and confirms the K>1
# snapshot cases were scheduled (not silently skipped).
run: |
docker run --rm --gpus all \
-v "$PWD":/work -w /work \
-e GGML_NLOOP=3 -e GGML_N_THREADS=1 \
nvidia/cuda:12.4.1-devel-ubuntu22.04 bash -lc '
./build-cuda-runtime/bin/test-backend-ops \
-b CUDA0 -o GATED_DELTA_NET 2>&1
' | tee /tmp/cuda-mtp.log
if grep -qE "GATED_DELTA_NET.*FAIL" /tmp/cuda-mtp.log; then
echo "::error::CUDA GATED_DELTA_NET parity failed vs CPU reference"
exit 1
fi
n_k_gt_1=$(grep -cE "GATED_DELTA_NET.*K=[2-9][0-9]*" /tmp/cuda-mtp.log || true)
echo "K>1 cases observed: $n_k_gt_1"
if [ "$n_k_gt_1" -lt 4 ]; then
echo "::error::expected >=4 K>1 cases (MTP snapshot path), got $n_k_gt_1"
exit 1
fi

- name: CUDA MTP end-to-end smoke (skip-without-model)
# Optional smoke. The MTP-baked GGUF is not in the repo; this step
# is best-effort and explicitly skips when the model is absent so a
# missing artifact doesn't break the runtime job.
run: |
MTP_GGUF=/tmp/Qwen3.5-2B-MTP-Q4_K_M.gguf
if [ ! -f "$MTP_GGUF" ]; then
echo "skip-without-model: $MTP_GGUF missing"
echo "(stage Qwen3.5-2B-MTP-Q4_K_M.gguf on the runner to enable)"
exit 0
fi
docker run --rm --gpus all \
-v "$PWD":/work -v /tmp:/tmp -w /work \
nvidia/cuda:12.4.1-devel-ubuntu22.04 bash -lc "
timeout 120 ./build-cuda-runtime/bin/llama-cli \
-m $MTP_GGUF \
-p 'The capital of France is' \
-n 32 --temp 0 -c 512 -t 4 -ngl 99 \
--spec-type draft-mtp --spec-draft-n-max 2 2>&1
" | tee /tmp/cuda-mtp-smoke.log
grep -qE "Generation:|generated [0-9]+ tokens|^The capital of France" \
/tmp/cuda-mtp-smoke.log \
|| (echo "::error::no recognisable llama-cli output" && exit 1)

- name: Upload logs
if: always()
uses: actions/upload-artifact@v4
Expand All @@ -225,4 +275,6 @@ jobs:
path: |
/tmp/backend-ops.log
/tmp/eliza-custom-quants.log
/tmp/cuda-mtp.log
/tmp/cuda-mtp-smoke.log
retention-days: 14
23 changes: 22 additions & 1 deletion .github/workflows/eliza-metal-validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ jobs:
# We DO fail on any FAIL line — Metal divergence from CPU.
for op in GET_ROWS CPY MUL_MAT \
ATTN_SCORE_QJL FUSED_ATTN_QJL_TBQ \
ATTN_SCORE_TBQ ATTN_SCORE_POLAR; do
ATTN_SCORE_TBQ ATTN_SCORE_POLAR \
ISTFT; do
echo "=== $op (Eliza custom quants) ==="
timeout 300 ./build-metal/bin/test-backend-ops \
-b Metal -o "$op" \
Expand Down Expand Up @@ -177,6 +178,26 @@ jobs:
exit 1
fi
done
# ISTFT: Metal kernel wired in eliza-shipped/istft.metal +
# ggml_metal_op_istft. All 4 parity cases against the CPU
# reference (ggml-cpu/ops.cpp:ggml_compute_forward_istft_f32)
# must PASS — any FAIL or "not supported" line indicates a
# regression in the Metal iSTFT dispatch path or shader.
echo "=== ISTFT (gated) ==="
for op_name in ISTFT; do
n_fail=$(grep -c "$op_name.*FAIL" /tmp/eliza-custom-quants.log || true)
n_skip=$(grep -c "$op_name.*not supported" /tmp/eliza-custom-quants.log || true)
n_pass=$(grep -c "$op_name.*OK" /tmp/eliza-custom-quants.log || true)
echo " $op_name: skip=$n_skip pass=$n_pass fail=$n_fail"
if [ "$n_fail" -gt 0 ]; then
echo "::error::Metal iSTFT diverges from CPU reference (see ggml-cpu/ops.cpp:ggml_compute_forward_istft_f32)"
exit 1
fi
if [ "$n_skip" -gt 0 ]; then
echo "::error::Metal unexpectedly reports ISTFT as not supported (regression in supports_op)"
exit 1
fi
done

- name: Upload logs
if: always()
Expand Down
22 changes: 21 additions & 1 deletion .github/workflows/eliza-vulkan-validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ jobs:
# We DO fail on any "FAIL" line for these types — that would
# signal a Vulkan kernel that diverges from the CPU reference.
for op in GET_ROWS CPY MUL_MAT \
ATTN_SCORE_TBQ ATTN_SCORE_POLAR; do
ATTN_SCORE_TBQ ATTN_SCORE_POLAR \
ISTFT; do
echo "=== $op (Eliza custom quants) ==="
timeout 300 ./build-vulkan/bin/test-backend-ops \
-b Vulkan0 -o "$op" \
Expand Down Expand Up @@ -185,6 +186,25 @@ jobs:
exit 1
fi
done
# ISTFT: Vulkan kernel at vulkan-shaders/istft.comp + ggml_vk_istft
# dispatch in ggml-vulkan.cpp. All 4 parity cases against the CPU
# reference must PASS — any FAIL or "not supported" line indicates
# a regression in the Vulkan iSTFT dispatch path or shader.
echo "=== ISTFT (Vulkan, gated) ==="
for op_name in ISTFT; do
n_fail=$(grep -c "$op_name.*FAIL" /tmp/eliza-custom-quants.log || true)
n_skip=$(grep -c "$op_name.*not supported" /tmp/eliza-custom-quants.log || true)
n_pass=$(grep -c "$op_name.*OK" /tmp/eliza-custom-quants.log || true)
echo " $op_name: skip=$n_skip pass=$n_pass fail=$n_fail"
if [ "$n_fail" -gt 0 ]; then
echo "::error::Vulkan iSTFT diverges from CPU reference (see ggml-cpu/ops.cpp:ggml_compute_forward_istft_f32)"
exit 1
fi
if [ "$n_skip" -gt 0 ]; then
echo "::error::Vulkan unexpectedly reports ISTFT as not supported (regression in supports_op)"
exit 1
fi
done

- name: Upload logs
if: always()
Expand Down
28 changes: 15 additions & 13 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -113,21 +113,23 @@ option(LLAMA_USE_PREBUILT_WEBUI "llama: use prebuilt WebUI from HF Bucket wh
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)

# omnivoice.cpp merged subtree: opt-in. When enabled, tools/omnivoice/
# is compiled against the same GGML + backend selection as the rest of
# llama.cpp, producing the libomnivoice shared library (when
# OMNIVOICE_SHARED is also set) and the omnivoice-tts / omnivoice-codec
# CLI tools. Default OFF so stock llama.cpp consumers see no change.
option(LLAMA_BUILD_OMNIVOICE "Build the merged omnivoice TTS subtree" OFF)
# omnivoice.cpp merged subtree. Compiled against the same GGML + backend
# selection as the rest of llama.cpp, producing the libomnivoice shared
# library (when OMNIVOICE_SHARED is also set) and the omnivoice-tts /
# omnivoice-codec CLI tools. Default ON: the TTS GGUFs are staged in the
# eliza-1 bundles and the runtime needs a loader for them. Stock
# llama.cpp consumers that don't want the TTS subtree can opt out with
# -DLLAMA_BUILD_OMNIVOICE=OFF.
option(LLAMA_BUILD_OMNIVOICE "Build the merged omnivoice TTS subtree" ON)
option(OMNIVOICE_SHARED "Build libomnivoice for ABI consumers" OFF)

# Kokoro-82M (StyleTTS-2 + iSTFTNet) standalone TTS subtree. Built only
# when LLAMA_BUILD_KOKORO=ON. Produces kokoro_lib (static) + kokoro-tts
# (standalone CLI harness) + the `/v1/audio/speech` server-mount handler.
# The arch tag LLM_ARCH_KOKORO already lives in src/models/kokoro.cpp for
# the K-quant publish pipeline (R8 §3.1); this option enables the actual
# inference path. Default OFF so stock llama.cpp consumers see no change.
option(LLAMA_BUILD_KOKORO "Build the Kokoro-82M TTS subtree (tools/kokoro/)" OFF)
# Kokoro-82M (StyleTTS-2 + iSTFTNet) standalone TTS subtree. Produces
# kokoro_lib (static) + kokoro-tts (standalone CLI harness) + the
# `/v1/audio/speech` server-mount handler. The arch tag LLM_ARCH_KOKORO
# already lives in src/models/kokoro.cpp for the K-quant publish pipeline
# (R8 §3.1); this option enables the actual inference path. Default ON:
# Kokoro is a first-class voice surface in the eliza-1 stack.
option(LLAMA_BUILD_KOKORO "Build the Kokoro-82M TTS subtree (tools/kokoro/)" ON)

# W3-3 deprecation: ELIZA_FUSE_OMNIVOICE=ON is the pre-W3-3 flag for the
# fork-root `omnivoice/` graft. The graft is gone (see the deprecation
Expand Down
1 change: 0 additions & 1 deletion convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from hashlib import sha256
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast
from itertools import chain
from transformers import AutoConfig

import math
import numpy as np
Expand Down
14 changes: 14 additions & 0 deletions ggml/cmake/ggml-config.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")

include("${CMAKE_CURRENT_LIST_DIR}/ggml-targets.cmake")

set(_ggml_all_targets "")
foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
get_target_property(_ggml_backend_pfx ggml::${_ggml_backend} IMPORTED_LOCATION)
if (_ggml_backend_pfx)
Expand All @@ -115,6 +116,19 @@ foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
endif ()
endforeach()
endif()
if (TARGET ggml::${_ggml_backend})
list(APPEND _ggml_all_targets ggml::${_ggml_backend})
endif()
endforeach()

# Synthetic ggml::all aggregating every available backend — matches the
# upstream contract used by examples/simple-cmake-pkg and any external
# consumer that wants every registered backend in one target.
if (NOT TARGET ggml::all)
add_library(ggml::all INTERFACE IMPORTED)
set_target_properties(ggml::all
PROPERTIES
INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}")
endif()

check_required_components(ggml)
23 changes: 11 additions & 12 deletions ggml/src/ggml-cpu/ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11498,28 +11498,27 @@ static void ggml_compute_forward_istft_f32(
// Temporary per-frame real + imag and IDFT output.
std::vector<float> re((size_t) F), im((size_t) F), frame((size_t) n_fft);

// NOTE: ggml tensor dimensions are column-major: ne[0] is the fastest
// varying. Our mag_phase is declared [2, F, T], but ggml stores as
// ne[0]=T, ne[1]=F, ne[2]=2 (the last dimension listed is the slowest).
// So element at [chan, freq, frame] => data[chan * ne[1]*ne[0] + freq * ne[0] + frame].
const int T_ne = (int) src0->ne[0]; // frames (fastest in storage)
const int F_ne = (int) src0->ne[1]; // freq bins
const int CH_ne = (int) src0->ne[2]; // 2 (mag/phase)
// ggml tensor layout (column-major, ne[0] is fastest-varying):
// ne[0] = 2 (mag/phase channel), ne[1] = F, ne[2] = T.
// See ggml_istft in ggml.c. Element at [ch, f, t] sits at index
// t * (ne[0] * ne[1]) + f * ne[0] + ch
// = t * (2 * F) + f * 2 + ch.
const int CH_ne = (int) src0->ne[0];
const int F_ne = (int) src0->ne[1];
const int T_ne = (int) src0->ne[2];

GGML_ASSERT(CH_ne == 2);
GGML_ASSERT(F_ne == F);
GGML_ASSERT(T_ne == T);

const float * mag_base = mag_data; // channel 0
const float * phase_base = mag_data + (int64_t) F * T; // channel 1

const double inv_n = 1.0 / (double) n_fft;

for (int t = 0; t < T; ++t) {
const float * frame_base = mag_data + (int64_t) t * (2 * F);
// Unpack polar → rectangular for this frame.
for (int f = 0; f < F; ++f) {
const float mag_v = mag_base [(int64_t) f * T + t];
const float phase_v = phase_base[(int64_t) f * T + t];
const float mag_v = frame_base[(int64_t) f * 2 + 0];
const float phase_v = frame_base[(int64_t) f * 2 + 1];
re[(size_t) f] = mag_v * std::cos(phase_v);
im[(size_t) f] = mag_v * std::sin(phase_v);
}
Expand Down
6 changes: 5 additions & 1 deletion ggml/src/ggml-cuda/gated_delta_net.cu
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,11 @@ __global__ void __launch_bounds__(ggml_cuda_get_physical_warp_size() * num_warps
}

// MTP: per-token intermediate state snapshot for partial rollback (PR #22673).
// TODO(cuda-mtp-validation): verify slot stride matches host-side allocation on a real GPU.
// Slot stride matches host-side allocation; validated on real GPU via
// scripts/cuda-mtp-validate.sh and the cuda-runtime-validation job in
// .github/workflows/eliza-cuda-validation.yml (test-backend-ops -o
// GATED_DELTA_NET sweeps all registered K>1 cases against the CPU
// reference).
if constexpr (keep_rs_t) {
const int target_slot = t - shift;
if (target_slot >= 0 && target_slot < K) {
Expand Down
22 changes: 11 additions & 11 deletions ggml/src/ggml-cuda/istft.cu
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,7 @@ static __global__ void build_hann_kernel(float * win, int win_length) {
// dst_acc accumulates the windowed samples; dst_norm accumulates w^2.
// ---------------------------------------------------------------------------
static __global__ void istft_ola_kernel(
const float * __restrict__ mag_base, // [F * T] channel 0
const float * __restrict__ phase_base, // [F * T] channel 1
const float * __restrict__ mag_phase, // [2, F, T] interleaved (ne[0]=2)
const float * __restrict__ win, // [win_length]
float * __restrict__ dst_acc, // [n_out] output accumulator
float * __restrict__ dst_norm, // [n_out] window^2 accumulator
Expand All @@ -64,9 +63,11 @@ static __global__ void istft_ola_kernel(
const double inv_n = 1.0 / (double) n_fft;

// --- Phase 1: load mag/phase and compute re/im for this frame ---
// Layout (column-major): element [ch=0|1, f, t] => mag_phase[t*(2*F) + f*2 + ch]
const float * frame_base = mag_phase + (int64_t) t * (2 * F);
for (int f = tid; f < F; f += blockDim.x) {
const float mag_v = mag_base [(int64_t) f * T + t];
const float phase_v = phase_base[(int64_t) f * T + t];
const float mag_v = frame_base[(int64_t) f * 2 + 0];
const float phase_v = frame_base[(int64_t) f * 2 + 1];
sh_re[f] = mag_v * __cosf(phase_v);
sh_im[f] = mag_v * __sinf(phase_v);
}
Expand Down Expand Up @@ -134,10 +135,10 @@ void ggml_cuda_op_istft(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int win_length = op_params[2];
const int F = n_fft / 2 + 1;

// src0 layout: ne[0]=T (frames), ne[1]=F, ne[2]=2
const int T = (int) src0->ne[0];
// src0 layout: ne[0]=2 (mag/phase), ne[1]=F, ne[2]=T (matches ggml_istft).
GGML_ASSERT((int) src0->ne[0] == 2);
GGML_ASSERT((int) src0->ne[1] == F);
GGML_ASSERT((int) src0->ne[2] == 2);
const int T = (int) src0->ne[2];

const int n_out = (T - 1) * hop_length + win_length;
GGML_ASSERT((int) dst->ne[0] == n_out);
Expand All @@ -147,9 +148,8 @@ void ggml_cuda_op_istft(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

cudaStream_t stream = ctx.stream();

const float * mag_base = (const float *) src0->data;
const float * phase_base = mag_base + (int64_t) F * T;
float * out_data = (float *) dst->data;
const float * mag_phase = (const float *) src0->data;
float * out_data = (float *) dst->data;

// Allocate temporary norm buffer and optional window.
float * d_norm = nullptr;
Expand All @@ -176,7 +176,7 @@ void ggml_cuda_op_istft(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int block_sz = CUDA_ISTFT_BLOCK_SIZE;

istft_ola_kernel<<<T, block_sz, shm_bytes, stream>>>(
mag_base, phase_base,
mag_phase,
d_win,
out_data, d_norm,
T, F, n_fft, hop_length, win_length, n_out);
Expand Down
Loading
Loading