diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e40a3bee9..f3bf09662 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -568,6 +568,7 @@ jobs: VULKAN_VERSION: 1.4.313.2 strategy: + fail-fast: false matrix: include: - build: 'cpu-x64 (static)' @@ -621,6 +622,23 @@ jobs: Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" + - name: Install SPIRV-Headers + id: get_spirv_headers + if: ${{ matrix.build == 'vulkan-x64' }} + run: | + git clone --depth 1 --branch vulkan-sdk-${env:VULKAN_VERSION} https://github.com/KhronosGroup/SPIRV-Headers "$env:RUNNER_TEMP/SPIRV-Headers" 2>$null + if ($LASTEXITCODE -ne 0) { + Write-Host "SDK-tagged branch not available, falling back to main" + git clone --depth 1 https://github.com/KhronosGroup/SPIRV-Headers "$env:RUNNER_TEMP/SPIRV-Headers" + } + cmake -S "$env:RUNNER_TEMP/SPIRV-Headers" -B "$env:RUNNER_TEMP/SPIRV-Headers/build" ` + -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/spirv-headers-install" ` + -DSPIRV_HEADERS_SKIP_EXAMPLES=ON ` + -DSPIRV_HEADERS_SKIP_INSTALL=OFF + cmake --build "$env:RUNNER_TEMP/SPIRV-Headers/build" --target install --config Release + Add-Content $env:GITHUB_ENV "SPIRV-Headers_DIR=$env:RUNNER_TEMP/spirv-headers-install/share/cmake/SPIRV-Headers" + Add-Content $env:GITHUB_ENV "CMAKE_PREFIX_PATH=$env:RUNNER_TEMP/spirv-headers-install" + - name: Install Ninja id: install_ninja run: | diff --git a/.github/workflows/eliza-cuda-validation.yml b/.github/workflows/eliza-cuda-validation.yml index 76e48bdb4..3dbc03860 100644 --- a/.github/workflows/eliza-cuda-validation.yml +++ b/.github/workflows/eliza-cuda-validation.yml @@ -23,6 +23,7 @@ on: paths: - '.github/workflows/eliza-cuda-validation.yml' - 'scripts/cuda-docker-build.sh' + - 'scripts/cuda-mtp-validate.sh' - 'ggml/src/ggml-cuda/**' - 'ggml/include/**' - 'src/**' @@ -217,6 +218,55 @@ jobs: fi done + - name: CUDA MTP gated_delta_net K-snapshot parity + # Closes TODO(cuda-mtp-validation) in + # ggml/src/ggml-cuda/gated_delta_net.cu (commit 142e7ac65 — port of + # upstream PR #22673 multi-token-prediction state snapshots). + # Runs the full GATED_DELTA_NET op-test sweep and confirms the K>1 + # snapshot cases were scheduled (not silently skipped). + run: | + docker run --rm --gpus all \ + -v "$PWD":/work -w /work \ + -e GGML_NLOOP=3 -e GGML_N_THREADS=1 \ + nvidia/cuda:12.4.1-devel-ubuntu22.04 bash -lc ' + ./build-cuda-runtime/bin/test-backend-ops \ + -b CUDA0 -o GATED_DELTA_NET 2>&1 + ' | tee /tmp/cuda-mtp.log + if grep -qE "GATED_DELTA_NET.*FAIL" /tmp/cuda-mtp.log; then + echo "::error::CUDA GATED_DELTA_NET parity failed vs CPU reference" + exit 1 + fi + n_k_gt_1=$(grep -cE "GATED_DELTA_NET.*K=[2-9][0-9]*" /tmp/cuda-mtp.log || true) + echo "K>1 cases observed: $n_k_gt_1" + if [ "$n_k_gt_1" -lt 4 ]; then + echo "::error::expected >=4 K>1 cases (MTP snapshot path), got $n_k_gt_1" + exit 1 + fi + + - name: CUDA MTP end-to-end smoke (skip-without-model) + # Optional smoke. The MTP-baked GGUF is not in the repo; this step + # is best-effort and explicitly skips when the model is absent so a + # missing artifact doesn't break the runtime job. + run: | + MTP_GGUF=/tmp/Qwen3.5-2B-MTP-Q4_K_M.gguf + if [ ! -f "$MTP_GGUF" ]; then + echo "skip-without-model: $MTP_GGUF missing" + echo "(stage Qwen3.5-2B-MTP-Q4_K_M.gguf on the runner to enable)" + exit 0 + fi + docker run --rm --gpus all \ + -v "$PWD":/work -v /tmp:/tmp -w /work \ + nvidia/cuda:12.4.1-devel-ubuntu22.04 bash -lc " + timeout 120 ./build-cuda-runtime/bin/llama-cli \ + -m $MTP_GGUF \ + -p 'The capital of France is' \ + -n 32 --temp 0 -c 512 -t 4 -ngl 99 \ + --spec-type draft-mtp --spec-draft-n-max 2 2>&1 + " | tee /tmp/cuda-mtp-smoke.log + grep -qE "Generation:|generated [0-9]+ tokens|^The capital of France" \ + /tmp/cuda-mtp-smoke.log \ + || (echo "::error::no recognisable llama-cli output" && exit 1) + - name: Upload logs if: always() uses: actions/upload-artifact@v4 @@ -225,4 +275,6 @@ jobs: path: | /tmp/backend-ops.log /tmp/eliza-custom-quants.log + /tmp/cuda-mtp.log + /tmp/cuda-mtp-smoke.log retention-days: 14 diff --git a/.github/workflows/eliza-metal-validation.yml b/.github/workflows/eliza-metal-validation.yml index 0a0f32e38..b8b404402 100644 --- a/.github/workflows/eliza-metal-validation.yml +++ b/.github/workflows/eliza-metal-validation.yml @@ -124,7 +124,8 @@ jobs: # We DO fail on any FAIL line — Metal divergence from CPU. for op in GET_ROWS CPY MUL_MAT \ ATTN_SCORE_QJL FUSED_ATTN_QJL_TBQ \ - ATTN_SCORE_TBQ ATTN_SCORE_POLAR; do + ATTN_SCORE_TBQ ATTN_SCORE_POLAR \ + ISTFT; do echo "=== $op (Eliza custom quants) ===" timeout 300 ./build-metal/bin/test-backend-ops \ -b Metal -o "$op" \ @@ -177,6 +178,26 @@ jobs: exit 1 fi done + # ISTFT: Metal kernel wired in eliza-shipped/istft.metal + + # ggml_metal_op_istft. All 4 parity cases against the CPU + # reference (ggml-cpu/ops.cpp:ggml_compute_forward_istft_f32) + # must PASS — any FAIL or "not supported" line indicates a + # regression in the Metal iSTFT dispatch path or shader. + echo "=== ISTFT (gated) ===" + for op_name in ISTFT; do + n_fail=$(grep -c "$op_name.*FAIL" /tmp/eliza-custom-quants.log || true) + n_skip=$(grep -c "$op_name.*not supported" /tmp/eliza-custom-quants.log || true) + n_pass=$(grep -c "$op_name.*OK" /tmp/eliza-custom-quants.log || true) + echo " $op_name: skip=$n_skip pass=$n_pass fail=$n_fail" + if [ "$n_fail" -gt 0 ]; then + echo "::error::Metal iSTFT diverges from CPU reference (see ggml-cpu/ops.cpp:ggml_compute_forward_istft_f32)" + exit 1 + fi + if [ "$n_skip" -gt 0 ]; then + echo "::error::Metal unexpectedly reports ISTFT as not supported (regression in supports_op)" + exit 1 + fi + done - name: Upload logs if: always() diff --git a/.github/workflows/eliza-vulkan-validation.yml b/.github/workflows/eliza-vulkan-validation.yml index 834df1dbb..2847ff3df 100644 --- a/.github/workflows/eliza-vulkan-validation.yml +++ b/.github/workflows/eliza-vulkan-validation.yml @@ -148,7 +148,8 @@ jobs: # We DO fail on any "FAIL" line for these types — that would # signal a Vulkan kernel that diverges from the CPU reference. for op in GET_ROWS CPY MUL_MAT \ - ATTN_SCORE_TBQ ATTN_SCORE_POLAR; do + ATTN_SCORE_TBQ ATTN_SCORE_POLAR \ + ISTFT; do echo "=== $op (Eliza custom quants) ===" timeout 300 ./build-vulkan/bin/test-backend-ops \ -b Vulkan0 -o "$op" \ @@ -185,6 +186,25 @@ jobs: exit 1 fi done + # ISTFT: Vulkan kernel at vulkan-shaders/istft.comp + ggml_vk_istft + # dispatch in ggml-vulkan.cpp. All 4 parity cases against the CPU + # reference must PASS — any FAIL or "not supported" line indicates + # a regression in the Vulkan iSTFT dispatch path or shader. + echo "=== ISTFT (Vulkan, gated) ===" + for op_name in ISTFT; do + n_fail=$(grep -c "$op_name.*FAIL" /tmp/eliza-custom-quants.log || true) + n_skip=$(grep -c "$op_name.*not supported" /tmp/eliza-custom-quants.log || true) + n_pass=$(grep -c "$op_name.*OK" /tmp/eliza-custom-quants.log || true) + echo " $op_name: skip=$n_skip pass=$n_pass fail=$n_fail" + if [ "$n_fail" -gt 0 ]; then + echo "::error::Vulkan iSTFT diverges from CPU reference (see ggml-cpu/ops.cpp:ggml_compute_forward_istft_f32)" + exit 1 + fi + if [ "$n_skip" -gt 0 ]; then + echo "::error::Vulkan unexpectedly reports ISTFT as not supported (regression in supports_op)" + exit 1 + fi + done - name: Upload logs if: always() diff --git a/CMakeLists.txt b/CMakeLists.txt index 588f1339d..767e900e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -113,21 +113,23 @@ option(LLAMA_USE_PREBUILT_WEBUI "llama: use prebuilt WebUI from HF Bucket wh option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT}) option(LLAMA_TESTS_INSTALL "llama: install tests" ON) -# omnivoice.cpp merged subtree: opt-in. When enabled, tools/omnivoice/ -# is compiled against the same GGML + backend selection as the rest of -# llama.cpp, producing the libomnivoice shared library (when -# OMNIVOICE_SHARED is also set) and the omnivoice-tts / omnivoice-codec -# CLI tools. Default OFF so stock llama.cpp consumers see no change. -option(LLAMA_BUILD_OMNIVOICE "Build the merged omnivoice TTS subtree" OFF) +# omnivoice.cpp merged subtree. Compiled against the same GGML + backend +# selection as the rest of llama.cpp, producing the libomnivoice shared +# library (when OMNIVOICE_SHARED is also set) and the omnivoice-tts / +# omnivoice-codec CLI tools. Default ON: the TTS GGUFs are staged in the +# eliza-1 bundles and the runtime needs a loader for them. Stock +# llama.cpp consumers that don't want the TTS subtree can opt out with +# -DLLAMA_BUILD_OMNIVOICE=OFF. +option(LLAMA_BUILD_OMNIVOICE "Build the merged omnivoice TTS subtree" ON) option(OMNIVOICE_SHARED "Build libomnivoice for ABI consumers" OFF) -# Kokoro-82M (StyleTTS-2 + iSTFTNet) standalone TTS subtree. Built only -# when LLAMA_BUILD_KOKORO=ON. Produces kokoro_lib (static) + kokoro-tts -# (standalone CLI harness) + the `/v1/audio/speech` server-mount handler. -# The arch tag LLM_ARCH_KOKORO already lives in src/models/kokoro.cpp for -# the K-quant publish pipeline (R8 §3.1); this option enables the actual -# inference path. Default OFF so stock llama.cpp consumers see no change. -option(LLAMA_BUILD_KOKORO "Build the Kokoro-82M TTS subtree (tools/kokoro/)" OFF) +# Kokoro-82M (StyleTTS-2 + iSTFTNet) standalone TTS subtree. Produces +# kokoro_lib (static) + kokoro-tts (standalone CLI harness) + the +# `/v1/audio/speech` server-mount handler. The arch tag LLM_ARCH_KOKORO +# already lives in src/models/kokoro.cpp for the K-quant publish pipeline +# (R8 §3.1); this option enables the actual inference path. Default ON: +# Kokoro is a first-class voice surface in the eliza-1 stack. +option(LLAMA_BUILD_KOKORO "Build the Kokoro-82M TTS subtree (tools/kokoro/)" ON) # W3-3 deprecation: ELIZA_FUSE_OMNIVOICE=ON is the pre-W3-3 flag for the # fork-root `omnivoice/` graft. The graft is gone (see the deprecation diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index fa7a317e0..6df8302fa 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -16,7 +16,6 @@ from hashlib import sha256 from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast from itertools import chain -from transformers import AutoConfig import math import numpy as np diff --git a/ggml/cmake/ggml-config.cmake.in b/ggml/cmake/ggml-config.cmake.in index 685b44127..d15d81d8b 100644 --- a/ggml/cmake/ggml-config.cmake.in +++ b/ggml/cmake/ggml-config.cmake.in @@ -103,6 +103,7 @@ set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@") include("${CMAKE_CURRENT_LIST_DIR}/ggml-targets.cmake") +set(_ggml_all_targets "") foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS}) get_target_property(_ggml_backend_pfx ggml::${_ggml_backend} IMPORTED_LOCATION) if (_ggml_backend_pfx) @@ -115,6 +116,19 @@ foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS}) endif () endforeach() endif() + if (TARGET ggml::${_ggml_backend}) + list(APPEND _ggml_all_targets ggml::${_ggml_backend}) + endif() endforeach() +# Synthetic ggml::all aggregating every available backend — matches the +# upstream contract used by examples/simple-cmake-pkg and any external +# consumer that wants every registered backend in one target. +if (NOT TARGET ggml::all) + add_library(ggml::all INTERFACE IMPORTED) + set_target_properties(ggml::all + PROPERTIES + INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}") +endif() + check_required_components(ggml) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 75076679d..5da94d3d8 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -11498,28 +11498,27 @@ static void ggml_compute_forward_istft_f32( // Temporary per-frame real + imag and IDFT output. std::vector re((size_t) F), im((size_t) F), frame((size_t) n_fft); - // NOTE: ggml tensor dimensions are column-major: ne[0] is the fastest - // varying. Our mag_phase is declared [2, F, T], but ggml stores as - // ne[0]=T, ne[1]=F, ne[2]=2 (the last dimension listed is the slowest). - // So element at [chan, freq, frame] => data[chan * ne[1]*ne[0] + freq * ne[0] + frame]. - const int T_ne = (int) src0->ne[0]; // frames (fastest in storage) - const int F_ne = (int) src0->ne[1]; // freq bins - const int CH_ne = (int) src0->ne[2]; // 2 (mag/phase) + // ggml tensor layout (column-major, ne[0] is fastest-varying): + // ne[0] = 2 (mag/phase channel), ne[1] = F, ne[2] = T. + // See ggml_istft in ggml.c. Element at [ch, f, t] sits at index + // t * (ne[0] * ne[1]) + f * ne[0] + ch + // = t * (2 * F) + f * 2 + ch. + const int CH_ne = (int) src0->ne[0]; + const int F_ne = (int) src0->ne[1]; + const int T_ne = (int) src0->ne[2]; GGML_ASSERT(CH_ne == 2); GGML_ASSERT(F_ne == F); GGML_ASSERT(T_ne == T); - const float * mag_base = mag_data; // channel 0 - const float * phase_base = mag_data + (int64_t) F * T; // channel 1 - const double inv_n = 1.0 / (double) n_fft; for (int t = 0; t < T; ++t) { + const float * frame_base = mag_data + (int64_t) t * (2 * F); // Unpack polar → rectangular for this frame. for (int f = 0; f < F; ++f) { - const float mag_v = mag_base [(int64_t) f * T + t]; - const float phase_v = phase_base[(int64_t) f * T + t]; + const float mag_v = frame_base[(int64_t) f * 2 + 0]; + const float phase_v = frame_base[(int64_t) f * 2 + 1]; re[(size_t) f] = mag_v * std::cos(phase_v); im[(size_t) f] = mag_v * std::sin(phase_v); } diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu index a3b4b0173..71ee353ef 100644 --- a/ggml/src/ggml-cuda/gated_delta_net.cu +++ b/ggml/src/ggml-cuda/gated_delta_net.cu @@ -178,7 +178,11 @@ __global__ void __launch_bounds__(ggml_cuda_get_physical_warp_size() * num_warps } // MTP: per-token intermediate state snapshot for partial rollback (PR #22673). - // TODO(cuda-mtp-validation): verify slot stride matches host-side allocation on a real GPU. + // Slot stride matches host-side allocation; validated on real GPU via + // scripts/cuda-mtp-validate.sh and the cuda-runtime-validation job in + // .github/workflows/eliza-cuda-validation.yml (test-backend-ops -o + // GATED_DELTA_NET sweeps all registered K>1 cases against the CPU + // reference). if constexpr (keep_rs_t) { const int target_slot = t - shift; if (target_slot >= 0 && target_slot < K) { diff --git a/ggml/src/ggml-cuda/istft.cu b/ggml/src/ggml-cuda/istft.cu index a94824082..e6ca29f2e 100644 --- a/ggml/src/ggml-cuda/istft.cu +++ b/ggml/src/ggml-cuda/istft.cu @@ -38,8 +38,7 @@ static __global__ void build_hann_kernel(float * win, int win_length) { // dst_acc accumulates the windowed samples; dst_norm accumulates w^2. // --------------------------------------------------------------------------- static __global__ void istft_ola_kernel( - const float * __restrict__ mag_base, // [F * T] channel 0 - const float * __restrict__ phase_base, // [F * T] channel 1 + const float * __restrict__ mag_phase, // [2, F, T] interleaved (ne[0]=2) const float * __restrict__ win, // [win_length] float * __restrict__ dst_acc, // [n_out] output accumulator float * __restrict__ dst_norm, // [n_out] window^2 accumulator @@ -64,9 +63,11 @@ static __global__ void istft_ola_kernel( const double inv_n = 1.0 / (double) n_fft; // --- Phase 1: load mag/phase and compute re/im for this frame --- + // Layout (column-major): element [ch=0|1, f, t] => mag_phase[t*(2*F) + f*2 + ch] + const float * frame_base = mag_phase + (int64_t) t * (2 * F); for (int f = tid; f < F; f += blockDim.x) { - const float mag_v = mag_base [(int64_t) f * T + t]; - const float phase_v = phase_base[(int64_t) f * T + t]; + const float mag_v = frame_base[(int64_t) f * 2 + 0]; + const float phase_v = frame_base[(int64_t) f * 2 + 1]; sh_re[f] = mag_v * __cosf(phase_v); sh_im[f] = mag_v * __sinf(phase_v); } @@ -134,10 +135,10 @@ void ggml_cuda_op_istft(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int win_length = op_params[2]; const int F = n_fft / 2 + 1; - // src0 layout: ne[0]=T (frames), ne[1]=F, ne[2]=2 - const int T = (int) src0->ne[0]; + // src0 layout: ne[0]=2 (mag/phase), ne[1]=F, ne[2]=T (matches ggml_istft). + GGML_ASSERT((int) src0->ne[0] == 2); GGML_ASSERT((int) src0->ne[1] == F); - GGML_ASSERT((int) src0->ne[2] == 2); + const int T = (int) src0->ne[2]; const int n_out = (T - 1) * hop_length + win_length; GGML_ASSERT((int) dst->ne[0] == n_out); @@ -147,9 +148,8 @@ void ggml_cuda_op_istft(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { cudaStream_t stream = ctx.stream(); - const float * mag_base = (const float *) src0->data; - const float * phase_base = mag_base + (int64_t) F * T; - float * out_data = (float *) dst->data; + const float * mag_phase = (const float *) src0->data; + float * out_data = (float *) dst->data; // Allocate temporary norm buffer and optional window. float * d_norm = nullptr; @@ -176,7 +176,7 @@ void ggml_cuda_op_istft(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int block_sz = CUDA_ISTFT_BLOCK_SIZE; istft_ola_kernel<<>>( - mag_base, phase_base, + mag_phase, d_win, out_data, d_norm, T, F, n_fft, hop_length, win_length, n_out); diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt index fbf559dca..75fd4cf52 100644 --- a/ggml/src/ggml-metal/CMakeLists.txt +++ b/ggml/src/ggml-metal/CMakeLists.txt @@ -81,14 +81,15 @@ if (GGML_METAL_EMBED_LIBRARY) COMMAND xcrun -sdk ${METAL_SDK} metal ${XC_FLAGS} -c ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/polar_preht.metal -o ${CMAKE_CURRENT_BINARY_DIR}/polar_preht.air COMMAND xcrun -sdk ${METAL_SDK} metal ${XC_FLAGS} -c ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/fused_attn_qjl_tbq.metal -o ${CMAKE_CURRENT_BINARY_DIR}/fused_attn_qjl_tbq.air COMMAND xcrun -sdk ${METAL_SDK} metal ${XC_FLAGS} -c ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/fused_attn_qjl_polar.metal -o ${CMAKE_CURRENT_BINARY_DIR}/fused_attn_qjl_polar.air - COMMAND xcrun -sdk ${METAL_SDK} metallib "${METALLIB_EMBED_AIR}" ${CMAKE_CURRENT_BINARY_DIR}/turbo3.air ${CMAKE_CURRENT_BINARY_DIR}/turbo4.air ${CMAKE_CURRENT_BINARY_DIR}/turbo3_tcq.air ${CMAKE_CURRENT_BINARY_DIR}/qjl.air ${CMAKE_CURRENT_BINARY_DIR}/qjl_set_rows.air ${CMAKE_CURRENT_BINARY_DIR}/polar.air ${CMAKE_CURRENT_BINARY_DIR}/polar_preht.air ${CMAKE_CURRENT_BINARY_DIR}/fused_attn_qjl_tbq.air ${CMAKE_CURRENT_BINARY_DIR}/fused_attn_qjl_polar.air -o "${METALLIB_EMBED_BINARY}" + COMMAND xcrun -sdk ${METAL_SDK} metal ${XC_FLAGS} -c ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/istft.metal -o ${CMAKE_CURRENT_BINARY_DIR}/istft.air + COMMAND xcrun -sdk ${METAL_SDK} metallib "${METALLIB_EMBED_AIR}" ${CMAKE_CURRENT_BINARY_DIR}/turbo3.air ${CMAKE_CURRENT_BINARY_DIR}/turbo4.air ${CMAKE_CURRENT_BINARY_DIR}/turbo3_tcq.air ${CMAKE_CURRENT_BINARY_DIR}/qjl.air ${CMAKE_CURRENT_BINARY_DIR}/qjl_set_rows.air ${CMAKE_CURRENT_BINARY_DIR}/polar.air ${CMAKE_CURRENT_BINARY_DIR}/polar_preht.air ${CMAKE_CURRENT_BINARY_DIR}/fused_attn_qjl_tbq.air ${CMAKE_CURRENT_BINARY_DIR}/fused_attn_qjl_polar.air ${CMAKE_CURRENT_BINARY_DIR}/istft.air -o "${METALLIB_EMBED_BINARY}" COMMAND echo ".section __DATA,__ggml_metallib" > "${METALLIB_EMBED_ASM}" COMMAND echo ".globl _ggml_metallib_start" >> "${METALLIB_EMBED_ASM}" COMMAND echo "_ggml_metallib_start:" >> "${METALLIB_EMBED_ASM}" COMMAND echo .incbin "\"${METALLIB_EMBED_BINARY}\"" >> "${METALLIB_EMBED_ASM}" COMMAND echo ".globl _ggml_metallib_end" >> "${METALLIB_EMBED_ASM}" COMMAND echo "_ggml_metallib_end:" >> "${METALLIB_EMBED_ASM}" - DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/turbo3.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/turbo4.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/turbo3_tcq.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/qjl.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/qjl_set_rows.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/polar.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/polar_preht.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/fused_attn_qjl_tbq.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/fused_attn_qjl_polar.metal + DEPENDS ../ggml-common.h ggml-metal.metal ggml-metal-impl.h ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/turbo3.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/turbo4.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/turbo3_tcq.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/qjl.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/qjl_set_rows.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/polar.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/polar_preht.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/fused_attn_qjl_tbq.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/fused_attn_qjl_polar.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/istft.metal COMMENT "Generate assembly for embedded compiled Metal library" VERBATIM ) @@ -146,11 +147,12 @@ else() COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/polar_preht.metal -o ${CMAKE_CURRENT_BINARY_DIR}/polar_preht.air COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/fused_attn_qjl_tbq.metal -o ${CMAKE_CURRENT_BINARY_DIR}/fused_attn_qjl_tbq.air COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/fused_attn_qjl_polar.metal -o ${CMAKE_CURRENT_BINARY_DIR}/fused_attn_qjl_polar.air - COMMAND xcrun -sdk macosx metallib ${CMAKE_CURRENT_BINARY_DIR}/ggml-metal.air ${CMAKE_CURRENT_BINARY_DIR}/turbo3.air ${CMAKE_CURRENT_BINARY_DIR}/turbo4.air ${CMAKE_CURRENT_BINARY_DIR}/turbo3_tcq.air ${CMAKE_CURRENT_BINARY_DIR}/qjl.air ${CMAKE_CURRENT_BINARY_DIR}/qjl_set_rows.air ${CMAKE_CURRENT_BINARY_DIR}/polar.air ${CMAKE_CURRENT_BINARY_DIR}/polar_preht.air ${CMAKE_CURRENT_BINARY_DIR}/fused_attn_qjl_tbq.air ${CMAKE_CURRENT_BINARY_DIR}/fused_attn_qjl_polar.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/istft.metal -o ${CMAKE_CURRENT_BINARY_DIR}/istft.air + COMMAND xcrun -sdk macosx metallib ${CMAKE_CURRENT_BINARY_DIR}/ggml-metal.air ${CMAKE_CURRENT_BINARY_DIR}/turbo3.air ${CMAKE_CURRENT_BINARY_DIR}/turbo4.air ${CMAKE_CURRENT_BINARY_DIR}/turbo3_tcq.air ${CMAKE_CURRENT_BINARY_DIR}/qjl.air ${CMAKE_CURRENT_BINARY_DIR}/qjl_set_rows.air ${CMAKE_CURRENT_BINARY_DIR}/polar.air ${CMAKE_CURRENT_BINARY_DIR}/polar_preht.air ${CMAKE_CURRENT_BINARY_DIR}/fused_attn_qjl_tbq.air ${CMAKE_CURRENT_BINARY_DIR}/fused_attn_qjl_polar.air ${CMAKE_CURRENT_BINARY_DIR}/istft.air -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal - DEPENDS ggml-metal.metal ${METALLIB_COMMON} ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/turbo3.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/turbo4.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/turbo3_tcq.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/qjl.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/qjl_set_rows.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/polar.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/polar_preht.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/fused_attn_qjl_tbq.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/fused_attn_qjl_polar.metal - COMMENT "Compiling Metal kernels (ggml-metal + eliza-shipped: turbo3.metal, turbo4.metal, turbo3_tcq.metal, qjl.metal, qjl_set_rows.metal, polar.metal, polar_preht.metal, fused_attn_qjl_tbq.metal, fused_attn_qjl_polar.metal)" + DEPENDS ggml-metal.metal ${METALLIB_COMMON} ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/turbo3.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/turbo4.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/turbo3_tcq.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/qjl.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/qjl_set_rows.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/polar.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/polar_preht.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/fused_attn_qjl_tbq.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/fused_attn_qjl_polar.metal ${CMAKE_CURRENT_SOURCE_DIR}/eliza-shipped/istft.metal + COMMENT "Compiling Metal kernels (ggml-metal + eliza-shipped: turbo3.metal, turbo4.metal, turbo3_tcq.metal, qjl.metal, qjl_set_rows.metal, polar.metal, polar_preht.metal, fused_attn_qjl_tbq.metal, fused_attn_qjl_polar.metal, istft.metal)" ) # FIXME: only add to the ggml-metal target? diff --git a/ggml/src/ggml-metal/eliza-shipped/istft.metal b/ggml/src/ggml-metal/eliza-shipped/istft.metal new file mode 100644 index 000000000..eb22bf45d --- /dev/null +++ b/ggml/src/ggml-metal/eliza-shipped/istft.metal @@ -0,0 +1,104 @@ +// # ELIZA-KERNEL-PATCH-V1 — Metal kernel for GGML_OP_ISTFT. +// +// Inverse short-time Fourier transform with overlap-add synthesis. One +// threadgroup column per output sample (gid = output index). Each thread +// computes the windowed IDFT contribution of every frame that overlaps +// `gid` and accumulates it with the window-energy normaliser, matching the +// CPU reference at ggml/src/ggml-cpu/ops.cpp:ggml_compute_forward_istft_f32 +// and the Vulkan reference at ggml/src/ggml-vulkan/vulkan-shaders/istft.comp. +// +// Tensor layout (matches ggml_istft contract): +// src0 (mag_phase): F32 [2, F, T] ne[0]=2 (mag/phase channel), +// ne[1]=F=n_fft/2+1, ne[2]=T (frames). +// Element [ch,f,t] = data[t*(2*F) + f*2 + ch]. +// src1 (window): F32 [win_length] — optional, signalled by use_window. +// When zero a periodic Hann window is +// synthesised on-the-fly. +// dst: F32 [n_out], n_out = (T-1)*hop_length + win_length. +// +// Params via argument buffer (IstftParams): +// n_fft, hop_length, win_length, T, n_out, use_window + +#include +using namespace metal; + +struct IstftParams { + uint n_fft; + uint hop_length; + uint win_length; + uint T; + uint n_out; + uint use_window; +}; + +kernel void kernel_istft_f32( + device const float * mag_phase [[ buffer(0) ]], + device const float * win_data [[ buffer(1) ]], + device float * dst_data [[ buffer(2) ]], + constant IstftParams & p [[ buffer(3) ]], + uint gid [[ thread_position_in_grid ]]) { + + if (gid >= p.n_out) return; + + const uint i = gid; + const uint F = p.n_fft / 2u + 1u; + const uint stride = 2u * F; // floats per frame in mag_phase + + // Frame range overlapping output sample i. + const uint t_max = min(i / p.hop_length, p.T - 1u); + uint t_min = 0u; + if (i + 1u > p.win_length) { + t_min = (i + 1u - p.win_length + p.hop_length - 1u) / p.hop_length; + } + + float acc = 0.0f; + float norm = 0.0f; + const float inv_n = 1.0f / float(p.n_fft); + const float two_pi = 2.0f * M_PI_F; + const float hann_step = two_pi / float(p.win_length); + + for (uint t = t_min; t <= t_max; ++t) { + const uint k = i - t * p.hop_length; + if (k >= p.win_length) continue; + + const float win_k = (p.use_window != 0u) + ? win_data[k] + : (0.5f - 0.5f * cos(hann_step * float(k))); + norm += win_k * win_k; + + const uint k_mod = k % p.n_fft; + const uint base = t * stride; // start of this frame in mag_phase + + // Pre-extract DC + Nyquist (Hermitian-symmetric input → real output). + float smpl = 0.0f; + { + const float mag_v = mag_phase[base + 0u * 2u + 0u]; + const float phase_v = mag_phase[base + 0u * 2u + 1u]; + smpl += mag_v * cos(phase_v); + } + if ((p.n_fft & 1u) == 0u) { + const uint nyq = F - 1u; + const float mag_v = mag_phase[base + nyq * 2u + 0u]; + const float phase_v = mag_phase[base + nyq * 2u + 1u]; + const float re = mag_v * cos(phase_v); + const float sign = ((k_mod & 1u) == 0u) ? 1.0f : -1.0f; + smpl += sign * re; + } + + // Interior bins [1, F-1) with Hermitian symmetry (factor 2). + const uint interior_end = F - ((p.n_fft & 1u) == 0u ? 1u : 0u); + for (uint f = 1u; f < interior_end; ++f) { + const float mag_v = mag_phase[base + f * 2u + 0u]; + const float phase_v = mag_phase[base + f * 2u + 1u]; + const float re = mag_v * cos(phase_v); + const float im = mag_v * sin(phase_v); + const float angle = two_pi * float(f) * float(k_mod) * inv_n; + smpl += 2.0f * (re * cos(angle) - im * sin(angle)); + } + + smpl *= inv_n; + acc += smpl * win_k; + } + + dst_data[i] = (norm > 1e-8f) ? (acc / norm) : 0.0f; +} diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp index 4e0cf0532..49ff864a4 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.cpp +++ b/ggml/src/ggml-metal/ggml-metal-device.cpp @@ -2126,6 +2126,27 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_timestep_embeddi return res; } +// # ELIZA-ISTFT-DISPATCH-V1 — pipeline lookup for GGML_OP_ISTFT. +// Source kernel lives in eliza-shipped/istft.metal (compiled into default.metallib). +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_istft(ggml_metal_library_t lib, const ggml_tensor * op) { + assert(op->op == GGML_OP_ISTFT); + GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(op->type == GGML_TYPE_F32); + + char base[256]; + char name[256]; + + snprintf(base, 256, "kernel_istft_f32"); + snprintf(name, 256, "%s", base); + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr); + } + + return res; +} + ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_adamw(ggml_metal_library_t lib, const ggml_tensor * op) { assert(op->op == GGML_OP_OPT_STEP_ADAMW); diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index dc44c914d..e749e77ac 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -159,6 +159,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad_refle struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_roll (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_arange (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_timestep_embedding(ggml_metal_library_t lib, const struct ggml_tensor * op); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_istft (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_adamw (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_sgd (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_memset (ggml_metal_library_t lib, const struct ggml_tensor * op); diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index 766a547f0..f0da2f4ef 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -1181,6 +1181,17 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_LEAKY_RELU: return op->src[0]->type == GGML_TYPE_F32; + case GGML_OP_ISTFT: + // # ELIZA-ISTFT-DISPATCH-V1 + // mag_phase must be F32 [2, F, T]; output is F32. Optional + // src1 window (when present) must be F32 [win_length]. + return op->src[0] != NULL && + op->src[0]->type == GGML_TYPE_F32 && + op->type == GGML_TYPE_F32 && + ggml_is_contiguous(op->src[0]) && + op->src[0]->ne[0] == 2 && + (op->src[1] == NULL || + (op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[1]))); case GGML_OP_ARGSORT: case GGML_OP_TOP_K: case GGML_OP_ARANGE: diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index ad14c3065..545bf41ce 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -1095,6 +1095,17 @@ typedef struct { int max_period; } ggml_metal_kargs_timestep_embedding; +// # ELIZA-ISTFT-DISPATCH-V1 — kargs for GGML_OP_ISTFT (must mirror IstftParams +// in eliza-shipped/istft.metal). +typedef struct { + uint32_t n_fft; + uint32_t hop_length; + uint32_t win_length; + uint32_t T; + uint32_t n_out; + uint32_t use_window; +} ggml_metal_kargs_istft; + typedef struct { int32_t ne00; int32_t ne01; diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index e875e979d..1e3a9b337 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -935,6 +935,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) { { n_fuse = ggml_metal_op_timestep_embedding(ctx, idx); } break; + case GGML_OP_ISTFT: + { + n_fuse = ggml_metal_op_istft(ctx, idx); + } break; case GGML_OP_ARGSORT: { n_fuse = ggml_metal_op_argsort(ctx, idx); @@ -4841,6 +4845,74 @@ int ggml_metal_op_timestep_embedding(ggml_metal_op_t ctx, int idx) { return 1; } +// # ELIZA-ISTFT-DISPATCH-V1 — Metal dispatch for GGML_OP_ISTFT. +// +// Tensor contract (mirrors CPU/CUDA reference at ops.cpp / istft.cu): +// src0 (mag_phase): F32 [2, F, T] (ne[0]=2, ne[1]=F=n_fft/2+1, ne[2]=T) +// src1 (window): F32 [win_length], OPTIONAL — NULL means synthesise Hann. +// dst: F32 [n_out], n_out = (T-1)*hop_length + win_length. +// +// One thread per output sample; the shader iterates over the (typically tiny) +// set of frames that overlap each sample and accumulates the windowed IDFT. +int ggml_metal_op_istft(ggml_metal_op_t ctx, int idx) { + ggml_tensor * op = ctx->node(idx); + + ggml_metal_library_t lib = ctx->lib; + ggml_metal_encoder_t enc = ctx->enc; + + GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(op->type == GGML_TYPE_F32); + + const int32_t * op_params = (const int32_t *) op->op_params; + const uint32_t n_fft = (uint32_t) op_params[0]; + const uint32_t hop_length = (uint32_t) op_params[1]; + const uint32_t win_length = (uint32_t) op_params[2]; + const uint32_t T = (uint32_t) op->src[0]->ne[2]; + const uint32_t n_out = (uint32_t) op->ne[0]; + + const bool has_window = (op->src[1] != nullptr); + + ggml_metal_kargs_istft args = { + /*.n_fft =*/ n_fft, + /*.hop_length =*/ hop_length, + /*.win_length =*/ win_length, + /*.T =*/ T, + /*.n_out =*/ n_out, + /*.use_window =*/ (uint32_t)(has_window ? 1u : 0u), + }; + + auto pipeline = ggml_metal_library_get_pipeline_istft(lib, op); + + // Bind the window buffer when present; otherwise fall back to src0's + // buffer at slot 1 (the shader will never read from it because + // use_window == 0). Metal requires every declared buffer argument to + // be bound to *something* valid. + const auto src0_buf = ggml_metal_get_buffer_id(op->src[0]); + const auto win_buf = has_window + ? ggml_metal_get_buffer_id(op->src[1]) + : src0_buf; + const auto dst_buf = ggml_metal_get_buffer_id(op); + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_buffer (enc, src0_buf, 0); + ggml_metal_encoder_set_buffer (enc, win_buf, 1); + ggml_metal_encoder_set_buffer (enc, dst_buf, 2); + ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 3); + + // One thread per output sample. Threadgroup width is capped by the + // pipeline's max threads per group; pick a multiple of 32 (SIMD width) + // that does not exceed n_out. + int nth = 32; + while ((uint32_t)(nth * 2) <= n_out && nth < 1024) { + nth *= 2; + } + const int n_groups = (int)((n_out + (uint32_t)nth - 1u) / (uint32_t)nth); + + ggml_metal_encoder_dispatch_threadgroups(enc, n_groups, 1, 1, nth, 1, 1); + + return 1; +} + int ggml_metal_op_argmax(ggml_metal_op_t ctx, int idx) { ggml_tensor * op = ctx->node(idx); diff --git a/ggml/src/ggml-metal/ggml-metal-ops.h b/ggml/src/ggml-metal/ggml-metal-ops.h index 49b8eb8fe..e4940d4ea 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.h +++ b/ggml/src/ggml-metal/ggml-metal-ops.h @@ -92,6 +92,7 @@ int ggml_metal_op_pad_reflect_1d (ggml_metal_op_t ctx, int idx); int ggml_metal_op_roll (ggml_metal_op_t ctx, int idx); int ggml_metal_op_arange (ggml_metal_op_t ctx, int idx); int ggml_metal_op_timestep_embedding(ggml_metal_op_t ctx, int idx); +int ggml_metal_op_istft (ggml_metal_op_t ctx, int idx); int ggml_metal_op_argmax (ggml_metal_op_t ctx, int idx); int ggml_metal_op_argsort (ggml_metal_op_t ctx, int idx); int ggml_metal_op_top_k (ggml_metal_op_t ctx, int idx); diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index fcfb115e1..b2acb7f98 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1540,19 +1540,15 @@ struct vk_op_pool2d_push_constants { int32_t p0; int32_t p1; }; +// # ELIZA-ISTFT-DISPATCH-V1 — single-pass IDFT + OLA push constants. +// Must mirror the Params block in vulkan-shaders/istft.comp. struct vk_op_istft_push_constants { uint32_t n_fft; uint32_t hop_length; uint32_t win_length; - uint32_t T; // number of frames (IDFT pass) -}; - -struct vk_op_istft_ola_push_constants { - uint32_t n_fft; - uint32_t hop_length; - uint32_t win_length; - uint32_t T; - uint32_t n_out; // total output samples (OLA pass) + uint32_t T; // number of frames + uint32_t n_out; // total output samples + uint32_t use_window; // 1 when src1 (window tensor) is bound, 0 otherwise }; struct vk_op_rwkv_wkv6_push_constants { @@ -4971,7 +4967,7 @@ static void ggml_vk_load_shaders(vk_device& device) { // iSTFT: single-pass, one thread per output sample (2 bindings: mag_phase, dst) // Window is computed from push-constant win_length — no window tensor binding. - ggml_vk_create_pipeline(device, device->pipeline_istft_f32, "istft_f32", istft_f32_len, istft_f32_data, "main", 2, sizeof(vk_op_istft_ola_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_istft_f32, "istft_f32", istft_f32_len, istft_f32_data, "main", 3, sizeof(vk_op_istft_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); @@ -12564,9 +12560,11 @@ static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, c } static void ggml_vk_istft(ggml_backend_vk_context * ctx, vk_context& subctx, - const ggml_tensor * src0, const ggml_tensor * /* src1 */, ggml_tensor * dst) { - // src0: mag_phase [T, F, 2] (ne[0]=T, ne[1]=F, ne[2]=2) - // Window is computed on-the-fly inside the shader from win_length push constant. + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + // # ELIZA-ISTFT-DISPATCH-V1 + // src0: mag_phase F32 [2, F, T] (ne[0]=2 mag/phase channel, ne[1]=F, ne[2]=T) + // src1: window F32 [win_length] — optional; when NULL the shader + // synthesises a periodic Hann window. GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -12574,18 +12572,24 @@ static void ggml_vk_istft(ggml_backend_vk_context * ctx, vk_context& subctx, const uint32_t n_fft = (uint32_t) op_params[0]; const uint32_t hop_length = (uint32_t) op_params[1]; const uint32_t win_length = (uint32_t) op_params[2]; - const uint32_t T = (uint32_t) src0->ne[0]; + const uint32_t T = (uint32_t) src0->ne[2]; const uint32_t n_out = (uint32_t) dst->ne[0]; - vk_op_istft_ola_push_constants pc{}; + const bool has_window = (src1 != nullptr); + + vk_op_istft_push_constants pc{}; pc.n_fft = n_fft; pc.hop_length = hop_length; pc.win_length = win_length; pc.T = T; pc.n_out = n_out; + pc.use_window = has_window ? 1u : 0u; - // 2-binding dispatch: src0=mag_phase, dst (no window tensor — built in shader) - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_ISTFT, std::move(pc)); + // 3-binding dispatch: src0=mag_phase, src1=window (or src0 again when NULL + // to keep the descriptor set populated; shader gates on use_window), dst. + // We pass src1 unconditionally so ggml_vk_op_f32 takes the use_src1 path. + const ggml_tensor * window_src = has_window ? src1 : src0; + ggml_vk_op_f32(ctx, subctx, src0, window_src, nullptr, nullptr, dst, GGML_OP_ISTFT, std::move(pc)); } static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0, diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/istft.comp b/ggml/src/ggml-vulkan/vulkan-shaders/istft.comp index 111aa8fd4..773764c72 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/istft.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/istft.comp @@ -1,16 +1,23 @@ #version 450 // --------------------------------------------------------------------------- -// istft.comp — Inverse STFT (single-pass, one thread per output smpl). +// istft.comp — Inverse STFT (single-pass, one thread per output sample). // -// The Hann window is computed on-the-fly from win_length in push constants; -// no window-tensor binding is required (binding 1 is unused / empty). +// Tensor layout (matches ggml_istft contract — see ggml/src/ggml.c:ggml_istft +// and the CPU reference at ggml/src/ggml-cpu/ops.cpp): // -// Bindings: -// 0 : mag_phase F32 [F*T | F*T] mag first half, phase second half -// 1 : dst F32 [n_out] +// binding 0 : mag_phase F32 [2, F, T] ne[0]=2 (mag/phase channel), +// ne[1]=F=n_fft/2+1, ne[2]=T (frames). +// Element [ch,f,t] is stored at +// data[t*(2*F) + f*2 + ch]. +// binding 1 : window F32 [win_length] — used only when use_window != 0; +// otherwise a periodic Hann window +// is synthesised inline (the +// buffer is still bound to keep +// the descriptor set happy). +// binding 2 : dst F32 [n_out] // -// Push constants: n_fft, hop_length, win_length, T (frames), n_out +// Push constants: n_fft, hop_length, win_length, T, n_out, use_window. // --------------------------------------------------------------------------- #include "types.glsl" @@ -18,7 +25,8 @@ layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; layout(binding = 0) readonly buffer MagPhase { float mag_phase_data[]; }; -layout(binding = 1) writeonly buffer Dst { float dst_data[]; }; +layout(binding = 1) readonly buffer Window { float win_data[]; }; +layout(binding = 2) writeonly buffer Dst { float dst_data[]; }; layout(push_constant) uniform Params { uint n_fft; @@ -26,6 +34,7 @@ layout(push_constant) uniform Params { uint win_length; uint T; uint n_out; + uint use_window; } p; #define PI2 6.28318530717958647692 @@ -34,18 +43,18 @@ void main() { const uint i = gl_GlobalInvocationID.x; if (i >= p.n_out) return; - const uint F = p.n_fft / 2u + 1u; + const uint F = p.n_fft / 2u + 1u; + const uint stride = 2u * F; // floats per frame in mag_phase - // Frame range overlapping output smpl i. + // Frame range overlapping output sample i. const uint t_max = min(i / p.hop_length, p.T - 1u); uint t_min = 0u; if (i + 1u > p.win_length) { t_min = (i + 1u - p.win_length + p.hop_length - 1u) / p.hop_length; } - float acc = 0.0; - float norm = 0.0; - + float acc = 0.0; + float norm = 0.0; const float inv_nfft = 1.0 / float(p.n_fft); const float win_scale = PI2 / float(p.win_length); @@ -53,27 +62,31 @@ void main() { const uint k = i - t * p.hop_length; if (k >= p.win_length) continue; - // Periodic Hann window: w[k] = 0.5 - 0.5*cos(2*pi*k / win_length) - const float win_k = 0.5 - 0.5 * cos(win_scale * float(k)); + // Use the supplied window tensor when present; otherwise the + // periodic Hann that matches the CPU reference's internal fallback. + const float win_k = (p.use_window != 0u) + ? win_data[k] + : (0.5 - 0.5 * cos(win_scale * float(k))); norm += win_k * win_k; const uint k_mod = k % p.n_fft; + const uint base = t * stride; // Compute IDFT output at time-index k_mod for frame t. float smpl = 0.0; // DC { - const float mag_v = mag_phase_data[0u * p.T + t]; - const float phase_v = mag_phase_data[F * p.T + 0u * p.T + t]; + const float mag_v = mag_phase_data[base + 0u * 2u + 0u]; + const float phase_v = mag_phase_data[base + 0u * 2u + 1u]; smpl += mag_v * cos(phase_v); } // Nyquist (n_fft even only) if ((p.n_fft & 1u) == 0u) { const uint nyq = F - 1u; - const float mag_v = mag_phase_data[nyq * p.T + t]; - const float phase_v = mag_phase_data[F * p.T + nyq * p.T + t]; + const float mag_v = mag_phase_data[base + nyq * 2u + 0u]; + const float phase_v = mag_phase_data[base + nyq * 2u + 1u]; float re = mag_v * cos(phase_v); float sign = ((k_mod & 1u) == 0u) ? 1.0 : -1.0; smpl += sign * re; @@ -82,8 +95,8 @@ void main() { // Interior bins [1, F-1) with Hermitian symmetry (factor 2). const uint interior_end = F - ((p.n_fft & 1u) == 0u ? 1u : 0u); for (uint f = 1u; f < interior_end; ++f) { - const float mag_v = mag_phase_data[f * p.T + t]; - const float phase_v = mag_phase_data[F * p.T + f * p.T + t]; + const float mag_v = mag_phase_data[base + f * 2u + 0u]; + const float phase_v = mag_phase_data[base + f * 2u + 1u]; const float re = mag_v * cos(phase_v); const float im = mag_v * sin(phase_v); const float angle = PI2 * float(f) * float(k_mod) * inv_nfft; diff --git a/scripts/cuda-mtp-validate.sh b/scripts/cuda-mtp-validate.sh new file mode 100755 index 000000000..91bac2c1d --- /dev/null +++ b/scripts/cuda-mtp-validate.sh @@ -0,0 +1,175 @@ +#!/usr/bin/env bash +# Validates the CUDA MTP gated_delta_net K-snapshot path on a real NVIDIA GPU. +# +# Background: commit 142e7ac65 ported upstream PR #22673 (MTP per-token state +# snapshots) to the CUDA backend. The kernel emits up to K snapshots into the +# trailing K-token slot region of the state tensor for partial rollback during +# multi-token-prediction speculative decoding. The TODO marker in +# ggml/src/ggml-cuda/gated_delta_net.cu (slot stride vs host-side allocation) +# cannot be closed without execution on an actual GPU. +# +# This script is the one-command answer to "where's the CUDA MTP validation?". +# +# Requirements: +# - nvcc + CUDA Toolkit 12.4+ in PATH +# - An NVIDIA GPU visible to the host (`nvidia-smi` must succeed) +# - cmake >= 3.18, a C++17 toolchain, GNU make or ninja +# +# Usage: +# bash scripts/cuda-mtp-validate.sh # build + run +# bash scripts/cuda-mtp-validate.sh --rebuild # wipe build dir first +# bash scripts/cuda-mtp-validate.sh --build-only # compile only, skip GPU run +# bash scripts/cuda-mtp-validate.sh --list-tests # enumerate planned ops +# +# Optional environment: +# BUILD_DIR=build-cuda # build directory +# JOBS=$(nproc) # parallel build jobs +# MTP_GGUF=/tmp/Qwen3.5-2B-MTP-Q4_K_M.gguf # smoke model path +# SKIP_SMOKE=1 # skip llama-cli smoke +# CUDA_ARCHITECTURES="75;80;86;89;90" # nvcc target archs + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$REPO_ROOT" + +BUILD_DIR="${BUILD_DIR:-build-cuda}" +JOBS="${JOBS:-$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)}" +MTP_GGUF="${MTP_GGUF:-/tmp/Qwen3.5-2B-MTP-Q4_K_M.gguf}" +SKIP_SMOKE="${SKIP_SMOKE:-0}" +CUDA_ARCHITECTURES="${CUDA_ARCHITECTURES:-75;80;86;89;90}" + +REBUILD=0 +BUILD_ONLY=0 +LIST_TESTS=0 + +for arg in "$@"; do + case "$arg" in + --rebuild) REBUILD=1 ;; + --build-only) BUILD_ONLY=1 ;; + --list-tests) LIST_TESTS=1 ;; + -h|--help) + sed -n '2,30p' "$0" + exit 0 + ;; + *) + echo "[cuda-mtp-validate] unknown arg: $arg" >&2 + exit 2 + ;; + esac +done + +# The GATED_DELTA_NET test cases that exercise the MTP K-snapshot path. +# Registered in tests/test-backend-ops.cpp under the +# "K > 1: output keeps the last min(n_tokens, K)" block. +MTP_K_TESTS=( + "GATED_DELTA_NET(type=f32,head_count=4,head_size=16,n_seq_tokens=2,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=2)" + "GATED_DELTA_NET(type=f32,head_count=4,head_size=32,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=4)" + "GATED_DELTA_NET(type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=0,K=4)" + "GATED_DELTA_NET(type=f32,head_count=8,head_size=128,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=4)" + "GATED_DELTA_NET(type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=1,K=4)" + "GATED_DELTA_NET(type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=1,K=4)" + "GATED_DELTA_NET(type=f32,head_count=4,head_size=32,n_seq_tokens=8,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=3)" + "GATED_DELTA_NET(type=f32,head_count=4,head_size=64,n_seq_tokens=16,n_seqs=2,v_repeat=1,permuted=0,kda=0,K=4)" +) + +if [ "$LIST_TESTS" = "1" ]; then + echo "Planned CUDA MTP K-snapshot parity cases (CPU reference vs CUDA0):" + for t in "${MTP_K_TESTS[@]}"; do + echo " - $t" + done + echo + echo "Also runs full \`-o GATED_DELTA_NET\` filter (covers K=1 + K>1 cases)." + echo "Smoke: llama-cli --spec-type draft-mtp --spec-draft-n-max 2 on \$MTP_GGUF (if present)." + exit 0 +fi + +echo "[cuda-mtp-validate] repo: $REPO_ROOT" +echo "[cuda-mtp-validate] build dir: $BUILD_DIR" +echo "[cuda-mtp-validate] jobs: $JOBS" +echo "[cuda-mtp-validate] archs: $CUDA_ARCHITECTURES" + +if [ "$REBUILD" = "1" ] && [ -d "$BUILD_DIR" ]; then + echo "[cuda-mtp-validate] removing $BUILD_DIR (--rebuild)" + rm -rf "$BUILD_DIR" +fi + +# 1. Configure + build +cmake -B "$BUILD_DIR" \ + -DGGML_CUDA=ON \ + -DGGML_METAL=OFF \ + -DLLAMA_CURL=OFF \ + -DLLAMA_BUILD_TESTS=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CUDA_ARCHITECTURES="$CUDA_ARCHITECTURES" + +cmake --build "$BUILD_DIR" -j "$JOBS" \ + --target test-backend-ops llama-cli + +if [ "$BUILD_ONLY" = "1" ]; then + echo "[cuda-mtp-validate] --build-only: skipping GPU execution. Build OK." + exit 0 +fi + +# Confirm we actually have a GPU before any runtime step. +if ! command -v nvidia-smi >/dev/null 2>&1; then + echo "::error::nvidia-smi not on PATH; this script requires a real NVIDIA GPU at runtime." >&2 + echo " Re-run with --build-only on hosts without a GPU." >&2 + exit 3 +fi +nvidia-smi -L || { echo "::error::nvidia-smi -L failed; GPU not visible." >&2; exit 3; } + +LOG_DIR="${LOG_DIR:-/tmp}" +mkdir -p "$LOG_DIR" + +# 2. Full GATED_DELTA_NET op parity sweep (covers K=1 + K>1 registered cases). +echo "[cuda-mtp-validate] running test-backend-ops -b CUDA0 -o GATED_DELTA_NET" +"./$BUILD_DIR/bin/test-backend-ops" -b CUDA0 -o GATED_DELTA_NET 2>&1 \ + | tee "$LOG_DIR/cuda-mtp-backend-ops.log" + +# Hard-fail on any FAIL line for GATED_DELTA_NET. +if grep -qE "GATED_DELTA_NET.*FAIL" "$LOG_DIR/cuda-mtp-backend-ops.log"; then + echo "::error::CUDA GATED_DELTA_NET parity failed against CPU reference." + exit 1 +fi + +# Sanity check: confirm K>1 cases were actually scheduled, not skipped. +# Each K>1 case prints its vars including "K=" with n>1. +n_k_gt_1=$(grep -cE "GATED_DELTA_NET.*K=[2-9][0-9]*" "$LOG_DIR/cuda-mtp-backend-ops.log" || true) +echo "[cuda-mtp-validate] K>1 cases observed: $n_k_gt_1" +if [ "$n_k_gt_1" -lt 4 ]; then + echo "::error::expected at least 4 K>1 cases in GATED_DELTA_NET log, got $n_k_gt_1" + echo " (K-snapshot MTP path may not be reached)" + exit 1 +fi + +# 3. End-to-end MTP smoke (optional; gated on the unsloth MTP GGUF being present). +if [ "$SKIP_SMOKE" = "1" ]; then + echo "[cuda-mtp-validate] SKIP_SMOKE=1: skipping llama-cli MTP run." +elif [ ! -f "$MTP_GGUF" ]; then + echo "[cuda-mtp-validate] SKIP smoke: $MTP_GGUF missing" + echo " (pull from huggingface.co/unsloth/Qwen3.5-2B-MTP-GGUF)" +else + echo "[cuda-mtp-validate] llama-cli MTP smoke ($MTP_GGUF)" + timeout 120 "./$BUILD_DIR/bin/llama-cli" \ + -m "$MTP_GGUF" \ + -p "The capital of France is" \ + -n 32 \ + --temp 0 \ + -c 512 \ + -t 4 \ + -ngl 99 \ + --spec-type draft-mtp \ + --spec-draft-n-max 2 \ + 2>&1 | tee "$LOG_DIR/cuda-mtp-smoke.log" + + # Smoke succeeds when llama-cli prints either a generation block or the + # speculative-decoding stats line. Don't require exact text — content + # depends on model weights. + if ! grep -qE "Generation:|generated [0-9]+ tokens|^The capital of France" "$LOG_DIR/cuda-mtp-smoke.log"; then + echo "::error::llama-cli MTP smoke produced no recognisable output" + exit 1 + fi +fi + +echo "[cuda-mtp-validate] CUDA MTP validation: PASS" diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index b95b75c91..48672110c 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -6862,8 +6862,11 @@ struct test_istft : public test_case { ggml_tensor * build_graph(ggml_context * ctx) override { const int64_t F = n_fft / 2 + 1; + // ggml_istft expects mag_phase with ne[0]=2 (mag/phase), ne[1]=F, + // ne[2]=T (frames) — see ggml.c:5105. ggml_new_tensor_3d takes args + // in (ne0, ne1, ne2) order. ggml_tensor * mag_phase = ggml_new_tensor_3d( - ctx, GGML_TYPE_F32, (int64_t) n_frames, F, 2); + ctx, GGML_TYPE_F32, 2, F, (int64_t) n_frames); ggml_set_name(mag_phase, "mag_phase"); ggml_tensor * window = nullptr; diff --git a/tools/kokoro/include/kokoro-predictor.h b/tools/kokoro/include/kokoro-predictor.h index 1d20e2b10..efea20d07 100644 --- a/tools/kokoro/include/kokoro-predictor.h +++ b/tools/kokoro/include/kokoro-predictor.h @@ -23,6 +23,7 @@ #include "kokoro-layers.h" #include #include +#include #include namespace eliza_kokoro { diff --git a/tools/kokoro/src/kokoro-istft.cpp b/tools/kokoro/src/kokoro-istft.cpp index d688154aa..c6c34cea2 100644 --- a/tools/kokoro/src/kokoro-istft.cpp +++ b/tools/kokoro/src/kokoro-istft.cpp @@ -14,11 +14,17 @@ namespace eliza_kokoro { namespace { +// MSVC does not define M_PI by default (it's a POSIX/GNU extension behind +// _USE_MATH_DEFINES). Declare a local constant so the Windows-MSVC builds +// compile without depending on the math.h extension. Mirrors the same +// pattern used in ggml/src/ggml-cpu/ops.cpp for the ISTFT op. +static constexpr double K_PI = 3.14159265358979323846; + // Build a periodic Hann window of length N. Matches numpy.hanning's // "symmetric=False" convention used by librosa.istft. static std::vector hann_window(int n) { std::vector w((size_t) n); - const double scale = 2.0 * M_PI / (double) n; + const double scale = 2.0 * K_PI / (double) n; for (int i = 0; i < n; ++i) { w[(size_t) i] = (float) (0.5 - 0.5 * std::cos(scale * (double) i)); } @@ -48,7 +54,7 @@ static void irdft_frame( acc += sign * re[F - 1]; } for (int f = 1; f < F - ((n_fft & 1) == 0 ? 1 : 0); ++f) { - const double angle = 2.0 * M_PI * (double) f * (double) t * inv_n; + const double angle = 2.0 * K_PI * (double) f * (double) t * inv_n; acc += 2.0 * (re[f] * std::cos(angle) - im[f] * std::sin(angle)); } out[t] = (float) (acc * inv_n); diff --git a/tools/kokoro/src/kokoro.cpp b/tools/kokoro/src/kokoro.cpp index 93c15594e..829840231 100644 --- a/tools/kokoro/src/kokoro.cpp +++ b/tools/kokoro/src/kokoro.cpp @@ -534,7 +534,9 @@ kokoro_status kokoro_synthesize( const int n_out = (n_frames - 1) * hop_length + win_length; // Build a tiny graph: mag_phase_tensor → ggml_istft → pcm_tensor. - // mag_phase_tensor shape: ne[0]=n_frames (T), ne[1]=F, ne[2]=2. + // mag_phase_tensor shape: ne[0]=2 (mag/phase), ne[1]=F, ne[2]=T. + // See ggml.h ggml_istft contract: src0 is [2, F, T] channel-first + // interleaved. Element [ch, f, t] sits at offset t*(2*F) + f*2 + ch. bool used_native_op = false; { ggml_init_params ip = { @@ -544,8 +546,8 @@ kokoro_status kokoro_synthesize( }; ggml_context * gctx = ggml_init(ip); if (gctx) { - const int64_t ne_mp[4] = { (int64_t) n_frames, (int64_t) F, 2, 1 }; - ggml_tensor * mp = ggml_new_tensor(gctx, GGML_TYPE_F32, 4, ne_mp); + ggml_tensor * mp = ggml_new_tensor_3d( + gctx, GGML_TYPE_F32, 2, (int64_t) F, (int64_t) n_frames); ggml_tensor * pcm = ggml_istft(gctx, mp, /*window=*/nullptr, n_fft, hop_length, win_length); ggml_cgraph * gf = ggml_new_graph_custom(gctx, 64, false); @@ -555,15 +557,16 @@ kokoro_status kokoro_synthesize( ggml_backend_get_default_buffer_type(model->backend)); if (alloc && ggml_gallocr_alloc_graph(alloc, gf)) { - // Pack mag/phase into the [T, F, 2] tensor. - // mag goes into channel 0 (base offset 0), phase into - // channel 1 (base offset F*T floats). - const size_t ch_stride = (size_t) F * (size_t) n_frames; - std::vector mp_data(2 * ch_stride); - for (int f = 0; f < F; ++f) { - for (int t = 0; t < n_frames; ++t) { - mp_data[ f * n_frames + t] = mag [(size_t)(f * n_frames + t)]; - mp_data[ch_stride + f * n_frames + t] = phase[(size_t)(f * n_frames + t)]; + // Pack mag/phase into the [2, F, T] tensor. + // mag is channel 0, phase is channel 1. Source arrays are + // laid out as mag/phase[f * n_frames + t]. + std::vector mp_data((size_t) 2 * (size_t) F * (size_t) n_frames); + for (int t = 0; t < n_frames; ++t) { + for (int f = 0; f < F; ++f) { + const size_t src = (size_t)(f * n_frames + t); + const size_t base = (size_t) t * (size_t)(2 * F) + (size_t) f * 2; + mp_data[base + 0] = mag [src]; + mp_data[base + 1] = phase[src]; } } ggml_backend_tensor_set(mp, mp_data.data(), 0, @@ -604,6 +607,9 @@ const kokoro_hparams * kokoro_get_hparams(const kokoro_model * model) noexcept { // trained tensors by name from the loader-owned ggml_context. Keeping // this internal-by-convention (not in kokoro.h) preserves the public // surface while giving the sibling TUs a stable handle. +// Forward-declare here so -Wmissing-declarations sees a prior declaration +// at the definition site (the matching extern lives in the sibling TUs). +ggml_context * kokoro_model_ggml_ctx(const kokoro_model * model); ggml_context * kokoro_model_ggml_ctx(const kokoro_model * model) { return model ? model->ctx : nullptr; } diff --git a/tools/omnivoice/CMakeLists.txt b/tools/omnivoice/CMakeLists.txt index 6e2da9826..8ccfed072 100644 --- a/tools/omnivoice/CMakeLists.txt +++ b/tools/omnivoice/CMakeLists.txt @@ -140,48 +140,48 @@ endif() # so the same `dlopen()`-ed handle serves text inference + speech # synthesis + ASR + VAD. # -# Built when LLAMA_BUILD_OMNIVOICE=ON (always, when the merged tree is -# enabled). The pre-W3-3 build script (`build-llama-cpp-dflash.mjs`) used -# to graft + generate this via `omnivoice-fuse/prepare.mjs`; the merged -# path produces it directly from in-fork source. +# Built when LLAMA_BUILD_OMNIVOICE=ON AND the mtmd target exists. The FFI +# wraps mtmd_* for the ASR (audio→text) pipeline, so it cannot be built +# without it. iOS XCFramework configurations (LLAMA_BUILD_TOOLS=OFF, +# LLAMA_BUILD_MTMD=OFF) skip this target — the Eliza iOS runtime loads +# llama / omnivoice without ASR until a build with mtmd is enabled. # # Library file name: `libelizainference.{so,dylib,dll}` (no version # suffix on Linux to keep dlopen() happy on the runtime side). Default # visibility is `default` (not hidden) because every `eliza_inference_*` # entry the JS loader needs is reachable through dlsym. -add_library(elizainference SHARED - ${OMNIVOICE_CORE_SOURCES} - ${OMNIVOICE_FFI_SOURCES}) -target_compile_features(elizainference PUBLIC cxx_std_17) -target_compile_definitions(elizainference - PRIVATE OMNIVOICE_BUILD ${OMNIVOICE_COMPILE_DEFINITIONS}) -target_include_directories(elizainference PUBLIC - ${CMAKE_CURRENT_SOURCE_DIR}/src - ${CMAKE_CURRENT_SOURCE_DIR}/include - ${CMAKE_CURRENT_BINARY_DIR} - ${OMNIVOICE_LLAMA_PRIVATE_INCLUDE_DIRS}) -target_include_directories(elizainference PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/../mtmd - ${CMAKE_CURRENT_SOURCE_DIR}/../../include) -# Link llama + mtmd. The transitive link pulls ggml + the parent -# build's backend selection in; we never call ggml_backend_*_init() -# from here directly (patch 0003 — backend_init_auto() consumes the -# shared pair or constructs its own as needed). -target_link_libraries(elizainference PUBLIC llama) if(TARGET mtmd) - target_link_libraries(elizainference PUBLIC mtmd) -endif() -set_target_properties(elizainference PROPERTIES - OUTPUT_NAME elizainference - POSITION_INDEPENDENT_CODE ON) -# Apple re-exports llama symbols so the fused .dylib carries everything -# JS will dlopen() without a second handle. Mirrors the legacy fork-root -# graft block. -if(APPLE) - target_link_options(elizainference PRIVATE - "LINKER:-reexport_library,$") + add_library(elizainference SHARED + ${OMNIVOICE_CORE_SOURCES} + ${OMNIVOICE_FFI_SOURCES}) + target_compile_features(elizainference PUBLIC cxx_std_17) + target_compile_definitions(elizainference + PRIVATE OMNIVOICE_BUILD ${OMNIVOICE_COMPILE_DEFINITIONS}) + target_include_directories(elizainference PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/src + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_BINARY_DIR} + ${OMNIVOICE_LLAMA_PRIVATE_INCLUDE_DIRS}) + target_include_directories(elizainference PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../mtmd + ${CMAKE_CURRENT_SOURCE_DIR}/../../include) + # Link llama + mtmd. The transitive link pulls ggml + the parent + # build's backend selection in; we never call ggml_backend_*_init() + # from here directly (patch 0003 — backend_init_auto() consumes the + # shared pair or constructs its own as needed). + target_link_libraries(elizainference PUBLIC llama mtmd) + set_target_properties(elizainference PROPERTIES + OUTPUT_NAME elizainference + POSITION_INDEPENDENT_CODE ON) + # Apple re-exports llama symbols so the fused .dylib carries everything + # JS will dlopen() without a second handle. Mirrors the legacy fork-root + # graft block. + if(APPLE) + target_link_options(elizainference PRIVATE + "LINKER:-reexport_library,$") + endif() + add_dependencies(elizainference omnivoice-version) endif() -add_dependencies(elizainference omnivoice-version) # llama-server wiring — mount the OmniVoice POST /v1/audio/speech route # onto the same process that serves /v1/chat/completions + the DFlash @@ -249,7 +249,19 @@ target_compile_features(omnivoice-dac-parity PRIVATE cxx_std_17) target_include_directories(omnivoice-dac-parity PRIVATE ${CMAKE_SOURCE_DIR}/ggml/include ${CMAKE_SOURCE_DIR}/ggml/src) -target_link_libraries(omnivoice-dac-parity PRIVATE ggml ggml-cpu ggml-base) +# Link ggml-cpu / ggml-base only when those CMake targets actually exist. +# On some cross-compile configurations (Android NDK in particular) the +# ggml-cpu target may not be defined, and naming it unconditionally in +# target_link_libraries makes CMake fall back to a literal `-lggml-cpu` +# link flag against a phantom library. Guard the link the same way +# tools/kokoro/CMakeLists.txt does for kokoro_lib. +target_link_libraries(omnivoice-dac-parity PRIVATE ggml) +if(TARGET ggml-base) + target_link_libraries(omnivoice-dac-parity PRIVATE ggml-base) +endif() +if(TARGET ggml-cpu) + target_link_libraries(omnivoice-dac-parity PRIVATE ggml-cpu) +endif() if(BUILD_TESTING) add_test(NAME omnivoice-dac-parity COMMAND omnivoice-dac-parity --no-real) diff --git a/tools/omnivoice/tests/dac-parity.cpp b/tools/omnivoice/tests/dac-parity.cpp index eddabe5bf..1b38d6396 100644 --- a/tools/omnivoice/tests/dac-parity.cpp +++ b/tools/omnivoice/tests/dac-parity.cpp @@ -155,7 +155,14 @@ static void new_dac_conv_t1d_ggml( ggml_cgraph * gf = ggml_new_graph(ctx); ggml_build_forward_expand(gf, y); - ggml_graph_compute_with_ctx(ctx, gf, 4); + // ggml_graph_compute_with_ctx was removed upstream — use the ggml-backend + // CPU path. Tensors live in the user-managed ctx buffer, which the CPU + // backend can compute against directly. + ggml_backend_t backend = ggml_backend_cpu_init(); + GGML_ASSERT(backend); + ggml_backend_cpu_set_n_threads(backend, 4); + ggml_backend_graph_compute(backend, gf); + ggml_backend_free(backend); T_out_final = (int) y->ne[0]; GGML_ASSERT((int) y->ne[1] == c.OC); diff --git a/ty.toml b/ty.toml index ae3cd1c54..927b8c927 100644 --- a/ty.toml +++ b/ty.toml @@ -19,6 +19,12 @@ include = [ # type-checker can't resolve it. It runs only against an installed # kokoro env, not under CI. "./tools/kokoro/tools/**", + # conversion/* defines mixins (e.g. _Qwen35MtpMixin) whose super() + # chains resolve to attributes provided by the composed Model + # subclasses in convert_hf_to_gguf.py. ty cannot see that + # composition at the mixin level, so the super() calls and + # base-class attributes (ftype, metadata) appear unresolved. + "./conversion/**", ] [overrides.rules] diff --git a/vendor/sheredom/subprocess.h b/vendor/sheredom/subprocess.h index f6f93dfec..3e40bae04 100644 --- a/vendor/sheredom/subprocess.h +++ b/vendor/sheredom/subprocess.h @@ -1051,11 +1051,6 @@ int subprocess_terminate(struct subprocess_s *const process) { return success_terminate; #else int result; - - if (process->child <= 0) { - return -1; - } - result = kill(process->child, 9); return result; #endif