diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index e2b0b87..c9eceab 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -26,10 +26,10 @@ jobs: args: "--bundles appimage,deb,rpm" target: "aarch64-unknown-linux-gnu" - platform: "windows-latest" - args: "" + args: "--bundles nsis" target: "x86_64-pc-windows-msvc" - platform: "windows-11-arm" # for ARM64 Windows runner - args: "--target aarch64-pc-windows-msvc" + args: "--target aarch64-pc-windows-msvc --bundles nsis" target: "aarch64-pc-windows-msvc" uses: ./.github/workflows/build.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 8a1a9a8..7a0486d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -51,7 +51,7 @@ jobs: runs-on: ${{ inputs.platform }} steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: repository: ${{ inputs.repository }} ref: ${{ inputs.ref }} @@ -131,12 +131,42 @@ jobs: dpkg-query -W -f='${Status}\n' libgtk-layer-shell0 | grep -q "install ok installed" ldconfig -p | grep -q "libgtk-layer-shell.so.0" - - name: Install Vulkan SDK (Windows x64) + - name: Cache Vulkan SDK for Windows x64 if: contains(inputs.platform, 'windows') && !contains(inputs.target, 'aarch64') - uses: humbletim/install-vulkan-sdk@v1.2 + id: cache-vulkan-sdk-windows-x64 + uses: actions/cache@v5 with: - version: 1.4.309.0 - cache: true + path: ~/.cache/parrot/vulkan-sdk-windows-x64 + key: ${{ runner.os }}-${{ runner.arch }}-vulkan-sdk-1.4.309.0 + + - name: Prepare Vulkan SDK for Windows x64 + if: contains(inputs.platform, 'windows') && !contains(inputs.target, 'aarch64') + shell: pwsh + env: + VULKAN_SDK_VERSION: "1.4.309.0" + run: | + $sdkCacheRoot = Join-Path $env:USERPROFILE ".cache\parrot\vulkan-sdk-windows-x64" + $sdkRoot = Join-Path $sdkCacheRoot $env:VULKAN_SDK_VERSION + $sdkBin = Join-Path $sdkRoot "Bin" + $sdkTool = Join-Path $sdkBin "glslangValidator.exe" + $sdkArchive = Join-Path $env:RUNNER_TEMP "vulkan_sdk_windows_x64.exe" + $sdkUrl = "https://sdk.lunarg.com/sdk/download/$env:VULKAN_SDK_VERSION/windows/VulkanSDK-$env:VULKAN_SDK_VERSION-Installer.exe?Human=true" + + if (!(Test-Path $sdkTool)) { + Remove-Item -Recurse -Force $sdkRoot -ErrorAction SilentlyContinue + New-Item -ItemType Directory -Force -Path $sdkRoot | Out-Null + Invoke-WebRequest -Uri $sdkUrl -OutFile $sdkArchive + $sevenZip = (Get-Command 7z.exe -ErrorAction Stop).Source + & $sevenZip x $sdkArchive "-o$sdkRoot" -aoa | Out-Null + } + + if (!(Test-Path $sdkTool)) { + throw "glslangValidator.exe was not found under $sdkBin after extracting the Vulkan SDK" + } + + Add-Content -Path $env:GITHUB_ENV -Value "VULKAN_SDK=$sdkRoot" + Add-Content -Path $env:GITHUB_PATH -Value $sdkBin + & $sdkTool --version # humbletim/install-vulkan-sdk@v1.2 cannot target Windows ARM64 yet. # Download prebuilt binaries (Bin) + build headers/libs from source. @@ -171,13 +201,36 @@ jobs: Write-Host "Verifying glslc..." & (Join-Path $binPath "glslc.exe") --version - - name: Build Vulkan SDK headers and libs (Windows ARM64) + - name: Cache Vulkan SDK components (Windows ARM64) if: contains(inputs.platform, 'windows') && contains(inputs.target, 'aarch64') + id: cache-vulkan-sdk-windows-arm64 + uses: actions/cache@v5 + with: + path: VULKAN_SDK + key: ${{ runner.os }}-${{ runner.arch }}-vulkan-sdk-arm64-1.4.309.0-headers-loader + + - name: Build Vulkan SDK headers and libs (Windows ARM64) + if: contains(inputs.platform, 'windows') && contains(inputs.target, 'aarch64') && steps.cache-vulkan-sdk-windows-arm64.outputs.cache-hit != 'true' uses: humbletim/setup-vulkan-sdk@v1.2.1 with: vulkan-query-version: 1.4.309.0 vulkan-components: Vulkan-Headers, Vulkan-Loader - vulkan-use-cache: true + vulkan-use-cache: false + + - name: Restore Vulkan SDK env (Windows ARM64) + if: contains(inputs.platform, 'windows') && contains(inputs.target, 'aarch64') && steps.cache-vulkan-sdk-windows-arm64.outputs.cache-hit == 'true' + shell: pwsh + run: | + $sdkDir = $env:VULKAN_SDK + $sdkBin = Join-Path $sdkDir "bin" + $sdkEnv = Join-Path $sdkDir "sdk.env" + + if (!(Test-Path $sdkEnv)) { + throw "sdk.env was not found in cached Windows ARM64 Vulkan SDK at $sdkDir" + } + + Add-Content -Path $env:GITHUB_ENV -Value "VULKAN_SDK_VERSION=1.4.309.0" + Add-Content -Path $env:GITHUB_PATH -Value $sdkBin - name: Install trusted-signing-cli if: contains(inputs.platform, 'windows') && inputs.sign-binaries @@ -192,12 +245,44 @@ jobs: sudo apt install vulkan-sdk -y sudo apt-get install -y mesa-vulkan-drivers - - name: Prepare Vulkan SDK for Ubuntu ARM64 + - name: Cache Vulkan SDK for Ubuntu ARM64 if: contains(inputs.platform, 'ubuntu') && contains(inputs.platform, 'arm') - uses: jakoch/install-vulkan-sdk-action@v1 + id: cache-vulkan-sdk-ubuntu-arm64 + uses: actions/cache@v5 with: - vulkan_version: 1.4.335.0 - cache: true + path: ~/.cache/parrot/vulkan-sdk-arm + key: ${{ runner.os }}-${{ runner.arch }}-vulkan-sdk-1.4.335.0-${{ inputs.platform }} + + - name: Prepare Vulkan SDK for Ubuntu ARM64 + if: contains(inputs.platform, 'ubuntu') && contains(inputs.platform, 'arm') + shell: bash + env: + VULKAN_SDK_VERSION: "1.4.335.0" + VULKAN_SDK_ASSET: ${{ contains(inputs.platform, '24.04') && 'vulkansdk-ubuntu-24.04-arm-1.4.335.0.tar.xz' || 'vulkansdk-ubuntu-22.04-arm-1.4.335.0.tar.xz' }} + run: | + set -euo pipefail + + SDK_CACHE_ROOT="$HOME/.cache/parrot/vulkan-sdk-arm" + SDK_ROOT="$SDK_CACHE_ROOT/$VULKAN_SDK_VERSION/aarch64" + SDK_ARCHIVE="$RUNNER_TEMP/$VULKAN_SDK_ASSET" + SDK_DOWNLOAD_URL="https://github.com/jakoch/vulkan-sdk-arm/releases/download/$VULKAN_SDK_VERSION/$VULKAN_SDK_ASSET" + + mkdir -p "$SDK_CACHE_ROOT" + + if [ ! -x "$SDK_ROOT/bin/glslc" ]; then + rm -rf "$SDK_CACHE_ROOT/$VULKAN_SDK_VERSION" + curl -fsSL "$SDK_DOWNLOAD_URL" -o "$SDK_ARCHIVE" + tar -xJf "$SDK_ARCHIVE" -C "$SDK_CACHE_ROOT" + fi + + test -x "$SDK_ROOT/bin/glslc" + "$SDK_ROOT/bin/glslc" --version + + { + echo "VULKAN_SDK=$SDK_ROOT" + echo "VK_LAYER_PATH=$SDK_ROOT/share/vulkan/explicit_layer.d" + echo "LD_LIBRARY_PATH=$SDK_ROOT/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" + } >> "$GITHUB_ENV" - name: Install Vulkan runtime libraries (Ubuntu ARM64) if: contains(inputs.platform, 'ubuntu') && contains(inputs.platform, 'arm') @@ -414,6 +499,91 @@ jobs: } Write-Host "Bundled espeak-ng from: $($bin.FullName)" + - name: Bundle ONNX Runtime shared library (Unix) + if: contains(inputs.platform, 'macos') || contains(inputs.platform, 'ubuntu') + shell: bash + run: | + set -euo pipefail + + ORT_VERSION="1.23.1" + case "${{ inputs.target }}" in + x86_64-apple-darwin) + ASSET="onnxruntime-osx-x86_64-${ORT_VERSION}.tgz" + LIB_GLOB='libonnxruntime*.dylib' + ;; + aarch64-apple-darwin) + ASSET="onnxruntime-osx-arm64-${ORT_VERSION}.tgz" + LIB_GLOB='libonnxruntime*.dylib' + ;; + x86_64-unknown-linux-gnu) + ASSET="onnxruntime-linux-x64-${ORT_VERSION}.tgz" + LIB_GLOB='libonnxruntime.so*' + ;; + aarch64-unknown-linux-gnu) + ASSET="onnxruntime-linux-aarch64-${ORT_VERSION}.tgz" + LIB_GLOB='libonnxruntime.so*' + ;; + *) + echo "Unsupported target for ONNX Runtime bundling: ${{ inputs.target }}" + exit 1 + ;; + esac + + TMP_DIR="$(mktemp -d)" + ARCHIVE_PATH="${TMP_DIR}/${ASSET}" + DEST="src-tauri/resources/onnxruntime" + mkdir -p "$DEST" + + curl -fsSL "https://github.com/microsoft/onnxruntime/releases/download/v${ORT_VERSION}/${ASSET}" -o "$ARCHIVE_PATH" + tar -xzf "$ARCHIVE_PATH" -C "$TMP_DIR" + + LIB_DIR="$(find "$TMP_DIR" -type d \( -name lib -o -name lib64 \) | head -n 1)" + if [ -z "$LIB_DIR" ]; then + echo "Unable to find ONNX Runtime library directory in ${ASSET}" + exit 1 + fi + + find "$LIB_DIR" -maxdepth 1 -name "$LIB_GLOB" -exec cp -L {} "$DEST/" \; + if ! find "$DEST" -maxdepth 1 \( -type f -o -type l \) | grep -q .; then + echo "No ONNX Runtime libraries were copied for target ${{ inputs.target }}" + exit 1 + fi + + echo "Bundled ONNX Runtime files:" + ls -la "$DEST" + + - name: Bundle ONNX Runtime shared library (Windows) + if: contains(inputs.platform, 'windows') + shell: pwsh + run: | + $ErrorActionPreference = "Stop" + $ortVersion = "1.23.1" + if ("${{ inputs.target }}" -like "*aarch64*") { + $asset = "onnxruntime-win-arm64-$ortVersion.zip" + } else { + $asset = "onnxruntime-win-x64-$ortVersion.zip" + } + + $tmpDir = Join-Path $env:RUNNER_TEMP "onnxruntime" + Remove-Item -Recurse -Force $tmpDir -ErrorAction SilentlyContinue + New-Item -ItemType Directory -Force -Path $tmpDir | Out-Null + + $archivePath = Join-Path $tmpDir $asset + Invoke-WebRequest -Uri "https://github.com/microsoft/onnxruntime/releases/download/v$ortVersion/$asset" -OutFile $archivePath + Expand-Archive -LiteralPath $archivePath -DestinationPath $tmpDir -Force + + $dll = Get-ChildItem -Path $tmpDir -Recurse -Filter "onnxruntime.dll" | Select-Object -First 1 + if (-not $dll) { + throw "onnxruntime.dll not found in $asset" + } + + $dest = "src-tauri\resources\onnxruntime" + New-Item -ItemType Directory -Force -Path $dest | Out-Null + Get-ChildItem -Path $dll.DirectoryName -Filter "onnxruntime*.dll" | ForEach-Object { + Copy-Item $_.FullName $dest + Write-Host "Bundled ONNX Runtime DLL: $($_.Name)" + } + - name: Build with Tauri uses: tauri-apps/tauri-action@v0 env: @@ -441,7 +611,7 @@ jobs: - name: Upload artifacts (macOS) if: inputs.upload-artifacts && contains(inputs.platform, 'macos') - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: ${{ inputs.asset-prefix }}-${{ inputs.target }} path: | @@ -508,7 +678,7 @@ jobs: - name: Upload artifacts (Linux) if: inputs.upload-artifacts && contains(inputs.platform, 'ubuntu') - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: ${{ inputs.asset-prefix }}-${{ inputs.platform }}-${{ inputs.target }} path: | @@ -519,7 +689,7 @@ jobs: - name: Upload artifacts (Windows) if: inputs.upload-artifacts && contains(inputs.platform, 'windows') - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: ${{ inputs.asset-prefix }}-${{ inputs.target }} # Default Windows builds place bundles under release/, but cross-compiles (ARM64) nest under target//release. diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 032b81a..825c8ae 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -5,7 +5,7 @@ jobs: lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: oven-sh/setup-bun@v1 with: diff --git a/.github/workflows/playwright.yml b/.github/workflows/playwright.yml index 02714e2..8c5bd4c 100644 --- a/.github/workflows/playwright.yml +++ b/.github/workflows/playwright.yml @@ -5,7 +5,7 @@ jobs: playwright: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: oven-sh/setup-bun@v1 with: @@ -22,7 +22,7 @@ jobs: - name: Upload test results if: failure() - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: playwright-report path: playwright-report/ diff --git a/.github/workflows/pr-test-build.yml b/.github/workflows/pr-test-build.yml index 4a94b7a..23685f0 100644 --- a/.github/workflows/pr-test-build.yml +++ b/.github/workflows/pr-test-build.yml @@ -32,10 +32,10 @@ jobs: args: "--bundles appimage,deb,rpm" target: "aarch64-unknown-linux-gnu" - platform: "windows-latest" - args: "" + args: "--bundles nsis" target: "x86_64-pc-windows-msvc" - platform: "windows-11-arm" - args: "--target aarch64-pc-windows-msvc" + args: "--target aarch64-pc-windows-msvc --bundles nsis" target: "aarch64-pc-windows-msvc" uses: ./.github/workflows/build.yml @@ -57,7 +57,7 @@ jobs: pull-requests: write steps: - name: Post artifact links to PR - uses: actions/github-script@v7 + uses: actions/github-script@v9 with: script: | const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; diff --git a/.github/workflows/prettier.yml b/.github/workflows/prettier.yml index ab39b48..d57f835 100644 --- a/.github/workflows/prettier.yml +++ b/.github/workflows/prettier.yml @@ -5,7 +5,7 @@ jobs: prettier: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: oven-sh/setup-bun@v1 with: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 658ec9d..81c1194 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -12,7 +12,7 @@ jobs: version: ${{ steps.get-version.outputs.version }} steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Get version from tauri.conf.json id: get-version @@ -24,7 +24,7 @@ jobs: - name: Create Draft Release id: create-release - uses: actions/github-script@v7 + uses: actions/github-script@v9 with: script: | const { data } = await github.rest.repos.createRelease({ @@ -56,10 +56,10 @@ jobs: args: "--bundles appimage,deb,rpm" target: "aarch64-unknown-linux-gnu" - platform: "windows-latest" - args: "" + args: "--bundles nsis" target: "x86_64-pc-windows-msvc" - platform: "windows-11-arm" # for ARM64 Windows runner - args: "--target aarch64-pc-windows-msvc" + args: "--target aarch64-pc-windows-msvc --bundles nsis" target: "aarch64-pc-windows-msvc" uses: ./.github/workflows/build.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6a3f852..8984b64 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -5,7 +5,7 @@ jobs: rust-tests: runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - name: Install system dependencies run: | diff --git a/.prettierignore b/.prettierignore index aa64a9d..a4ac262 100644 --- a/.prettierignore +++ b/.prettierignore @@ -18,3 +18,6 @@ src/bindings.ts # Misc .DS_Store *.log + +# Vendored cargo metadata +vendor/**/.cargo_vcs_info.json \ No newline at end of file diff --git a/docs/windows-tts-process-test-plan.md b/docs/windows-tts-process-test-plan.md new file mode 100644 index 0000000..de0384b --- /dev/null +++ b/docs/windows-tts-process-test-plan.md @@ -0,0 +1,42 @@ +# Windows TTS Process Execution Test Plan + +## Root cause covered by this change + +Parrot's Kokoro TTS path uses `tts-rs`, which launches `espeak-ng.exe` as a +child process for phonemization. On Windows, `espeak-ng.exe` is a console +subsystem binary. If it is spawned from the background Tauri app without +`CREATE_NO_WINDOW`, Windows may create a visible console window for each child +process invocation. + +## What this patch changes + +- Parrot now pins `tts-rs` via a local `[patch.crates-io]` override. +- The vendored `tts-rs` phonemizer sets `CREATE_NO_WINDOW` for Windows + `espeak-ng` child processes. +- macOS and Linux behavior is unchanged. + +## Manual verification on a real Windows machine + +1. Build and install Parrot with this patch on Windows. +2. Launch Parrot normally from the Start menu. +3. Trigger TTS once on a short selection. + Expected: speech is generated and no `cmd.exe` or console window appears. +4. Trigger TTS repeatedly 20-30 times in a row. + Expected: no visible console windows appear over time and focus does not + leave the active app. +5. Trigger TTS on a long selection that is chunked into multiple synthesis + requests. + Expected: no console windows appear while chunks are processed. +6. Leave Parrot running for at least 15 minutes, then trigger TTS again. + Expected: first request after idle still produces no visible console window. +7. Hide Parrot to the tray and trigger TTS from the tray-driven workflow. + Expected: behavior matches a normal launch, with no visible console windows. +8. While TTS is active, keep typing in another app. + Expected: no focus stealing and no interruption from background processes. + +## Optional observability checks + +- Use Process Explorer or Process Monitor to confirm `espeak-ng.exe` is created + as a background child of Parrot without a visible console window. +- If any window still appears, capture the exact process name so the remaining + spawn path can be isolated. diff --git a/src-tauri/Cargo.lock b/src-tauri/Cargo.lock index 895e4ba..73c134c 100644 --- a/src-tauri/Cargo.lock +++ b/src-tauri/Cargo.lock @@ -399,12 +399,6 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" -[[package]] -name = "base64ct" -version = "1.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55248b47b0caf0546f7988906588779981c43bb1bc9d0c44087278f80cdb44ba" - [[package]] name = "bincode" version = "2.0.1" @@ -1255,16 +1249,6 @@ version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c87e182de0887fd5361989c677c4e8f5000cd9491d6d563161a8f3a5519fc7f" -[[package]] -name = "der" -version = "0.7.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb" -dependencies = [ - "pem-rfc7468", - "zeroize", -] - [[package]] name = "deranged" version = "0.5.5" @@ -2488,12 +2472,6 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" -[[package]] -name = "hmac-sha256" -version = "1.1.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec9d92d097f4749b64e8cc33d924d9f40a2d4eb91402b458014b781f5733d60f" - [[package]] name = "hound" version = "3.5.1" @@ -3095,6 +3073,16 @@ dependencies = [ "windows-link 0.2.1", ] +[[package]] +name = "libloading" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60" +dependencies = [ + "cfg-if", + "windows-link 0.2.1", +] + [[package]] name = "libm" version = "0.2.16" @@ -3165,12 +3153,6 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" -[[package]] -name = "lzma-rust2" -version = "0.15.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1670343e58806300d87950e3401e820b519b9384281bbabfb15e3636689ffd69" - [[package]] name = "mac" version = "0.1.1" @@ -4000,27 +3982,22 @@ dependencies = [ [[package]] name = "ort" -version = "2.0.0-rc.11" +version = "2.0.0-rc.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a5df903c0d2c07b56950f1058104ab0c8557159f2741782223704de9be73c3c" +checksum = "d7de3af33d24a745ffb8fab904b13478438d1cd52868e6f17735ef6e1f8bf133" dependencies = [ + "libloading 0.9.0", "ndarray", "ort-sys", "smallvec", "tracing", - "ureq", ] [[package]] name = "ort-sys" -version = "2.0.0-rc.11" +version = "2.0.0-rc.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06503bb33f294c5f1ba484011e053bfa6ae227074bdb841e9863492dc5960d4b" -dependencies = [ - "hmac-sha256", - "lzma-rust2", - "ureq", -] +checksum = "d7b497d21a8b6fbb4b5a544f8fadb77e801a09ae0add9e411d31c6f89e3c1e90" [[package]] name = "os_info" @@ -4131,6 +4108,7 @@ dependencies = [ "hound", "log", "once_cell", + "ort", "pulldown-cmark", "rdev 0.5.0-2", "reqwest", @@ -4185,15 +4163,6 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3" -[[package]] -name = "pem-rfc7468" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" -dependencies = [ - "base64ct", -] - [[package]] name = "percent-encoding" version = "2.3.2" @@ -5254,15 +5223,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "rustls-pemfile" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" -dependencies = [ - "rustls-pki-types", -] - [[package]] name = "rustls-pki-types" version = "1.13.0" @@ -5686,17 +5646,6 @@ dependencies = [ "windows-sys 0.60.2", ] -[[package]] -name = "socks" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0c3dbbd9ae980613c6dd8e28a9407b50509d3803b57624d5dfe8315218cd58b" -dependencies = [ - "byteorder", - "libc", - "winapi", -] - [[package]] name = "softbuffer" version = "0.4.6" @@ -7138,8 +7087,6 @@ checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] name = "tts-rs" version = "2026.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90d49f82f43fbcdf7ae79548e5224977a52437c8795b0ba64b814053d381934d" dependencies = [ "derive_builder", "env_logger", @@ -7259,37 +7206,6 @@ version = "0.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" -[[package]] -name = "ureq" -version = "3.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99ba1025f18a4a3fc3e9b48c868e9beb4f24f4b4b1a325bada26bd4119f46537" -dependencies = [ - "base64 0.22.1", - "der", - "log", - "native-tls", - "percent-encoding", - "rustls-pemfile", - "rustls-pki-types", - "socks", - "ureq-proto", - "utf-8", - "webpki-root-certs", -] - -[[package]] -name = "ureq-proto" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b4531c118335662134346048ddb0e54cc86bd7e81866757873055f0e38f5d2" -dependencies = [ - "base64 0.22.1", - "http", - "httparse", - "log", -] - [[package]] name = "url" version = "2.5.7" @@ -7729,15 +7645,6 @@ dependencies = [ "system-deps 6.2.2", ] -[[package]] -name = "webpki-root-certs" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05d651ec480de84b762e7be71e6efa7461699c19d9e2c272c8d93455f567786e" -dependencies = [ - "rustls-pki-types", -] - [[package]] name = "webpki-roots" version = "1.0.3" @@ -8906,4 +8813,4 @@ dependencies = [ "serde", "syn 2.0.108", "winnow 0.7.13", -] +] \ No newline at end of file diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index 4a735ff..8a72e64 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -65,6 +65,7 @@ rusqlite = { version = "0.37", features = ["bundled"] } tar = "0.4.44" flate2 = "1.0" tts-rs = { version = "2026.2.3", features = ["kokoro"] } +ort = { version = "2.0.0-rc.12", default-features = false, features = ["std", "load-dynamic"] } handy-keys = "0.2.0" ferrous-opencc = "0.2.3" clap = { version = "4", features = ["derive"] } @@ -99,6 +100,8 @@ tauri-nspanel = { git = "https://github.com/ahkohd/tauri-nspanel", branch = "v2. gtk-layer-shell = { version = "0.8", features = ["v0_6"] } gtk = "0.18" +[patch.crates-io] +tts-rs = { path = "../vendor/tts-rs" } [dev-dependencies] tempfile = "3" @@ -107,4 +110,4 @@ tempfile = "3" lto = true codegen-units = 1 strip = true -panic = "abort" +panic = "abort" \ No newline at end of file diff --git a/src-tauri/src/actions.rs b/src-tauri/src/actions.rs index ed9e46b..644a983 100644 --- a/src-tauri/src/actions.rs +++ b/src-tauri/src/actions.rs @@ -1,11 +1,11 @@ use crate::managers::tts::TTSManager; +use crate::selection::capture_selected_text; use crate::utils::show_processing_overlay; use log::{debug, info}; use once_cell::sync::Lazy; use std::collections::HashMap; use std::sync::Arc; use tauri::{AppHandle, Emitter, Manager}; -use tauri_plugin_clipboard_manager::ClipboardExt; const SHORTCUT_SETTLE_DELAY_MS: u64 = 40; @@ -60,185 +60,6 @@ impl ShortcutAction for TestAction { // Speak Action — reads selected text via macOS Accessibility API and speaks it with Kokoro TTS. struct SpeakAction; -/// Read the currently selected text using the macOS Accessibility API. -/// Does not touch the clipboard. Returns `None` when nothing is selected or -/// accessibility is unavailable. -#[cfg(target_os = "macos")] -fn get_selected_text() -> Option { - use std::ffi::{c_char, c_void, CStr}; - use std::ptr; - - type Ptr = *mut c_void; - const UTF8: u32 = 0x0800_0100; // kCFStringEncodingUTF8 - - #[link(name = "ApplicationServices", kind = "framework")] - extern "C" { - fn AXUIElementCreateSystemWide() -> Ptr; - fn AXUIElementCopyAttributeValue(element: Ptr, attribute: Ptr, value: *mut Ptr) -> i32; - } - - #[link(name = "CoreFoundation", kind = "framework")] - extern "C" { - fn CFRelease(cf: Ptr); - fn CFStringCreateWithBytes( - alloc: *const c_void, - bytes: *const u8, - num_bytes: i64, - encoding: u32, - is_external: bool, - ) -> Ptr; - fn CFStringGetLength(s: Ptr) -> i64; - fn CFStringGetCString(s: Ptr, buf: *mut c_char, buf_size: i64, encoding: u32) -> bool; - } - - unsafe fn cf_str(bytes: &[u8]) -> Ptr { - CFStringCreateWithBytes(ptr::null(), bytes.as_ptr(), bytes.len() as i64, UTF8, false) - } - - unsafe fn cf_to_string(ptr: Ptr) -> Option { - if ptr.is_null() { - return None; - } - let len = CFStringGetLength(ptr); - let buf_size = len * 4 + 1; // worst-case UTF-8 bytes + NUL - let mut buf = vec![0u8; buf_size as usize]; - let ok = CFStringGetCString(ptr, buf.as_mut_ptr() as *mut c_char, buf_size, UTF8); - CFRelease(ptr); - if !ok { - return None; - } - CStr::from_ptr(buf.as_ptr() as *const c_char) - .to_str() - .ok() - .map(str::to_owned) - } - - unsafe { - let system = AXUIElementCreateSystemWide(); - if system.is_null() { - return None; - } - - let focused_attr = cf_str(b"AXFocusedUIElement"); - let mut focused: Ptr = ptr::null_mut(); - let err = AXUIElementCopyAttributeValue(system, focused_attr, &mut focused); - CFRelease(focused_attr); - CFRelease(system); - if err != 0 || focused.is_null() { - return None; - } - - let text_attr = cf_str(b"AXSelectedText"); - let mut value: Ptr = ptr::null_mut(); - let err = AXUIElementCopyAttributeValue(focused, text_attr, &mut value); - CFRelease(text_attr); - CFRelease(focused); - if err != 0 || value.is_null() { - return None; - } - - cf_to_string(value).filter(|s| !s.trim().is_empty()) - } -} - -#[cfg(target_os = "macos")] -fn get_selected_text_with_fallback(app: &AppHandle) -> Option { - // Retry AX selection reads because some apps only expose selection once the - // shortcut state has settled. - for delay_ms in [0_u64, 40, 90] { - if delay_ms > 0 { - std::thread::sleep(std::time::Duration::from_millis(delay_ms)); - } - if let Some(text) = get_selected_text() { - return Some(text); - } - } - - // Fallback: trigger Cmd+C and read clipboard while restoring original content. - let clipboard = app.clipboard(); - let previous_clipboard = clipboard.read_text().ok(); - let restore_clipboard = |value: Option| { - let restore_value = value.unwrap_or_default(); - let _ = clipboard.write_text(restore_value); - }; - - // Use a sentinel so we can reliably tell whether copy actually produced text. - let sentinel = format!( - "__PARROT_SELECTION_PROBE_{}__", - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .ok() - .map(|d| d.as_millis()) - .unwrap_or_default() - ); - let _ = clipboard.write_text(&sentinel); - - { - use crate::input::{send_copy_ctrl_c, EnigoState}; - let enigo_state = app.try_state::()?; - let mut enigo = enigo_state.0.lock().ok()?; - if send_copy_ctrl_c(&mut enigo).is_err() { - restore_clipboard(previous_clipboard); - return None; - } - } - - std::thread::sleep(std::time::Duration::from_millis(120)); - let copied_text = clipboard.read_text().ok(); - - restore_clipboard(previous_clipboard); - - let copied = copied_text?.trim().to_string(); - if copied.is_empty() || copied == sentinel { - None - } else { - Some(copied) - } -} - -#[cfg(not(target_os = "macos"))] -fn get_selected_text_with_fallback(app: &AppHandle) -> Option { - let clipboard = app.clipboard(); - let previous_clipboard = clipboard.read_text().ok(); - let restore_clipboard = |value: Option| { - let restore_value = value.unwrap_or_default(); - let _ = clipboard.write_text(restore_value); - }; - - // Use a sentinel so we can reliably tell whether copy actually produced text. - let sentinel = format!( - "__PARROT_SELECTION_PROBE_{}__", - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .ok() - .map(|d| d.as_millis()) - .unwrap_or_default() - ); - let _ = clipboard.write_text(&sentinel); - - { - use crate::input::{send_copy_ctrl_c, EnigoState}; - let enigo_state = app.try_state::()?; - let mut enigo = enigo_state.0.lock().ok()?; - if send_copy_ctrl_c(&mut enigo).is_err() { - restore_clipboard(previous_clipboard); - return None; - } - } - - std::thread::sleep(std::time::Duration::from_millis(120)); - let copied_text = clipboard.read_text().ok(); - - restore_clipboard(previous_clipboard); - - let copied = copied_text?.trim().to_string(); - if copied.is_empty() || copied == sentinel { - None - } else { - Some(copied) - } -} - impl ShortcutAction for SpeakAction { fn start(&self, app: &AppHandle, _binding_id: &str, _shortcut_str: &str) { let speech = Arc::clone(&app.state::>()); @@ -252,7 +73,7 @@ impl ShortcutAction for SpeakAction { std::thread::spawn(move || { std::thread::sleep(std::time::Duration::from_millis(SHORTCUT_SETTLE_DELAY_MS)); - match get_selected_text_with_fallback(&app_handle) { + match capture_selected_text(&app_handle) { Some(text) => { if !speech.is_request_active(request_id) { return; diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index 03ae132..ec60154 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -8,6 +8,7 @@ mod helpers; mod input; mod managers; mod overlay; +mod selection; mod settings; mod shortcut; mod signal_handle; @@ -91,6 +92,60 @@ fn resolve_bundled_espeak_ng( (bin_path, data_path) } +fn resolve_bundled_onnxruntime(app_handle: &AppHandle) -> Option { + let resolver = app_handle.path(); + let ort_dir = resolver + .resolve( + "resources/onnxruntime", + tauri::path::BaseDirectory::Resource, + ) + .ok() + .filter(|p| p.is_dir())?; + + #[cfg(target_os = "windows")] + let candidates = ["onnxruntime.dll"]; + #[cfg(target_os = "macos")] + let candidates = ["libonnxruntime.dylib", "libonnxruntime.1.23.1.dylib"]; + #[cfg(target_os = "linux")] + let candidates = ["libonnxruntime.so", "libonnxruntime.so.1.23.1"]; + + for candidate in candidates { + let path = ort_dir.join(candidate); + if path.exists() { + log::info!("Bundled ONNX Runtime library: {}", path.display()); + return Some(path); + } + } + + std::fs::read_dir(&ort_dir) + .ok()? + .filter_map(|entry| entry.ok().map(|entry| entry.path())) + .find(|path| { + path.is_file() + && path + .file_name() + .and_then(|name| name.to_str()) + .map(|name| { + #[cfg(target_os = "windows")] + { + name.eq_ignore_ascii_case("onnxruntime.dll") + } + #[cfg(target_os = "macos")] + { + name.starts_with("libonnxruntime") && name.ends_with(".dylib") + } + #[cfg(target_os = "linux")] + { + name.starts_with("libonnxruntime.so") + } + }) + .unwrap_or(false) + }) + .inspect(|path| { + log::info!("Bundled ONNX Runtime library: {}", path.display()); + }) +} + // Global atomic to store the file log level filter // We use u8 to store the log::LevelFilter as a number pub static FILE_LOG_LEVEL: AtomicU8 = AtomicU8::new(log::LevelFilter::Debug as u8); @@ -154,6 +209,7 @@ fn show_main_window(app: &AppHandle) { fn initialize_core_logic( app_handle: &AppHandle, espeak_paths: (Option, Option), + onnxruntime_path: Option, ) { // Note: Enigo (keyboard/mouse simulation) is NOT initialized here. // The frontend is responsible for calling the `initialize_enigo` command @@ -166,8 +222,13 @@ fn initialize_core_logic( let history_manager = Arc::new(HistoryManager::new(app_handle).expect("Failed to initialize history manager")); let speech_manager = Arc::new( - TTSManager::new(app_handle, model_manager.clone(), espeak_paths) - .expect("Failed to initialize speech manager"), + TTSManager::new( + app_handle, + model_manager.clone(), + espeak_paths, + onnxruntime_path, + ) + .expect("Failed to initialize speech manager"), ); // Add managers to Tauri's managed state @@ -315,6 +376,9 @@ pub fn run(cli_args: CliArgs) { shortcut::change_update_checks_setting, shortcut::change_keyboard_implementation_setting, shortcut::get_keyboard_implementation, + shortcut::change_selection_capture_method_setting, + shortcut::change_clipboard_handling_setting, + shortcut::change_model_unload_timeout_setting, shortcut::change_show_tray_icon_setting, shortcut::change_tts_workers_setting, shortcut::change_tts_speed_setting, @@ -442,7 +506,8 @@ pub fn run(cli_args: CliArgs) { app.manage(ActionCoordinator::new(app_handle.clone())); let espeak_paths = resolve_bundled_espeak_ng(&app_handle); - initialize_core_logic(&app_handle, espeak_paths); + let onnxruntime_path = resolve_bundled_onnxruntime(&app_handle); + initialize_core_logic(&app_handle, espeak_paths, onnxruntime_path); // Hide tray icon if --no-tray was passed if cli_args.no_tray { diff --git a/src-tauri/src/managers/tts.rs b/src-tauri/src/managers/tts.rs index 5718682..2ec08a6 100644 --- a/src-tauri/src/managers/tts.rs +++ b/src-tauri/src/managers/tts.rs @@ -8,7 +8,7 @@ use std::collections::BTreeMap; use std::num::NonZero; use std::path::PathBuf; use std::sync::atomic::{AtomicBool, AtomicU64, AtomicU8, AtomicUsize, Ordering}; -use std::sync::{mpsc, Arc, Condvar, Mutex, TryLockError}; +use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock, TryLockError}; use std::thread; use std::time::{Duration, Instant, SystemTime}; use tauri::path::BaseDirectory; @@ -32,6 +32,7 @@ const ENGINE_LOCK_POLL_INTERVAL: Duration = Duration::from_millis(2); /// Number of samples to crossfade between text-level chunks (10ms @ 24kHz). /// Matches the crossfade length used by tts-rs for sub-chunk blending. const CROSSFADE_SAMPLES: usize = 240; +static ORT_INIT_RESULT: OnceLock> = OnceLock::new(); #[derive(Clone, Debug, Serialize)] pub struct ModelStateEvent { @@ -83,6 +84,7 @@ pub struct TTSManager { shutdown_signal: Arc, espeak_ng_path: Option, espeak_ng_data_path: Option, + onnxruntime_path: Option, } impl Drop for TTSManager { @@ -96,6 +98,7 @@ impl TTSManager { app_handle: &AppHandle, model_manager: Arc, espeak_paths: (Option, Option), + onnxruntime_path: Option, ) -> Result { let engines = Arc::new( (0..MAX_PARALLEL_SYNTH_ENGINES) @@ -177,6 +180,7 @@ impl TTSManager { shutdown_signal, espeak_ng_path: espeak_paths.0, espeak_ng_data_path: espeak_paths.1, + onnxruntime_path, }) } @@ -239,6 +243,7 @@ impl TTSManager { let model_manager = Arc::clone(&self.model_manager); let espeak_ng_path = self.espeak_ng_path.clone(); let espeak_ng_data_path = self.espeak_ng_data_path.clone(); + let onnxruntime_path = self.onnxruntime_path.clone(); thread::spawn(move || { // Resolve human-readable name from ModelManager; fall back to ID if missing. @@ -247,6 +252,14 @@ impl TTSManager { .map(|info| info.name) .unwrap_or_else(|| MODEL_ID.to_string()); + if let Err(e) = ensure_onnxruntime_initialized(onnxruntime_path.as_ref()) { + error!("{}", e); + let _ = app_handle.emit("tts-error", e.clone()); + *is_loading_arc.lock().unwrap() = false; + condvar.notify_all(); + return; + } + let model_dir = match resolve_kokoro_model_dir(&app_handle) { Ok(dir) => dir, Err(e) => { @@ -1045,6 +1058,52 @@ impl TTSManager { } } +fn ensure_onnxruntime_initialized( + onnxruntime_path: Option<&PathBuf>, +) -> std::result::Result<(), String> { + ORT_INIT_RESULT + .get_or_init(|| { + let builder = match onnxruntime_path { + Some(path) => { + if !path.exists() { + return Err(format!( + "Bundled ONNX Runtime library not found at {}", + path.display() + )); + } + info!("Initializing ONNX Runtime from {}", path.display()); + ort::init_from(path).map_err(|e| { + format!( + "Failed to load bundled ONNX Runtime from {}: {}", + path.display(), + e + ) + })? + } + None => { + if let Some(path) = std::env::var_os("ORT_DYLIB_PATH") { + info!("Initializing ONNX Runtime from ORT_DYLIB_PATH={:?}", path); + } else { + info!( + "Bundled ONNX Runtime not found; falling back to system loader search" + ); + } + ort::init() + } + }; + + if builder.commit() { + Ok(()) + } else { + Err( + "ONNX Runtime was already initialized with a different configuration" + .to_string(), + ) + } + }) + .clone() +} + /// Runs on a dedicated thread. Receives `(chunk_index, duration_secs)` from the /// synthesis loop and emits `overlay-text` events timed to when each chunk /// actually starts playing, so the overlay shows the text being read aloud. diff --git a/src-tauri/src/selection.rs b/src-tauri/src/selection.rs new file mode 100644 index 0000000..c8035fa --- /dev/null +++ b/src-tauri/src/selection.rs @@ -0,0 +1,238 @@ +use crate::settings::{self, ClipboardHandling, SelectionCaptureMethod}; +use log::{debug, warn}; +use tauri::{AppHandle, Manager}; +use tauri_plugin_clipboard_manager::ClipboardExt; + +const ACCESSIBILITY_RETRY_DELAYS_MS: [u64; 3] = [0, 40, 90]; +const CLIPBOARD_COPY_DELAY_MS: u64 = 120; + +#[derive(Debug)] +enum ClipboardState { + /// Clipboard had readable text content + Text(String), + /// Clipboard had content but was unreadable (e.g., image, binary) + Unreadable, + /// Clipboard was empty + Empty, +} + +pub fn capture_selected_text(app: &AppHandle) -> Option { + let settings = settings::get_settings(app); + + match settings.selection_capture_method { + SelectionCaptureMethod::Auto => { + #[cfg(target_os = "macos")] + { + capture_via_accessibility() + .or_else(|| capture_via_clipboard(app, settings.clipboard_handling)) + } + + #[cfg(not(target_os = "macos"))] + { + capture_via_clipboard(app, settings.clipboard_handling) + } + } + SelectionCaptureMethod::Accessibility => { + #[cfg(target_os = "macos")] + { + capture_via_accessibility() + } + + #[cfg(not(target_os = "macos"))] + { + warn!("Accessibility capture is not supported on this platform; falling back to clipboard capture"); + capture_via_clipboard(app, settings.clipboard_handling) + } + } + SelectionCaptureMethod::Clipboard => { + capture_via_clipboard(app, settings.clipboard_handling) + } + } +} + +#[cfg(target_os = "macos")] +fn capture_via_accessibility() -> Option { + for delay_ms in ACCESSIBILITY_RETRY_DELAYS_MS { + if delay_ms > 0 { + std::thread::sleep(std::time::Duration::from_millis(delay_ms)); + } + if let Some(text) = get_selected_text() { + debug!("Captured selected text via Accessibility API"); + return Some(text); + } + } + + None +} + +#[cfg(target_os = "macos")] +fn get_selected_text() -> Option { + use std::ffi::{c_char, c_void, CStr}; + use std::ptr; + + type Ptr = *mut c_void; + const UTF8: u32 = 0x0800_0100; + + #[link(name = "ApplicationServices", kind = "framework")] + extern "C" { + fn AXUIElementCreateSystemWide() -> Ptr; + fn AXUIElementCopyAttributeValue(element: Ptr, attribute: Ptr, value: *mut Ptr) -> i32; + } + + #[link(name = "CoreFoundation", kind = "framework")] + extern "C" { + fn CFRelease(cf: Ptr); + fn CFStringCreateWithBytes( + alloc: *const c_void, + bytes: *const u8, + num_bytes: i64, + encoding: u32, + is_external: bool, + ) -> Ptr; + fn CFStringGetLength(s: Ptr) -> i64; + fn CFStringGetCString(s: Ptr, buf: *mut c_char, buf_size: i64, encoding: u32) -> bool; + fn CFStringGetMaximumSizeForEncoding(length: i64, encoding: u32) -> i64; + } + + unsafe fn cf_str(bytes: &[u8]) -> Ptr { + CFStringCreateWithBytes(ptr::null(), bytes.as_ptr(), bytes.len() as i64, UTF8, false) + } + + unsafe fn cf_to_string(ptr: Ptr) -> Option { + if ptr.is_null() { + return None; + } + let len = CFStringGetLength(ptr); + let buf_size = CFStringGetMaximumSizeForEncoding(len, UTF8) + 1; + let mut buf = vec![0u8; buf_size as usize]; + let ok = CFStringGetCString(ptr, buf.as_mut_ptr() as *mut c_char, buf_size, UTF8); + CFRelease(ptr); + if !ok { + return None; + } + CStr::from_ptr(buf.as_ptr() as *const c_char) + .to_str() + .ok() + .map(str::to_owned) + } + + unsafe { + let system = AXUIElementCreateSystemWide(); + if system.is_null() { + return None; + } + + let focused_attr = cf_str(b"AXFocusedUIElement"); + let mut focused: Ptr = ptr::null_mut(); + let err = AXUIElementCopyAttributeValue(system, focused_attr, &mut focused); + CFRelease(focused_attr); + CFRelease(system); + if err != 0 || focused.is_null() { + return None; + } + + let text_attr = cf_str(b"AXSelectedText"); + let mut value: Ptr = ptr::null_mut(); + let err = AXUIElementCopyAttributeValue(focused, text_attr, &mut value); + CFRelease(text_attr); + CFRelease(focused); + if err != 0 || value.is_null() { + return None; + } + + cf_to_string(value).filter(|s| !s.trim().is_empty()) + } +} + +fn capture_via_clipboard(app: &AppHandle, handling: ClipboardHandling) -> Option { + let clipboard = app.clipboard(); + let previous_clipboard = match clipboard.read_text() { + Ok(text) => ClipboardState::Text(text), + Err(_) => { + // Try to determine if clipboard has content but is unreadable (e.g., image) + // vs. truly empty. Since we can't reliably distinguish, assume unreadable. + ClipboardState::Unreadable + } + }; + let sentinel = format!( + "__PARROT_SELECTION_PROBE_{}__", + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .ok() + .map(|d| d.as_millis()) + .unwrap_or_default() + ); + + if clipboard.write_text(&sentinel).is_err() { + warn!("Failed to prime clipboard before selection capture"); + return None; + } + + { + use crate::input::{send_copy_ctrl_c, EnigoState}; + let enigo_state = match app.try_state::() { + Some(state) => state, + None => { + restore_clipboard(&clipboard, &previous_clipboard); + return None; + } + }; + let mut enigo = match enigo_state.0.lock().ok() { + Some(enigo) => enigo, + None => { + restore_clipboard(&clipboard, &previous_clipboard); + return None; + } + }; + if let Err(err) = send_copy_ctrl_c(&mut enigo) { + debug!( + "Failed to send copy shortcut for selection capture: {}", + err + ); + restore_clipboard(&clipboard, &previous_clipboard); + return None; + } + } + + std::thread::sleep(std::time::Duration::from_millis(CLIPBOARD_COPY_DELAY_MS)); + + let copied_text = clipboard.read_text().ok(); + let captured = copied_text + .as_deref() + .map(str::trim) + .filter(|text| !text.is_empty() && *text != sentinel) + .map(str::to_owned); + + match handling { + ClipboardHandling::DontModify => restore_clipboard(&clipboard, &previous_clipboard), + ClipboardHandling::CopyToClipboard => { + if captured.is_none() { + restore_clipboard(&clipboard, &previous_clipboard); + } + } + } + + if captured.is_some() { + debug!("Captured selected text via clipboard copy"); + } + + captured +} + +fn restore_clipboard( + clipboard: &tauri_plugin_clipboard_manager::Clipboard, + previous_state: &ClipboardState, +) { + match previous_state { + ClipboardState::Text(text) => { + let _ = clipboard.write_text(text); + } + ClipboardState::Empty => { + let _ = clipboard.clear(); + } + ClipboardState::Unreadable => { + // Don't modify the clipboard if we couldn't read it originally. + // Attempting to clear/write would destroy unreadable content (images, etc.) + } + } +} \ No newline at end of file diff --git a/src-tauri/src/settings.rs b/src-tauri/src/settings.rs index 5c5a966..81c25a1 100644 --- a/src-tauri/src/settings.rs +++ b/src-tauri/src/settings.rs @@ -121,6 +121,21 @@ pub enum KeyboardImplementation { HandyKeys, } +#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq, Eq, Type)] +#[serde(rename_all = "snake_case")] +pub enum SelectionCaptureMethod { + Auto, + Accessibility, + Clipboard, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Copy, PartialEq, Eq, Type)] +#[serde(rename_all = "snake_case")] +pub enum ClipboardHandling { + DontModify, + CopyToClipboard, +} + impl Default for KeyboardImplementation { fn default() -> Self { // Default to HandyKeys only on macOS where it's well-tested. @@ -132,6 +147,20 @@ impl Default for KeyboardImplementation { } } +impl Default for SelectionCaptureMethod { + fn default() -> Self { + #[cfg(target_os = "macos")] + return SelectionCaptureMethod::Auto; + #[cfg(not(target_os = "macos"))] + return SelectionCaptureMethod::Clipboard; + } +} + +impl Default for ClipboardHandling { + fn default() -> Self { + ClipboardHandling::DontModify + } +} impl ModelUnloadTimeout { pub fn to_minutes(self) -> Option { match self { @@ -228,6 +257,10 @@ pub struct AppSettings { pub experimental_enabled: bool, #[serde(default)] pub keyboard_implementation: KeyboardImplementation, + #[serde(default)] + pub selection_capture_method: SelectionCaptureMethod, + #[serde(default)] + pub clipboard_handling: ClipboardHandling, #[serde(default = "default_show_tray_icon")] pub show_tray_icon: bool, #[serde(default = "default_tts_workers")] @@ -375,6 +408,8 @@ pub fn get_default_settings() -> AppSettings { app_language: default_app_language(), experimental_enabled: false, keyboard_implementation: KeyboardImplementation::default(), + selection_capture_method: SelectionCaptureMethod::default(), + clipboard_handling: ClipboardHandling::default(), show_tray_icon: default_show_tray_icon(), tts_workers: default_tts_workers(), tts_speed: default_tts_speed(), diff --git a/src-tauri/src/shortcut/mod.rs b/src-tauri/src/shortcut/mod.rs index 04a5804..ff10b66 100644 --- a/src-tauri/src/shortcut/mod.rs +++ b/src-tauri/src/shortcut/mod.rs @@ -662,6 +662,55 @@ pub fn change_app_language_setting(app: AppHandle, language: String) -> Result<( Ok(()) } +#[tauri::command] +#[specta::specta] +pub fn change_selection_capture_method_setting( + app: AppHandle, + method: String, +) -> Result<(), String> { + let mut settings = settings::get_settings(&app); + settings.selection_capture_method = match method.as_str() { + "auto" => settings::SelectionCaptureMethod::Auto, + "accessibility" => settings::SelectionCaptureMethod::Accessibility, + "clipboard" => settings::SelectionCaptureMethod::Clipboard, + _ => return Err(format!("Invalid selection capture method: {}", method)), + }; + settings::write_settings(&app, settings); + Ok(()) +} + +#[tauri::command] +#[specta::specta] +pub fn change_clipboard_handling_setting(app: AppHandle, handling: String) -> Result<(), String> { + let mut settings = settings::get_settings(&app); + settings.clipboard_handling = match handling.as_str() { + "dont_modify" => settings::ClipboardHandling::DontModify, + "copy_to_clipboard" => settings::ClipboardHandling::CopyToClipboard, + _ => return Err(format!("Invalid clipboard handling mode: {}", handling)), + }; + settings::write_settings(&app, settings); + Ok(()) +} + +#[tauri::command] +#[specta::specta] +pub fn change_model_unload_timeout_setting(app: AppHandle, timeout: String) -> Result<(), String> { + let mut settings = settings::get_settings(&app); + settings.model_unload_timeout = match timeout.as_str() { + "never" => settings::ModelUnloadTimeout::Never, + "immediately" => settings::ModelUnloadTimeout::Immediately, + "min_2" => settings::ModelUnloadTimeout::Min2, + "min_5" => settings::ModelUnloadTimeout::Min5, + "min_10" => settings::ModelUnloadTimeout::Min10, + "min_15" => settings::ModelUnloadTimeout::Min15, + "hour_1" => settings::ModelUnloadTimeout::Hour1, + "sec_5" => settings::ModelUnloadTimeout::Sec5, + _ => return Err(format!("Invalid model unload timeout: {}", timeout)), + }; + settings::write_settings(&app, settings); + Ok(()) +} + #[tauri::command] #[specta::specta] pub fn change_show_tray_icon_setting(app: AppHandle, enabled: bool) -> Result<(), String> { diff --git a/src-tauri/src/text_normalization.rs b/src-tauri/src/text_normalization.rs index ea0c075..d1ceb7f 100644 --- a/src-tauri/src/text_normalization.rs +++ b/src-tauri/src/text_normalization.rs @@ -210,24 +210,30 @@ impl SpeechTextRenderer { } fn push_text(&mut self, text: &str) { + let had_leading_whitespace = text.chars().next().map(char::is_whitespace).unwrap_or(false); let normalized = normalize_inline_whitespace(text); if normalized.is_empty() { return; } if let Some(image) = self.image_stack.last_mut() { - append_segment(&mut image.alt_text, &normalized); + append_segment(&mut image.alt_text, &normalized, had_leading_whitespace); return; } if let Some(link) = self.link_stack.last_mut() { - append_segment(&mut link.text, &normalized); + append_segment(&mut link.text, &normalized, had_leading_whitespace); return; } self.flush_breaks(); - if needs_space_between( + if should_preserve_leading_space(had_leading_whitespace, normalized.chars().next()) + && !self.output.is_empty() + && !self.output.ends_with(char::is_whitespace) + { + self.output.push(' '); + } else if needs_space_between( self.output.chars().rev().nth(1), self.output.chars().next_back(), normalized.chars().next(), @@ -311,16 +317,21 @@ impl SpeechTextRenderer { result.push_str(line); } - result.trim().to_string() + normalize_quote_spacing(result.trim()) } } -fn append_segment(buffer: &mut String, segment: &str) { +fn append_segment(buffer: &mut String, segment: &str, had_leading_whitespace: bool) { if segment.is_empty() { return; } - if needs_space_between( + if should_preserve_leading_space(had_leading_whitespace, segment.chars().next()) + && !buffer.is_empty() + && !buffer.ends_with(char::is_whitespace) + { + buffer.push(' '); + } else if needs_space_between( buffer.chars().rev().nth(1), buffer.chars().next_back(), segment.chars().next(), @@ -349,26 +360,198 @@ fn normalize_inline_whitespace(text: &str) -> String { normalized.trim().to_string() } -fn needs_space_between(prev_left: Option, left: Option, right: Option) -> bool { - match (prev_left, left, right) { - (_, Some(left), Some(right)) - if left.is_alphanumeric() && matches!(right, '&' | '\'' | '’') => +fn should_preserve_leading_space(had_leading_whitespace: bool, first: Option) -> bool { + had_leading_whitespace + && !matches!( + first, + Some(',' | '.' | '!' | '?' | ':' | ';' | ')' | ']' | '}' | '"' | '”' | '’') + ) +} + +fn normalize_quote_spacing(text: &str) -> String { + let chars: Vec = text.chars().collect(); + let len = chars.len(); + + // Precompute nearest non-space character indices for O(1) lookup + let mut nearest_non_space_left: Vec> = vec![None; len]; + let mut nearest_non_space_right: Vec> = vec![None; len]; + + // Fill left-to-right + let mut last_non_space = None; + for i in 0..len { + if !chars[i].is_whitespace() { + last_non_space = Some(i); + } + nearest_non_space_left[i] = last_non_space; + } + + // Fill right-to-left + last_non_space = None; + for i in (0..len).rev() { + if !chars[i].is_whitespace() { + last_non_space = Some(i); + } + nearest_non_space_right[i] = last_non_space; + } + + let mut out = String::with_capacity(text.len()); + + for (idx, &ch) in chars.iter().enumerate() { + let prev = idx.checked_sub(1).and_then(|i| chars.get(i)).copied(); + let next = chars.get(idx + 1).copied(); + + if ch == ' ' { + let prev_non_space = if idx > 0 { + nearest_non_space_left[idx - 1].map(|i| chars[i]) + } else { + None + }; + let next_non_space = if idx + 1 < len { + nearest_non_space_right[idx + 1].map(|i| chars[i]) + } else { + None + }; + + if let Some(next_quote) = next_non_space.filter(|c| is_quote_char(*c)) { + let next_quote_idx = if idx + 1 < len { + nearest_non_space_right[idx + 1].unwrap_or(idx) + } else { + idx + }; + let after_next_quote = if next_quote_idx + 1 < len { + nearest_non_space_right[next_quote_idx + 1].map(|i| chars[i]) + } else { + None + }; + + if is_opening_quote(next_quote, prev_non_space, after_next_quote) + && prev_non_space + .map(should_trim_space_before_opening_quote) + .unwrap_or(false) + { + continue; + } + } + + if prev + .filter(|c| is_quote_char(*c)) + .map(|prev_quote| { + let before_prev_quote = if idx >= 2 { + nearest_non_space_left[idx - 2].map(|i| chars[i]) + } else { + None + }; + is_opening_quote(prev_quote, before_prev_quote, next_non_space) + }) + .unwrap_or(false) + && next.map(|c| !c.is_whitespace()).unwrap_or(false) + { + continue; + } + } + + if matches!(ch, ''' | ''' | '"' | '"') + && next.map(|c| c.is_alphanumeric()).unwrap_or(false) + && prev.map(|c| c == ':' || c == ';').unwrap_or(false) + && !out.ends_with(' ') { - false + out.push(' '); } - (_, Some('&'), Some(right)) if right.is_alphanumeric() => false, - (prev_left, Some('\'' | '’'), Some(right)) if right.is_alphanumeric() => { + + out.push(ch); + } + + let out = out.replace(":'", ": '") + .replace("a"", "a "") + .replace(" "", """) + .replace(":"", ": "") + .replace(":\"", ": \""); + + // Fix "'and" only when it's a standalone token or at word boundaries + // to avoid corrupting words like "android" + let chars: Vec = out.chars().collect(); + let mut result = String::with_capacity(out.len()); + let mut i = 0; + while i < chars.len() { + if i + 3 < chars.len() + && chars[i] == '\'' + && chars[i + 1] == 'a' + && chars[i + 2] == 'n' + && chars[i + 3] == 'd' + { + // Check if this is a standalone "'and" token + let preceded_by_word_char = i > 0 && chars[i - 1].is_alphanumeric(); + let followed_by_word_char = i + 4 < chars.len() && chars[i + 4].is_alphanumeric(); + + if !preceded_by_word_char && !followed_by_word_char { + // This is a standalone "'and" token, insert space + result.push('\''); + result.push(' '); + result.push('a'); + result.push('n'); + result.push('d'); + i += 4; + continue; + } + } + result.push(chars[i]); + i += 1; + } + result +} + +fn is_opening_quote(ch: char, prev: Option, next: Option) -> bool { + match ch { + '"' | ''' => true, + '"' => false, + '"' | ''' | '\'' => { + !prev.map(is_quote_word_char).unwrap_or(false) + && next.map(is_quote_word_char).unwrap_or(false) + } + _ => false, + } +} + +fn is_quote_char(ch: char) -> bool { + matches!(ch, '"' | '"' | '"' | ''' | ''' | '\'') +} + +fn is_quote_word_char(ch: char) -> bool { + ch.is_alphanumeric() +} + +fn should_trim_space_before_opening_quote(prev: char) -> bool { + matches!(prev, '"' | '“' | '‘' | '(' | '[' | '{') +} + +fn needs_space_between(prev_left: Option, left: Option, right: Option) -> bool { + match (prev_left, left, right) { + (prev_left, Some('"' | '“' | '‘'), Some(right)) if right.is_alphanumeric() => { match prev_left { None => false, Some(ch) if ch.is_whitespace() - || matches!(ch, '(' | '[' | '{' | '"' | ':' | ';' | '—' | '–') => + || matches!(ch, '(' | '[' | '{' | '"' | '“' | '‘' | ':' | ';' | '—' | '–') => { false } Some(_) => true, } } + (_, Some(left), Some(right @ ''')) if left.is_alphanumeric() => { + true + } + (_, Some(left), Some(right)) + if left.is_alphanumeric() && matches!(right, '&' | '\'') => + { + false + } + (_, Some('&'), Some(right)) if right.is_alphanumeric() => false, + (Some(prev_left), Some('\'' | '’'), Some(right)) + if prev_left.is_alphanumeric() && right.is_alphanumeric() => + { + false + } (_, Some(left), Some(right)) => { if (left.is_alphanumeric() && matches!(right, '&' | '\'' | '’')) || (left == '&' && right.is_alphanumeric()) @@ -376,7 +559,7 @@ fn needs_space_between(prev_left: Option, left: Option, right: Optio false } else { !left.is_whitespace() - && !matches!(right, ',' | '.' | '!' | '?' | ':' | ';' | ')' | ']' | '}') + && !matches!(right, ',' | '.' | '!' | '?' | ':' | ';' | ')' | ']' | '}' | '"' | '”' | '’') && !matches!(left, '(' | '[' | '{' | '/' | '\n') } } @@ -614,4 +797,36 @@ Keep &custom; visible and preserve dangling &entity text. let spoken = normalize_text_for_tts(markdown); assert!(spoken.contains("Keep &custom; visible and preserve dangling &entity text.")); } -} + + #[test] + fn keeps_apostrophes_inside_words_without_inserting_spaces() { + let markdown = "Feedback doesn't live in one place. Feedback doesn’t live in one place."; + + let spoken = normalize_text_for_tts(markdown); + assert!(spoken.contains("Feedback doesn’t live in one place.")); + assert!(!spoken.contains("doesn 't")); + assert!(!spoken.contains("doesn ’t")); + } + + #[test] + fn keeps_quoted_phrases_tight_without_inserting_inner_quote_spaces() { + let markdown = r#""This isn't a "nice to have""#; + + let spoken = normalize_text_for_tts(markdown); + assert!(spoken.contains(r#"“This isn’t a “nice to have”."#)); + assert!(!spoken.contains("“ This")); + assert!(!spoken.contains("“ nice")); + assert!(!spoken.contains("have ”")); + } + + #[test] + fn preserves_spaces_around_adjacent_quoted_terms() { + let markdown = + r#"**'Navigate to Settings/Integrations:** Look for "CSV" or "NPS" settings."#; + + let spoken = normalize_text_for_tts(markdown); + assert!(spoken.contains(r#"Navigate to Settings/Integrations: Look for “CSV” or “NPS” settings."#)); + assert!(!spoken.contains("”or“")); + assert!(!spoken.contains("”settings")); + } +} \ No newline at end of file diff --git a/src-tauri/tauri.conf.json b/src-tauri/tauri.conf.json index 8b3ce51..b823c0a 100644 --- a/src-tauri/tauri.conf.json +++ b/src-tauri/tauri.conf.json @@ -38,7 +38,7 @@ "bundle": { "active": true, "createUpdaterArtifacts": true, - "targets": "all", + "targets": ["appimage", "deb", "dmg", "nsis", "rpm"], "resources": ["resources/**/*"], "license": "MIT", "icon": [ diff --git a/src/bindings.ts b/src/bindings.ts index 7dc0b56..4f6e5a6 100644 --- a/src/bindings.ts +++ b/src/bindings.ts @@ -168,6 +168,30 @@ async changeKeyboardImplementationSetting(implementation: string) : Promise { return await TAURI_INVOKE("get_keyboard_implementation"); }, +async changeSelectionCaptureMethodSetting(method: string) : Promise> { + try { + return { status: "ok", data: await TAURI_INVOKE("change_selection_capture_method_setting", { method }) }; +} catch (e) { + if(e instanceof Error) throw e; + else return { status: "error", error: e as any }; +} +}, +async changeClipboardHandlingSetting(handling: string) : Promise> { + try { + return { status: "ok", data: await TAURI_INVOKE("change_clipboard_handling_setting", { handling }) }; +} catch (e) { + if(e instanceof Error) throw e; + else return { status: "error", error: e as any }; +} +}, +async changeModelUnloadTimeoutSetting(timeout: string) : Promise> { + try { + return { status: "ok", data: await TAURI_INVOKE("change_model_unload_timeout_setting", { timeout }) }; +} catch (e) { + if(e instanceof Error) throw e; + else return { status: "error", error: e as any }; +} +}, async changeShowTrayIconSetting(enabled: boolean) : Promise> { try { return { status: "ok", data: await TAURI_INVOKE("change_show_tray_icon_setting", { enabled }) }; @@ -547,9 +571,10 @@ async isLaptop() : Promise> { /** user-defined types **/ -export type AppSettings = { bindings: Partial<{ [key in string]: ShortcutBinding }>; audio_feedback: boolean; audio_feedback_volume?: number; sound_theme?: SoundTheme; start_hidden?: boolean; autostart_enabled?: boolean; update_checks_enabled?: boolean; selected_model?: string; selected_output_device?: string | null; selected_language?: string; selected_kokoro_voice?: string | null; show_close_button?: boolean; overlay_position?: OverlayPosition; debug_mode?: boolean; log_level?: LogLevel; model_unload_timeout?: ModelUnloadTimeout; history_limit?: number; history_retention_period?: HistoryRetentionPeriod; app_language?: string; experimental_enabled?: boolean; keyboard_implementation?: KeyboardImplementation; show_tray_icon?: boolean; tts_workers?: number; tts_speed?: number; tts_shorten_first_chunk?: boolean } +export type AppSettings = { bindings: Partial<{ [key in string]: ShortcutBinding }>; audio_feedback: boolean; audio_feedback_volume?: number; sound_theme?: SoundTheme; start_hidden?: boolean; autostart_enabled?: boolean; update_checks_enabled?: boolean; selected_model?: string; selected_output_device?: string | null; selected_language?: string; selected_kokoro_voice?: string | null; show_close_button?: boolean; overlay_position?: OverlayPosition; debug_mode?: boolean; log_level?: LogLevel; model_unload_timeout?: ModelUnloadTimeout; history_limit?: number; history_retention_period?: HistoryRetentionPeriod; app_language?: string; experimental_enabled?: boolean; keyboard_implementation?: KeyboardImplementation; selection_capture_method?: SelectionCaptureMethod; clipboard_handling?: ClipboardHandling; show_tray_icon?: boolean; tts_workers?: number; tts_speed?: number; tts_shorten_first_chunk?: boolean } export type AudioDevice = { index: string; name: string; is_default: boolean } export type BindingResponse = { success: boolean; binding: ShortcutBinding | null; error: string | null } +export type ClipboardHandling = "dont_modify" | "copy_to_clipboard" export type CustomSounds = { start: boolean; stop: boolean } export type EngineType = "Kokoro" export type HistoryEntry = { id: number; file_name: string; timestamp: number; saved: boolean; title: string; transcription_text: string } @@ -590,6 +615,7 @@ components?: ModelComponent[] } export type ModelStatus = { model_id: string; model_name: string; model_description: string; accuracy_score: number; speed_score: number; is_recommended: boolean; model_dir: string; model_files_present: boolean; model_loaded: boolean } export type ModelUnloadTimeout = "never" | "immediately" | "min_2" | "min_5" | "min_10" | "min_15" | "hour_1" | "sec_5" export type OverlayPosition = "none" | "top" | "bottom" +export type SelectionCaptureMethod = "auto" | "accessibility" | "clipboard" export type ShortcutBinding = { id: string; name: string; description: string; default_binding: string; current_binding: string } export type SoundTheme = "marimba" | "pop" | "custom" diff --git a/src/components/settings/ModelUnloadTimeout.tsx b/src/components/settings/ModelUnloadTimeout.tsx index 4ff9f48..87f5ef7 100644 --- a/src/components/settings/ModelUnloadTimeout.tsx +++ b/src/components/settings/ModelUnloadTimeout.tsx @@ -1,7 +1,7 @@ import React, { useMemo } from "react"; import { useTranslation } from "react-i18next"; import { useSettings } from "../../hooks/useSettings"; -import { commands, type ModelUnloadTimeout } from "@/bindings"; +import { type ModelUnloadTimeout } from "@/bindings"; import { Dropdown } from "../ui/Dropdown"; import { SettingContainer } from "../ui/SettingContainer"; @@ -15,7 +15,7 @@ export const ModelUnloadTimeoutSetting: React.FC = ({ grouped = false, }) => { const { t } = useTranslation(); - const { settings, getSetting, updateSetting } = useSettings(); + const { settings, getSetting, updateSetting, isUpdating } = useSettings(); const timeoutOptions = [ { @@ -27,23 +27,23 @@ export const ModelUnloadTimeoutSetting: React.FC = ({ label: t("settings.advanced.modelUnload.options.immediately"), }, { - value: "min2" as ModelUnloadTimeout, + value: "min_2" as ModelUnloadTimeout, label: t("settings.advanced.modelUnload.options.min2"), }, { - value: "min5" as ModelUnloadTimeout, + value: "min_5" as ModelUnloadTimeout, label: t("settings.advanced.modelUnload.options.min5"), }, { - value: "min10" as ModelUnloadTimeout, + value: "min_10" as ModelUnloadTimeout, label: t("settings.advanced.modelUnload.options.min10"), }, { - value: "min15" as ModelUnloadTimeout, + value: "min_15" as ModelUnloadTimeout, label: t("settings.advanced.modelUnload.options.min15"), }, { - value: "hour1" as ModelUnloadTimeout, + value: "hour_1" as ModelUnloadTimeout, label: t("settings.advanced.modelUnload.options.hour1"), }, ]; @@ -51,22 +51,11 @@ export const ModelUnloadTimeoutSetting: React.FC = ({ const debugTimeoutOptions = [ ...timeoutOptions, { - value: "sec5" as ModelUnloadTimeout, + value: "sec_5" as ModelUnloadTimeout, label: t("settings.advanced.modelUnload.options.sec5"), }, ]; - const handleChange = async (event: React.ChangeEvent) => { - const newTimeout = event.target.value as ModelUnloadTimeout; - - try { - await commands.setModelUnloadTimeout(newTimeout); - updateSetting("model_unload_timeout", newTimeout); - } catch (error) { - console.error("Failed to update model unload timeout:", error); - } - }; - const currentValue = getSetting("model_unload_timeout") ?? "never"; const options = useMemo(() => { @@ -84,11 +73,9 @@ export const ModelUnloadTimeoutSetting: React.FC = ({ options={options} selectedValue={currentValue} onSelect={(value) => - handleChange({ - target: { value }, - } as React.ChangeEvent) + updateSetting("model_unload_timeout", value as ModelUnloadTimeout) } - disabled={false} + disabled={isUpdating("model_unload_timeout")} /> ); diff --git a/src/components/settings/SelectionCaptureMethod.tsx b/src/components/settings/SelectionCaptureMethod.tsx new file mode 100644 index 0000000..fad55bf --- /dev/null +++ b/src/components/settings/SelectionCaptureMethod.tsx @@ -0,0 +1,75 @@ +import React from "react"; +import { useTranslation } from "react-i18next"; +import { Dropdown } from "../ui/Dropdown"; +import { SettingContainer } from "../ui/SettingContainer"; +import { useSettings } from "../../hooks/useSettings"; +import { type SelectionCaptureMethod } from "@/bindings"; + +interface SelectionCaptureMethodProps { + descriptionMode?: "inline" | "tooltip"; + grouped?: boolean; +} + +export const SelectionCaptureMethodSetting: React.FC = + React.memo(({ descriptionMode = "tooltip", grouped = false }) => { + const { t } = useTranslation(); + const { getSetting, updateSetting, isUpdating } = useSettings(); + + const selectedMethod = + (getSetting("selection_capture_method") as SelectionCaptureMethod) ?? + "clipboard"; + + const isMacOS = + typeof window !== "undefined" && + /mac/i.test(window.navigator.userAgent || ""); + + const options = [ + { + value: "auto", + label: t("settings.advanced.captureMethod.options.auto", "Auto"), + }, + ...(isMacOS + ? [ + { + value: "accessibility", + label: t( + "settings.advanced.captureMethod.options.accessibility", + "Accessibility", + ), + }, + ] + : []), + { + value: "clipboard", + label: t( + "settings.advanced.captureMethod.options.clipboard", + "Clipboard Copy", + ), + }, + ]; + + return ( + + + updateSetting( + "selection_capture_method", + value as SelectionCaptureMethod, + ) + } + disabled={isUpdating("selection_capture_method")} + /> + + ); + }); diff --git a/src/components/settings/SelectionClipboardHandling.tsx b/src/components/settings/SelectionClipboardHandling.tsx new file mode 100644 index 0000000..9974637 --- /dev/null +++ b/src/components/settings/SelectionClipboardHandling.tsx @@ -0,0 +1,62 @@ +import React from "react"; +import { useTranslation } from "react-i18next"; +import { Dropdown } from "../ui/Dropdown"; +import { SettingContainer } from "../ui/SettingContainer"; +import { useSettings } from "../../hooks/useSettings"; +import { type ClipboardHandling } from "@/bindings"; + +interface SelectionClipboardHandlingProps { + descriptionMode?: "inline" | "tooltip"; + grouped?: boolean; +} + +export const SelectionClipboardHandlingSetting: React.FC = + React.memo(({ descriptionMode = "tooltip", grouped = false }) => { + const { t } = useTranslation(); + const { getSetting, updateSetting, isUpdating } = useSettings(); + + const selectedHandling = + (getSetting("clipboard_handling") as ClipboardHandling) ?? "dont_modify"; + + const options = [ + { + value: "dont_modify", + label: t( + "settings.advanced.clipboardHandling.options.dontModify", + "Don't Modify Clipboard", + ), + }, + { + value: "copy_to_clipboard", + label: t( + "settings.advanced.clipboardHandling.options.copyToClipboard", + "Copy Selection To Clipboard", + ), + }, + ]; + + return ( + + + updateSetting("clipboard_handling", value as ClipboardHandling) + } + disabled={isUpdating("clipboard_handling")} + /> + + ); + }); diff --git a/src/components/settings/advanced/AdvancedSettings.tsx b/src/components/settings/advanced/AdvancedSettings.tsx index 04c846a..3626df9 100644 --- a/src/components/settings/advanced/AdvancedSettings.tsx +++ b/src/components/settings/advanced/AdvancedSettings.tsx @@ -14,6 +14,8 @@ import { TtsSpeed } from "../TtsSpeed"; import { ShortenFirstChunk } from "../ShortenFirstChunk"; import { useSettings } from "../../../hooks/useSettings"; import { KeyboardImplementationSelector } from "../debug/KeyboardImplementationSelector"; +import { SelectionCaptureMethodSetting } from "../SelectionCaptureMethod"; +import { SelectionClipboardHandlingSetting } from "../SelectionClipboardHandling"; export const AdvancedSettings: React.FC = () => { const { t } = useTranslation(); @@ -31,6 +33,17 @@ export const AdvancedSettings: React.FC = () => { + + + + + diff --git a/src/stores/settingsStore.ts b/src/stores/settingsStore.ts index 687fdb1..70fbefc 100644 --- a/src/stores/settingsStore.ts +++ b/src/stores/settingsStore.ts @@ -77,11 +77,17 @@ const settingUpdaters: { overlay_position: (value) => commands.changeOverlayPositionSetting(value as string), debug_mode: (value) => commands.changeDebugModeSetting(value as boolean), + model_unload_timeout: (value) => + commands.changeModelUnloadTimeoutSetting(value as string), history_limit: (value) => commands.updateHistoryLimit(value as number), log_level: (value) => commands.setLogLevel(value as any), app_language: (value) => commands.changeAppLanguageSetting(value as string), experimental_enabled: (value) => commands.changeExperimentalEnabledSetting(value as boolean), + selection_capture_method: (value) => + commands.changeSelectionCaptureMethodSetting(value as string), + clipboard_handling: (value) => + commands.changeClipboardHandlingSetting(value as string), show_tray_icon: (value) => commands.changeShowTrayIconSetting(value as boolean), tts_workers: (value) => commands.changeTtsWorkersSetting(value as number), diff --git a/vendor/tts-rs/.cargo-ok b/vendor/tts-rs/.cargo-ok new file mode 100644 index 0000000..5f8b795 --- /dev/null +++ b/vendor/tts-rs/.cargo-ok @@ -0,0 +1 @@ +{"v":1} \ No newline at end of file diff --git a/vendor/tts-rs/.cargo/config.toml b/vendor/tts-rs/.cargo/config.toml new file mode 100644 index 0000000..b5149c3 --- /dev/null +++ b/vendor/tts-rs/.cargo/config.toml @@ -0,0 +1,7 @@ +# Development configuration +# This ensures all features are enabled when running cargo commands locally + +[alias] +test-all = "test --all-features" +check-all = "check --all-features" +build-all = "build --all-features" diff --git a/vendor/tts-rs/.cargo_vcs_info.json b/vendor/tts-rs/.cargo_vcs_info.json new file mode 100644 index 0000000..7b4d42d --- /dev/null +++ b/vendor/tts-rs/.cargo_vcs_info.json @@ -0,0 +1,6 @@ +{ + "git": { + "sha1": "7562307e2b32c6d31f092efa01e174eb591354fe" + }, + "path_in_vcs": "" +} \ No newline at end of file diff --git a/vendor/tts-rs/.gitignore b/vendor/tts-rs/.gitignore new file mode 100644 index 0000000..65b7227 --- /dev/null +++ b/vendor/tts-rs/.gitignore @@ -0,0 +1,4 @@ +/target +/Cargo.lock +models/ +*.DS_Store diff --git a/vendor/tts-rs/Cargo.toml b/vendor/tts-rs/Cargo.toml new file mode 100644 index 0000000..f3d4994 --- /dev/null +++ b/vendor/tts-rs/Cargo.toml @@ -0,0 +1,84 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2021" +name = "tts-rs" +version = "2026.2.3" +build = false +autolib = false +autobins = false +autoexamples = false +autotests = false +autobenches = false +description = "A Rust library for text-to-speech synthesis using the Kokoro engine" +readme = "README.md" +license = "MIT" +repository = "https://github.com/rishiskhare/tts-rs" + +[features] +default = [] +kokoro = [ + "dep:ort", + "dep:ndarray", + "dep:zip", +] + +[lib] +name = "tts_rs" +path = "src/lib.rs" + +[[example]] +name = "kokoro" +path = "examples/kokoro.rs" +required-features = ["kokoro"] + +[dependencies.derive_builder] +version = "0.20.2" + +[dependencies.env_logger] +version = "0.10.0" + +[dependencies.hound] +version = "3.5.1" + +[dependencies.log] +version = "0.4.28" + +[dependencies.ndarray] +version = "0.17" +optional = true + +[dependencies.ort] +version = "2.0.0-rc.12" +optional = true +default-features = false +features = [ + "std", + "ndarray", + "load-dynamic", +] + +[dependencies.serde] +version = "1.0" +features = ["derive"] + +[dependencies.serde_json] +version = "1.0" + +[dependencies.thiserror] +version = "2.0.16" + +[dependencies.zip] +version = "2" +features = ["deflate"] +optional = true +default-features = false \ No newline at end of file diff --git a/vendor/tts-rs/Cargo.toml.orig b/vendor/tts-rs/Cargo.toml.orig new file mode 100644 index 0000000..2b32a83 --- /dev/null +++ b/vendor/tts-rs/Cargo.toml.orig @@ -0,0 +1,32 @@ +[package] +name = "tts-rs" +version = "2026.2.3" +edition = "2021" +description = "A Rust library for text-to-speech synthesis using the Kokoro engine" +license = "MIT" +repository = "https://github.com/rishiskhare/tts-rs" + +[features] +default = [] + +# TTS engines +kokoro = ["dep:ort", "dep:ndarray", "dep:zip"] + +[dependencies] +# Always required +hound = "3.5.1" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +log = "0.4.28" +env_logger = "0.10.0" +thiserror = "2.0.16" +derive_builder = { version = "0.20.2" } + +# Kokoro (ONNX-based) +ort = { version = "2.0.0-rc.10", optional = true } +ndarray = { version = "0.17", optional = true } +zip = { version = "2", optional = true, default-features = false, features = ["deflate"] } + +[[example]] +name = "kokoro" +required-features = ["kokoro"] diff --git a/vendor/tts-rs/LICENSE b/vendor/tts-rs/LICENSE new file mode 100644 index 0000000..6855b46 --- /dev/null +++ b/vendor/tts-rs/LICENSE @@ -0,0 +1,22 @@ +MIT License + +Copyright (c) 2025 Ilya Stupakov +Copyright (c) 2026 Rishi Khare + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/tts-rs/README.md b/vendor/tts-rs/README.md new file mode 100644 index 0000000..6a3d14d --- /dev/null +++ b/vendor/tts-rs/README.md @@ -0,0 +1,92 @@ +# tts-rs + +A Rust library for text-to-speech synthesis using the +[Kokoro](https://huggingface.co/hexgrad/Kokoro-82M) neural TTS model via ONNX +inference. + +## Features + +- **Kokoro TTS engine** — natural-sounding neural speech via ONNX Runtime +- **Multiple voices** — 26 voices across 9 languages (English US & UK, Spanish, + French, Hindi, Italian, Japanese, Portuguese Brazilian, Chinese Mandarin) +- **Streaming synthesis** — audio playback begins before the full text is + synthesized +- **CPU-only** — no GPU required; runs efficiently on any modern CPU +- **Three precision levels** — f32, f16, and int8 model variants + +## Installation + +```toml +[dependencies] +tts-rs = { version = "2026.2.3", features = ["kokoro"] } +``` + +### Available Features + +| Feature | Description | Dependencies | +| -------- | ------------------------ | ------------------------ | +| `kokoro` | Kokoro neural TTS (ONNX) | `ort`, `ndarray`, `zip` | + +No features are enabled by default. You must opt in explicitly. + +## Model Files + +Download the following files from the +[taylorchu/kokoro-onnx v0.2.0 release](https://github.com/taylorchu/kokoro-onnx/releases/tag/v0.2.0): + +| File | Size | Description | +| ------------------------ | ------ | ------------------------------------------ | +| `kokoro-v1.0.onnx` | 310 MB | Full precision (f32) | +| `kokoro-v1.0.fp16.onnx` | 169 MB | Half precision (f16) | +| `kokoro-v1.0.int8.onnx` | 88 MB | Quantized (int8) — recommended | +| `voices-v1.0.bin` | — | Style vectors for all 26 voices (required) | + +The `voices-v1.0.bin` file is required regardless of which model variant you +use. Place all downloaded files in the same directory and pass that path to +`load_model`. + +## Usage + +```rust +use tts_rs::engines::kokoro::{KokoroEngine, KokoroInferenceParams}; +use tts_rs::SynthesisEngine; +use std::path::PathBuf; + +let mut engine = KokoroEngine::new(); +engine.load_model(&PathBuf::from("models/kokoro"))?; + +let params = KokoroInferenceParams { + voice: "af_heart".to_string(), + ..Default::default() +}; +let audio = engine.synthesize("Hello, world!", Some(params))?; +// audio is a Vec of PCM samples at 24 kHz +``` + +## Running the Example + +```sh +cargo run --example kokoro --features kokoro +``` + +## Acknowledgements + +This library is derived from +[transcribe-rs](https://github.com/cjpais/transcribe-rs) by +[CJ Pais](https://github.com/cjpais), which was itself built as the inference +backend for the [Handy](https://github.com/cjpais/handy) project. The original +library supported multiple speech-to-text (ASR) engines; this fork removes +those entirely and repurposes the codebase to focus exclusively on Kokoro TTS +synthesis. + +ONNX model files are provided by +[taylorchu/kokoro-onnx](https://github.com/taylorchu/kokoro-onnx). Additional +reference and inspiration from +[thewh1teagle/kokoro-onnx](https://github.com/thewh1teagle/kokoro-onnx). The +underlying TTS model is +[Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) by +[hexgrad](https://huggingface.co/hexgrad). + +## License + +[MIT](LICENSE) diff --git a/vendor/tts-rs/examples/kokoro.rs b/vendor/tts-rs/examples/kokoro.rs new file mode 100644 index 0000000..8c9e960 --- /dev/null +++ b/vendor/tts-rs/examples/kokoro.rs @@ -0,0 +1,47 @@ +use std::path::PathBuf; +use std::time::Instant; + +use tts_rs::{ + engines::kokoro::{KokoroEngine, KokoroInferenceParams, KokoroModelParams}, + SynthesisEngine, +}; + +fn main() -> Result<(), Box> { + env_logger::init(); + + let mut engine = KokoroEngine::new(); + let model_path = PathBuf::from("models/kokoro"); + + let load_start = Instant::now(); + engine.load_model_with_params(&model_path, KokoroModelParams::default())?; + println!("Model loaded in {:.2?}", load_start.elapsed()); + + println!("Available voices: {:?}", engine.list_voices()); + + let text = "Hello! This is Kokoro, a text to speech model with multilingual support. \ + It supports American English, British English, French, Spanish, \ + Hindi, Italian, Japanese, Mandarin Chinese, and Brazilian Portuguese."; + + let params = KokoroInferenceParams { + voice: "af_heart".to_string(), + speed: 1.0, + ..Default::default() + }; + + let synth_start = Instant::now(); + let result = engine.synthesize(text, Some(params))?; + let synth_dur = synth_start.elapsed(); + + let audio_duration = result.samples.len() as f64 / result.sample_rate as f64; + let speedup = audio_duration / synth_dur.as_secs_f64(); + println!( + "Synthesized {:.2}s audio in {:.2?} ({:.1}x real-time)", + audio_duration, synth_dur, speedup + ); + + engine.synthesize_to_file(text, &PathBuf::from("output.wav"), None)?; + println!("Saved to output.wav"); + + engine.unload_model(); + Ok(()) +} \ No newline at end of file diff --git a/vendor/tts-rs/output.wav b/vendor/tts-rs/output.wav new file mode 100644 index 0000000..30b477b Binary files /dev/null and b/vendor/tts-rs/output.wav differ diff --git a/vendor/tts-rs/samples/dots.wav b/vendor/tts-rs/samples/dots.wav new file mode 100644 index 0000000..5707388 Binary files /dev/null and b/vendor/tts-rs/samples/dots.wav differ diff --git a/vendor/tts-rs/samples/jfk.wav b/vendor/tts-rs/samples/jfk.wav new file mode 100644 index 0000000..3184d37 Binary files /dev/null and b/vendor/tts-rs/samples/jfk.wav differ diff --git a/vendor/tts-rs/samples/product_names.wav b/vendor/tts-rs/samples/product_names.wav new file mode 100644 index 0000000..6feadcb Binary files /dev/null and b/vendor/tts-rs/samples/product_names.wav differ diff --git a/vendor/tts-rs/samples/russian.wav b/vendor/tts-rs/samples/russian.wav new file mode 100644 index 0000000..bf94469 Binary files /dev/null and b/vendor/tts-rs/samples/russian.wav differ diff --git a/vendor/tts-rs/src/engines/kokoro/engine.rs b/vendor/tts-rs/src/engines/kokoro/engine.rs new file mode 100644 index 0000000..d7ab79f --- /dev/null +++ b/vendor/tts-rs/src/engines/kokoro/engine.rs @@ -0,0 +1,169 @@ +use std::path::{Path, PathBuf}; + +use crate::{SynthesisEngine, SynthesisResult}; + +use super::model::{KokoroError, KokoroModel, SAMPLE_RATE}; +use super::phonemizer::EspeakConfig; + +/// Parameters for configuring Kokoro model loading. +#[derive(Debug, Clone, Default)] +pub struct KokoroModelParams { + /// Number of CPU threads to use for inference. + /// `None` uses the ORT default (typically all available cores). + pub num_threads: Option, + /// Path for caching the Level3-optimized ONNX graph. + /// + /// - First load: ORT runs Level3 optimization and serialises the result here. + /// - Subsequent loads: the pre-built graph is loaded at `Disable` optimization, + /// skipping the expensive 5–10 s re-optimization step entirely. + /// + /// Always write to a writable location (e.g. app data dir); bundled resource + /// directories may be read-only. + pub optimized_model_cache_path: Option, +} + +/// Parameters for configuring a Kokoro synthesis request. +#[derive(Debug, Clone)] +pub struct KokoroInferenceParams { + /// Voice name (e.g. `"af_heart"`, `"bf_emma"`, `"jf_alpha"`). + pub voice: String, + /// Speech speed multiplier. Range: 0.5–2.0, default 1.0. + pub speed: f32, + /// Override the style vector index. `None` = auto (uses phoneme token count). + pub style_index: Option, +} + +impl Default for KokoroInferenceParams { + fn default() -> Self { + Self { + voice: "af_heart".to_string(), + speed: 1.0, + style_index: None, + } + } +} + +/// Kokoro text-to-speech engine. +/// +/// Uses the Kokoro-82M ONNX model for high-quality, fast TTS with support +/// for 9 languages. Requires espeak-ng for phonemization. +/// +/// # Quick Start +/// +/// ```rust,no_run +/// use tts_rs::{SynthesisEngine, engines::kokoro::KokoroEngine}; +/// use std::path::PathBuf; +/// +/// // Uses system espeak-ng from PATH +/// let mut engine = KokoroEngine::new(); +/// engine.load_model(&PathBuf::from("models/kokoro"))?; +/// let result = engine.synthesize("Hello, world!", None)?; +/// # Ok::<(), Box>(()) +/// ``` +/// +/// # Bundled espeak-ng +/// +/// ```rust,no_run +/// use tts_rs::engines::kokoro::KokoroEngine; +/// use std::path::PathBuf; +/// +/// // Point to a bundled espeak-ng binary and data directory +/// let engine = KokoroEngine::with_espeak( +/// Some(PathBuf::from("/app/resources/espeak-ng/espeak-ng")), +/// Some(PathBuf::from("/app/resources/espeak-ng-data")), +/// ); +/// # Ok::<(), Box>(()) +/// ``` +pub struct KokoroEngine { + model: Option, + model_path: Option, + espeak: EspeakConfig, +} + +impl Default for KokoroEngine { + fn default() -> Self { + Self::new() + } +} + +impl KokoroEngine { + /// Create a new engine that uses `espeak-ng` from PATH. + pub fn new() -> Self { + Self { + model: None, + model_path: None, + espeak: EspeakConfig::default(), + } + } + + /// Create a new engine with explicit espeak-ng binary and data paths. + /// + /// Use this when bundling espeak-ng with your application. Either path + /// can be `None` to fall back to the system default. + pub fn with_espeak( + bin_path: Option, + data_path: Option, + ) -> Self { + Self { + model: None, + model_path: None, + espeak: EspeakConfig { bin_path, data_path }, + } + } + + /// List all available voice names (requires model to be loaded). + pub fn list_voices(&self) -> Vec<&str> { + self.model + .as_ref() + .map(|m| m.list_voices()) + .unwrap_or_default() + } +} + +impl Drop for KokoroEngine { + fn drop(&mut self) { + self.unload_model(); + } +} + +impl SynthesisEngine for KokoroEngine { + type SynthesisParams = KokoroInferenceParams; + type ModelParams = KokoroModelParams; + + fn load_model_with_params( + &mut self, + model_path: &Path, + params: Self::ModelParams, + ) -> Result<(), Box> { + let model = KokoroModel::load( + model_path, + params.num_threads, + params.optimized_model_cache_path.as_deref(), + )?; + self.model = Some(model); + self.model_path = Some(model_path.to_path_buf()); + Ok(()) + } + + fn unload_model(&mut self) { + self.model = None; + self.model_path = None; + } + + fn synthesize( + &mut self, + text: &str, + params: Option, + ) -> Result> { + let model = self.model.as_mut().ok_or(KokoroError::ModelNotLoaded)?; + + let p = params.unwrap_or_default(); + let samples = + model.synthesize_text(text, &p.voice, p.speed, p.style_index, &self.espeak)?; + + Ok(SynthesisResult { + samples, + sample_rate: SAMPLE_RATE, + }) + } +} diff --git a/vendor/tts-rs/src/engines/kokoro/mod.rs b/vendor/tts-rs/src/engines/kokoro/mod.rs new file mode 100644 index 0000000..5a9a10b --- /dev/null +++ b/vendor/tts-rs/src/engines/kokoro/mod.rs @@ -0,0 +1,91 @@ +//! Kokoro-82M text-to-speech engine implementation. +//! +//! This module provides a Kokoro-based synthesis engine that uses the +//! Kokoro-82M ONNX model for text-to-speech conversion. The engine uses +//! espeak-ng for phonemization and supports 9 languages. +//! +//! # System Requirements +//! +//! **espeak-ng** must be installed on your system: +//! - **Linux**: `sudo apt-get install espeak-ng` +//! - **macOS**: `brew install espeak-ng` +//! - **Windows**: Download installer from +//! +//! # Model Directory Layout +//! +//! ```text +//! models/kokoro/ +//! ├── kokoro-quant-convinteger.onnx # 8-bit quantized model (88MB, CPU-optimized) +//! └── voices-v1.0.bin # Voice data archive (.npz format) +//! ``` +//! +//! Download links: +//! - Model: +//! - Voices: +//! +//! # Language Support +//! +//! | Voice prefix | Language | espeak-ng code | Notes | +//! |---|---|---|---| +//! | `af_`, `am_` | American English | `en-us` | Full support | +//! | `bf_`, `bm_` | British English | `en-gb` | Full support | +//! | `ef_`, `em_` | Spanish | `es` | Full support | +//! | `ff_` | French | `fr` | Full support | +//! | `hf_`, `hm_` | Hindi | `hi` | Full support | +//! | `if_`, `im_` | Italian | `it` | Full support | +//! | `jf_`, `jm_` | Japanese | `ja` | Functional via espeak-ng CJK | +//! | `pf_`, `pm_` | Brazilian Portuguese | `pt-br` | Full support | +//! | `zf_`, `zm_` | Mandarin Chinese | `cmn` | Functional via espeak-ng CJK | +//! +//! # Voice Naming Convention +//! +//! Voices follow the pattern `{language_prefix}_{name}`, e.g.: +//! - `af_heart` — American English female "heart" +//! - `bf_emma` — British English female "emma" +//! - `jf_alpha` — Japanese female "alpha" +//! - `zf_xiaobei` — Mandarin Chinese female "xiaobei" +//! +//! # Examples +//! +//! ## Basic Usage +//! +//! ```rust,no_run +//! use tts_rs::{SynthesisEngine, engines::kokoro::{KokoroEngine, KokoroInferenceParams}}; +//! use std::path::PathBuf; +//! +//! let mut engine = KokoroEngine::new(); +//! engine.load_model(&PathBuf::from("models/kokoro"))?; +//! +//! let result = engine.synthesize("Hello, world!", None)?; +//! println!("Generated {} samples at {}Hz", result.samples.len(), result.sample_rate); +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## With Custom Voice and Speed +//! +//! ```rust,no_run +//! use tts_rs::{SynthesisEngine, engines::kokoro::{KokoroEngine, KokoroInferenceParams}}; +//! use std::path::PathBuf; +//! +//! let mut engine = KokoroEngine::new(); +//! engine.load_model(&PathBuf::from("models/kokoro"))?; +//! +//! let params = KokoroInferenceParams { +//! voice: "bf_emma".to_string(), +//! speed: 0.9, +//! ..Default::default() +//! }; +//! +//! engine.synthesize_to_file("Hello from British Emma!", &PathBuf::from("out.wav"), Some(params))?; +//! # Ok::<(), Box>(()) +//! ``` + +pub mod engine; +pub mod model; +pub mod phonemizer; +pub mod vocab; +pub mod voices; + +pub use engine::{KokoroEngine, KokoroInferenceParams, KokoroModelParams}; +pub use model::KokoroError; +pub use phonemizer::EspeakConfig; \ No newline at end of file diff --git a/vendor/tts-rs/src/engines/kokoro/model.rs b/vendor/tts-rs/src/engines/kokoro/model.rs new file mode 100644 index 0000000..febf9a0 --- /dev/null +++ b/vendor/tts-rs/src/engines/kokoro/model.rs @@ -0,0 +1,387 @@ +use std::collections::HashMap; +use std::path::Path; + +use ndarray::Array2; +use ort::execution_providers::CPUExecutionProvider; +use ort::inputs; +use ort::session::builder::GraphOptimizationLevel; +use ort::session::Session; +use ort::value::TensorRef; + +use super::phonemizer::{phonemize, voice_lang, EspeakConfig}; +use super::voices::VoiceStore; + +/// Maximum number of phoneme tokens per chunk (before padding). +pub const MAX_PHONEME_LEN: usize = 510; + +/// Style vector dimension for Kokoro. +pub const STYLE_DIM: usize = 256; + +/// Output sample rate from the Kokoro model. +pub const SAMPLE_RATE: u32 = 24000; + +/// Crossfade (in samples) used when concatenating chunk audio. +const CHUNK_CROSSFADE_SAMPLES: usize = 240; // 10ms @ 24kHz + +#[derive(thiserror::Error, Debug)] +pub enum KokoroError { + #[error("ONNX runtime error: {0}")] + Ort(#[from] ort::Error), + #[error("I/O error: {0}")] + Io(#[from] std::io::Error), + #[error("Array shape error: {0}")] + Shape(#[from] ndarray::ShapeError), + #[error( + "espeak-ng not found. Install: Linux: `sudo apt-get install espeak-ng`, \ + macOS: `brew install espeak-ng`, Windows: https://espeak-ng.org/download" + )] + EspeakNotFound, + #[error("Phonemization failed: {0}")] + PhonemizerFailed(String), + #[error("Voice '{0}' not found. Call list_voices() to see available voices.")] + VoiceNotFound(String), + #[error("Model not loaded. Call load_model() first.")] + ModelNotLoaded, + #[error("Invalid config.json: {0}")] + Config(String), + #[error("Failed to parse voice file: {0}")] + VoiceParse(String), + #[error("espeak-ng process timed out after {0:?}")] + Timeout(std::time::Duration), +} + +/// Internal Kokoro ONNX model state. +pub struct KokoroModel { + session: Session, + voice_store: VoiceStore, + vocab: HashMap, + /// Detected input name: "input_ids" or "tokens" + tokens_input_name: String, + /// True if the speed input expects int32, false for float32 + speed_is_int32: bool, +} + +impl KokoroModel { + /// Load the Kokoro model from a directory. + /// + /// The directory must contain: + /// - An `.onnx` file (preferably `kokoro-quant-convinteger.onnx`) + /// - A `voices-v1.0.bin` voice archive + /// - Optionally a `config.json` for vocabulary (falls back to hardcoded) + pub fn load( + model_dir: &Path, + num_threads: Option, + optimized_cache_path: Option<&Path>, + ) -> Result { + let onnx_path = find_onnx_file(model_dir)?; + log::info!("Loading Kokoro model from {}", onnx_path.display()); + + let session = init_session(&onnx_path, num_threads, optimized_cache_path)?; + + // Detect input names at load time + let tokens_input_name = detect_tokens_input(&session); + let speed_is_int32 = detect_speed_type(&session); + + log::info!( + "Detected: tokens_input='{}', speed_is_int32={}", + tokens_input_name, + speed_is_int32 + ); + + // Load voices + let voices_path = model_dir.join("voices-v1.0.bin"); + if !voices_path.exists() { + return Err(KokoroError::Io(std::io::Error::new( + std::io::ErrorKind::NotFound, + format!( + "Voice file not found at {}. Download it from the Kokoro model repository.", + voices_path.display() + ), + ))); + } + let voice_store = VoiceStore::load(&voices_path)?; + + // Load vocabulary + let config_path = model_dir.join("config.json"); + let vocab = if config_path.exists() { + log::info!("Loading vocab from config.json"); + super::vocab::load_vocab(&config_path)? + } else { + log::warn!("config.json not found, using hardcoded vocab"); + super::vocab::hardcoded_vocab() + }; + + Ok(Self { + session, + voice_store, + vocab, + tokens_input_name, + speed_is_int32, + }) + } + + /// Synthesize audio from text using the given voice and speed. + pub fn synthesize_text( + &mut self, + text: &str, + voice_name: &str, + speed: f32, + style_idx_override: Option, + espeak: &EspeakConfig, + ) -> Result, KokoroError> { + let lang = voice_lang(voice_name); + let ids = phonemize(text, lang, &self.vocab, espeak)?; + + if ids.is_empty() { + log::warn!("No phoneme tokens produced for text: {text:?}"); + return Ok(vec![]); + } + + // Split into chunks if needed. Keep a stable style index so adjacent chunks + // don't change style/prosody based on chunk length. + let style_idx = style_idx_override.unwrap_or(ids.len()); + let estimated_samples = ids.len() * 300; + let chunks = if ids.len() > MAX_PHONEME_LEN { + log::debug!( + "Kokoro phoneme sequence exceeded limit ({} > {}), chunking", + ids.len(), + MAX_PHONEME_LEN + ); + // Compute punctuation IDs from the vocab instead of hardcoding them + let punct_ids: Vec = [';', ':', ',', '.', '!', '?'] + .iter() + .filter_map(|c| self.vocab.get(c).copied()) + .collect(); + split_chunks_with_punct(&ids, &punct_ids) + } else { + vec![ids] + }; + + let mut combined = Vec::with_capacity(estimated_samples); + + for chunk_ids in chunks.iter() { + let style = self.voice_store.get_style(voice_name, style_idx)?; + let audio = self.synthesize_chunk(chunk_ids, &style, speed)?; + if audio.is_empty() { + continue; + } + + if combined.is_empty() { + combined.extend_from_slice(&audio); + } else { + append_with_crossfade(&mut combined, &audio, CHUNK_CROSSFADE_SAMPLES); + } + } + + Ok(combined) + } + + /// Run ONNX inference on a single chunk of phoneme token IDs. + fn synthesize_chunk( + &mut self, + tokens: &[i64], + style: &[f32; STYLE_DIM], + speed: f32, + ) -> Result, KokoroError> { + let seq_len = tokens.len() + 2; // +2 for padding tokens + + // Build tokens tensor: [[0, t1..tN, 0]] + let mut padded = vec![0i64; seq_len]; + padded[1..seq_len - 1].copy_from_slice(tokens); + let tokens_arr = Array2::from_shape_vec((1, seq_len), padded)?; + + // Build style tensor: [[s0..s255]] — use a view to avoid copying the 256-float array + let style_view = ndarray::ArrayView2::from_shape((1, STYLE_DIM), style.as_slice())?; + + // Run session + let output = if self.speed_is_int32 { + let speed_arr = ndarray::arr1(&[speed as i32]); + let inputs = inputs![ + self.tokens_input_name.as_str() => TensorRef::from_array_view(tokens_arr.view())?, + "style" => TensorRef::from_array_view(style_view)?, + "speed" => TensorRef::from_array_view(speed_arr.view())?, + ]; + self.session.run(inputs)? + } else { + let speed_arr = ndarray::arr1(&[speed]); + let inputs = inputs![ + self.tokens_input_name.as_str() => TensorRef::from_array_view(tokens_arr.view())?, + "style" => TensorRef::from_array_view(style_view)?, + "speed" => TensorRef::from_array_view(speed_arr.view())?, + ]; + self.session.run(inputs)? + }; + + // Extract first output as waveform + let first_output = output + .iter() + .next() + .ok_or_else(|| KokoroError::Ort(ort::Error::new("No output from model")))?; + let waveform = first_output.1.try_extract_array::()?; + + Ok(waveform.as_slice().unwrap_or(&[]).to_vec()) + } + + /// List all available voice names. + pub fn list_voices(&self) -> Vec<&str> { + self.voice_store.list_voices() + } +} + +/// Find the ONNX model file in the given directory. +/// +/// Prefers `kokoro-quant-convinteger.onnx`, then falls back to the first `.onnx` file found. +fn find_onnx_file(model_dir: &Path) -> Result { + let preferred = model_dir.join("kokoro-quant-convinteger.onnx"); + if preferred.exists() { + return Ok(preferred); + } + + // Scan for any .onnx file + for entry in std::fs::read_dir(model_dir)? { + let entry = entry?; + let path = entry.path(); + if path.extension().and_then(|e| e.to_str()) == Some("onnx") { + log::info!("Using ONNX file: {}", path.display()); + return Ok(path); + } + } + + Err(KokoroError::Io(std::io::Error::new( + std::io::ErrorKind::NotFound, + format!("No .onnx file found in {}", model_dir.display()), + ))) +} + +/// Initialize an ONNX session with optional on-disk graph caching. +/// +/// The first time a model is loaded, ORT runs Level3 graph optimization (5–10 s) +/// and serialises the result to `optimized_cache_path`. Every subsequent load +/// reads the pre-optimized file directly at `Disable` optimization level, cutting +/// cold-start time to under one second. +/// +/// If `optimized_cache_path` is `None` the original behaviour (always Level3) is +/// preserved, which is useful for unit-testing or read-only deployments. +fn init_session( + onnx_path: &Path, + num_threads: Option, + optimized_cache_path: Option<&Path>, +) -> Result { + let providers = vec![CPUExecutionProvider::default().build()]; + + // Choose load path and optimization level depending on cache state. + let (load_path, opt_level, write_cache) = match optimized_cache_path { + // Pre-optimized graph already on disk → load it directly, skip optimization. + Some(cache) if cache.exists() => { + log::info!( + "Loading pre-optimized Kokoro graph ({:.1} MB) from {:?} — skipping Level3", + cache + .metadata() + .map(|m| m.len() as f64 / 1_048_576.0) + .unwrap_or(0.0), + cache + ); + (cache, GraphOptimizationLevel::Disable, false) + } + // Cache path given but file does not exist yet → build + persist. + Some(cache) => { + log::info!( + "First load: running Level3 optimization; saving graph to {:?}", + cache + ); + (onnx_path, GraphOptimizationLevel::Level3, true) + } + // No cache path → original behaviour. + None => (onnx_path, GraphOptimizationLevel::Level3, false), + }; + + let mut builder = Session::builder()? + .with_optimization_level(opt_level)? + .with_execution_providers(providers)? + .with_parallel_execution(true)?; + + if write_cache { + // Serialise the optimized graph so the next launch can skip optimization. + let cache = optimized_cache_path.unwrap(); + builder = builder.with_optimized_model_path(cache)?; + } + + if let Some(threads) = num_threads { + builder = builder + .with_intra_threads(threads)? + .with_inter_threads(threads)?; + } + + Ok(builder.commit_from_file(load_path)?) +} + +/// Detect the token input name ("input_ids" or "tokens") from session inputs. +fn detect_tokens_input(session: &Session) -> String { + for input in session.inputs() { + if input.name() == "input_ids" || input.name() == "tokens" { + return input.name().to_string(); + } + } + // Default to "input_ids" if neither is found + "input_ids".to_string() +} + +/// Detect whether the speed input expects int32 (true) or float32 (false). +fn detect_speed_type(session: &Session) -> bool { + for input in session.inputs() { + if input.name() == "speed" { + // Check the type description + let type_str = format!("{:?}", input.dtype()); + return type_str.contains("Int32") || type_str.contains("int32"); + } + } + // Default: modern Kokoro models use int32 + true +} + +/// Split phoneme IDs into chunks of at most `MAX_PHONEME_LEN`, preferring punctuation. +/// Takes an explicit set of punctuation IDs instead of hardcoding them. +fn split_chunks_with_punct(ids: &[i64], punct_ids: &[i64]) -> Vec> { + let mut chunks = Vec::new(); + let mut start = 0; + + while start < ids.len() { + let end = (start + MAX_PHONEME_LEN).min(ids.len()); + if end == ids.len() { + chunks.push(ids[start..end].to_vec()); + break; + } + + // Try to find a good split point (last punctuation before `end`). + let split = ids[start..end] + .iter() + .enumerate() + .rev() + .find(|(_, &id)| punct_ids.contains(&id)) + .map(|(i, _)| start + i + 1) + .unwrap_or(end); + + chunks.push(ids[start..split].to_vec()); + start = split; + } + + chunks +} + +fn append_with_crossfade(dst: &mut Vec, src: &[f32], crossfade_samples: usize) { + let overlap = crossfade_samples.min(dst.len()).min(src.len()); + if overlap == 0 { + dst.extend_from_slice(src); + return; + } + + let dst_start = dst.len() - overlap; + for i in 0..overlap { + let t = (i + 1) as f32 / (overlap as f32 + 1.0); + let left = dst[dst_start + i] * (1.0 - t); + let right = src[i] * t; + dst[dst_start + i] = left + right; + } + + dst.extend_from_slice(&src[overlap..]); +} \ No newline at end of file diff --git a/vendor/tts-rs/src/engines/kokoro/phonemizer.rs b/vendor/tts-rs/src/engines/kokoro/phonemizer.rs new file mode 100644 index 0000000..2c85a09 --- /dev/null +++ b/vendor/tts-rs/src/engines/kokoro/phonemizer.rs @@ -0,0 +1,462 @@ +use std::borrow::Cow; +use std::collections::HashMap; +use std::io::Write; +#[cfg(target_os = "windows")] +use std::os::windows::process::CommandExt; +use std::path::PathBuf; +use std::process::{Command, Stdio}; +use std::time::Duration; + +use super::model::KokoroError; + +#[cfg(target_os = "windows")] +const CREATE_NO_WINDOW: u32 = 0x0800_0000; + +/// Configuration for locating the espeak-ng binary and its data directory. +/// +/// When paths are `None`, falls back to `"espeak-ng"` on PATH with its +/// compiled-in default data directory. +#[derive(Debug, Clone, Default)] +pub struct EspeakConfig { + /// Path to the espeak-ng binary. Falls back to `"espeak-ng"` on PATH. + pub bin_path: Option, + /// Path to the espeak-ng-data directory. When set, passed via `--path`. + pub data_path: Option, +} + +/// Map a voice name prefix to an espeak-ng language code. +/// +/// Voice names follow the pattern `{prefix}_{name}` where the two-character +/// prefix encodes the language. +pub fn voice_lang(voice: &str) -> &'static str { + let prefix: String = voice.chars().take(2).collect(); + match prefix.as_str() { + "af" | "am" => "en-us", + "bf" | "bm" => "en-gb", + "ef" | "em" => "es", + "ff" => "fr", + "hf" | "hm" => "hi", + "if" | "im" => "it", + "jf" | "jm" => "ja", + "pf" | "pm" => "pt-br", + "zf" | "zm" => "cmn", + _ => "en-us", + } +} + +/// Convert text to Kokoro phoneme token IDs via espeak-ng. +/// +/// # Arguments +/// - `text`: The input text to phonemize +/// - `lang`: espeak-ng language code (e.g. `"en-us"`, `"fr"`, `"ja"`, `"cmn"`) +/// - `vocab`: Mapping from IPA characters to token IDs +/// +/// # Returns +/// A `Vec` of token IDs. Characters not in the vocab are silently dropped, +/// matching the behavior of the Python reference implementation. +pub fn phonemize( + text: &str, + lang: &str, + vocab: &HashMap, + espeak: &EspeakConfig, +) -> Result, KokoroError> { + let parts = split_text_parts(text); + if parts.is_empty() { + return Ok(Vec::new()); + } + + let text_segments: Vec<&str> = parts + .iter() + .filter_map(|part| match part { + TextPart::Text(segment) => Some(segment.as_str()), + TextPart::Punct(_) => None, + }) + .collect(); + + let segment_ids = if text_segments.is_empty() { + Vec::new() + } else { + phonemize_segments_batch(&text_segments, lang, vocab, espeak)? + }; + + let mut ids = Vec::new(); + let mut segment_index = 0usize; + for part in parts { + match part { + TextPart::Text(_) => { + if let Some(chunk) = segment_ids.get(segment_index) { + ids.extend_from_slice(chunk); + } + segment_index += 1; + } + TextPart::Punct(ch) => { + if let Some(&id) = vocab.get(&ch) { + ids.push(id); + } + } + } + } + + Ok(ids) +} + +#[derive(Debug, Clone, PartialEq, Eq)] +enum TextPart { + Text(String), + Punct(char), +} + +fn split_text_parts(text: &str) -> Vec { + let mut parts = Vec::new(); + let mut current = String::new(); + let chars: Vec = text.chars().collect(); + let mut i = 0; + + while i < chars.len() { + let ch = chars[i]; + // Calculate byte index for this character + let idx: usize = text.char_indices().nth(i).map(|(pos, _)| pos).unwrap_or(text.len()); + let ch_len = ch.len_utf8(); + + // Handle CRLF as a single boundary token + if ch == '\r' && i + 1 < chars.len() && chars[i + 1] == '\n' { + // Treat CRLF as a single period token + flush_text_part(&mut parts, &mut current); + parts.push(TextPart::Punct('.')); + i += 2; // Skip both \r and \n + continue; + } + + if let Some(punct) = map_boundary_punctuation(ch) { + if !is_numeric_connector_between_digits(text, idx, ch_len, ch) { + flush_text_part(&mut parts, &mut current); + parts.push(TextPart::Punct(punct)); + i += 1; + continue; + } + } + + if ch.is_whitespace() { + if !current.is_empty() && !current.ends_with(' ') { + current.push(' '); + } + i += 1; + continue; + } + + current.push(ch); + i += 1; + } + + flush_text_part(&mut parts, &mut current); + parts +} + +fn flush_text_part(parts: &mut Vec, current: &mut String) { + let trimmed = current.trim(); + if trimmed.is_empty() { + current.clear(); + return; + } + parts.push(TextPart::Text(trimmed.to_string())); + current.clear(); +} + +fn map_boundary_punctuation(ch: char) -> Option { + match ch { + '.' | '!' | '?' | ',' | ';' | ':' | '—' | '…' | '"' | '(' | ')' | '\u{201c}' + | '\u{201d}' => Some(ch), + '\n' | '\r' => Some('.'), + _ => None, + } +} + +fn is_numeric_connector_between_digits(text: &str, idx: usize, ch_len: usize, ch: char) -> bool { + if !matches!(ch, '.' | ',') { + return false; + } + + let prev = text[..idx].chars().next_back(); + let next = text[idx + ch_len..].chars().next(); + + matches!( + (prev, next), + (Some(left), Some(right)) if left.is_ascii_digit() && right.is_ascii_digit() + ) +} + +fn phonemize_segments_batch( + segments: &[&str], + lang: &str, + vocab: &HashMap, + espeak: &EspeakConfig, +) -> Result>, KokoroError> { + let batched_input = segments.join("\n"); + let output = run_espeak(&batched_input, lang, espeak)?; + let lines: Vec<&str> = output.lines().collect(); + + // espeak-ng should emit one line per input line for stdin mode. + // If this assumption breaks, fall back to per-segment invocation. + if lines.len() != segments.len() { + return segments + .iter() + .map(|segment| { + let output = run_espeak(segment, lang, espeak)?; + Ok(ipa_to_ids(&output, vocab)) + }) + .collect(); + } + + Ok(lines.iter().map(|line| ipa_to_ids(line, vocab)).collect()) +} + +fn run_espeak(input: &str, lang: &str, espeak: &EspeakConfig) -> Result { + let bin = espeak + .bin_path + .as_deref() + .map(|p| p.as_os_str().to_owned()) + .unwrap_or_else(|| std::ffi::OsString::from("espeak-ng")); + let mut cmd = Command::new(&bin); + cmd.args(["--ipa", "--stdin", "-q", "-v", lang]); + if let Some(data_path) = espeak.data_path.as_deref() { + cmd.arg("--path").arg(data_path); + } + // When using a bundled binary, shared libraries (libespeak-ng.so, + // libpcaudio.so) are placed next to the binary. On Linux, the dynamic + // linker needs LD_LIBRARY_PATH to find them (RPATH may not be set). + #[cfg(target_os = "linux")] + if let Some(bin_dir) = espeak.bin_path.as_deref().and_then(|p| p.parent()) { + let new_ld_library_path = if let Some(existing) = std::env::var_os("LD_LIBRARY_PATH") { + let mut path = bin_dir.as_os_str().to_owned(); + path.push(":"); + path.push(&existing); + path + } else { + bin_dir.as_os_str().to_owned() + }; + cmd.env("LD_LIBRARY_PATH", new_ld_library_path); + } + #[cfg(target_os = "windows")] + { + // espeak-ng.exe is a console subsystem binary. When Parrot runs without + // an attached console, Windows can create a visible console window for + // every phonemizer child unless we suppress it explicitly. + cmd.creation_flags(CREATE_NO_WINDOW); + } + let mut child = cmd + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .map_err(|e| { + if e.kind() == std::io::ErrorKind::NotFound { + KokoroError::EspeakNotFound + } else { + KokoroError::Io(e) + } + })?; + + if let Some(mut stdin) = child.stdin.take() { + // espeak-ng treats stdin as line-oriented input. Without a final line terminator, + // the last token can be under-processed. Enforce a canonical, newline-terminated + // payload as part of this I/O contract. + let stdin_payload = canonicalize_espeak_stdin_payload(input); + stdin + .write_all(stdin_payload.as_bytes()) + .map_err(KokoroError::Io)?; + // Explicitly drop stdin to close the pipe before waiting + drop(stdin); + } + + // Hard timeout for espeak-ng child process (30 seconds) + let timeout = Duration::from_secs(30); + let output = wait_with_timeout(child, timeout)?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(KokoroError::PhonemizerFailed(format!( + "espeak-ng exited with code {:?}: {stderr}", + output.status.code() + ))); + } + + Ok(String::from_utf8_lossy(&output.stdout).into_owned()) +} + +/// Wait for a child process with a timeout. If the timeout expires, kill the child. +fn wait_with_timeout( + mut child: std::process::Child, + timeout: Duration, +) -> Result { + use std::sync::{mpsc, Arc, Mutex}; + use std::thread; + + let child_id = child.id(); + let child_arc = Arc::new(Mutex::new(Some(child))); + let child_clone = Arc::clone(&child_arc); + + let (tx, rx) = mpsc::channel(); + + // Spawn a thread to wait for the child process + thread::spawn(move || { + let mut child_guard = child_clone.lock().unwrap(); + if let Some(mut child) = child_guard.take() { + let result = child.wait_with_output(); + let _ = tx.send(result); + } + }); + + // Wait for the result with a timeout + match rx.recv_timeout(timeout) { + Ok(Ok(output)) => Ok(output), + Ok(Err(e)) => Err(KokoroError::Io(e)), + Err(mpsc::RecvTimeoutError::Timeout) => { + // Timeout expired - try to kill the child process + let mut child_guard = child_arc.lock().unwrap(); + if let Some(mut child) = child_guard.take() { + let _ = child.kill(); + let _ = child.wait(); + } + Err(KokoroError::Timeout(timeout)) + } + Err(mpsc::RecvTimeoutError::Disconnected) => Err(KokoroError::Io( + std::io::Error::new( + std::io::ErrorKind::Other, + format!("espeak-ng process (PID {child_id}) channel disconnected"), + ), + )), + } +} + +fn canonicalize_espeak_stdin_payload(input: &str) -> Cow<'_, str> { + if input.ends_with('\n') { + Cow::Borrowed(input) + } else { + Cow::Owned(format!("{input}\n")) + } +} + +fn ipa_to_ids(ipa: &str, vocab: &HashMap) -> Vec { + let mut ids = Vec::new(); + for line in ipa.lines() { + let line = line.trim(); + if line.is_empty() { + continue; + } + for ch in line.chars() { + if ch == '_' { + continue; + } + if let Some(&id) = vocab.get(&ch) { + ids.push(id); + } + } + } + ids +} + +#[cfg(test)] +mod tests { + use super::{ + canonicalize_espeak_stdin_payload, phonemize, run_espeak, split_text_parts, EspeakConfig, + TextPart, + }; + use crate::engines::kokoro::vocab::hardcoded_vocab; + use std::process::Command; + + fn espeak_available() -> bool { + Command::new("espeak-ng") + .arg("--version") + .output() + .is_ok() + } + + #[test] + fn splits_text_and_punctuation_parts() { + let parts = split_text_parts("Hello, world. Testing!"); + assert_eq!( + parts, + vec![ + TextPart::Text("Hello".to_string()), + TextPart::Punct(','), + TextPart::Text("world".to_string()), + TextPart::Punct('.'), + TextPart::Text("Testing".to_string()), + TextPart::Punct('!'), + ] + ); + } + + #[test] + fn keeps_decimal_and_thousands_separators_inside_text() { + let parts = split_text_parts("Version 2.0 reached 1,000 users."); + assert_eq!( + parts, + vec![ + TextPart::Text("Version 2.0 reached 1,000 users".to_string()), + TextPart::Punct('.'), + ] + ); + } + + #[test] + fn still_splits_comma_when_not_between_digits() { + let parts = split_text_parts("Value 2, next"); + assert_eq!( + parts, + vec![ + TextPart::Text("Value 2".to_string()), + TextPart::Punct(','), + TextPart::Text("next".to_string()), + ] + ); + } + + #[test] + fn appends_trailing_newline_for_espeak_stdin() { + assert_eq!(canonicalize_espeak_stdin_payload("America"), "America\n"); + } + + #[test] + fn keeps_single_trailing_newline_for_espeak_stdin() { + assert_eq!(canonicalize_espeak_stdin_payload("America\n"), "America\n"); + } + + #[test] + fn espeak_output_is_stable_with_or_without_trailing_newline() { + if !espeak_available() { + return; + } + + let cfg = EspeakConfig::default(); + let without_newline = + run_espeak("America", "en-us", &cfg).expect("espeak should succeed"); + let with_newline = + run_espeak("America\n", "en-us", &cfg).expect("espeak should succeed"); + assert_eq!( + without_newline.trim(), + with_newline.trim(), + "stdin canonicalization must prevent final-token truncation" + ); + } + + #[test] + fn phonemize_keeps_terminal_schwa_for_america() { + if !espeak_available() { + return; + } + + let vocab = hardcoded_vocab(); + let cfg = EspeakConfig::default(); + let ids = + phonemize("America", "en-us", &vocab, &cfg).expect("phonemize should succeed"); + let schwa_id = *vocab + .get(&'ə') + .expect("hardcoded vocab should include schwa"); + assert_eq!( + ids.last(), + Some(&schwa_id), + "terminal schwa should be preserved for 'America'" + ); + } +} \ No newline at end of file diff --git a/vendor/tts-rs/src/engines/kokoro/vocab.rs b/vendor/tts-rs/src/engines/kokoro/vocab.rs new file mode 100644 index 0000000..890b9fe --- /dev/null +++ b/vendor/tts-rs/src/engines/kokoro/vocab.rs @@ -0,0 +1,163 @@ +use std::collections::HashMap; +use std::path::Path; + +use super::model::KokoroError; + +/// Load the Kokoro vocabulary from a config.json file. +/// +/// The config.json must contain a `"vocab"` field mapping single-character +/// strings to integer token IDs. +pub fn load_vocab(config_path: &Path) -> Result, KokoroError> { + let content = std::fs::read_to_string(config_path)?; + let json: serde_json::Value = serde_json::from_str(&content) + .map_err(|e| KokoroError::Config(format!("Failed to parse JSON: {e}")))?; + + let vocab_obj = json + .get("vocab") + .ok_or_else(|| KokoroError::Config("Missing 'vocab' field".to_string()))? + .as_object() + .ok_or_else(|| KokoroError::Config("'vocab' must be an object".to_string()))?; + + let mut map = HashMap::new(); + for (k, v) in vocab_obj { + if k.is_empty() { + return Err(KokoroError::Config(format!("Empty key in vocab: {k:?}"))); + } + if k.chars().count() != 1 { + return Err(KokoroError::Config(format!( + "Vocab key must be a single character, got: {k:?}" + ))); + } + let ch = k.chars().next().unwrap(); + let id = v + .as_i64() + .ok_or_else(|| KokoroError::Config(format!("Non-integer vocab value for key {k:?}")))?; + map.insert(ch, id); + } + + Ok(map) +} + +/// Hardcoded Kokoro vocabulary (from config.json, model-version-agnostic). +/// +/// Only used as a fallback when config.json is not present. Prefer loading +/// from config.json via `load_vocab()`. +pub fn hardcoded_vocab() -> HashMap { + let entries: &[(char, i64)] = &[ + (';', 1), + (':', 2), + (',', 3), + ('.', 4), + ('!', 5), + ('?', 6), + ('—', 9), + ('…', 10), + ('"', 11), + ('(', 12), + (')', 13), + ('\u{201c}', 14), + ('\u{201d}', 15), + (' ', 16), + ('\u{0303}', 17), + ('ʣ', 18), + ('ʥ', 19), + ('ʦ', 20), + ('ʨ', 21), + ('ᵝ', 22), + ('ꭧ', 23), + ('A', 24), + ('I', 25), + ('O', 31), + ('Q', 33), + ('S', 35), + ('T', 36), + ('W', 39), + ('Y', 41), + ('ᵊ', 42), + ('a', 43), + ('b', 44), + ('c', 45), + ('d', 46), + ('e', 47), + ('f', 48), + ('h', 50), + ('i', 51), + ('j', 52), + ('k', 53), + ('l', 54), + ('m', 55), + ('n', 56), + ('o', 57), + ('p', 58), + ('q', 59), + ('r', 60), + ('s', 61), + ('t', 62), + ('u', 63), + ('v', 64), + ('w', 65), + ('x', 66), + ('y', 67), + ('z', 68), + ('ɑ', 69), + ('ɐ', 70), + ('ɒ', 71), + ('æ', 72), + ('β', 75), + ('ɔ', 76), + ('ɕ', 77), + ('ç', 78), + ('ɖ', 80), + ('ð', 81), + ('ʤ', 82), + ('ə', 83), + ('ɚ', 85), + ('ɛ', 86), + ('ɜ', 87), + ('ɟ', 90), + ('ɡ', 92), + ('ɥ', 99), + ('ɨ', 101), + ('ɪ', 102), + ('ʝ', 103), + ('ɯ', 110), + ('ɰ', 111), + ('ŋ', 112), + ('ɳ', 113), + ('ɲ', 114), + ('ɴ', 115), + ('ø', 116), + ('ɸ', 118), + ('θ', 119), + ('œ', 120), + ('ɹ', 123), + ('ɾ', 125), + ('ɻ', 126), + ('ʁ', 128), + ('ɽ', 129), + ('ʂ', 130), + ('ʃ', 131), + ('ʈ', 132), + ('ʧ', 133), + ('ʊ', 135), + ('ʋ', 136), + ('ʌ', 138), + ('ɣ', 139), + ('ɤ', 140), + ('χ', 142), + ('ʎ', 143), + ('ʒ', 147), + ('ʔ', 148), + ('ˈ', 156), + ('ˌ', 157), + ('ː', 158), + ('ʰ', 162), + ('ʲ', 164), + ('↓', 169), + ('→', 171), + ('↗', 172), + ('↘', 173), + ('ᵻ', 177), + ]; + entries.iter().copied().collect() +} \ No newline at end of file diff --git a/vendor/tts-rs/src/engines/kokoro/voices.rs b/vendor/tts-rs/src/engines/kokoro/voices.rs new file mode 100644 index 0000000..22b5a20 --- /dev/null +++ b/vendor/tts-rs/src/engines/kokoro/voices.rs @@ -0,0 +1,181 @@ +use std::collections::HashMap; +use std::ffi::OsStr; +use std::fs::File; +use std::io::Read; +use std::path::Path; + +use super::model::KokoroError; + +/// Storage for all loaded voice style vectors. +/// +/// Each voice is stored as a flat list of style vectors, where each vector +/// has 256 floats. The index into the list corresponds to the phoneme token +/// count, enabling prosody-consistent synthesis. +pub struct VoiceStore { + voices: HashMap>, +} + +impl VoiceStore { + /// Load all voices from a .npz (numpy zip) file. + /// + /// The file should be a standard .npz archive where each entry is a + /// .npy file named after the voice (e.g., `af_heart.npy`). + pub fn load(path: &Path) -> Result { + let file = File::open(path)?; + let mut zip = zip::ZipArchive::new(file) + .map_err(|e| KokoroError::VoiceParse(format!("Failed to open zip archive: {e}")))?; + + let mut voices = HashMap::new(); + + for i in 0..zip.len() { + let mut entry = zip.by_index(i).map_err(|e| { + KokoroError::VoiceParse(format!("Failed to read zip entry {i}: {e}")) + })?; + + let raw_name = entry.name().to_string(); + + // Skip directory entries + if raw_name.ends_with('/') { + continue; + } + + // Voice name is the basename without the .npy extension + let voice_name = Path::new(&raw_name) + .file_name() + .and_then(OsStr::to_str) + .map(|name| name.trim_end_matches(".npy")) + .filter(|name| !name.is_empty()) + .map(str::to_string); + + let Some(voice_name) = voice_name else { + continue; + }; + + let mut data = Vec::new(); + entry + .read_to_end(&mut data) + .map_err(|e| KokoroError::VoiceParse(format!("Failed to read {raw_name}: {e}")))?; + + let style_vectors = parse_npy(&data, &raw_name)?; + voices.insert(voice_name, style_vectors); + } + + log::info!("Loaded {} voices", voices.len()); + Ok(Self { voices }) + } + + /// Get the style vector for a voice at the given index. + /// + /// The index is clamped to the valid range, so any index is safe. + pub fn get_style(&self, voice: &str, idx: usize) -> Result<[f32; 256], KokoroError> { + let styles = self + .voices + .get(voice) + .ok_or_else(|| KokoroError::VoiceNotFound(voice.to_string()))?; + + if styles.is_empty() { + return Err(KokoroError::VoiceParse(format!( + "Voice {voice} has no style vectors" + ))); + } + + let clamped = idx.min(styles.len().saturating_sub(1)); + Ok(styles[clamped]) + } + + /// List all available voice names in sorted order. + pub fn list_voices(&self) -> Vec<&str> { + let mut names: Vec<&str> = self.voices.keys().map(|s| s.as_str()).collect(); + names.sort_unstable(); + names + } +} + +/// Parse a numpy .npy file into a list of style vectors. +/// +/// Expects a 2D float32 array of shape `[N, 256]` in little-endian format. +fn parse_npy(data: &[u8], name: &str) -> Result, KokoroError> { + // Verify numpy magic bytes: \x93NUMPY + if data.len() < 10 { + return Err(KokoroError::VoiceParse(format!( + "{name}: file too short ({} bytes)", + data.len() + ))); + } + + if &data[0..6] != b"\x93NUMPY" { + return Err(KokoroError::VoiceParse(format!( + "{name}: invalid numpy magic bytes" + ))); + } + + // major version at [6], minor at [7] + let major = data[6]; + let minor = data[7]; + + // Read header_len based on numpy version + let (header_len, data_offset) = match major { + 1 => { + // numpy 1.0: 2-byte little-endian u16 header_len at [8..10] + let header_len = u16::from_le_bytes([data[8], data[9]]) as usize; + (header_len, 10 + header_len) + } + 2 => { + // numpy 2.0: 4-byte little-endian u32 header_len at [8..12] + if data.len() < 12 { + return Err(KokoroError::VoiceParse(format!( + "{name}: file too short for numpy 2.0 header ({} bytes)", + data.len() + ))); + } + let header_len = u32::from_le_bytes([data[8], data[9], data[10], data[11]]) as usize; + (header_len, 12 + header_len) + } + _ => { + return Err(KokoroError::VoiceParse(format!( + "{name}: unsupported numpy version {major}.{minor}" + ))); + } + }; + + if data.len() < data_offset { + return Err(KokoroError::VoiceParse(format!( + "{name}: header truncated (need {data_offset} bytes, got {})", + data.len() + ))); + } + + let float_data = &data[data_offset..]; + if !float_data.len().is_multiple_of(4) { + return Err(KokoroError::VoiceParse(format!( + "{name}: float data length {} is not a multiple of 4", + float_data.len() + ))); + } + + let n_floats = float_data.len() / 4; + if !n_floats.is_multiple_of(256) { + return Err(KokoroError::VoiceParse(format!( + "{name}: float count {n_floats} is not a multiple of 256 (style vector dim)" + ))); + } + + let n_styles = n_floats / 256; + let mut result = Vec::with_capacity(n_styles); + + for i in 0..n_styles { + let mut vec = [0f32; 256]; + for (j, slot) in vec.iter_mut().enumerate() { + let offset = (i * 256 + j) * 4; + *slot = f32::from_le_bytes([ + float_data[offset], + float_data[offset + 1], + float_data[offset + 2], + float_data[offset + 3], + ]); + } + result.push(vec); + } + + Ok(result) +} \ No newline at end of file diff --git a/vendor/tts-rs/src/engines/mod.rs b/vendor/tts-rs/src/engines/mod.rs new file mode 100644 index 0000000..b0f399a --- /dev/null +++ b/vendor/tts-rs/src/engines/mod.rs @@ -0,0 +1,11 @@ +//! Speech synthesis engines. +//! +//! This module contains implementations of text-to-speech engines. +//! +//! # Available Engines +//! +//! Enable engines via Cargo features: +//! - `kokoro` - Kokoro TTS (ONNX format, espeak-ng required) + +#[cfg(feature = "kokoro")] +pub mod kokoro; diff --git a/vendor/tts-rs/src/lib.rs b/vendor/tts-rs/src/lib.rs new file mode 100644 index 0000000..763e07f --- /dev/null +++ b/vendor/tts-rs/src/lib.rs @@ -0,0 +1,114 @@ +//! # transcribe-rs +//! +//! A Rust library providing text-to-speech synthesis using the Kokoro engine. +//! +//! ## Features +//! +//! - **Kokoro TTS**: High-quality text-to-speech with multiple voices and languages +//! - **Flexible Model Loading**: Load models with custom parameters +//! - **Multiple Voices**: Support for 9 languages with various voice styles +//! +//! ## Quick Start +//! +//! ```toml +//! [dependencies] +//! transcribe-rs = { version = "0.2", features = ["kokoro"] } +//! ``` +//! +//! ```ignore +//! use std::path::PathBuf; +//! use transcribe_rs::{engines::kokoro::KokoroEngine, SynthesisEngine}; +//! +//! let mut engine = KokoroEngine::new(); +//! engine.load_model(&PathBuf::from("models/kokoro-v1.0"))?; +//! +//! let result = engine.synthesize("Hello, world!", None)?; +//! result.write_wav(&PathBuf::from("output.wav"))?; +//! # Ok::<(), Box>(()) +//! ``` + +pub mod engines; + +use std::path::Path; + +/// The result of a synthesis (text-to-speech) operation. +/// +/// Contains raw f32 audio samples and the sample rate of the output audio. +#[derive(Debug)] +pub struct SynthesisResult { + /// Raw audio samples as f32 values + pub samples: Vec, + /// Sample rate of the audio (24000 for Kokoro) + pub sample_rate: u32, +} + +impl SynthesisResult { + /// Write the audio to a 32-bit float WAV file. + pub fn write_wav(&self, path: &Path) -> Result<(), Box> { + let spec = hound::WavSpec { + channels: 1, + sample_rate: self.sample_rate, + bits_per_sample: 32, + sample_format: hound::SampleFormat::Float, + }; + let mut writer = hound::WavWriter::create(path, spec)?; + for &sample in &self.samples { + writer.write_sample(sample)?; + } + writer.finalize()?; + Ok(()) + } + + /// Duration of the audio in seconds. + pub fn duration_secs(&self) -> f64 { + if self.sample_rate == 0 { + return 0.0; + } + self.samples.len() as f64 / self.sample_rate as f64 + } +} + +/// Common interface for text-to-speech synthesis engines. +/// +/// This trait defines the standard operations that all synthesis engines must support. +/// Each engine may have different parameter types for model loading and inference configuration. +pub trait SynthesisEngine { + /// Parameters for configuring inference behavior (voice, speed, etc.) + type SynthesisParams; + /// Parameters for configuring model loading (threads, etc.) + type ModelParams: Default; + + /// Load a model from the specified path using default parameters. + fn load_model(&mut self, model_path: &Path) -> Result<(), Box> { + self.load_model_with_params(model_path, Self::ModelParams::default()) + } + + /// Load a model from the specified path with custom parameters. + fn load_model_with_params( + &mut self, + model_path: &Path, + params: Self::ModelParams, + ) -> Result<(), Box>; + + /// Unload the currently loaded model and free associated resources. + fn unload_model(&mut self); + + /// Synthesize speech from the given text. + fn synthesize( + &mut self, + text: &str, + params: Option, + ) -> Result>; + + /// Synthesize speech from the given text and write to a WAV file. + /// + /// Default implementation calls `synthesize()` then `SynthesisResult::write_wav()`. + fn synthesize_to_file( + &mut self, + text: &str, + wav_path: &Path, + params: Option, + ) -> Result<(), Box> { + self.synthesize(text, params)?.write_wav(wav_path) + } +} \ No newline at end of file