diff --git a/.gitignore b/.gitignore index fb6236d..d5a631f 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,25 @@ __pycache__/ # Generated mel reference tensors (regenerate with scripts/gen_reference.py) *.mel.npy + +# Jupyter checkpoints +.ipynb_checkpoints/ + +# Training venvs +training/**/.venv/ + +# Training data +training/smart-turn-zh/data/wav/*.wav +training/smart-turn-zh/data/**/*.wav +training/smart-turn-zh/data/*.jsonl +training/smart-turn-zh/data/vad_probs/ +training/smart-turn-zh/data/asr_results/ +training/smart-turn-zh/data/example/ +training/smart-turn-zh/data/grouped/ + +# Training references +training/smart-turn-zh/refs/*.pdf + +# Viewer +training/smart-turn-zh/viewer/node_modules/ +training/smart-turn-zh/viewer/dist/ \ No newline at end of file diff --git a/docs/plan-accuracy.md b/docs/01-plan-accuracy.md similarity index 100% rename from docs/plan-accuracy.md rename to docs/01-plan-accuracy.md diff --git a/docs/plan-backends.md b/docs/02-plan-backends.md similarity index 100% rename from docs/plan-backends.md rename to docs/02-plan-backends.md diff --git a/docs/plan-turn-controller.md b/docs/03-plan-turn-controller.md similarity index 100% rename from docs/plan-turn-controller.md rename to docs/03-plan-turn-controller.md diff --git a/training/smart-turn-zh/Makefile b/training/smart-turn-zh/Makefile new file mode 100644 index 0000000..b1302a9 --- /dev/null +++ b/training/smart-turn-zh/Makefile @@ -0,0 +1,36 @@ +VENV := .venv +PYTHON := $(VENV)/bin/python +PIP := $(VENV)/bin/pip + +.PHONY: help venv install notebook viewer clean + +help: + @echo "Available targets:" + @echo " venv Create virtualenv and install dependencies" + @echo " install Re-install dependencies into existing venv" + @echo " notebook Launch Jupyter notebook server" + @echo " viewer Launch audio viewer dev server" + @echo " clean Remove virtualenv" + +venv: $(VENV)/bin/activate + +$(VENV)/bin/activate: requirements.txt + python3 -m venv $(VENV) + $(PIP) install --upgrade pip + $(PIP) install -r requirements.txt + touch $(VENV)/bin/activate + +install: venv + $(PIP) install -r requirements.txt + +notebook: venv + $(VENV)/bin/jupyter lab notebooks/ + +viewer: viewer/node_modules + cd viewer && npm run dev + +viewer/node_modules: viewer/package.json + cd viewer && npm install + +clean: + rm -rf $(VENV) diff --git a/training/smart-turn-zh/README.md b/training/smart-turn-zh/README.md new file mode 100644 index 0000000..5641fa1 --- /dev/null +++ b/training/smart-turn-zh/README.md @@ -0,0 +1,20 @@ +# Smart Turn — Mandarin (smart-turn-zh) + +Mandarin-Chinese variant of the Smart Turn detector: fine-tuning the upstream +[Pipecat Smart Turn](../pipecat-smart-turn/) architecture (Whisper-Tiny encoder ++ binary classification head) on Chinese conversational audio. + +## Layout + +- [`plan-data.md`](plan-data.md) — dataset construction plan (under revision) +- [`research/`](research/) — surveys and open questions feeding the plans + - [`01-datasets.md`](research/01-datasets.md) — OpenSLR + HuggingFace dataset survey +- `data/` — data pipeline scripts +- `notebooks/` — Jupyter exploration +- `train/` — training scripts (later) + +## Status + +Design phase. The data pipeline plan in `plan-data.md` is the original +LLM-rewriting + TTS approach; we are revising toward real conversational +corpora — see `research/datasets.md` for the source-corpus analysis. diff --git a/training/smart-turn-zh/data/wav/.gitkeep b/training/smart-turn-zh/data/wav/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/training/smart-turn-zh/docs/01-vad-comparison.md b/training/smart-turn-zh/docs/01-vad-comparison.md new file mode 100644 index 0000000..a8fe3c5 --- /dev/null +++ b/training/smart-turn-zh/docs/01-vad-comparison.md @@ -0,0 +1,76 @@ +# VAD Model Comparison for Chinese Filler Detection + +## Use Case + +Detect speech regions in Chinese podcast audio, including soft fillers +(呃/嗯/啊) that ASR typically skips. The goal: **VAD active ∧ ASR silent = +filler candidates** (PodcastFillers, Zhu et al. 2022). + +Key requirements: +- Catches soft/quiet speech (fillers are often quiet) +- Configurable activation threshold (paper found 0.1 critical) +- Fine temporal resolution (10ms ideal) +- Runs on MacBook Pro (Apple Silicon) + +## Comparison + +| | FSMN-VAD | Silero VAD | WebRTC VAD | pyannote.audio | TEN VAD | +|---|---|---|---|---|---| +| **Source** | Alibaba / FunASR | Silero | Google | pyannote | TEN Framework | +| **Params** | 0.4M | ~70KB (quantized) | <1MB compiled | ~68M | lightweight ONNX | +| **Resolution** | 200ms chunks | 10ms+ configurable | 10/20/30ms fixed | 10ms | 10–16ms | +| **Threshold** | Configurable (speech_noise_thres) | Probability output, fully tunable | Aggressiveness 0–3 (coarse) | Probability output, fully tunable | Configurable, default 0.5 | +| **Chinese** | Native (5000h Mandarin) | General (6000+ langs, no zh-specific) | Language-agnostic | Trained on AISHELL + AliMeeting | General | +| **Soft fillers** | Good, but 200ms chunks may blur boundaries | Catches at low threshold (~0.1–0.3) | May miss quiet fillers | Best accuracy on soft boundaries | Less documented | +| **Mac perf** | CPU, fast for 0.4M | ~40μs/chunk on M2 Max | Very fast CPU | Slow on CPU (68M params) | arm64 native + ONNX | +| **MPS/GPU** | Yes (via FunASR) | MLX native | N/A (CPU only) | MPS supported | ONNX only | +| **License** | Model-specific (check HF) | MIT | BSD | MIT | Apache 2.0 | + +## Analysis + +### FSMN-VAD +- **Pro**: Already in our stack (FunASR). Chinese-native, production-proven. +- **Con**: 200ms chunk size is coarser than ideal. Fine boundary detection + for short fillers (150–400ms) may lose precision. + +### Silero VAD +- **Pro**: Tiny, fast, 10ms resolution, MIT license. Threshold tunable to + ~0.1 for soft speech. MLX native on Apple Silicon. +- **Con**: Not Chinese-optimized. Lightweight design means less sophisticated + boundary detection. + +### WebRTC VAD +- **Pro**: Mature, fast, 10ms native resolution. +- **Con**: No probability output — binary decisions with coarse aggressiveness + levels (0–3). Hard to tune for soft fillers. No fine-grained threshold. + +### pyannote.audio (segmentation-3.0) +- **Pro**: Best accuracy. 10ms resolution. Trained on Chinese datasets + (AISHELL, AliMeeting). Best at catching soft speech boundaries. +- **Con**: 68M params — slow on CPU. Overkill if we only need binary VAD. + +### TEN VAD +- **Pro**: 10ms resolution, superior precision vs WebRTC and Silero. Apache 2.0. +- **Con**: Newer (2024–2025), fewer production deployments. Less documented + for Chinese/soft speech. + +## Recommendation + +**Silero VAD** as primary choice: +- 10ms resolution matches what the paper uses +- Threshold tunable to 0.1 (critical finding from the paper) +- Tiny and fast on MacBook +- MIT license +- Good enough for candidate generation — the classifier stage handles precision + +**FSMN-VAD** as comparison baseline since it's already in our pipeline. + +If accuracy on soft boundaries proves insufficient, upgrade to **pyannote.audio**. + +## References + +- [PodcastFillers paper](../refs/2203.pdf) — Section 2.2: VAD threshold 0.1, candidates 150ms–2s +- [Silero VAD](https://github.com/snakers4/silero-vad) +- [FSMN-VAD](https://huggingface.co/funasr/fsmn-vad) +- [pyannote.audio](https://github.com/pyannote/pyannote-audio) +- [TEN VAD](https://github.com/TEN-framework/ten-vad) diff --git a/training/smart-turn-zh/docs/02-data-structures.md b/training/smart-turn-zh/docs/02-data-structures.md new file mode 100644 index 0000000..247e1de --- /dev/null +++ b/training/smart-turn-zh/docs/02-data-structures.md @@ -0,0 +1,114 @@ +# Data Structures + +Artifacts produced by the notebook pipeline (`01-asr-transcribe`, `02-vad`) and their schemas. + +## Directory Layout + +``` +data/ +├── wav/ # Source audio +│ ├── R8001_M8004_MS801.wav # 8-ch, 16 kHz, PCM-16 +│ └── R8003_M8001_MS801.wav +├── asr_results/ # Per-file ASR transcriptions +│ ├── R8001_M8004_MS801.json +│ └── R8003_M8001_MS801.json +└── vad_probs/ # Per-frame speech probabilities + ├── R8001_M8004_MS801.npy + └── R8003_M8001_MS801.npy +``` + +## Source Audio (`data/wav/*.wav`) + +| Property | Value | +|-------------|--------------------------------| +| Format | RIFF WAVE, PCM 16-bit | +| Channels | 8 (per-speaker headset mics) | +| Sample rate | 16 kHz | +| Source | AliMeeting (SLR-119) meetings | + +## ASR Results (`data/asr_results/*.json`) + +**Format**: JSON — one file per WAV, named `{wav_stem}.json`. +**Producer**: `notebooks/01-asr-transcribe.ipynb` (Paraformer-zh + FSMN-VAD + ct-punc). + +### File Schema + +Each file contains a JSON array of record objects: + +```jsonc +[ + { + "text": "全文转写结果...", // full transcription (punctuated) + "sentences": [ /* see below */ ], + "timestamp": [ /* see below */ ] + } +] +``` + +### `sentences` Array Element + +Each element is one sentence/chunk segmented by the ASR model. + +```jsonc +{ + "text": "啊,", // punctuated text + "raw_text": "啊", // text without punctuation + "start": 7130, // start time (ms) + "end": 7370, // end time (ms) + "timestamp": [[7130, 7370]] // per-word [start_ms, end_ms] pairs +} +``` + +| Field | Type | Unit | Description | +|-------------|------------------|------|------------------------------------------| +| `text` | string | — | Sentence with restored punctuation | +| `raw_text` | string | — | Same sentence, no punctuation | +| `start` | int | ms | Sentence start time | +| `end` | int | ms | Sentence end time | +| `timestamp` | array of [int, int] | ms | Per-word start/end pairs (10 ms frames) | + +### Top-level `timestamp` Array + +Flat array of all per-word `[start_ms, end_ms]` pairs across the entire file (same data as the union of per-sentence timestamps). + +## VAD Probabilities (`data/vad_probs/*.npy`) + +**Format**: NumPy `.npy`, 1-D `float32` array. +**Producer**: `notebooks/02-vad.ipynb` (Silero VAD). + +| Property | Value | +|-------------------|-------------------------------| +| Shape | `(num_frames,)` | +| Dtype | `float32` | +| Frame size | 512 samples = **32 ms** @ 16 kHz | +| Value range | `[0.0, 1.0]` — P(speech) | +| Index → time | `frame[i]` → `i * 32 ms` | + +### Loading + +```python +import numpy as np +probs = np.load("data/vad_probs/R8001_M8004_MS801.npy") +# probs[i] = speech probability at time i * 32 ms +``` + +### File Details + +| File | Frames | Duration | +|---------------------------|---------|------------| +| `R8001_M8004_MS801.npy` | 49,183 | ~1,573.9 s | +| `R8003_M8001_MS801.npy` | 64,625 | ~2,068.0 s | + +## Cross-referencing ASR and VAD + +ASR timestamps are in **milliseconds**; VAD frames are **32 ms** each. + +```python +# Convert ASR ms timestamp to VAD frame index +vad_frame = asr_start_ms // 32 + +# Convert VAD frame index to ms +time_ms = vad_frame * 32 +``` + +This alignment is used in the next pipeline step (filler candidate extraction): regions where VAD is active (`prob > threshold`) but ASR produces no recognized words. diff --git a/training/smart-turn-zh/docs/03-audio-viewer.md b/training/smart-turn-zh/docs/03-audio-viewer.md new file mode 100644 index 0000000..93a1be6 --- /dev/null +++ b/training/smart-turn-zh/docs/03-audio-viewer.md @@ -0,0 +1,238 @@ +# Audio Viewer — Plan + +A browser-based tool for inspecting audio alongside VAD probabilities and ASR +transcriptions. Designed for 1-hour recordings at interactive frame rates. + +## Goal + +Accelerate data review for the training pipeline. Load a WAV file with its +matching VAD and ASR outputs, visualize everything on a synchronized timeline, +and search/navigate ASR text to jump to specific moments. + +## Requirements + +### Functional + +1. **File upload** — drag-and-drop or file picker for three inputs: + - `.wav` audio (any channel count, 16 kHz, PCM-16) + - `.npy` VAD probabilities (float32, 32 ms frames) + - `.json` ASR result (sentence-level timestamps) + +2. **Waveform display** — amplitude over time. Supports any channel count + (1, 2, 4, 8, ...). Individual channel selection or a merged "all channels" + view that overlays every channel on a single waveform. + +3. **Spectrogram display** — time-frequency view of the selected channel. + +4. **VAD probability curve** — stacked below the waveform, showing P(speech) + per 32 ms frame. Two configurable threshold lines: **entry** (e.g. 0.3) and + **exit** (e.g. 0.1). Hysteresis logic: speech region starts when probs rise + above entry, ends when they drop below exit. Active speech regions are + filled with one color; inactive regions with another. + +5. **ASR transcript panel** — scrollable sentence list with timestamps. Clicking + a sentence seeks the playback cursor and centers the viewport. + +6. **Text search** — search box filters ASR sentences by keyword. Highlights + matching segments on the timeline. Previous/Next buttons (+ keyboard + shortcuts) cycle through results. + +7. **Audio playback** — play/pause, click-to-seek on waveform, playback cursor + that scrolls the viewport. + +### Non-functional + +- Handle **1-hour audio** (57.6 M samples per channel) without lag. +- Smooth zoom from full-file overview down to individual samples. +- All processing client-side — no server required. Open `index.html` and go. + +## Data Inputs + +All three formats are documented in `02-data-structures.md`. Key details +relevant to the viewer: + +| Input | Size (1 hr) | Notes | +|-------|-------------|-------| +| WAV (N-ch, 16 kHz, 16-bit) | varies | Decode via Web Audio API; select channel or merge all | +| VAD .npy (float32) | ~450 KB | ~112 K frames; parse npy header + raw ArrayBuffer | +| ASR .json | ~50-200 KB | Array of records with `sentences[].{start,end,text}` | + +### Parsing `.npy` in the browser + +NumPy v1.0 `.npy` format: 6-byte magic + 2-byte version + 2-byte header length ++ ASCII header (dtype, shape, order) + raw data. For 1-D float32 arrays this is +trivial — read the header to get length, then wrap the remaining bytes as a +`Float32Array`. + +## Architecture + +``` +viewer/ +├── index.html +├── src/ +│ ├── main.tsx # React bootstrap +│ ├── App.tsx # root component, file loading, keyboard shortcuts +│ ├── style.css # dark theme, responsive layout +│ ├── lib/ +│ │ ├── audio.ts # WAV decode, channel extraction, LOD pyramid +│ │ ├── waveform.ts # Canvas waveform renderer +│ │ ├── vad.ts # .npy parser + VAD curve renderer +│ │ ├── asr.ts # ASR JSON loader + search + overlay drawing +│ │ ├── timeline.ts # shared time axis, zoom/pan state, cursor sync +│ │ ├── spectrogram.ts # tile-cached spectrogram renderer + colormap +│ │ └── fft-worker.ts # Web Worker for STFT computation +│ ├── components/ +│ │ ├── Toolbar.tsx # file controls, channel selector, playback +│ │ ├── WaveformTrack.tsx # waveform canvas + ASR label overlays +│ │ ├── VADTrack.tsx # VAD canvas + threshold inputs +│ │ ├── SpectrogramTrack.tsx # spectrogram canvas (FFT tiles) +│ │ ├── ASRPanel.tsx # search bar + transcript list +│ │ └── Minimap.tsx # overview bar with viewport indicator +│ └── hooks/ +│ ├── usePlayback.ts # Web Audio API playback, gain, auto-scroll +│ └── useCanvasInteraction.ts # wheel zoom, drag pan, click-to-seek +├── package.json +├── tsconfig.json +└── vite.config.ts +``` + +**Stack**: React 19 + TypeScript + Vite. Canvas 2D for all visualizations. +React manages component state and layout; rendering libraries (`waveform.ts`, +`vad.ts`, `asr.ts`) draw directly to canvas refs. + +## Performance Strategy + +### Waveform — level-of-detail (LOD) decimation + +Raw samples per channel for 1 hr: **57.6 M**. A screen is ~2000 px wide. +Drawing all samples is pointless and slow. + +**Approach**: pre-compute a min/max mipmap pyramid on load. + +``` +Level 0: raw samples (57.6 M points) +Level 1: min/max per 64 samples (~900 K points) +Level 2: min/max per 256 (~225 K points) +Level 3: min/max per 1024 (~56 K points) +Level 4: min/max per 4096 (~14 K points) +... +``` + +At render time, pick the level where each pixel covers ~1 bucket. Draw a filled +shape between min and max per pixel column. This is O(screen_width) per frame +regardless of zoom level. + +Build the pyramid in a **Web Worker** so the UI stays responsive during load. + +### Spectrogram — viewport-only FFT + +Computing a full STFT for 1 hour is expensive (~3.6 M frames at hop=16). +Instead: + +1. Compute FFT **only for the visible time range** plus a small margin. +2. Cache computed tiles (e.g. 10-second chunks) in a tile map. +3. On pan/zoom, render cached tiles and compute missing ones in a Worker. +4. FFT size: 512 (32 ms at 16 kHz) — matches VAD frame size, gives 256 freq + bins up to 8 kHz. Fast enough for real-time viewport updates. + +Render to an off-screen canvas, then `drawImage` to the visible canvas — +avoids per-pixel DOM work. + +### VAD curve + +~112 K points for 1 hour. Apply the same LOD approach as waveform (average +instead of min/max). At full zoom-out the curve is already only ~2 K points +per screen width — nearly free. + +### ASR overlay + +Sentence count is small (a few hundred). No special optimization needed. +On search, build a Set of matching sentence indices, then highlight their +time spans on the timeline canvas. + +### General + +- **Typed arrays everywhere** — `Float32Array` / `Int16Array` for audio and + VAD data. No JS array copies. +- **`requestAnimationFrame`** for synchronized redraws — batch all dirty + canvases into one frame. +- **Debounced zoom/pan** — pointer events update viewport state; rendering + reads state on rAF. +- **Channel switching** — re-slice from the decoded `AudioBuffer` (kept in + memory), rebuild LOD pyramid. ~1 s for 1-hour file. "All" mode averages + channels into a single waveform, or overlays them with per-channel colors. + +## UI Layout + +``` +┌──────────────────────────────────────────────────────┐ +│ [Drop zone / file pickers] [Channel ▾ All] [▶ ⏸] │ +├──────────────────────────────────────────────────────┤ +│ Waveform ░░░▓▓▓▓▓▓░░░░▓▓▓▓░░░░░░ │ ← Canvas +│ VAD probs ─────╱╲──╱╲╲──────╱╲───────────── │ ← Canvas (stacked) +│ Spectrogram ▒▒▒▓▓▓▓▒▒▒▒▓▓▓▓▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ │ ← Canvas +├────────────────────────────┬─────────────────────────┤ +│ [🔍 Search ASR...] │ Time axis / minimap │ +│ ─────────────────────────│──────────────────────────│ +│ 00:01.23 啊,我觉得... │ │ +│ 00:05.67 对对对,就是 │ │ +│ 00:12.34 然后我们... │ (overview bar showing │ +│ ... │ full file with viewport │ +│ [◀ Prev] [Next ▶] │ indicator) │ +└────────────────────────────┴─────────────────────────┘ +``` + +- **Top row**: file upload area, channel selector, playback controls. +- **Middle**: three vertically stacked canvases sharing a time axis. Scroll + wheel zooms, click-drag pans. All canvases pan/zoom in sync. +- **Bottom left**: ASR transcript panel with search. Clicking a sentence + scrolls the timeline. Search highlights appear on the waveform/VAD canvases. +- **Bottom right**: minimap showing the full file duration. The blue rectangle + indicates the current viewport; drag to navigate. + +### Keyboard shortcuts + +| Key | Action | +|-----|--------| +| Space | Play / Pause | +| F | Search (focus search box) | +| Enter / Shift+Enter | Next / Previous search result | +| +/- | Zoom in / out | +| Left / Right | Pan | +| 0 | Merged (all channels) view | +| 1-9 | Select channel | + +## Implementation Phases + +### Phase 1 — Scaffold + waveform ✅ + +- [x] Vite project setup, HTML shell, file drop zone. +- [x] WAV decode via `AudioContext.decodeAudioData`. +- [x] LOD pyramid builder (main thread, 4× decimation levels). +- [x] Canvas waveform renderer with pan/zoom. +- [x] Channel selector (individual + "All" merged view). +- [x] Audio playback with seek. + +### Phase 2 — VAD + ASR ✅ + +- [x] `.npy` parser (typed array). +- [x] VAD probability canvas, synced to waveform timeline. +- [x] Entry/exit threshold inputs with hysteresis logic. +- [x] ASR JSON loader, transcript panel. +- [x] Click sentence to seek. + +### Phase 3 — Search + polish ✅ + +- [x] ASR text search with highlight on timeline. +- [x] Prev/Next navigation. +- [x] Minimap overview bar. +- [x] Keyboard shortcuts. +- [x] Responsive layout, loading indicators, error handling. +- [x] Character-level karaoke highlighting during playback. +- [x] Drag-and-drop file loading. + +### Phase 4 — Spectrogram ✅ + +- [x] FFT Web Worker with tile caching (10 s tiles, FFT 512 / hop 256). +- [x] Spectrogram canvas, synced timeline, frequency axis labels. +- [x] Color map (magma). diff --git a/training/smart-turn-zh/notebooks/01-asr-transcribe.ipynb b/training/smart-turn-zh/notebooks/01-asr-transcribe.ipynb new file mode 100644 index 0000000..2ec9166 --- /dev/null +++ b/training/smart-turn-zh/notebooks/01-asr-transcribe.ipynb @@ -0,0 +1,170 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ASR Transcription with Paraformer-zh\n", + "\n", + "Load WAV files from a folder, run Paraformer-zh (via FunASR) with FSMN-VAD,\n", + "and save word-level ASR results to JSONL." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %pip install funasr modelscope soundfile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "# -- Configure these paths --\n", + "WAV_DIR = Path(\"../data/wav\") # folder containing .wav files\n", + "OUTPUT_DIR = Path(\"../data/asr_results\")\n", + "\n", + "WAV_DIR.mkdir(parents=True, exist_ok=True)\n", + "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from funasr import AutoModel\n", + "\n", + "model = AutoModel(\n", + " model=\"paraformer-zh\", # ASR — best Chinese accuracy at 220M params\n", + " vad_model=\"fsmn-vad\", # VAD with timestamps\n", + " punc_model=\"ct-punc\", # punctuation restoration\n", + " device=\"mps\", # MPS for Mac\n", + ")\n", + "print(\"Model loaded.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wav_files = sorted(WAV_DIR.glob(\"*.wav\"))\n", + "print(f\"Found {len(wav_files)} WAV files in {WAV_DIR}\")\n", + "for f in wav_files[:5]:\n", + " print(f\" {f.name}\")\n", + "if len(wav_files) > 5:\n", + " print(f\" ... and {len(wav_files) - 5} more\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "for i, wav_path in enumerate(wav_files):\n", + " res = model.generate(\n", + " input=str(wav_path),\n", + " return_raw_text=True,\n", + " sentence_timestamp=True,\n", + " )\n", + "\n", + " # Save one JSON per WAV file\n", + " out_path = OUTPUT_DIR / f\"{wav_path.stem}.json\"\n", + " records = []\n", + " for item in res:\n", + " records.append({\n", + " \"text\": item.get(\"text\", \"\"),\n", + " \"sentences\": item.get(\"sentence_info\", []),\n", + " \"timestamp\": item.get(\"timestamp\", []),\n", + " })\n", + " with open(out_path, \"w\", encoding=\"utf-8\") as f:\n", + " json.dump(records, f, ensure_ascii=False, indent=2)\n", + "\n", + " if (i + 1) % 10 == 0 or (i + 1) == len(wav_files):\n", + " print(f\"[{i + 1}/{len(wav_files)}] {wav_path.name}\")\n", + "\n", + "print(f\"\\nDone. Results saved to {OUTPUT_DIR}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Speed\n", + "\n", + "With CPU\n", + "\n", + "> rtf_avg: 0.069, time_speech: 2068.000, time_escape: 143.4924.00s/it]\n", + "\n", + "With MPS\n", + "\n", + "> rtf_avg: 0.009, time_speech: 2068.000, time_escape: 19.374:9.86s/it]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Preview results\n", + "for json_path in sorted(OUTPUT_DIR.glob(\"*.json\"))[:5]:\n", + " r = json.loads(json_path.read_text(encoding=\"utf-8\"))\n", + " print(f\"--- {json_path.stem} ---\")\n", + " for item in r:\n", + " print(f\" text: {item['text']}\")\n", + " if item[\"sentences\"]:\n", + " print(f\" sentences: {item['sentences'][:3]}\")\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/training/smart-turn-zh/notebooks/02-vad.ipynb b/training/smart-turn-zh/notebooks/02-vad.ipynb new file mode 100644 index 0000000..b3753f5 --- /dev/null +++ b/training/smart-turn-zh/notebooks/02-vad.ipynb @@ -0,0 +1,185 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# VAD Detection with Silero VAD\n", + "\n", + "Run Silero VAD on WAV files and save raw per-frame speech probabilities.\n", + "Each frame is 32ms (512 samples at 16kHz). Save raw probs for visualization\n", + "so we can pick threshold/filtering parameters later." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "import numpy as np\n", + "\n", + "WAV_DIR = Path(\"../data/wav\")\n", + "OUTPUT_DIR = Path(\"../data/vad_probs\")\n", + "\n", + "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n", + "\n", + "SAMPLE_RATE = 16000\n", + "WINDOW_SIZE = 512 # 32ms per frame at 16kHz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "model, utils = torch.hub.load(\n", + " repo_or_dir=\"snakers4/silero-vad\",\n", + " model=\"silero_vad\",\n", + " trust_repo=True,\n", + ")\n", + "\n", + "(get_speech_timestamps, _, read_audio, _, _) = utils\n", + "\n", + "# Move to MPS if available\n", + "device = torch.device(\"mps\" if torch.backends.mps.is_available() else \"cpu\")\n", + "model = model.to(device)\n", + "print(f\"Silero VAD loaded on {device}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wav_files = sorted(WAV_DIR.glob(\"*.wav\"))\n", + "print(f\"Found {len(wav_files)} WAV files in {WAV_DIR}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm.auto import tqdm\n", + "\n", + "for wav_path in tqdm(wav_files, desc=\"Files\"):\n", + " wav = read_audio(str(wav_path), sampling_rate=SAMPLE_RATE).to(device)\n", + "\n", + " # Extract per-frame speech probabilities\n", + " chunks = range(0, len(wav), WINDOW_SIZE)\n", + " probs = []\n", + " for start in tqdm(chunks, desc=wav_path.stem, leave=False):\n", + " chunk = wav[start : start + WINDOW_SIZE]\n", + " if len(chunk) < WINDOW_SIZE:\n", + " chunk = torch.nn.functional.pad(chunk, (0, WINDOW_SIZE - len(chunk)))\n", + " prob = model(chunk, SAMPLE_RATE).item()\n", + " probs.append(prob)\n", + " model.reset_states()\n", + "\n", + " # Save as .npy — one float32 per 32ms frame\n", + " out_path = OUTPUT_DIR / f\"{wav_path.stem}.npy\"\n", + " np.save(out_path, np.array(probs, dtype=np.float32))\n", + "\n", + "print(f\"\\nDone. Saved {len(wav_files)} prob arrays to {OUTPUT_DIR}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Preview: plot speech probabilities\n", + "import matplotlib.pyplot as plt\n", + "\n", + "npy_files = sorted(OUTPUT_DIR.glob(\"*.npy\"))\n", + "probs = np.load(npy_files[0])\n", + "time_s = np.arange(len(probs)) * WINDOW_SIZE / SAMPLE_RATE\n", + "\n", + "# Overview (full file, downsampled for readability)\n", + "fig, axes = plt.subplots(3, 1, figsize=(16, 8), sharex=False)\n", + "\n", + "# 1) Full file overview — use fill to show density\n", + "axes[0].fill_between(time_s, probs, alpha=0.6)\n", + "axes[0].set_title(f\"{npy_files[0].stem} — full ({time_s[-1]:.0f}s)\")\n", + "axes[0].set_ylim(0, 1)\n", + "axes[0].axhline(y=0.5, color=\"r\", linestyle=\"--\", alpha=0.5, label=\"0.5\")\n", + "axes[0].axhline(y=0.1, color=\"orange\", linestyle=\"--\", alpha=0.5, label=\"0.1\")\n", + "axes[0].legend(loc=\"upper right\")\n", + "\n", + "# 2) First 60s zoomed in\n", + "mask = time_s <= 60\n", + "axes[1].fill_between(time_s[mask], probs[mask], alpha=0.6)\n", + "axes[1].plot(time_s[mask], probs[mask], linewidth=0.3, color=\"C0\")\n", + "axes[1].set_title(\"First 60s\")\n", + "axes[1].set_ylim(0, 1)\n", + "axes[1].axhline(y=0.5, color=\"r\", linestyle=\"--\", alpha=0.5)\n", + "axes[1].axhline(y=0.1, color=\"orange\", linestyle=\"--\", alpha=0.5)\n", + "\n", + "# 3) Histogram of probabilities\n", + "axes[2].hist(probs, bins=100, edgecolor=\"none\", alpha=0.7)\n", + "axes[2].set_xlabel(\"Speech probability\")\n", + "axes[2].set_ylabel(\"Frame count\")\n", + "axes[2].set_title(\"Probability distribution\")\n", + "axes[2].axvline(x=0.5, color=\"r\", linestyle=\"--\", alpha=0.5, label=\"0.5\")\n", + "axes[2].axvline(x=0.1, color=\"orange\", linestyle=\"--\", alpha=0.5, label=\"0.1\")\n", + "axes[2].legend()\n", + "\n", + "for ax in axes[:2]:\n", + " ax.set_ylabel(\"P(speech)\")\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/training/smart-turn-zh/notebooks/03-group-for-viewer.ipynb b/training/smart-turn-zh/notebooks/03-group-for-viewer.ipynb new file mode 100644 index 0000000..ec1e857 --- /dev/null +++ b/training/smart-turn-zh/notebooks/03-group-for-viewer.ipynb @@ -0,0 +1,152 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Group Files for Viewer\n", + "\n", + "For each WAV in `data/wav/` that has a matching VAD `.npy` and ASR `.json`,\n", + "move all three into `data/grouped/{stem}/` so they can be opened together in the viewer." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import shutil\n", + "from pathlib import Path\n", + "\n", + "WAV_DIR = Path(\"../data/wav\")\n", + "VAD_DIR = Path(\"../data/vad_probs\")\n", + "ASR_DIR = Path(\"../data/asr_results\")\n", + "OUT_DIR = Path(\"../data/grouped\")\n", + "\n", + "OUT_DIR.mkdir(parents=True, exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Ready to group: 0\n" + ] + } + ], + "source": [ + "# Preview — show what will be moved vs skipped\n", + "to_group = []\n", + "skipped = []\n", + "\n", + "for wav_path in sorted(WAV_DIR.glob(\"*.wav\")):\n", + " stem = wav_path.stem\n", + " vad_path = VAD_DIR / f\"{stem}.npy\"\n", + " asr_path = ASR_DIR / f\"{stem}.json\"\n", + "\n", + " if not vad_path.exists() or not asr_path.exists():\n", + " missing = []\n", + " if not vad_path.exists(): missing.append(\"VAD\")\n", + " if not asr_path.exists(): missing.append(\"ASR\")\n", + " skipped.append((stem, missing))\n", + " else:\n", + " to_group.append((wav_path, vad_path, asr_path))\n", + " print(f\" ✓ {stem}\")\n", + "\n", + "print(f\"\\nReady to group: {len(to_group)}\")\n", + "if skipped:\n", + " print(f\"Skipped: {len(skipped)}\")\n", + " for stem, missing in skipped:\n", + " print(f\" ✗ {stem} — missing {', '.join(missing)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Moved: 0\n" + ] + } + ], + "source": [ + "# Move files\n", + "for wav_path, vad_path, asr_path in to_group:\n", + " dest = OUT_DIR / wav_path.stem\n", + " dest.mkdir(exist_ok=True)\n", + "\n", + " shutil.move(str(wav_path), dest / wav_path.name)\n", + " shutil.move(str(vad_path), dest / vad_path.name)\n", + " shutil.move(str(asr_path), dest / asr_path.name)\n", + "\n", + " print(f\" {wav_path.stem}/\")\n", + "\n", + "print(f\"\\nMoved: {len(to_group)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "R8001_M8004_N_SPK8013/ ['R8001_M8004_N_SPK8013.json', 'R8001_M8004_N_SPK8013.npy', 'R8001_M8004_N_SPK8013.wav']\n", + "R8001_M8004_N_SPK8014/ ['R8001_M8004_N_SPK8014.json', 'R8001_M8004_N_SPK8014.npy', 'R8001_M8004_N_SPK8014.wav']\n", + "R8001_M8004_N_SPK8015/ ['R8001_M8004_N_SPK8015.json', 'R8001_M8004_N_SPK8015.npy', 'R8001_M8004_N_SPK8015.wav']\n", + "R8001_M8004_N_SPK8016/ ['R8001_M8004_N_SPK8016.json', 'R8001_M8004_N_SPK8016.npy', 'R8001_M8004_N_SPK8016.wav']\n" + ] + } + ], + "source": [ + "# Verify\n", + "for d in sorted(OUT_DIR.iterdir()):\n", + " if d.is_dir():\n", + " files = sorted(f.name for f in d.iterdir())\n", + " print(f\"{d.name}/ {files}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.14.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/training/smart-turn-zh/notebooks/README.md b/training/smart-turn-zh/notebooks/README.md new file mode 100644 index 0000000..3f6f126 --- /dev/null +++ b/training/smart-turn-zh/notebooks/README.md @@ -0,0 +1,26 @@ +# Notebooks — Filler Detection Pipeline + +Following the PodcastFillers approach (Zhu et al., 2022): +filler candidates = regions where **VAD is active** but **ASR produces no words**. + +## Pipeline + +``` +WAV files + → 01 ASR transcription (Paraformer-zh, word-level timestamps) + → 02 VAD detection (FSMN-VAD, speech region timestamps) + → 03 Filler candidates (VAD ∧ ¬ASR, duration filter 150ms–2s) + → 04 ... (labeling / classification TBD) +``` + +## Notebooks + +| # | Notebook | Input | Output | Description | +|---|----------|-------|--------|-------------| +| 01 | `01-asr-transcribe` | `data/wav/*.wav` | `data/asr_results.jsonl` | Paraformer-zh ASR with word timestamps | +| 02 | `02-vad` | `data/wav/*.wav` | `data/vad_results.jsonl` | FSMN-VAD speech region detection | +| 03 | TBD | ASR + VAD results | `data/filler_candidates.jsonl` | Compute VAD ∧ ¬ASR gaps | + +## Reference + +- `refs/2203.pdf` — PodcastFillers: Filler Word Detection and Classification (Zhu et al., 2022) diff --git a/training/smart-turn-zh/plan-data.md b/training/smart-turn-zh/plan-data.md new file mode 100644 index 0000000..4433b98 --- /dev/null +++ b/training/smart-turn-zh/plan-data.md @@ -0,0 +1,342 @@ +# Training Data Pipeline — LCCC-based Turn Detection Dataset + +## Goal + +Generate a labeled audio dataset for turn detection (complete vs. incomplete) +from Chinese conversational text, using the following pipeline: + + LCCC conversations + → split by conversation (train/eval/test) + → LLM generates complete + incomplete text variants + → TTS synthesizes speech audio (WAV) + → labeled audio dataset + +## 1. Source Dataset + +**LCCC-base** (`thu-coai/lccc`, base split — ~500K strictly filtered dialogues) + +Each sample is a multi-turn dialogue: `["你好", "你好呀", "最近怎么样", ...]` + +Why LCCC-base over LCCC-large: +- Strictly filtered — less noise, fewer broken sentences +- 500K dialogues is already more than enough (each produces multiple samples) +- Cleaner text → better TTS output + +## 2. Data Split Strategy (No Leakage) + +**Split at the conversation level, not the utterance level.** + + ┌─────────────────────────────────┐ + │ LCCC conversations │ + │ (each = multi-turn dialogue) │ + └──────────┬──────────────────────┘ + │ deterministic hash split + ▼ + ┌───────────────────────┐ + │ train 80% (~400K) │ + │ eval 10% (~ 50K) │ + │ test 10% (~ 50K) │ + └───────────────────────┘ + │ + ▼ per-utterance expansion + each utterance → 1 complete + 1 incomplete sample + +**Why conversation-level split:** +- Utterances in the same conversation share topic, vocabulary, and speaker style +- Splitting at utterance level would leak conversational context across splits +- A model could memorize topic patterns rather than learning turn boundaries + +**Implementation:** Hash the full conversation text (SHA-256) → use last byte to +assign split deterministically. This is reproducible without storing split IDs. + +```python +import hashlib + +def assign_split(conversation: list[str]) -> str: + key = "\n".join(conversation).encode() + h = int(hashlib.sha256(key).hexdigest(), 16) % 100 + if h < 80: + return "train" + elif h < 90: + return "eval" + else: + return "test" +``` + +## 3. Text Generation (LLM) + +For each utterance in a conversation, generate two variants with TTS instructions: + +| Label | Fields | Example | +|--------------|--------------------|--------------------------------------------------| +| **complete** | text + tts_instruct | `"我明天要去北京出差。"` + `"Calm declarative tone, falling intonation at the end."` | +| incomplete | text + tts_instruct | `"我明天要去"` + `"Hesitant, trailing off mid-sentence, rising intonation."` | + +### What the LLM does + +The LLM takes each utterance and produces: +1. **Complete version** — a self-contained sentence. Minor rewording is OK to + make it sound natural when spoken aloud (remove internet slang, add proper + sentence-final particles like 啊/吧/呢). Plus a TTS instruct describing + the speaking style for a completed turn. +2. **Incomplete version** — must simulate how real incomplete turns sound in + conversation. Plus a TTS instruct describing the speaking style. + +### Incomplete turn patterns + +The incomplete version should randomly follow one of these real-world patterns: + +| Pattern | Example | Description | +|----------------------|--------------------------------|------------------------------------------| +| **Mid-clause cut** | `"我觉得这个..."` | Stops between words/clauses | +| **Filler trailing** | `"嗯...那个...就是说..."` | Thinking aloud with filler words | +| **Compound break** | `"我先去买菜,然后..."` | Finishes one clause but clearly has more | +| **Self-correction** | `"我想...不对,我是说..."` | Restarts mid-thought | +| **Interrupted** | `"你能不能帮我把那个——"` | Cut off abruptly by another speaker | + +Filler words (嗯, 啊, 那个, 就是) are a natural part of incomplete turns and +should be included in the generated text, not treated as a separate category. + +TTS instructs must be in **English** (Qwen3-TTS instruct language). +Text content must be in **Chinese** (target language). + +### Model choice + +**OpenRouter API** — use any strong Chinese-capable model without local GPU. + +- Access to Qwen, DeepSeek, Gemini, Claude, etc. via a single API +- No local GPU needed for this step — offload compute to the cloud +- Pay per token, swap models easily to compare quality +- Recommended starting model: `qwen/qwen3-235b-a22b` (best Chinese quality) + +### Prompt design + +System prompt: + +> You are a speech data labelling assistant. Given a Chinese conversational +> utterance, produce two versions: +> +> 1. **complete**: A natural, spoken-style complete sentence in Chinese, plus +> a short English TTS instruction describing how a speaker would say a +> finished thought (e.g. tone, intonation, emotion). +> 2. **incomplete**: A realistically incomplete version of the sentence in +> Chinese. Randomly pick one of these patterns: +> - Mid-clause cut: stop between words/clauses ("我觉得这个...") +> - Filler trailing: think aloud with fillers ("嗯...那个...就是说...") +> - Compound break: finish one clause but have more ("我先去买菜,然后...") +> - Self-correction: restart mid-thought ("我想...不对,我是说...") +> - Interrupted: cut off abruptly ("你能不能帮我把那个——") +> Plus a short English TTS instruction matching the chosen pattern. +> +> Output JSON only, no explanation. The `text` fields must be Chinese. +> The `tts_instruct` fields must be English. + +User input: `我明天要去北京出差` + +Expected output: + +```json +{ + "complete": { + "text": "我明天要去北京出差。", + "tts_instruct": "Calm and steady, declarative tone with falling intonation at the end." + }, + "incomplete": { + "text": "我明天要去", + "tts_instruct": "Hesitant, trailing off mid-sentence as if thinking about what to say next." + } +} +``` + +### Batch processing + +Use OpenRouter with async HTTP for throughput: + +```python +import openai + +client = openai.AsyncOpenAI( + base_url="https://openrouter.ai/api/v1", + api_key=os.environ["OPENROUTER_API_KEY"], +) + +response = await client.chat.completions.create( + model="qwen/qwen3-235b-a22b", + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": utterance}, + ], + response_format={"type": "json_object"}, + temperature=0.7, + max_tokens=128, +) +``` + +### Filtering + +Post-generation quality checks: +- Parse JSON; discard malformed outputs (must have complete/incomplete with text + tts_instruct) +- Discard if `incomplete.text` is longer than `complete.text` +- Discard if `incomplete.text` is identical to `complete.text` +- Discard if any `text` is empty or too short (< 2 chars) +- Discard if `text` contains non-speech characters (URLs, hashtags, emojis) +- Discard if `tts_instruct` is empty or not in English + +## 4. Speech Synthesis (TTS) + +### Model choice + +**Qwen3-TTS VoiceDesign** (`Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign`) + +- Apache 2.0 license +- 1.7B params, ~8 GB VRAM (bfloat16) +- **Voice design from natural language** — describe any voice persona in text + (age, gender, tone, emotion, speaking style) and the model creates it +- No preset speakers or reference audio needed — unlimited speaker diversity +- Runs on T4 (16 GB VRAM) + +### Voice design for speaker diversity + turn labels + +VoiceDesign takes a `voice_description` that defines the speaker persona, plus +an `instruct` that controls per-utterance prosody. This gives us two levers: + +1. **`voice_description`** — defines who is speaking (generated once per speaker) +2. **`instruct`** — defines how they speak this particular utterance (per sample) + +#### Speaker personas + +Pre-generate a pool of ~20 voice descriptions covering diverse demographics: + +```python +VOICE_POOL = [ + "年轻女性,声音清亮活泼,语速偏快", + "中年男性,声音低沉稳重,语速适中", + "年轻男性,声音明朗有活力,略带磁性", + "中年女性,声音温柔沉稳,语调平和", + "老年男性,声音沙哑浑厚,语速较慢", + # ... more variations +] +``` + +Randomly assign a voice to each sample; track voice ID in metadata. + +#### Prosody instruct per sample + +The `tts_instruct` is **not hardcoded** — it is generated by the LLM in step 2 +alongside the text, tailored to each specific utterance. This produces more +natural and varied prosody than a single fixed instruction per label. + +The combination of voice_description + per-sample instruct makes synthesized +audio more realistic: diverse speakers with prosody that matches the content. + +### Audio format + +- **Sample rate:** 16 kHz (matches Whisper input, resample from TTS native rate) +- **Channels:** mono +- **Bit depth:** 16-bit PCM +- **Format:** WAV +- **Max duration:** 8 seconds (matching existing training data spec) +- Trim trailing silence to ~200 ms (matching existing data convention) + +### Batch processing + +```python +from qwen_tts import Qwen3TTSModel +import torch +import soundfile as sf + +model = Qwen3TTSModel.from_pretrained( + "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign", + device_map="cuda:0", + dtype=torch.bfloat16, +) + +voice = "年轻女性,声音清亮活泼,语速偏快" + +# Complete sample — instruct from LLM +wavs, sr = model.generate_voice_design( + text=sample["complete"]["text"], + language="Chinese", + voice_description=voice, + instruct=sample["complete"]["tts_instruct"], # e.g. "Calm and steady, falling intonation." +) +sf.write("complete.wav", wavs[0], sr) + +# Incomplete sample — instruct from LLM +wavs, sr = model.generate_voice_design( + text=sample["incomplete"]["text"], + language="Chinese", + voice_description=voice, + instruct=sample["incomplete"]["tts_instruct"], # e.g. "Hesitant, trailing off mid-sentence." +) +sf.write("incomplete.wav", wavs[0], sr) +``` + +## 5. Output Format + +### Directory structure + +Follows the existing raw data convention from upstream: + + output/ + ├── train/ + │ ├── zh/ + │ │ ├── complete-nofiller/ + │ │ │ ├── {uuid}.wav + │ │ │ └── ... + │ │ └── incomplete-nofiller/ + │ │ ├── {uuid}.wav + │ │ └── ... + ├── eval/ + │ └── zh/ + │ ├── complete-nofiller/ + │ └── incomplete-nofiller/ + └── test/ + └── zh/ + ├── complete-nofiller/ + └── incomplete-nofiller/ + +### Metadata + +Each split gets a `metadata.jsonl` with one line per sample: + +```json +{"uuid": "...", "text": "...", "tts_instruct": "...", "label": "complete", "voice_id": "V03", "source_conv_hash": "a3f2..."} +``` + +## 6. Pipeline Steps (Execution Order) + +| Step | Script | Input | Output | Compute | +|------|----------------------|------------------------|------------------------|---------| +| 1 | `01_split_lccc.py` | LCCC-base from HF | `splits/{train,eval,test}.jsonl` | CPU | +| 2 | `02_generate_text.py`| `splits/*.jsonl` | `text/{split}.jsonl` with complete/incomplete | OpenRouter API | +| 3 | `03_synthesize.py` | `text/*.jsonl` | `audio/{split}/zh/{label}/*.wav` | GPU (Qwen3-TTS) | +| 4 | `04_build_dataset.py`| `audio/` tree | HuggingFace Dataset (Arrow) | CPU | + +Each step is idempotent — it checks for existing outputs and skips them, +so you can resume after failures. + +## 7. Scale Estimate + +Starting conservative, can scale up later: + +| Parameter | Value | Notes | +|-------------------|-------------|-----------------------------| +| Conversations | 10K (of 500K) | Start small, validate quality | +| Utterances/conv | ~4 avg | ~40K utterances | +| Samples | ~80K | 40K × 2 (complete + incomplete) | +| Audio duration | ~3s avg | ~67 hours total | +| Storage | ~8 GB | 16-bit WAV at 16 kHz | + +For comparison, upstream Smart Turn v3.2 uses 270K samples (~41 GB). +We can scale to full 500K conversations (~4M samples) once quality is validated. + +## 8. Decisions + +- **Fillers** — not a separate category. Filler words (嗯, 啊, 那个) are a + natural part of incomplete turns and are included in the generated text. +- **Background noise** — no augmentation in this pipeline. Keep audio clean; + augmentation is handled during training. +- **Voice pool size** — 20 voice descriptions for the VoiceDesign speaker pool. +- **OpenRouter model** — start with `qwen/qwen3-235b-a22b`, iterate as needed. +- **TTS instruct guardrails** — iterate on quality; no upfront constraints. diff --git a/training/smart-turn-zh/requirements.txt b/training/smart-turn-zh/requirements.txt new file mode 100644 index 0000000..c8ae105 --- /dev/null +++ b/training/smart-turn-zh/requirements.txt @@ -0,0 +1,10 @@ +torch +torchaudio +torchcodec +funasr +modelscope +soundfile +numpy +matplotlib +tqdm +jupyterlab diff --git a/training/smart-turn-zh/research/01-datasets.md b/training/smart-turn-zh/research/01-datasets.md new file mode 100644 index 0000000..feaffe7 --- /dev/null +++ b/training/smart-turn-zh/research/01-datasets.md @@ -0,0 +1,84 @@ +# Datasets — Mandarin Conversational Audio for Turn Detection + +Survey of candidate corpora for training a Mandarin Smart Turn detector. +Sources reviewed: [OpenSLR](http://www.openslr.org/resources.php) (full list, +SLR1–SLR162) and HuggingFace. + +## What we need + +Spontaneous conversational audio with **per-speaker turn boundaries**: + +- Real conversation — readers don't yield turns naturally; read-speech corpora + cannot supervise turn detection as a primary signal. +- Turn timestamps, or per-speaker channels we can VAD ourselves. +- Permissive license — we want commercial-friendly model weights. + +## Recommended basket (Mandarin, commercial-OK) + +About 338 h of pre-annotated Mandarin conversation across three corpora. + +| SLR | Name | Hours | Per-speaker channels | License | Why | +|---|---|---|---|---|---| +| [119](http://www.openslr.org/119/) | AliMeeting | 118 | yes (headset) | CC BY-SA 4.0 | 2–4 spk meetings, clean per-speaker tracks, diarization ready. Best fit. | +| [159](http://www.openslr.org/159/) | AISHELL-5 | 100 | partial (training-set near-field) | CC BY-SA 4.0 | In-car free conversation, 2–4 spk. Closest to voice-agent setting. | +| [111](http://www.openslr.org/111/) | AISHELL-4 | 120 | no (8-ch array only) | CC BY-SA 4.0 | 4–8 spk meetings, more overlap; needs careful filtering. | + +For scale once the base pipeline works: + +| SLR | Name | Hours | License | Catch | +|---|---|---|---|---| +| [121](http://www.openslr.org/121/) | WenetSpeech | 10,000+ | CC BY 4.0 | Multi-domain (podcasts, interviews, audiobooks). Not pre-segmented — needs diarization (e.g. pyannote-audio). | + +## Reserve / research-only + +| SLR | Name | Hours | License | Why on hold | +|---|---|---|---|---| +| [123](http://www.openslr.org/123/) | MAGICDATA Conversational | 180 | **CC BY-NC-ND 4.0** | Non-commercial only. Use only if scope stays research. | +| [155](http://www.openslr.org/155/) | SBCSAE | 20 | **CC BY-ND 3.0** | No-derivatives blocks publishing processed clips. Small. American English. | + +## Optional English coverage (multilingual model) + +| SLR | Name | Hours | License | Notes | +|---|---|---|---|---| +| [16](http://www.openslr.org/16/) | AMI Corpus | ~100 | CC BY 4.0 | Classic meeting corpus, headset + array. | +| [150](http://www.openslr.org/150/) | CHiME-6 | ~50 | CC BY-SA 4.0 | Dinner-party recordings, JSON annotations. | + +## Not useful for the primary turn signal + +**Read speech** (no natural turn yielding): SLR18 THCHS-30, SLR33 AISHELL-1, +SLR38 Free ST, SLR47 Primewords, SLR62 aidatatang_200zh, SLR68 MAGICDATA Read +755h, SLR93 AISHELL-3, SLR138 SHALCAS22A. Could be reused as text/voice +sources for synthetic-truncation augmentation if we go that route. + +**Other excluded** (full sweep): software mirrors (SLR2/3/4/9/11/15/23/48/50/56), +impulse-response and noise databases (SLR13/17/20/26/28), wake-word and +hotword (SLR85/87/120), speaker verification (SLR82 CN-Celeb, SLR156), +pronunciation dictionaries (SLR8/14/21/29/34), text-only corpora (SLR55/153), +emotional read-speech TTS (SLR88/110/115/136/161), scripture (SLR129/132), +handwriting (SLR84), nonverbal vocalizations (SLR99), test fixtures +(SLR1/81), forensics (SLR162), whistled language (SLR137), single-speaker TTS +data for low-resource languages (most of the "Crowdsourced high-quality" +series, Thorsten Müller, Sinhala TTS, etc.), and endangered-language +documentation (SLR89/92/107/124/133/147/148/158/149). + +## License notes + +**CC BY-SA 4.0 share-alike** applies to *derived datasets* — if we publish +processed clips, the clip dataset itself must be CC BY-SA. Trained model +weights are generally not treated as derivatives of training data in most +jurisdictions, so a permissive weight release is usually fine, but worth +confirming with whoever handles licensing if we plan to ship weights. + +**CC BY-NC-ND** (SLR123) blocks both commercial use and derivative datasets — +incompatible with any pipeline that publishes processed clips. + +## Open questions + +- WenetSpeech subsets: which of its domains (podcast / interview / meeting / + audiobook / vlog) are spontaneous enough to be worth diarizing? Need to + inspect the metadata before committing GPU time. +- Whether to include SLR123 (RAMC, 180h) under a research-only carve-out — + doubles available Mandarin conversation hours but adds license-tracking + burden across artifacts. +- Multilingual scope: is English (AMI + CHiME-6) in scope for v1, or + Mandarin-only? diff --git a/training/smart-turn-zh/viewer/README.md b/training/smart-turn-zh/viewer/README.md new file mode 100644 index 0000000..2ad4db7 --- /dev/null +++ b/training/smart-turn-zh/viewer/README.md @@ -0,0 +1,56 @@ +# Audio Viewer + +Browser-based tool for reviewing audio alongside VAD probabilities and ASR +transcriptions. Built to support the Smart Turn training data pipeline — lets +you visually inspect and cross-reference the outputs of the ASR and VAD +notebooks before feeding data into downstream labelling and training steps. + +## What it does + +- **Waveform display** with LOD decimation — handles 1-hour, multi-channel + recordings smoothly. Switch between individual channels or a merged "All" view. +- **VAD probability curve** with dual-threshold hysteresis (entry/exit), + color-coded active/inactive speech regions. +- **ASR transcript panel** with timestamped sentence list. Click any sentence + to jump to that moment. +- **Text search** across ASR results with timeline highlighting and + Previous/Next navigation. +- **Audio playback** with click-to-seek, keyboard shortcuts, and auto-scroll. +- **Minimap** overview bar for quick navigation across long recordings. + +All processing is client-side — drop your files in and go, no server needed. + +## Quick start + +```bash +npm install +npm run dev +``` + +Then open the URL shown in the terminal. Drop in your files: + +| File type | Example | Source | +|-----------|---------|--------| +| `.wav` | `R8001_M8004_MS801.wav` | `data/wav/` | +| `.npy` | `R8001_M8004_MS801.npy` | `data/vad_probs/` (from `02-vad.ipynb`) | +| `.json` | `R8001_M8004_MS801.json` | `data/asr_results/` (from `01-asr-transcribe.ipynb`) | + +## Keyboard shortcuts + +| Key | Action | +|-----|--------| +| Space | Play / Pause | +| F | Focus search box | +| Enter / Shift+Enter | Next / Previous search result | +| +/- | Zoom in / out | +| Left / Right | Pan | +| 0 | Merged (all channels) view | +| 1-9 | Select channel | + +## Further reading + +- [`docs/02-data-structures.md`](../docs/02-data-structures.md) — schemas for + the WAV, ASR, and VAD data formats this viewer consumes. +- [`docs/03-audio-viewer.md`](../docs/03-audio-viewer.md) — full design plan, + architecture decisions, and performance strategy. +- [`notebooks/`](../notebooks/) — the pipeline notebooks that produce the data. diff --git a/training/smart-turn-zh/viewer/index.html b/training/smart-turn-zh/viewer/index.html new file mode 100644 index 0000000..806e3de --- /dev/null +++ b/training/smart-turn-zh/viewer/index.html @@ -0,0 +1,20 @@ + + + + + + Audio Viewer + + + + + +
+ + + diff --git a/training/smart-turn-zh/viewer/package-lock.json b/training/smart-turn-zh/viewer/package-lock.json new file mode 100644 index 0000000..18cfa88 --- /dev/null +++ b/training/smart-turn-zh/viewer/package-lock.json @@ -0,0 +1,1820 @@ +{ + "name": "audio-viewer", + "version": "0.2.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "audio-viewer", + "version": "0.2.0", + "dependencies": { + "react": "^19.0.0", + "react-dom": "^19.0.0" + }, + "devDependencies": { + "@types/react": "^19.0.0", + "@types/react-dom": "^19.0.0", + "@vitejs/plugin-react": "^4.3.0", + "typescript": "^5.7.0", + "vite": "^6.0.0" + } + }, + "node_modules/@babel/code-frame": { + "version": "7.29.0", + "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.29.0.tgz", + "integrity": "sha512-9NhCeYjq9+3uxgdtp20LSiJXJvN0FeCtNGpJxuMFZ1Kv3cWUNb6DOhJwUvcVCzKGR66cw4njwM6hrJLqgOwbcw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-validator-identifier": "^7.28.5", + "js-tokens": "^4.0.0", + "picocolors": "^1.1.1" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/compat-data": { + "version": "7.29.0", + "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.29.0.tgz", + "integrity": "sha512-T1NCJqT/j9+cn8fvkt7jtwbLBfLC/1y1c7NtCeXFRgzGTsafi68MRv8yzkYSapBnFA6L3U2VSc02ciDzoAJhJg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/core": { + "version": "7.29.0", + "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.29.0.tgz", + "integrity": "sha512-CGOfOJqWjg2qW/Mb6zNsDm+u5vFQ8DxXfbM09z69p5Z6+mE1ikP2jUXw+j42Pf1XTYED2Rni5f95npYeuwMDQA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/code-frame": "^7.29.0", + "@babel/generator": "^7.29.0", + "@babel/helper-compilation-targets": "^7.28.6", + "@babel/helper-module-transforms": "^7.28.6", + "@babel/helpers": "^7.28.6", + "@babel/parser": "^7.29.0", + "@babel/template": "^7.28.6", + "@babel/traverse": "^7.29.0", + "@babel/types": "^7.29.0", + "@jridgewell/remapping": "^2.3.5", + "convert-source-map": "^2.0.0", + "debug": "^4.1.0", + "gensync": "^1.0.0-beta.2", + "json5": "^2.2.3", + "semver": "^6.3.1" + }, + "engines": { + "node": ">=6.9.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/babel" + } + }, + "node_modules/@babel/generator": { + "version": "7.29.1", + "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.29.1.tgz", + "integrity": "sha512-qsaF+9Qcm2Qv8SRIMMscAvG4O3lJ0F1GuMo5HR/Bp02LopNgnZBC/EkbevHFeGs4ls/oPz9v+Bsmzbkbe+0dUw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.29.0", + "@babel/types": "^7.29.0", + "@jridgewell/gen-mapping": "^0.3.12", + "@jridgewell/trace-mapping": "^0.3.28", + "jsesc": "^3.0.2" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-compilation-targets": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.28.6.tgz", + "integrity": "sha512-JYtls3hqi15fcx5GaSNL7SCTJ2MNmjrkHXg4FSpOA/grxK8KwyZ5bubHsCq8FXCkua6xhuaaBit+3b7+VZRfcA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/compat-data": "^7.28.6", + "@babel/helper-validator-option": "^7.27.1", + "browserslist": "^4.24.0", + "lru-cache": "^5.1.1", + "semver": "^6.3.1" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-globals": { + "version": "7.28.0", + "resolved": "https://registry.npmjs.org/@babel/helper-globals/-/helper-globals-7.28.0.tgz", + "integrity": "sha512-+W6cISkXFa1jXsDEdYA8HeevQT/FULhxzR99pxphltZcVaugps53THCeiWA8SguxxpSp3gKPiuYfSWopkLQ4hw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-module-imports": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/helper-module-imports/-/helper-module-imports-7.28.6.tgz", + "integrity": "sha512-l5XkZK7r7wa9LucGw9LwZyyCUscb4x37JWTPz7swwFE/0FMQAGpiWUZn8u9DzkSBWEcK25jmvubfpw2dnAMdbw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/traverse": "^7.28.6", + "@babel/types": "^7.28.6" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-module-transforms": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.28.6.tgz", + "integrity": "sha512-67oXFAYr2cDLDVGLXTEABjdBJZ6drElUSI7WKp70NrpyISso3plG9SAGEF6y7zbha/wOzUByWWTJvEDVNIUGcA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-module-imports": "^7.28.6", + "@babel/helper-validator-identifier": "^7.28.5", + "@babel/traverse": "^7.28.6" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/@babel/helper-plugin-utils": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/helper-plugin-utils/-/helper-plugin-utils-7.28.6.tgz", + "integrity": "sha512-S9gzZ/bz83GRysI7gAD4wPT/AI3uCnY+9xn+Mx/KPs2JwHJIz1W8PZkg2cqyt3RNOBM8ejcXhV6y8Og7ly/Dug==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-string-parser": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.27.1.tgz", + "integrity": "sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-validator-identifier": { + "version": "7.28.5", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.28.5.tgz", + "integrity": "sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-validator-option": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.27.1.tgz", + "integrity": "sha512-YvjJow9FxbhFFKDSuFnVCe2WxXk1zWc22fFePVNEaWJEu8IrZVlda6N0uHwzZrUM1il7NC9Mlp4MaJYbYd9JSg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helpers": { + "version": "7.29.2", + "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.29.2.tgz", + "integrity": "sha512-HoGuUs4sCZNezVEKdVcwqmZN8GoHirLUcLaYVNBK2J0DadGtdcqgr3BCbvH8+XUo4NGjNl3VOtSjEKNzqfFgKw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/template": "^7.28.6", + "@babel/types": "^7.29.0" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/parser": { + "version": "7.29.2", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.29.2.tgz", + "integrity": "sha512-4GgRzy/+fsBa72/RZVJmGKPmZu9Byn8o4MoLpmNe1m8ZfYnz5emHLQz3U4gLud6Zwl0RZIcgiLD7Uq7ySFuDLA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/types": "^7.29.0" + }, + "bin": { + "parser": "bin/babel-parser.js" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@babel/plugin-transform-react-jsx-self": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-self/-/plugin-transform-react-jsx-self-7.27.1.tgz", + "integrity": "sha512-6UzkCs+ejGdZ5mFFC/OCUrv028ab2fp1znZmCZjAOBKiBK2jXD1O+BPSfX8X2qjJ75fZBMSnQn3Rq2mrBJK2mw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-plugin-utils": "^7.27.1" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-react-jsx-source": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-source/-/plugin-transform-react-jsx-source-7.27.1.tgz", + "integrity": "sha512-zbwoTsBruTeKB9hSq73ha66iFeJHuaFkUbwvqElnygoNbj/jHRsSeokowZFN3CZ64IvEqcmmkVe89OPXc7ldAw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-plugin-utils": "^7.27.1" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/template": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.28.6.tgz", + "integrity": "sha512-YA6Ma2KsCdGb+WC6UpBVFJGXL58MDA6oyONbjyF/+5sBgxY/dwkhLogbMT2GXXyU84/IhRw/2D1Os1B/giz+BQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/code-frame": "^7.28.6", + "@babel/parser": "^7.28.6", + "@babel/types": "^7.28.6" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/traverse": { + "version": "7.29.0", + "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.29.0.tgz", + "integrity": "sha512-4HPiQr0X7+waHfyXPZpWPfWL/J7dcN1mx9gL6WdQVMbPnF3+ZhSMs8tCxN7oHddJE9fhNE7+lxdnlyemKfJRuA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/code-frame": "^7.29.0", + "@babel/generator": "^7.29.0", + "@babel/helper-globals": "^7.28.0", + "@babel/parser": "^7.29.0", + "@babel/template": "^7.28.6", + "@babel/types": "^7.29.0", + "debug": "^4.3.1" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/types": { + "version": "7.29.0", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.29.0.tgz", + "integrity": "sha512-LwdZHpScM4Qz8Xw2iKSzS+cfglZzJGvofQICy7W7v4caru4EaAmyUuO6BGrbyQ2mYV11W0U8j5mBhd14dd3B0A==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-string-parser": "^7.27.1", + "@babel/helper-validator-identifier": "^7.28.5" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.25.12.tgz", + "integrity": "sha512-Hhmwd6CInZ3dwpuGTF8fJG6yoWmsToE+vYgD4nytZVxcu1ulHpUQRAB1UJ8+N1Am3Mz4+xOByoQoSZf4D+CpkA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.25.12.tgz", + "integrity": "sha512-VJ+sKvNA/GE7Ccacc9Cha7bpS8nyzVv0jdVgwNDaR4gDMC/2TTRc33Ip8qrNYUcpkOHUT5OZ0bUcNNVZQ9RLlg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.25.12.tgz", + "integrity": "sha512-6AAmLG7zwD1Z159jCKPvAxZd4y/VTO0VkprYy+3N2FtJ8+BQWFXU+OxARIwA46c5tdD9SsKGZ/1ocqBS/gAKHg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.25.12.tgz", + "integrity": "sha512-5jbb+2hhDHx5phYR2By8GTWEzn6I9UqR11Kwf22iKbNpYrsmRB18aX/9ivc5cabcUiAT/wM+YIZ6SG9QO6a8kg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.25.12.tgz", + "integrity": "sha512-N3zl+lxHCifgIlcMUP5016ESkeQjLj/959RxxNYIthIg+CQHInujFuXeWbWMgnTo4cp5XVHqFPmpyu9J65C1Yg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.25.12.tgz", + "integrity": "sha512-HQ9ka4Kx21qHXwtlTUVbKJOAnmG1ipXhdWTmNXiPzPfWKpXqASVcWdnf2bnL73wgjNrFXAa3yYvBSd9pzfEIpA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.25.12.tgz", + "integrity": "sha512-gA0Bx759+7Jve03K1S0vkOu5Lg/85dou3EseOGUes8flVOGxbhDDh/iZaoek11Y8mtyKPGF3vP8XhnkDEAmzeg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.25.12.tgz", + "integrity": "sha512-TGbO26Yw2xsHzxtbVFGEXBFH0FRAP7gtcPE7P5yP7wGy7cXK2oO7RyOhL5NLiqTlBh47XhmIUXuGciXEqYFfBQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.25.12.tgz", + "integrity": "sha512-lPDGyC1JPDou8kGcywY0YILzWlhhnRjdof3UlcoqYmS9El818LLfJJc3PXXgZHrHCAKs/Z2SeZtDJr5MrkxtOw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.25.12.tgz", + "integrity": "sha512-8bwX7a8FghIgrupcxb4aUmYDLp8pX06rGh5HqDT7bB+8Rdells6mHvrFHHW2JAOPZUbnjUpKTLg6ECyzvas2AQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.25.12.tgz", + "integrity": "sha512-0y9KrdVnbMM2/vG8KfU0byhUN+EFCny9+8g202gYqSSVMonbsCfLjUO+rCci7pM0WBEtz+oK/PIwHkzxkyharA==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.25.12.tgz", + "integrity": "sha512-h///Lr5a9rib/v1GGqXVGzjL4TMvVTv+s1DPoxQdz7l/AYv6LDSxdIwzxkrPW438oUXiDtwM10o9PmwS/6Z0Ng==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.25.12.tgz", + "integrity": "sha512-iyRrM1Pzy9GFMDLsXn1iHUm18nhKnNMWscjmp4+hpafcZjrr2WbT//d20xaGljXDBYHqRcl8HnxbX6uaA/eGVw==", + "cpu": [ + "mips64el" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.25.12.tgz", + "integrity": "sha512-9meM/lRXxMi5PSUqEXRCtVjEZBGwB7P/D4yT8UG/mwIdze2aV4Vo6U5gD3+RsoHXKkHCfSxZKzmDssVlRj1QQA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.25.12.tgz", + "integrity": "sha512-Zr7KR4hgKUpWAwb1f3o5ygT04MzqVrGEGXGLnj15YQDJErYu/BGg+wmFlIDOdJp0PmB0lLvxFIOXZgFRrdjR0w==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.25.12.tgz", + "integrity": "sha512-MsKncOcgTNvdtiISc/jZs/Zf8d0cl/t3gYWX8J9ubBnVOwlk65UIEEvgBORTiljloIWnBzLs4qhzPkJcitIzIg==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.25.12.tgz", + "integrity": "sha512-uqZMTLr/zR/ed4jIGnwSLkaHmPjOjJvnm6TVVitAa08SLS9Z0VM8wIRx7gWbJB5/J54YuIMInDquWyYvQLZkgw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.25.12.tgz", + "integrity": "sha512-xXwcTq4GhRM7J9A8Gv5boanHhRa/Q9KLVmcyXHCTaM4wKfIpWkdXiMog/KsnxzJ0A1+nD+zoecuzqPmCRyBGjg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.25.12.tgz", + "integrity": "sha512-Ld5pTlzPy3YwGec4OuHh1aCVCRvOXdH8DgRjfDy/oumVovmuSzWfnSJg+VtakB9Cm0gxNO9BzWkj6mtO1FMXkQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.25.12.tgz", + "integrity": "sha512-fF96T6KsBo/pkQI950FARU9apGNTSlZGsv1jZBAlcLL1MLjLNIWPBkj5NlSz8aAzYKg+eNqknrUJ24QBybeR5A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.25.12.tgz", + "integrity": "sha512-MZyXUkZHjQxUvzK7rN8DJ3SRmrVrke8ZyRusHlP+kuwqTcfWLyqMOE3sScPPyeIXN/mDJIfGXvcMqCgYKekoQw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/openharmony-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.25.12.tgz", + "integrity": "sha512-rm0YWsqUSRrjncSXGA7Zv78Nbnw4XL6/dzr20cyrQf7ZmRcsovpcRBdhD43Nuk3y7XIoW2OxMVvwuRvk9XdASg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.25.12.tgz", + "integrity": "sha512-3wGSCDyuTHQUzt0nV7bocDy72r2lI33QL3gkDNGkod22EsYl04sMf0qLb8luNKTOmgF/eDEDP5BFNwoBKH441w==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.25.12.tgz", + "integrity": "sha512-rMmLrur64A7+DKlnSuwqUdRKyd3UE7oPJZmnljqEptesKM8wx9J8gx5u0+9Pq0fQQW8vqeKebwNXdfOyP+8Bsg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.25.12.tgz", + "integrity": "sha512-HkqnmmBoCbCwxUKKNPBixiWDGCpQGVsrQfJoVGYLPT41XWF8lHuE5N6WhVia2n4o5QK5M4tYr21827fNhi4byQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.25.12.tgz", + "integrity": "sha512-alJC0uCZpTFrSL0CCDjcgleBXPnCrEAhTBILpeAp7M/OFgoqtAetfBzX0xM00MUsVVPpVjlPuMbREqnZCXaTnA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@jridgewell/gen-mapping": { + "version": "0.3.13", + "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.13.tgz", + "integrity": "sha512-2kkt/7niJ6MgEPxF0bYdQ6etZaA+fQvDcLKckhy1yIQOzaoKjBBjSj63/aLVjYE3qhRt5dvM+uUyfCg6UKCBbA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/sourcemap-codec": "^1.5.0", + "@jridgewell/trace-mapping": "^0.3.24" + } + }, + "node_modules/@jridgewell/remapping": { + "version": "2.3.5", + "resolved": "https://registry.npmjs.org/@jridgewell/remapping/-/remapping-2.3.5.tgz", + "integrity": "sha512-LI9u/+laYG4Ds1TDKSJW2YPrIlcVYOwi2fUC6xB43lueCjgxV4lffOCZCtYFiH6TNOX+tQKXx97T4IKHbhyHEQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/gen-mapping": "^0.3.5", + "@jridgewell/trace-mapping": "^0.3.24" + } + }, + "node_modules/@jridgewell/resolve-uri": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz", + "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@jridgewell/sourcemap-codec": { + "version": "1.5.5", + "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz", + "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==", + "dev": true, + "license": "MIT" + }, + "node_modules/@jridgewell/trace-mapping": { + "version": "0.3.31", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.31.tgz", + "integrity": "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/resolve-uri": "^3.1.0", + "@jridgewell/sourcemap-codec": "^1.4.14" + } + }, + "node_modules/@rolldown/pluginutils": { + "version": "1.0.0-beta.27", + "resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.27.tgz", + "integrity": "sha512-+d0F4MKMCbeVUJwG96uQ4SgAznZNSq93I3V+9NHA4OpvqG8mRCpGdKmK8l/dl02h2CCDHwW2FqilnTyDcAnqjA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@rollup/rollup-android-arm-eabi": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.60.2.tgz", + "integrity": "sha512-dnlp69efPPg6Uaw2dVqzWRfAWRnYVb1XJ8CyyhIbZeaq4CA5/mLeZ1IEt9QqQxmbdvagjLIm2ZL8BxXv5lH4Yw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-android-arm64": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.60.2.tgz", + "integrity": "sha512-OqZTwDRDchGRHHm/hwLOL7uVPB9aUvI0am/eQuWMNyFHf5PSEQmyEeYYheA0EPPKUO/l0uigCp+iaTjoLjVoHg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-darwin-arm64": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.60.2.tgz", + "integrity": "sha512-UwRE7CGpvSVEQS8gUMBe1uADWjNnVgP3Iusyda1nSRwNDCsRjnGc7w6El6WLQsXmZTbLZx9cecegumcitNfpmA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-darwin-x64": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.60.2.tgz", + "integrity": "sha512-gjEtURKLCC5VXm1I+2i1u9OhxFsKAQJKTVB8WvDAHF+oZlq0GTVFOlTlO1q3AlCTE/DF32c16ESvfgqR7343/g==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-freebsd-arm64": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.60.2.tgz", + "integrity": "sha512-Bcl6CYDeAgE70cqZaMojOi/eK63h5Me97ZqAQoh77VPjMysA/4ORQBRGo3rRy45x4MzVlU9uZxs8Uwy7ZaKnBw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-freebsd-x64": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.60.2.tgz", + "integrity": "sha512-LU+TPda3mAE2QB0/Hp5VyeKJivpC6+tlOXd1VMoXV/YFMvk/MNk5iXeBfB4MQGRWyOYVJ01625vjkr0Az98OJQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-linux-arm-gnueabihf": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.60.2.tgz", + "integrity": "sha512-2QxQrM+KQ7DAW4o22j+XZ6RKdxjLD7BOWTP0Bv0tmjdyhXSsr2Ul1oJDQqh9Zf5qOwTuTc7Ek83mOFaKnodPjg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm-musleabihf": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.60.2.tgz", + "integrity": "sha512-TbziEu2DVsTEOPif2mKWkMeDMLoYjx95oESa9fkQQK7r/Orta0gnkcDpzwufEcAO2BLBsD7mZkXGFqEdMRRwfw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-gnu": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.60.2.tgz", + "integrity": "sha512-bO/rVDiDUuM2YfuCUwZ1t1cP+/yqjqz+Xf2VtkdppefuOFS2OSeAfgafaHNkFn0t02hEyXngZkxtGqXcXwO8Rg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-musl": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.60.2.tgz", + "integrity": "sha512-hr26p7e93Rl0Za+JwW7EAnwAvKkehh12BU1Llm9Ykiibg4uIr2rbpxG9WCf56GuvidlTG9KiiQT/TXT1yAWxTA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-loong64-gnu": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.60.2.tgz", + "integrity": "sha512-pOjB/uSIyDt+ow3k/RcLvUAOGpysT2phDn7TTUB3n75SlIgZzM6NKAqlErPhoFU+npgY3/n+2HYIQVbF70P9/A==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-loong64-musl": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-musl/-/rollup-linux-loong64-musl-4.60.2.tgz", + "integrity": "sha512-2/w+q8jszv9Ww1c+6uJT3OwqhdmGP2/4T17cu8WuwyUuuaCDDJ2ojdyYwZzCxx0GcsZBhzi3HmH+J5pZNXnd+Q==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-ppc64-gnu": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.60.2.tgz", + "integrity": "sha512-11+aL5vKheYgczxtPVVRhdptAM2H7fcDR5Gw4/bTcteuZBlH4oP9f5s9zYO9aGZvoGeBpqXI/9TZZihZ609wKw==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-ppc64-musl": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-musl/-/rollup-linux-ppc64-musl-4.60.2.tgz", + "integrity": "sha512-i16fokAGK46IVZuV8LIIwMdtqhin9hfYkCh8pf8iC3QU3LpwL+1FSFGej+O7l3E/AoknL6Dclh2oTdnRMpTzFQ==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-gnu": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.60.2.tgz", + "integrity": "sha512-49FkKS6RGQoriDSK/6E2GkAsAuU5kETFCh7pG4yD/ylj9rKhTmO3elsnmBvRD4PgJPds5W2PkhC82aVwmUcJ7A==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-musl": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.60.2.tgz", + "integrity": "sha512-mjYNkHPfGpUR00DuM1ZZIgs64Hpf4bWcz9Z41+4Q+pgDx73UwWdAYyf6EG/lRFldmdHHzgrYyge5akFUW0D3mQ==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-s390x-gnu": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.60.2.tgz", + "integrity": "sha512-ALyvJz965BQk8E9Al/JDKKDLH2kfKFLTGMlgkAbbYtZuJt9LU8DW3ZoDMCtQpXAltZxwBHevXz5u+gf0yA0YoA==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-gnu": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.60.2.tgz", + "integrity": "sha512-UQjrkIdWrKI626Du8lCQ6MJp/6V1LAo2bOK9OTu4mSn8GGXIkPXk/Vsp4bLHCd9Z9Iz2OTEaokUE90VweJgIYQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-musl": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.60.2.tgz", + "integrity": "sha512-bTsRGj6VlSdn/XD4CGyzMnzaBs9bsRxy79eTqTCBsA8TMIEky7qg48aPkvJvFe1HyzQ5oMZdg7AnVlWQSKLTnw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-openbsd-x64": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openbsd-x64/-/rollup-openbsd-x64-4.60.2.tgz", + "integrity": "sha512-6d4Z3534xitaA1FcMWP7mQPq5zGwBmGbhphh2DwaA1aNIXUu3KTOfwrWpbwI4/Gr0uANo7NTtaykFyO2hPuFLg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ] + }, + "node_modules/@rollup/rollup-openharmony-arm64": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.60.2.tgz", + "integrity": "sha512-NetAg5iO2uN7eB8zE5qrZ3CSil+7IJt4WDFLcC75Ymywq1VZVD6qJ6EvNLjZ3rEm6gB7XW5JdT60c6MN35Z85Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ] + }, + "node_modules/@rollup/rollup-win32-arm64-msvc": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.60.2.tgz", + "integrity": "sha512-NCYhOotpgWZ5kdxCZsv6Iudx0wX8980Q/oW4pNFNihpBKsDbEA1zpkfxJGC0yugsUuyDZ7gL37dbzwhR0VI7pQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-ia32-msvc": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.60.2.tgz", + "integrity": "sha512-RXsaOqXxfoUBQoOgvmmijVxJnW2IGB0eoMO7F8FAjaj0UTywUO/luSqimWBJn04WNgUkeNhh7fs7pESXajWmkg==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-gnu": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.60.2.tgz", + "integrity": "sha512-qdAzEULD+/hzObedtmV6iBpdL5TIbKVztGiK7O3/KYSf+HIzU257+MX1EXJcyIiDbMAqmbwaufcYPvyRryeZtA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-msvc": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.60.2.tgz", + "integrity": "sha512-Nd/SgG27WoA9e+/TdK74KnHz852TLa94ovOYySo/yMPuTmpckK/jIF2jSwS3g7ELSKXK13/cVdmg1Z/DaCWKxA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@types/babel__core": { + "version": "7.20.5", + "resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.20.5.tgz", + "integrity": "sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.20.7", + "@babel/types": "^7.20.7", + "@types/babel__generator": "*", + "@types/babel__template": "*", + "@types/babel__traverse": "*" + } + }, + "node_modules/@types/babel__generator": { + "version": "7.27.0", + "resolved": "https://registry.npmjs.org/@types/babel__generator/-/babel__generator-7.27.0.tgz", + "integrity": "sha512-ufFd2Xi92OAVPYsy+P4n7/U7e68fex0+Ee8gSG9KX7eo084CWiQ4sdxktvdl0bOPupXtVJPY19zk6EwWqUQ8lg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/types": "^7.0.0" + } + }, + "node_modules/@types/babel__template": { + "version": "7.4.4", + "resolved": "https://registry.npmjs.org/@types/babel__template/-/babel__template-7.4.4.tgz", + "integrity": "sha512-h/NUaSyG5EyxBIp8YRxo4RMe2/qQgvyowRwVMzhYhBCONbW8PUsg4lkFMrhgZhUe5z3L3MiLDuvyJ/CaPa2A8A==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.1.0", + "@babel/types": "^7.0.0" + } + }, + "node_modules/@types/babel__traverse": { + "version": "7.28.0", + "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.28.0.tgz", + "integrity": "sha512-8PvcXf70gTDZBgt9ptxJ8elBeBjcLOAcOtoO/mPJjtji1+CdGbHgm77om1GrsPxsiE+uXIpNSK64UYaIwQXd4Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/types": "^7.28.2" + } + }, + "node_modules/@types/estree": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", + "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/react": { + "version": "19.2.14", + "resolved": "https://registry.npmjs.org/@types/react/-/react-19.2.14.tgz", + "integrity": "sha512-ilcTH/UniCkMdtexkoCN0bI7pMcJDvmQFPvuPvmEaYA/NSfFTAgdUSLAoVjaRJm7+6PvcM+q1zYOwS4wTYMF9w==", + "dev": true, + "license": "MIT", + "dependencies": { + "csstype": "^3.2.2" + } + }, + "node_modules/@types/react-dom": { + "version": "19.2.3", + "resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-19.2.3.tgz", + "integrity": "sha512-jp2L/eY6fn+KgVVQAOqYItbF0VY/YApe5Mz2F0aykSO8gx31bYCZyvSeYxCHKvzHG5eZjc+zyaS5BrBWya2+kQ==", + "dev": true, + "license": "MIT", + "peerDependencies": { + "@types/react": "^19.2.0" + } + }, + "node_modules/@vitejs/plugin-react": { + "version": "4.7.0", + "resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-4.7.0.tgz", + "integrity": "sha512-gUu9hwfWvvEDBBmgtAowQCojwZmJ5mcLn3aufeCsitijs3+f2NsrPtlAWIR6OPiqljl96GVCUbLe0HyqIpVaoA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/core": "^7.28.0", + "@babel/plugin-transform-react-jsx-self": "^7.27.1", + "@babel/plugin-transform-react-jsx-source": "^7.27.1", + "@rolldown/pluginutils": "1.0.0-beta.27", + "@types/babel__core": "^7.20.5", + "react-refresh": "^0.17.0" + }, + "engines": { + "node": "^14.18.0 || >=16.0.0" + }, + "peerDependencies": { + "vite": "^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0" + } + }, + "node_modules/baseline-browser-mapping": { + "version": "2.10.20", + "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.10.20.tgz", + "integrity": "sha512-1AaXxEPfXT+GvTBJFuy4yXVHWJBXa4OdbIebGN/wX5DlsIkU0+wzGnd2lOzokSk51d5LUmqjgBLRLlypLUqInQ==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "baseline-browser-mapping": "dist/cli.cjs" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/browserslist": { + "version": "4.28.2", + "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.28.2.tgz", + "integrity": "sha512-48xSriZYYg+8qXna9kwqjIVzuQxi+KYWp2+5nCYnYKPTr0LvD89Jqk2Or5ogxz0NUMfIjhh2lIUX/LyX9B4oIg==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/browserslist" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "baseline-browser-mapping": "^2.10.12", + "caniuse-lite": "^1.0.30001782", + "electron-to-chromium": "^1.5.328", + "node-releases": "^2.0.36", + "update-browserslist-db": "^1.2.3" + }, + "bin": { + "browserslist": "cli.js" + }, + "engines": { + "node": "^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7" + } + }, + "node_modules/caniuse-lite": { + "version": "1.0.30001788", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001788.tgz", + "integrity": "sha512-6q8HFp+lOQtcf7wBK+uEenxymVWkGKkjFpCvw5W25cmMwEDU45p1xQFBQv8JDlMMry7eNxyBaR+qxgmTUZkIRQ==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/caniuse-lite" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "CC-BY-4.0" + }, + "node_modules/convert-source-map": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz", + "integrity": "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==", + "dev": true, + "license": "MIT" + }, + "node_modules/csstype": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.2.3.tgz", + "integrity": "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/debug": { + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", + "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==", + "dev": true, + "license": "MIT", + "dependencies": { + "ms": "^2.1.3" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/electron-to-chromium": { + "version": "1.5.340", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.340.tgz", + "integrity": "sha512-908qahOGocRMinT2nM3ajCEM99H4iPdv84eagPP3FfZy/1ZGeOy2CZYzjhms81ckOPCXPlW7LkY4XpxD8r1DrA==", + "dev": true, + "license": "ISC" + }, + "node_modules/esbuild": { + "version": "0.25.12", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.25.12.tgz", + "integrity": "sha512-bbPBYYrtZbkt6Os6FiTLCTFxvq4tt3JKall1vRwshA3fdVztsLAatFaZobhkBC8/BrPetoa0oksYoKXoG4ryJg==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.25.12", + "@esbuild/android-arm": "0.25.12", + "@esbuild/android-arm64": "0.25.12", + "@esbuild/android-x64": "0.25.12", + "@esbuild/darwin-arm64": "0.25.12", + "@esbuild/darwin-x64": "0.25.12", + "@esbuild/freebsd-arm64": "0.25.12", + "@esbuild/freebsd-x64": "0.25.12", + "@esbuild/linux-arm": "0.25.12", + "@esbuild/linux-arm64": "0.25.12", + "@esbuild/linux-ia32": "0.25.12", + "@esbuild/linux-loong64": "0.25.12", + "@esbuild/linux-mips64el": "0.25.12", + "@esbuild/linux-ppc64": "0.25.12", + "@esbuild/linux-riscv64": "0.25.12", + "@esbuild/linux-s390x": "0.25.12", + "@esbuild/linux-x64": "0.25.12", + "@esbuild/netbsd-arm64": "0.25.12", + "@esbuild/netbsd-x64": "0.25.12", + "@esbuild/openbsd-arm64": "0.25.12", + "@esbuild/openbsd-x64": "0.25.12", + "@esbuild/openharmony-arm64": "0.25.12", + "@esbuild/sunos-x64": "0.25.12", + "@esbuild/win32-arm64": "0.25.12", + "@esbuild/win32-ia32": "0.25.12", + "@esbuild/win32-x64": "0.25.12" + } + }, + "node_modules/escalade": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz", + "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/fdir": { + "version": "6.5.0", + "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.5.0.tgz", + "integrity": "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12.0.0" + }, + "peerDependencies": { + "picomatch": "^3 || ^4" + }, + "peerDependenciesMeta": { + "picomatch": { + "optional": true + } + } + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/gensync": { + "version": "1.0.0-beta.2", + "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz", + "integrity": "sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/js-tokens": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", + "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/jsesc": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-3.1.0.tgz", + "integrity": "sha512-/sM3dO2FOzXjKQhJuo0Q173wf2KOo8t4I8vHy6lF9poUp7bKT0/NHE8fPX23PwfhnykfqnC2xRxOnVw5XuGIaA==", + "dev": true, + "license": "MIT", + "bin": { + "jsesc": "bin/jsesc" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/json5": { + "version": "2.2.3", + "resolved": "https://registry.npmjs.org/json5/-/json5-2.2.3.tgz", + "integrity": "sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==", + "dev": true, + "license": "MIT", + "bin": { + "json5": "lib/cli.js" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/lru-cache": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", + "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", + "dev": true, + "license": "ISC", + "dependencies": { + "yallist": "^3.0.2" + } + }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "dev": true, + "license": "MIT" + }, + "node_modules/nanoid": { + "version": "3.3.11", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz", + "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, + "node_modules/node-releases": { + "version": "2.0.37", + "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.37.tgz", + "integrity": "sha512-1h5gKZCF+pO/o3Iqt5Jp7wc9rH3eJJ0+nh/CIoiRwjRxde/hAHyLPXYN4V3CqKAbiZPSeJFSWHmJsbkicta0Eg==", + "dev": true, + "license": "MIT" + }, + "node_modules/picocolors": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", + "dev": true, + "license": "ISC" + }, + "node_modules/picomatch": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.4.tgz", + "integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/postcss": { + "version": "8.5.10", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.10.tgz", + "integrity": "sha512-pMMHxBOZKFU6HgAZ4eyGnwXF/EvPGGqUr0MnZ5+99485wwW41kW91A4LOGxSHhgugZmSChL5AlElNdwlNgcnLQ==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "nanoid": "^3.3.11", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, + "node_modules/react": { + "version": "19.2.5", + "resolved": "https://registry.npmjs.org/react/-/react-19.2.5.tgz", + "integrity": "sha512-llUJLzz1zTUBrskt2pwZgLq59AemifIftw4aB7JxOqf1HY2FDaGDxgwpAPVzHU1kdWabH7FauP4i1oEeer2WCA==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/react-dom": { + "version": "19.2.5", + "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.2.5.tgz", + "integrity": "sha512-J5bAZz+DXMMwW/wV3xzKke59Af6CHY7G4uYLN1OvBcKEsWOs4pQExj86BBKamxl/Ik5bx9whOrvBlSDfWzgSag==", + "license": "MIT", + "dependencies": { + "scheduler": "^0.27.0" + }, + "peerDependencies": { + "react": "^19.2.5" + } + }, + "node_modules/react-refresh": { + "version": "0.17.0", + "resolved": "https://registry.npmjs.org/react-refresh/-/react-refresh-0.17.0.tgz", + "integrity": "sha512-z6F7K9bV85EfseRCp2bzrpyQ0Gkw1uLoCel9XBVWPg/TjRj94SkJzUTGfOa4bs7iJvBWtQG0Wq7wnI0syw3EBQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/rollup": { + "version": "4.60.2", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.60.2.tgz", + "integrity": "sha512-J9qZyW++QK/09NyN/zeO0dG/1GdGfyp9lV8ajHnRVLfo/uFsbji5mHnDgn/qYdUHyCkM2N+8VyspgZclfAh0eQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/estree": "1.0.8" + }, + "bin": { + "rollup": "dist/bin/rollup" + }, + "engines": { + "node": ">=18.0.0", + "npm": ">=8.0.0" + }, + "optionalDependencies": { + "@rollup/rollup-android-arm-eabi": "4.60.2", + "@rollup/rollup-android-arm64": "4.60.2", + "@rollup/rollup-darwin-arm64": "4.60.2", + "@rollup/rollup-darwin-x64": "4.60.2", + "@rollup/rollup-freebsd-arm64": "4.60.2", + "@rollup/rollup-freebsd-x64": "4.60.2", + "@rollup/rollup-linux-arm-gnueabihf": "4.60.2", + "@rollup/rollup-linux-arm-musleabihf": "4.60.2", + "@rollup/rollup-linux-arm64-gnu": "4.60.2", + "@rollup/rollup-linux-arm64-musl": "4.60.2", + "@rollup/rollup-linux-loong64-gnu": "4.60.2", + "@rollup/rollup-linux-loong64-musl": "4.60.2", + "@rollup/rollup-linux-ppc64-gnu": "4.60.2", + "@rollup/rollup-linux-ppc64-musl": "4.60.2", + "@rollup/rollup-linux-riscv64-gnu": "4.60.2", + "@rollup/rollup-linux-riscv64-musl": "4.60.2", + "@rollup/rollup-linux-s390x-gnu": "4.60.2", + "@rollup/rollup-linux-x64-gnu": "4.60.2", + "@rollup/rollup-linux-x64-musl": "4.60.2", + "@rollup/rollup-openbsd-x64": "4.60.2", + "@rollup/rollup-openharmony-arm64": "4.60.2", + "@rollup/rollup-win32-arm64-msvc": "4.60.2", + "@rollup/rollup-win32-ia32-msvc": "4.60.2", + "@rollup/rollup-win32-x64-gnu": "4.60.2", + "@rollup/rollup-win32-x64-msvc": "4.60.2", + "fsevents": "~2.3.2" + } + }, + "node_modules/scheduler": { + "version": "0.27.0", + "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.27.0.tgz", + "integrity": "sha512-eNv+WrVbKu1f3vbYJT/xtiF5syA5HPIMtf9IgY/nKg0sWqzAUEvqY/xm7OcZc/qafLx/iO9FgOmeSAp4v5ti/Q==", + "license": "MIT" + }, + "node_modules/semver": { + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==", + "dev": true, + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + } + }, + "node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "dev": true, + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/tinyglobby": { + "version": "0.2.16", + "resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.16.tgz", + "integrity": "sha512-pn99VhoACYR8nFHhxqix+uvsbXineAasWm5ojXoN8xEwK5Kd3/TrhNn1wByuD52UxWRLy8pu+kRMniEi6Eq9Zg==", + "dev": true, + "license": "MIT", + "dependencies": { + "fdir": "^6.5.0", + "picomatch": "^4.0.4" + }, + "engines": { + "node": ">=12.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/SuperchupuDev" + } + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/update-browserslist-db": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.2.3.tgz", + "integrity": "sha512-Js0m9cx+qOgDxo0eMiFGEueWztz+d4+M3rGlmKPT+T4IS/jP4ylw3Nwpu6cpTTP8R1MAC1kF4VbdLt3ARf209w==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/browserslist" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "escalade": "^3.2.0", + "picocolors": "^1.1.1" + }, + "bin": { + "update-browserslist-db": "cli.js" + }, + "peerDependencies": { + "browserslist": ">= 4.21.0" + } + }, + "node_modules/vite": { + "version": "6.4.2", + "resolved": "https://registry.npmjs.org/vite/-/vite-6.4.2.tgz", + "integrity": "sha512-2N/55r4JDJ4gdrCvGgINMy+HH3iRpNIz8K6SFwVsA+JbQScLiC+clmAxBgwiSPgcG9U15QmvqCGWzMbqda5zGQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "esbuild": "^0.25.0", + "fdir": "^6.4.4", + "picomatch": "^4.0.2", + "postcss": "^8.5.3", + "rollup": "^4.34.9", + "tinyglobby": "^0.2.13" + }, + "bin": { + "vite": "bin/vite.js" + }, + "engines": { + "node": "^18.0.0 || ^20.0.0 || >=22.0.0" + }, + "funding": { + "url": "https://github.com/vitejs/vite?sponsor=1" + }, + "optionalDependencies": { + "fsevents": "~2.3.3" + }, + "peerDependencies": { + "@types/node": "^18.0.0 || ^20.0.0 || >=22.0.0", + "jiti": ">=1.21.0", + "less": "*", + "lightningcss": "^1.21.0", + "sass": "*", + "sass-embedded": "*", + "stylus": "*", + "sugarss": "*", + "terser": "^5.16.0", + "tsx": "^4.8.1", + "yaml": "^2.4.2" + }, + "peerDependenciesMeta": { + "@types/node": { + "optional": true + }, + "jiti": { + "optional": true + }, + "less": { + "optional": true + }, + "lightningcss": { + "optional": true + }, + "sass": { + "optional": true + }, + "sass-embedded": { + "optional": true + }, + "stylus": { + "optional": true + }, + "sugarss": { + "optional": true + }, + "terser": { + "optional": true + }, + "tsx": { + "optional": true + }, + "yaml": { + "optional": true + } + } + }, + "node_modules/yallist": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz", + "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==", + "dev": true, + "license": "ISC" + } + } +} diff --git a/training/smart-turn-zh/viewer/package.json b/training/smart-turn-zh/viewer/package.json new file mode 100644 index 0000000..abca164 --- /dev/null +++ b/training/smart-turn-zh/viewer/package.json @@ -0,0 +1,22 @@ +{ + "name": "audio-viewer", + "private": true, + "version": "0.2.0", + "type": "module", + "scripts": { + "dev": "vite", + "build": "tsc && vite build", + "preview": "vite preview" + }, + "dependencies": { + "react": "^19.0.0", + "react-dom": "^19.0.0" + }, + "devDependencies": { + "@types/react": "^19.0.0", + "@types/react-dom": "^19.0.0", + "@vitejs/plugin-react": "^4.3.0", + "typescript": "^5.7.0", + "vite": "^6.0.0" + } +} diff --git a/training/smart-turn-zh/viewer/src/App.tsx b/training/smart-turn-zh/viewer/src/App.tsx new file mode 100644 index 0000000..e5d87d8 --- /dev/null +++ b/training/smart-turn-zh/viewer/src/App.tsx @@ -0,0 +1,485 @@ +import { useRef, useState, useCallback, useEffect } from 'react'; +import { Timeline } from './lib/timeline'; +import { AudioStore } from './lib/audio'; +import type { WaveformScale } from './lib/waveform'; +import { + type Sentence, + parseSentences, searchSentences, zoomToSentence, +} from './lib/asr'; +import { parseNpy, findVADBlocks, nextVADBlock, prevVADBlock } from './lib/vad'; +import { Toolbar } from './components/Toolbar'; +import { WaveformTrack } from './components/WaveformTrack'; +import { VADTrack } from './components/VADTrack'; +import { SpectrogramTrack } from './components/SpectrogramTrack'; +import { ResizeHandle } from './components/ResizeHandle'; +import { Minimap } from './components/Minimap'; +import { ASRPanel } from './components/ASRPanel'; +import { ShortcutsDialog } from './components/ShortcutsDialog'; +import { usePlayback } from './hooks/usePlayback'; + +export function App() { + // Stable singleton instances + const [tl] = useState(() => new Timeline()); + const [audio] = useState(() => new AudioStore()); + + // File loading status + const [wavStatus, setWavStatus] = useState(''); + const [vadStatus, setVadStatus] = useState(''); + const [asrStatus, setAsrStatus] = useState(''); + + // Audio state + const [channelCount, setChannelCount] = useState(0); + const [channel, setChannel] = useState(-1); + const [scale, setScale] = useState('dB'); + const [gainValue, setGainValue] = useState(10.0); + + // VAD state + const [vadProbs, setVadProbs] = useState(null); + const [vadEntry, setVadEntry] = useState(0.3); + const [vadExit, setVadExit] = useState(0.1); + + // ASR state + const [sentences, setSentences] = useState([]); + const [searchQuery, setSearchQuery] = useState(''); + const [searchResults, setSearchResults] = useState([]); + const [searchIdx, setSearchIdx] = useState(-1); + + // Drop overlay + const [showDrop, setShowDrop] = useState(false); + + // Shortcuts dialog + const [showShortcuts, setShowShortcuts] = useState(false); + + // ---- Layout persistence & resize handlers ---- + + const LAYOUT_KEY = 'viewer-layout'; + + const [resizeTrack] = useState(() => { + const ids = ['waveform-track', 'vad-track', 'spectrogram-track']; + const MIN = 30; + + const save = () => { + const els = ids.map((id) => document.getElementById(id)); + const panel = document.getElementById('asr-panel'); + if (els.some((e) => !e) || !panel) return; + const layout = { + trackFlex: els.map((e) => e!.offsetHeight), + asrWidth: panel.offsetWidth, + }; + localStorage.setItem(LAYOUT_KEY, JSON.stringify(layout)); + }; + + const resize = (aboveIdx: number, belowIdx: number) => (delta: number) => { + const els = ids.map((id) => document.getElementById(id)); + if (els.some((e) => !e)) return; + const heights = els.map((e) => e!.offsetHeight); + + heights[aboveIdx] += delta; + heights[belowIdx] -= delta; + if (heights[aboveIdx] < MIN || heights[belowIdx] < MIN) return; + + els.forEach((e, i) => { e!.style.flex = `${heights[i]} 0 0px`; }); + save(); + }; + + return { + wfVad: resize(0, 1), + vadSpec: resize(1, 2), + asr: (delta: number) => { + const panel = document.getElementById('asr-panel'); + if (!panel) return; + panel.style.width = `${Math.max(200, Math.min(600, panel.offsetWidth - delta))}px`; + save(); + }, + }; + }); + + // Restore saved layout on mount + useEffect(() => { + try { + const raw = localStorage.getItem(LAYOUT_KEY); + if (!raw) return; + const layout = JSON.parse(raw) as { trackFlex?: number[]; asrWidth?: number }; + const ids = ['waveform-track', 'vad-track', 'spectrogram-track']; + if (layout.trackFlex?.length === 3) { + ids.forEach((id, i) => { + const el = document.getElementById(id); + if (el) el.style.flex = `${layout.trackFlex![i]} 0 0px`; + }); + } + if (layout.asrWidth) { + const panel = document.getElementById('asr-panel'); + if (panel) panel.style.width = `${layout.asrWidth}px`; + } + } catch { /* ignore corrupt data */ } + }, []); + + // Playback + const { playing, play, playSegment, stop, setGain, invalidateBuffer } = usePlayback(tl, audio); + + // Refs for stable callbacks that need current values + const playingRef = useRef(playing); + const channelRef = useRef(channel); + const playRef = useRef(play); + const playSegmentRef = useRef(playSegment); + const stopRef = useRef(stop); + playingRef.current = playing; + channelRef.current = channel; + playRef.current = play; + playSegmentRef.current = playSegment; + stopRef.current = stop; + + // Sentences ref for search callbacks + const sentencesRef = useRef(sentences); + sentencesRef.current = sentences; + + // Search state refs for next/prev + const searchResultsRef = useRef(searchResults); + const searchIdxRef = useRef(searchIdx); + searchResultsRef.current = searchResults; + searchIdxRef.current = searchIdx; + + // VAD refs for keyboard nav + const vadProbsRef = useRef(vadProbs); + const vadEntryRef = useRef(vadEntry); + const vadExitRef = useRef(vadExit); + vadProbsRef.current = vadProbs; + vadEntryRef.current = vadEntry; + vadExitRef.current = vadExit; + + // ---- Handlers ---- + + const handleSeek = useCallback((t: number) => { + tl.setCursor(t); + if (playingRef.current) { + stopRef.current(); + playRef.current(t, channelRef.current); + } + }, [tl]); + + const handlePlaySegment = useCallback((start: number, end: number) => { + tl.clearLoop(); + tl.setCursor(start); + playSegmentRef.current(start, end, channelRef.current); + }, [tl]); + + const handlePlayToggle = useCallback(() => { + if (playingRef.current) { + stopRef.current(); + } else if (tl.hasLoop) { + tl.setCursor(tl.loopStart); + playSegmentRef.current(tl.loopStart, tl.loopEnd, channelRef.current); + } else { + playRef.current(tl.cursor, channelRef.current); + } + }, [tl]); + + const handleChannelChange = useCallback((ch: number) => { + setChannel(ch); + invalidateBuffer(); + }, [invalidateBuffer]); + + const handleGainChange = useCallback((v: number) => { + setGainValue(v); + setGain(v); + }, [setGain]); + + // ---- File loading ---- + + const loadWav = useCallback(async (f: File) => { + setWavStatus('...'); + try { + await audio.load(f); + tl.sampleRate = audio.sampleRate; + tl.setDuration(audio.duration); + setChannelCount(audio.channelCount); + setChannel(-1); + invalidateBuffer(); + setWavStatus('\u2713'); + } catch (e) { + setWavStatus('\u2717'); + console.error(e); + } + }, [audio, tl, invalidateBuffer]); + + const loadVad = useCallback(async (f: File) => { + setVadStatus('...'); + try { + const buf = await f.arrayBuffer(); + setVadProbs(parseNpy(buf)); + setVadStatus('\u2713'); + } catch (e) { + setVadStatus('\u2717'); + console.error(e); + } + }, []); + + const loadAsr = useCallback(async (f: File) => { + setAsrStatus('...'); + try { + const json = JSON.parse(await f.text()); + const sents = parseSentences(Array.isArray(json) ? json : [json]); + setSentences(sents); + setSearchQuery(''); + setSearchResults([]); + setSearchIdx(-1); + setAsrStatus('\u2713'); + } catch (e) { + setAsrStatus('\u2717'); + console.error(e); + } + }, []); + + const routeFile = useCallback((f: File) => { + if (f.name.endsWith('.wav')) loadWav(f); + else if (f.name.endsWith('.npy')) loadVad(f); + else if (f.name.endsWith('.json')) loadAsr(f); + }, [loadWav, loadVad, loadAsr]); + + const handleFilesSelected = useCallback((files: File[]) => { + for (const f of files) routeFile(f); + }, [routeFile]); + + // ---- Search ---- + + const handleSearch = useCallback((q: string) => { + setSearchQuery(q); + const sents = sentencesRef.current; + const results = searchSentences(sents, q); + setSearchResults(results); + if (results.length) { + setSearchIdx(0); + zoomToSentence(tl, sents, results[0]); + } else { + setSearchIdx(-1); + } + }, [tl]); + + const handleSearchNext = useCallback(() => { + const results = searchResultsRef.current; + if (!results.length) return; + const next = (searchIdxRef.current + 1) % results.length; + setSearchIdx(next); + zoomToSentence(tl, sentencesRef.current, results[next]); + }, [tl]); + + const handleSearchPrev = useCallback(() => { + const results = searchResultsRef.current; + if (!results.length) return; + const prev = (searchIdxRef.current - 1 + results.length) % results.length; + setSearchIdx(prev); + zoomToSentence(tl, sentencesRef.current, results[prev]); + }, [tl]); + + // ---- Drag & Drop ---- + + useEffect(() => { + const onDragOver = (e: DragEvent) => { e.preventDefault(); setShowDrop(true); }; + const onDrop = (e: DragEvent) => { + e.preventDefault(); + setShowDrop(false); + for (const f of e.dataTransfer?.files ?? []) routeFile(f); + }; + document.addEventListener('dragover', onDragOver); + document.addEventListener('drop', onDrop); + return () => { + document.removeEventListener('dragover', onDragOver); + document.removeEventListener('drop', onDrop); + }; + }, [routeFile]); + + // ---- Keyboard shortcuts ---- + + useEffect(() => { + const onKeyDown = (e: KeyboardEvent) => { + const searchInput = document.getElementById('search-input'); + + // Let search input handle its own keys (except Escape) + if (e.target === searchInput && e.key !== 'Escape') return; + + switch (e.key) { + case ' ': + e.preventDefault(); + handlePlayToggle(); + break; + case 'f': + e.preventDefault(); + searchInput?.focus(); + break; + case 'Escape': + tl.clearLoop(); + (document.activeElement as HTMLElement)?.blur(); + break; + case '=': case '+': + tl.zoom(0.67, 0.5); + break; + case '-': + tl.zoom(1.5, 0.5); + break; + case 'ArrowLeft': + tl.pan(-(tl.viewEnd - tl.viewStart) * 0.1); + break; + case 'ArrowRight': + tl.pan((tl.viewEnd - tl.viewStart) * 0.1); + break; + case '[': { + const probs = vadProbsRef.current; + if (probs) { + const blocks = findVADBlocks(probs, 0.032, vadEntryRef.current, vadExitRef.current); + const b = prevVADBlock(blocks, tl.cursor); + if (b) { + handleSeek(b.start); + if (b.start < tl.viewStart || b.start > tl.viewEnd) { + const span = tl.viewEnd - tl.viewStart; + tl.setView(b.start - span * 0.2, b.start - span * 0.2 + span); + } + } + } + break; + } + case ']': { + const probs = vadProbsRef.current; + if (probs) { + const blocks = findVADBlocks(probs, 0.032, vadEntryRef.current, vadExitRef.current); + const b = nextVADBlock(blocks, tl.cursor); + if (b) { + handleSeek(b.start); + if (b.start < tl.viewStart || b.start > tl.viewEnd) { + const span = tl.viewEnd - tl.viewStart; + tl.setView(b.start - span * 0.2, b.start - span * 0.2 + span); + } + } + } + break; + } + case '{': { + const sents = sentencesRef.current; + const cursorMs = tl.cursor * 1000; + for (let i = sents.length - 1; i >= 0; i--) { + if (sents[i].start < cursorMs - 1) { + const t = sents[i].start / 1000; + handleSeek(t); + if (t < tl.viewStart || t > tl.viewEnd) { + const span = tl.viewEnd - tl.viewStart; + tl.setView(t - span * 0.2, t - span * 0.2 + span); + } + break; + } + } + break; + } + case '}': { + const sents = sentencesRef.current; + const cursorMs = tl.cursor * 1000; + for (let i = 0; i < sents.length; i++) { + if (sents[i].start > cursorMs + 1) { + const t = sents[i].start / 1000; + handleSeek(t); + if (t < tl.viewStart || t > tl.viewEnd) { + const span = tl.viewEnd - tl.viewStart; + tl.setView(t - span * 0.2, t - span * 0.2 + span); + } + break; + } + } + break; + } + case '?': + setShowShortcuts(v => !v); + break; + case '0': + handleChannelChange(-1); + break; + default: + if (e.key >= '1' && e.key <= '9') { + const ch = +e.key - 1; + if (ch < audio.channelCount) handleChannelChange(ch); + } + } + }; + document.addEventListener('keydown', onKeyDown); + return () => document.removeEventListener('keydown', onKeyDown); + }, [tl, audio, handlePlayToggle, handleChannelChange, handleSeek]); + + // ---- Render ---- + + return ( +
+ 0} + onPlayToggle={handlePlayToggle} + gainValue={gainValue} + onGainChange={handleGainChange} + onShowShortcuts={() => setShowShortcuts(true)} + /> + +
+
+
+ + + + + +
+ + +
+ + + + +
+ + + + {showShortcuts && setShowShortcuts(false)} />} +
+ ); +} diff --git a/training/smart-turn-zh/viewer/src/components/ASRPanel.tsx b/training/smart-turn-zh/viewer/src/components/ASRPanel.tsx new file mode 100644 index 0000000..805143f --- /dev/null +++ b/training/smart-turn-zh/viewer/src/components/ASRPanel.tsx @@ -0,0 +1,223 @@ +import { useRef, useEffect, useCallback, memo } from 'react'; +import type { Timeline } from '../lib/timeline'; +import { + type Sentence, + matchHlSet, fmtTime, fmtMs, +} from '../lib/asr'; + +interface ASRPanelProps { + timeline: Timeline; + sentences: Sentence[]; + searchQuery: string; + searchResults: number[]; + searchResultIdx: number; + onSearchChange: (q: string) => void; + onNext: () => void; + onPrev: () => void; + onSeek: (time: number) => void; + onPlaySegment: (start: number, end: number) => void; + playing: boolean; +} + +export const ASRPanel = memo(function ASRPanel({ + timeline, sentences, searchQuery, searchResults, searchResultIdx, + onSearchChange, onNext, onPrev, onSeek, onPlaySegment, playing, +}: ASRPanelProps) { + const listRef = useRef(null); + const activeCharRef = useRef(null); + const activeRangeRef = useRef<[number, number]>([-1, -1]); + const activeSentRef = useRef(null); + + // Keep sentences in ref for stable callbacks + const sentencesRef = useRef(sentences); + sentencesRef.current = sentences; + + const clearHighlight = useCallback(() => { + if (activeCharRef.current) { + activeCharRef.current.classList.remove('char-active'); + activeCharRef.current = null; + activeRangeRef.current = [-1, -1]; + } + }, []); + + /** Find the sentence at timeMs, or the nearest one. */ + const findSentenceAt = useCallback((timeMs: number): number => { + const sents = sentencesRef.current; + if (!sents.length) return -1; + + // Exact hit + for (let i = 0; i < sents.length; i++) { + if (timeMs >= sents[i].start && timeMs <= sents[i].end) return i; + } + + // Nearest by distance to start or end + let best = 0; + let bestDist = Infinity; + for (let i = 0; i < sents.length; i++) { + const d = Math.min(Math.abs(timeMs - sents[i].start), Math.abs(timeMs - sents[i].end)); + if (d < bestDist) { bestDist = d; best = i; } + } + return best; + }, []); + + const highlightAt = useCallback((timeMs: number) => { + const sents = sentencesRef.current; + const list = listRef.current; + if (!list) return; + + // ---- Sentence-level highlight ---- + const si = findSentenceAt(timeMs); + const sentEl = si >= 0 ? list.querySelector(`[data-idx="${si}"]`) as HTMLElement | null : null; + + if (sentEl !== activeSentRef.current) { + activeSentRef.current?.classList.remove('active'); + if (sentEl) { + sentEl.classList.add('active'); + sentEl.scrollIntoView({ block: 'nearest', behavior: 'auto' }); + } + activeSentRef.current = sentEl; + } + + // ---- Character-level karaoke highlight ---- + if ( + activeCharRef.current && + timeMs >= activeRangeRef.current[0] && + timeMs < activeRangeRef.current[1] + ) { + return; + } + clearHighlight(); + + if (si < 0) return; + const s = sents[si]; + let charIdx = -1; + for (let ci = 0; ci < s.chars.length; ci++) { + const c = s.chars[ci]; + if (c.start >= 0 && timeMs >= c.start && timeMs < c.end) { charIdx = ci; break; } + } + if (charIdx < 0 || !sentEl) return; + + const txtEl = sentEl.querySelector('.txt'); + if (!txtEl) return; + const el = txtEl.children[charIdx] as HTMLElement | undefined; + if (!el) return; + + el.classList.add('char-active'); + activeCharRef.current = el; + activeRangeRef.current = [s.chars[charIdx].start, s.chars[charIdx].end]; + }, [clearHighlight, findSentenceAt]); + + // Subscribe to timeline for highlighting + useEffect(() => { + return timeline.onUpdate(() => highlightAt(timeline.cursor * 1000)); + }, [timeline, highlightAt]); + + // Re-apply after DOM rebuild + useEffect(() => { + activeSentRef.current = null; + clearHighlight(); + highlightAt(timeline.cursor * 1000); + }, [sentences, searchQuery, searchResults, searchResultIdx, clearHighlight, highlightAt, timeline]); + + // Scroll current search result into view + const currentSent = searchResultIdx >= 0 ? searchResults[searchResultIdx] : -1; + useEffect(() => { + if (currentSent >= 0) { + const el = listRef.current?.querySelector(`[data-idx="${currentSent}"]`); + el?.scrollIntoView({ block: 'center', behavior: 'smooth' }); + } + }, [currentSent]); + + // Handle click on a sentence or character + const handleClick = useCallback((e: React.MouseEvent) => { + const el = (e.target as HTMLElement).closest('.sentence') as HTMLElement | null; + if (!el) return; + const sentIdx = +el.dataset.idx!; + + const charEl = (e.target as HTMLElement).closest('.char') as HTMLElement | null; + const t = charEl ? +charEl.dataset.cs! / 1000 : +el.dataset.start! / 1000; + + // Pan to show the sentence without changing zoom level + const span = timeline.viewEnd - timeline.viewStart; + if (t < timeline.viewStart || t > timeline.viewEnd) { + timeline.setView(t - span * 0.1, t - span * 0.1 + span); + } + onSeek(t); + }, [timeline, onSeek]); + + const matchSet = new Set(searchResults); + const searchCount = searchQuery + ? (searchResults.length ? `${searchResultIdx + 1}/${searchResults.length}` : 'No results') + : ''; + + return ( +
+ +
+ {sentences.map((s, i) => { + const cls = i === currentSent + ? 'sentence current' + : matchSet.has(i) ? 'sentence match' : 'sentence'; + const hlSet = searchQuery && matchSet.has(i) + ? matchHlSet(s.text, searchQuery) : null; + + return ( +
+ { e.stopPropagation(); onPlaySegment(s.start / 1000, s.end / 1000); }} + title={`Play ${fmtTime(s.start / 1000)} \u2013 ${fmtTime(s.end / 1000)}`} + >{fmtTime(s.start / 1000)} + + {s.chars.map((c, ci) => { + const hl = hlSet?.has(ci) ? ' search-hl' : ''; + if (c.start >= 0) { + return ( + + {c.char} + + ); + } + return {c.char}; + })} + +
+ ); + })} +
+
+ ); +}); diff --git a/training/smart-turn-zh/viewer/src/components/Minimap.tsx b/training/smart-turn-zh/viewer/src/components/Minimap.tsx new file mode 100644 index 0000000..6195794 --- /dev/null +++ b/training/smart-turn-zh/viewer/src/components/Minimap.tsx @@ -0,0 +1,78 @@ +import { useRef, useEffect, useCallback, memo } from 'react'; +import type { Timeline } from '../lib/timeline'; +import type { AudioStore } from '../lib/audio'; + +interface MinimapProps { + timeline: Timeline; + audio: AudioStore; + channel: number; +} + +export const Minimap = memo(function Minimap({ timeline, audio, channel }: MinimapProps) { + const canvasRef = useRef(null); + const channelRef = useRef(channel); + channelRef.current = channel; + + const draw = useCallback(() => { + const canvas = canvasRef.current; + if (!canvas || !audio.raw) return; + + const dpr = devicePixelRatio; + const rect = canvas.getBoundingClientRect(); + canvas.width = rect.width * dpr; + canvas.height = rect.height * dpr; + const ctx = canvas.getContext('2d')!; + const w = rect.width, h = rect.height; + ctx.setTransform(dpr, 0, 0, dpr, 0, 0); + ctx.clearRect(0, 0, w, h); + + // Coarsest LOD waveform overview + const ch = channelRef.current; + const levels = audio.getLOD(ch); + if (levels.length) { + const lv = levels[levels.length - 1]; + const mid = h / 2, amp = mid; + ctx.fillStyle = '#1e3a4a'; + ctx.beginPath(); + for (let x = 0; x < w; x++) { + const b = Math.min(Math.floor((x / w) * lv.max.length), lv.max.length - 1); + x === 0 ? ctx.moveTo(x, mid - lv.max[b] * amp) : ctx.lineTo(x, mid - lv.max[b] * amp); + } + for (let x = w - 1; x >= 0; x--) { + const b = Math.min(Math.floor((x / w) * lv.min.length), lv.min.length - 1); + ctx.lineTo(x, mid - lv.min[b] * amp); + } + ctx.closePath(); + ctx.fill(); + } + + // Viewport indicator + const x1 = (timeline.viewStart / timeline.duration) * w; + const x2 = (timeline.viewEnd / timeline.duration) * w; + ctx.fillStyle = 'rgba(79,195,247,0.15)'; + ctx.fillRect(x1, 0, x2 - x1, h); + ctx.strokeStyle = '#4fc3f7'; + ctx.lineWidth = 1; + ctx.strokeRect(x1 + 0.5, 0.5, x2 - x1 - 1, h - 1); + }, [timeline, audio]); + + // Subscribe to timeline updates + useEffect(() => { + return timeline.onUpdate(draw); + }, [timeline, draw]); + + // Redraw on channel change + useEffect(draw, [channel, draw]); + + const handleClick = useCallback((e: React.MouseEvent) => { + const canvas = canvasRef.current; + if (!canvas) return; + const rect = canvas.getBoundingClientRect(); + const frac = (e.clientX - rect.left) / rect.width; + const t = frac * timeline.duration; + const span = timeline.viewEnd - timeline.viewStart; + timeline.setView(t - span / 2, t + span / 2); + }, [timeline]); + + return ; +}); diff --git a/training/smart-turn-zh/viewer/src/components/ResizeHandle.tsx b/training/smart-turn-zh/viewer/src/components/ResizeHandle.tsx new file mode 100644 index 0000000..c6276b2 --- /dev/null +++ b/training/smart-turn-zh/viewer/src/components/ResizeHandle.tsx @@ -0,0 +1,41 @@ +import { memo } from 'react'; + +interface ResizeHandleProps { + direction: 'row' | 'col'; + onDrag: (delta: number) => void; +} + +export const ResizeHandle = memo(function ResizeHandle({ + direction, + onDrag, +}: ResizeHandleProps) { + const onMouseDown = (e: React.MouseEvent) => { + e.preventDefault(); + let last = direction === 'col' ? e.clientX : e.clientY; + + const move = (ev: MouseEvent) => { + const cur = direction === 'col' ? ev.clientX : ev.clientY; + onDrag(cur - last); + last = cur; + }; + const up = () => { + window.removeEventListener('mousemove', move); + window.removeEventListener('mouseup', up); + document.body.style.removeProperty('cursor'); + document.body.style.removeProperty('user-select'); + }; + + document.body.style.cursor = + direction === 'col' ? 'col-resize' : 'row-resize'; + document.body.style.userSelect = 'none'; + window.addEventListener('mousemove', move); + window.addEventListener('mouseup', up); + }; + + return ( +
+ ); +}); diff --git a/training/smart-turn-zh/viewer/src/components/ShortcutsDialog.tsx b/training/smart-turn-zh/viewer/src/components/ShortcutsDialog.tsx new file mode 100644 index 0000000..4429d86 --- /dev/null +++ b/training/smart-turn-zh/viewer/src/components/ShortcutsDialog.tsx @@ -0,0 +1,36 @@ +import { memo } from 'react'; + +const SHORTCUTS = [ + { key: 'Space', desc: 'Play / pause (or replay loop)' }, + { key: 'F', desc: 'Focus search' }, + { key: 'Escape', desc: 'Clear loop / blur input' }, + { key: '+ / −', desc: 'Zoom in / out' }, + { key: '← / →', desc: 'Pan left / right' }, + { key: '[ / ]', desc: 'Previous / next VAD block' }, + { key: '{ / }', desc: 'Previous / next ASR sentence' }, + { key: 'Shift + drag', desc: 'Select loop range' }, + { key: '0', desc: 'All channels' }, + { key: '1–9', desc: 'Select channel' }, + { key: '?', desc: 'Toggle this dialog' }, +]; + +export const ShortcutsDialog = memo(function ShortcutsDialog({ onClose }: { onClose: () => void }) { + return ( +
+
e.stopPropagation()}> +
+ Keyboard Shortcuts + +
+
+ {SHORTCUTS.map(s => ( +
+ {s.key} + {s.desc} +
+ ))} +
+
+
+ ); +}); diff --git a/training/smart-turn-zh/viewer/src/components/SpectrogramTrack.tsx b/training/smart-turn-zh/viewer/src/components/SpectrogramTrack.tsx new file mode 100644 index 0000000..735978e --- /dev/null +++ b/training/smart-turn-zh/viewer/src/components/SpectrogramTrack.tsx @@ -0,0 +1,80 @@ +import { useRef, useEffect, useCallback, useState, memo } from 'react'; +import type { Timeline } from '../lib/timeline'; +import type { AudioStore } from '../lib/audio'; +import { SpectrogramRenderer, type FreqScale } from '../lib/spectrogram'; +import { useCanvasInteraction } from '../hooks/useCanvasInteraction'; + +interface SpectrogramTrackProps { + timeline: Timeline; + audio: AudioStore; + channel: number; + onSeek: (t: number) => void; +} + +export const SpectrogramTrack = memo(function SpectrogramTrack({ + timeline, audio, channel, onSeek, +}: SpectrogramTrackProps) { + const canvasRef = useRef(null); + const rendererRef = useRef(null); + const [freqScale, setFreqScale] = useState('mel'); + + const channelRef = useRef(channel); + const freqScaleRef = useRef(freqScale); + channelRef.current = channel; + freqScaleRef.current = freqScale; + + const draw = useCallback(() => { + const r = rendererRef.current; + if (!r) return; + r.channel = channelRef.current; + r.freqScale = freqScaleRef.current; + r.render(); + }, []); + + useEffect(() => { + const canvas = canvasRef.current; + if (!canvas) return; + + const r = new SpectrogramRenderer(canvas, timeline, audio); + rendererRef.current = r; + r.onTileReady = draw; + r.resize(); + draw(); + + const unsub = timeline.onUpdate(draw); + const ro = new ResizeObserver(() => { r.resize(); draw(); }); + ro.observe(canvas.parentElement!); + + return () => { + unsub(); + ro.disconnect(); + r.dispose(); + rendererRef.current = null; + }; + }, [timeline, audio, draw]); + + useEffect(draw, [channel, freqScale, draw]); + + useCanvasInteraction(canvasRef, timeline, onSeek); + + return ( +
+ Spectrogram +
+ + +
+ +
+ ); +}); diff --git a/training/smart-turn-zh/viewer/src/components/Toolbar.tsx b/training/smart-turn-zh/viewer/src/components/Toolbar.tsx new file mode 100644 index 0000000..ed77967 --- /dev/null +++ b/training/smart-turn-zh/viewer/src/components/Toolbar.tsx @@ -0,0 +1,165 @@ +import { useRef, useEffect, useCallback, useState, memo } from 'react'; +import type { Timeline } from '../lib/timeline'; +import { fmtTime } from '../lib/asr'; + +const ZOOM_PRESETS = [ + { label: '1s', span: 1 }, + { label: '5s', span: 5 }, + { label: '10s', span: 10 }, + { label: '20s', span: 20 }, + { label: '1m', span: 80 }, + { label: '5m', span: 300 }, + { label: '20m', span: 1200 }, + { label: 'Full', span: 0 }, +]; + +interface ToolbarProps { + timeline: Timeline; + onFilesSelected: (files: File[]) => void; + wavStatus: string; + vadStatus: string; + asrStatus: string; + channelCount: number; + channel: number; + onChannelChange: (ch: number) => void; + playing: boolean; + canPlay: boolean; + onPlayToggle: () => void; + gainValue: number; + onGainChange: (v: number) => void; + onShowShortcuts: () => void; +} + +export const Toolbar = memo(function Toolbar({ + timeline, onFilesSelected, + wavStatus, vadStatus, asrStatus, + channelCount, channel, onChannelChange, + playing, canPlay, onPlayToggle, + gainValue, onGainChange, + onShowShortcuts, +}: ToolbarProps) { + const timeRef = useRef(null); + const spanRef = useRef(null); + const loopRef = useRef(null); + const [activeZoom, setActiveZoom] = useState(null); + + // Update time display imperatively (60fps during playback) + useEffect(() => { + const update = () => { + if (timeRef.current) { + timeRef.current.textContent = `${fmtTime(timeline.cursor)} / ${fmtTime(timeline.duration)}`; + } + if (spanRef.current) { + const s = timeline.viewEnd - timeline.viewStart; + spanRef.current.textContent = s < 1 ? `${(s * 1000) | 0}ms` + : s < 60 ? `${s.toFixed(1)}s` + : s < 3600 ? `${(s / 60).toFixed(1)}m` + : `${(s / 3600).toFixed(1)}h`; + } + if (loopRef.current) { + if (timeline.hasLoop) { + const d = timeline.loopEnd - timeline.loopStart; + loopRef.current.textContent = `loop ${d < 1 ? `${(d * 1000) | 0}ms` : `${d.toFixed(1)}s`}`; + loopRef.current.hidden = false; + } else { + loopRef.current.hidden = true; + } + } + }; + update(); + return timeline.onUpdate(update); + }, [timeline]); + + const handleFileInput = useCallback((e: React.ChangeEvent) => { + const files = Array.from(e.target.files ?? []); + if (files.length) onFilesSelected(files); + }, [onFilesSelected]); + + const handleZoom = useCallback((span: number, idx: number) => { + if (span === 0) { + timeline.setView(0, timeline.duration); + } else { + const center = (timeline.viewStart + timeline.viewEnd) / 2; + timeline.setView(center - span / 2, center + span / 2); + } + setActiveZoom(idx); + }, [timeline]); + + return ( +
+
+ + {wavStatus} + {vadStatus} + {asrStatus} +
+
+ + + + {ZOOM_PRESETS.map((z, i) => ( + + ))} + + view +
+
+ + +
+
+ ); +}); diff --git a/training/smart-turn-zh/viewer/src/components/VADTrack.tsx b/training/smart-turn-zh/viewer/src/components/VADTrack.tsx new file mode 100644 index 0000000..913f4af --- /dev/null +++ b/training/smart-turn-zh/viewer/src/components/VADTrack.tsx @@ -0,0 +1,126 @@ +import { useRef, useEffect, useCallback, useMemo, memo } from 'react'; +import type { Timeline } from '../lib/timeline'; +import { VADRenderer, findVADBlocks, nextVADBlock, prevVADBlock } from '../lib/vad'; +import { useCanvasInteraction } from '../hooks/useCanvasInteraction'; + +interface VADTrackProps { + timeline: Timeline; + vadProbs: Float32Array | null; + entryThreshold: number; + exitThreshold: number; + onEntryChange: (v: number) => void; + onExitChange: (v: number) => void; + onSeek: (t: number) => void; +} + +export const VADTrack = memo(function VADTrack({ + timeline, vadProbs, entryThreshold, exitThreshold, + onEntryChange, onExitChange, onSeek, +}: VADTrackProps) { + const canvasRef = useRef(null); + const rendererRef = useRef(null); + + const entryRef = useRef(entryThreshold); + const exitRef = useRef(exitThreshold); + entryRef.current = entryThreshold; + exitRef.current = exitThreshold; + + const draw = useCallback(() => { + const r = rendererRef.current; + if (!r) return; + r.entryThreshold = entryRef.current; + r.exitThreshold = exitRef.current; + r.render(); + }, []); + + // Initialize renderer + useEffect(() => { + const canvas = canvasRef.current; + if (!canvas) return; + + const r = new VADRenderer(canvas, timeline); + rendererRef.current = r; + r.resize(); + + const unsub = timeline.onUpdate(draw); + const ro = new ResizeObserver(() => { r.resize(); draw(); }); + ro.observe(canvas.parentElement!); + + return () => { unsub(); ro.disconnect(); rendererRef.current = null; }; + }, [timeline, draw]); + + // Load VAD data when probs change + useEffect(() => { + if (vadProbs && rendererRef.current) { + rendererRef.current.probs = vadProbs; + draw(); + } + }, [vadProbs, draw]); + + // Redraw on threshold changes + useEffect(draw, [entryThreshold, exitThreshold, draw]); + + useCanvasInteraction(canvasRef, timeline, onSeek); + + const blocks = useMemo( + () => vadProbs ? findVADBlocks(vadProbs, 0.032, entryThreshold, exitThreshold) : [], + [vadProbs, entryThreshold, exitThreshold], + ); + + const handlePrev = useCallback(() => { + const b = prevVADBlock(blocks, timeline.cursor); + if (b) { + onSeek(b.start); + if (b.start < timeline.viewStart || b.start > timeline.viewEnd) { + const span = timeline.viewEnd - timeline.viewStart; + timeline.setView(b.start - span * 0.2, b.start - span * 0.2 + span); + } + } + }, [blocks, timeline, onSeek]); + + const handleNext = useCallback(() => { + const b = nextVADBlock(blocks, timeline.cursor); + if (b) { + onSeek(b.start); + if (b.start < timeline.viewStart || b.start > timeline.viewEnd) { + const span = timeline.viewEnd - timeline.viewStart; + timeline.setView(b.start - span * 0.2, b.start - span * 0.2 + span); + } + } + }, [blocks, timeline, onSeek]); + + return ( +
+ VAD + +
+
+ + +
+ + +
+
+ ); +}); diff --git a/training/smart-turn-zh/viewer/src/components/WaveformTrack.tsx b/training/smart-turn-zh/viewer/src/components/WaveformTrack.tsx new file mode 100644 index 0000000..a90b6fc --- /dev/null +++ b/training/smart-turn-zh/viewer/src/components/WaveformTrack.tsx @@ -0,0 +1,111 @@ +import { useRef, useEffect, useCallback, memo } from 'react'; +import type { Timeline } from '../lib/timeline'; +import type { AudioStore } from '../lib/audio'; +import { WaveformRenderer, type WaveformScale } from '../lib/waveform'; +import { drawASROverlay, drawASRLabels, type Sentence } from '../lib/asr'; +import { useCanvasInteraction } from '../hooks/useCanvasInteraction'; + +interface WaveformTrackProps { + timeline: Timeline; + audio: AudioStore; + channel: number; + scale: WaveformScale; + onScaleChange: (s: WaveformScale) => void; + sentences: Sentence[]; + searchResults: number[]; + searchResultIdx: number; + onSeek: (t: number) => void; +} + +export const WaveformTrack = memo(function WaveformTrack({ + timeline, audio, channel, scale, onScaleChange, + sentences, searchResults, searchResultIdx, onSeek, +}: WaveformTrackProps) { + const canvasRef = useRef(null); + const wfRef = useRef(null); + + // Store frequently-changing data in refs for the stable draw callback + const channelRef = useRef(channel); + const scaleRef = useRef(scale); + const sentencesRef = useRef(sentences); + const resultsRef = useRef(searchResults); + const resultIdxRef = useRef(searchResultIdx); + channelRef.current = channel; + scaleRef.current = scale; + sentencesRef.current = sentences; + resultsRef.current = searchResults; + resultIdxRef.current = searchResultIdx; + + // Stable draw function — reads everything from refs + const draw = useCallback(() => { + const wf = wfRef.current; + const canvas = canvasRef.current; + if (!wf || !canvas) return; + + wf.channel = channelRef.current; + wf.scale = scaleRef.current; + wf.render(); + + // ASR overlays on top of waveform + const sents = sentencesRef.current; + if (sents.length) { + const dpr = devicePixelRatio; + const w = canvas.width / dpr; + const h = canvas.height / dpr; + const ctx = canvas.getContext('2d')!; + ctx.setTransform(dpr, 0, 0, dpr, 0, 0); + const results = resultsRef.current; + if (results.length) { + drawASROverlay(ctx, w, h, timeline, sents, results, resultIdxRef.current); + } + drawASRLabels(ctx, w, h, timeline, sents); + } + }, [timeline]); + + // Initialize renderer, subscribe to timeline, observe resize + useEffect(() => { + const canvas = canvasRef.current; + if (!canvas) return; + + const wf = new WaveformRenderer(canvas, timeline, audio); + wfRef.current = wf; + wf.resize(); + draw(); + + const unsub = timeline.onUpdate(draw); + const ro = new ResizeObserver(() => { wf.resize(); draw(); }); + ro.observe(canvas.parentElement!); + + return () => { + unsub(); + ro.disconnect(); + wfRef.current = null; + }; + }, [timeline, audio, draw]); + + // Redraw when display props change + useEffect(draw, [channel, scale, sentences, searchResults, searchResultIdx, draw]); + + useCanvasInteraction(canvasRef, timeline, onSeek); + + return ( +
+ Waveform +
+ + +
+ +
+ ); +}); diff --git a/training/smart-turn-zh/viewer/src/hooks/useCanvasInteraction.ts b/training/smart-turn-zh/viewer/src/hooks/useCanvasInteraction.ts new file mode 100644 index 0000000..90d0eb3 --- /dev/null +++ b/training/smart-turn-zh/viewer/src/hooks/useCanvasInteraction.ts @@ -0,0 +1,93 @@ +import { useEffect, useRef, type RefObject } from 'react'; +import type { Timeline } from '../lib/timeline'; + +/** + * Adds wheel zoom, drag pan, and click-to-seek to a canvas element. + */ +export function useCanvasInteraction( + canvasRef: RefObject, + timeline: Timeline, + onSeek?: (time: number) => void, +) { + const onSeekRef = useRef(onSeek); + onSeekRef.current = onSeek; + + useEffect(() => { + const canvas = canvasRef.current; + if (!canvas) return; + + const onWheel = (e: WheelEvent) => { + e.preventDefault(); + if (e.metaKey || e.ctrlKey) { + const rect = canvas.getBoundingClientRect(); + const frac = (e.clientX - rect.left) / rect.width; + timeline.zoom(e.deltaY > 0 ? 1.08 : 0.93, frac); + } else { + const span = timeline.viewEnd - timeline.viewStart; + timeline.pan((e.deltaY > 0 ? 0.02 : -0.02) * span); + } + }; + + let drag = false, startX = 0, startVS = 0, moved = false; + let selecting = false, selectAnchor = 0; + + const onMouseDown = (e: MouseEvent) => { + if (e.shiftKey) { + // Shift+drag: loop range selection + selecting = true; + const rect = canvas.getBoundingClientRect(); + selectAnchor = timeline.xToTime(e.clientX - rect.left, rect.width); + timeline.setLoop(selectAnchor, selectAnchor); + } else { + drag = true; moved = false; startX = e.clientX; startVS = timeline.viewStart; + } + }; + const onMouseMove = (e: MouseEvent) => { + if (selecting) { + const rect = canvas.getBoundingClientRect(); + const t = timeline.xToTime(e.clientX - rect.left, rect.width); + timeline.setLoop(selectAnchor, t); + return; + } + if (!drag) return; + const dx = e.clientX - startX; + if (Math.abs(dx) > 3) moved = true; + if (!moved) return; + const rect = canvas.getBoundingClientRect(); + const span = timeline.viewEnd - timeline.viewStart; + const dt = -(dx / rect.width) * span; + timeline.setView(startVS + dt, startVS + dt + span); + }; + const onMouseUp = (e: MouseEvent) => { + if (selecting) { + selecting = false; + const rect = canvas.getBoundingClientRect(); + const t = timeline.xToTime(e.clientX - rect.left, rect.width); + timeline.setLoop(selectAnchor, t); + // Discard tiny accidental selections + if (timeline.loopEnd - timeline.loopStart < 0.05) timeline.clearLoop(); + return; + } + if (!drag) return; + drag = false; + if (!moved) { + const rect = canvas.getBoundingClientRect(); + const t = timeline.xToTime(e.clientX - rect.left, rect.width); + timeline.clearLoop(); + onSeekRef.current?.(t); + } + }; + + canvas.addEventListener('wheel', onWheel, { passive: false }); + canvas.addEventListener('mousedown', onMouseDown); + window.addEventListener('mousemove', onMouseMove); + window.addEventListener('mouseup', onMouseUp); + + return () => { + canvas.removeEventListener('wheel', onWheel); + canvas.removeEventListener('mousedown', onMouseDown); + window.removeEventListener('mousemove', onMouseMove); + window.removeEventListener('mouseup', onMouseUp); + }; + }, [canvasRef, timeline]); +} diff --git a/training/smart-turn-zh/viewer/src/hooks/usePlayback.ts b/training/smart-turn-zh/viewer/src/hooks/usePlayback.ts new file mode 100644 index 0000000..2317a67 --- /dev/null +++ b/training/smart-turn-zh/viewer/src/hooks/usePlayback.ts @@ -0,0 +1,130 @@ +import { useState, useCallback, useRef, useEffect } from 'react'; +import type { Timeline } from '../lib/timeline'; +import type { AudioStore } from '../lib/audio'; + +export function usePlayback(timeline: Timeline, audio: AudioStore) { + const [playing, setPlaying] = useState(false); + + const actxRef = useRef(null); + const srcRef = useRef(null); + const gainNodeRef = useRef(null); + const playBufRef = useRef(null); + const playChannelRef = useRef(-1); + const playT0Ref = useRef(0); + const playOffRef = useRef(0); + const gainValueRef = useRef(10.0); + const playingRef = useRef(false); + const rafIdRef = useRef(0); + const segmentEndRef = useRef(0); + + const stop = useCallback(() => { + playingRef.current = false; + const hadSegment = segmentEndRef.current > 0; + const segStart = playOffRef.current; + segmentEndRef.current = 0; + cancelAnimationFrame(rafIdRef.current); + + if (srcRef.current) { + srcRef.current.onended = null; + try { srcRef.current.stop(); } catch { /* already stopped */ } + srcRef.current = null; + } + if (actxRef.current) { + const elapsed = actxRef.current.currentTime - playT0Ref.current; + timeline.cursor = Math.min(playOffRef.current + elapsed, timeline.duration); + actxRef.current.close(); + actxRef.current = null; + } + // Reset cursor to loop start when a loop segment ends + if (hadSegment && timeline.hasLoop) { + timeline.cursor = segStart; + } + gainNodeRef.current = null; + setPlaying(false); + timeline.flush(); + }, [timeline]); + + const stopRef = useRef(stop); + stopRef.current = stop; + + const play = useCallback((offset: number, channel: number) => { + if (!audio.raw) return; + + // Stop any existing playback + if (playingRef.current) stopRef.current(); + + // Ensure buffer for channel + if (!playBufRef.current || playChannelRef.current !== channel) { + playBufRef.current = audio.createAudioBuffer(channel); + playChannelRef.current = channel; + } + const buf = playBufRef.current; + if (!buf) return; + + const actx = new AudioContext({ sampleRate: audio.sampleRate }); + const gainNode = actx.createGain(); + gainNode.gain.value = gainValueRef.current; + gainNode.connect(actx.destination); + const src = actx.createBufferSource(); + src.buffer = buf; + src.connect(gainNode); + src.start(0, offset); + src.onended = () => stopRef.current(); + + actxRef.current = actx; + srcRef.current = src; + gainNodeRef.current = gainNode; + playT0Ref.current = actx.currentTime; + playOffRef.current = offset; + playingRef.current = true; + setPlaying(true); + + const loop = () => { + if (!playingRef.current) return; + const a = actxRef.current; + if (!a) return; + const t = playOffRef.current + (a.currentTime - playT0Ref.current); + const endAt = segmentEndRef.current; + if (t >= timeline.duration || (endAt > 0 && t >= endAt)) { stopRef.current(); return; } + timeline.cursor = t; + // Auto-scroll when cursor exits viewport + if (t > timeline.viewEnd) { + const span = timeline.viewEnd - timeline.viewStart; + timeline.viewStart = t; + timeline.viewEnd = t + span; + } + timeline.flush(); + rafIdRef.current = requestAnimationFrame(loop); + }; + rafIdRef.current = requestAnimationFrame(loop); + }, [audio, timeline]); + + const playSegment = useCallback((start: number, end: number, channel: number) => { + segmentEndRef.current = end; + play(start, channel); + }, [play]); + + const setGain = useCallback((value: number) => { + gainValueRef.current = value; + if (gainNodeRef.current) gainNodeRef.current.gain.value = value; + }, []); + + const invalidateBuffer = useCallback(() => { + playBufRef.current = null; + }, []); + + // Cleanup on unmount + useEffect(() => { + return () => { + playingRef.current = false; + cancelAnimationFrame(rafIdRef.current); + if (srcRef.current) { + srcRef.current.onended = null; + try { srcRef.current.stop(); } catch { /* already stopped */ } + } + actxRef.current?.close(); + }; + }, []); + + return { playing, play, playSegment, stop, setGain, invalidateBuffer }; +} diff --git a/training/smart-turn-zh/viewer/src/lib/asr.ts b/training/smart-turn-zh/viewer/src/lib/asr.ts new file mode 100644 index 0000000..3f63d4d --- /dev/null +++ b/training/smart-turn-zh/viewer/src/lib/asr.ts @@ -0,0 +1,290 @@ +import type { Timeline } from './timeline'; + +export interface CharTiming { + char: string; + start: number; // ms, -1 for punctuation without timestamp + end: number; +} + +export interface Sentence { + text: string; + start: number; // ms + end: number; // ms + chars: CharTiming[]; +} + +interface RawSentence { + text: string; start: number; end: number; + raw_text?: string; timestamp?: number[][]; +} + +// ---- Data parsing ---- + +export function parseSentences(json: unknown[]): Sentence[] { + const sentences: Sentence[] = []; + for (const rec of json as Array<{ sentences?: RawSentence[] }>) { + if (rec.sentences) { + for (const s of rec.sentences) { + const chars = buildChars(s.text ?? '', s.raw_text ?? '', s.timestamp ?? []); + sentences.push({ text: s.text ?? '', start: s.start, end: s.end, chars }); + } + } + } + return sentences; +} + +export function searchSentences(sentences: Sentence[], query: string): number[] { + if (!query) return []; + const lower = query.toLowerCase(); + const results: number[] = []; + for (let i = 0; i < sentences.length; i++) { + if (sentences[i].text.toLowerCase().includes(lower)) results.push(i); + } + return results; +} + +export function zoomToSentence(tl: Timeline, sentences: Sentence[], sentIdx: number): void { + const s = sentences[sentIdx]; + if (!s) return; + const startSec = s.start / 1000; + const endSec = s.end / 1000; + const dur = endSec - startSec; + const pad = dur * 0.15; + tl.setView(startSec - pad, endSec + pad); + tl.setCursor(startSec); +} + +// ---- Canvas drawing ---- + +export function drawASROverlay( + ctx: CanvasRenderingContext2D, w: number, h: number, + tl: Timeline, sentences: Sentence[], + results: number[], currentResultIdx: number, +): void { + if (!results.length) return; + const cur = currentResultIdx >= 0 ? results[currentResultIdx] : -1; + for (const idx of results) { + const s = sentences[idx]; + const x1 = tl.timeToX(s.start / 1000, w); + const x2 = tl.timeToX(s.end / 1000, w); + const left = Math.max(0, x1), right = Math.min(w, x2); + if (right <= left) continue; + ctx.fillStyle = idx === cur ? 'rgba(255,152,0,0.3)' : 'rgba(255,235,59,0.15)'; + ctx.fillRect(left, 0, right - left, h); + } +} + +export function drawASRLabels( + ctx: CanvasRenderingContext2D, w: number, h: number, + tl: Timeline, sentences: Sentence[], +): void { + if (!sentences.length) return; + const vStart = tl.viewStart; + const vEnd = tl.viewEnd; + const vSpan = vEnd - vStart; + + const fontSize = 11; + ctx.font = `${fontSize}px monospace`; + ctx.textBaseline = 'top'; + const rowH = fontSize + 6; + const labelY = h - rowH; + + for (let si = 0; si < sentences.length; si++) { + const s = sentences[si]; + const startSec = s.start / 1000; + const endSec = s.end / 1000; + if (endSec < vStart || startSec > vEnd) continue; + + const rawX1 = ((startSec - vStart) / vSpan) * w; + const rawX2 = ((endSec - vStart) / vSpan) * w; + const x1 = Math.max(0, rawX1); + const x2 = Math.min(w, rawX2); + const regionW = x2 - x1; + + // Alternating sentence background + const sentColor = si % 2 === 0 + ? 'rgba(79, 195, 247, 0.06)' + : 'rgba(255, 183, 77, 0.06)'; + ctx.fillStyle = sentColor; + ctx.fillRect(x1, 0, regionW, labelY); + + // Per-character shading + gap highlighting when zoomed in + const perChar = s.chars.length > 0 && regionW / s.text.length > 10; + if (perChar) { + let ci = 0; + let prevEndMs = s.start; + for (const c of s.chars) { + if (c.start < 0) continue; + const cEndMs = c.end >= 0 ? c.end : c.start; + + if (c.start > prevEndMs) { + const gx1 = Math.max(0, ((prevEndMs / 1000 - vStart) / vSpan) * w); + const gx2 = Math.min(w, ((c.start / 1000 - vStart) / vSpan) * w); + if (gx2 > gx1) { + ctx.fillStyle = 'rgba(244, 67, 54, 0.12)'; + ctx.fillRect(gx1, 0, gx2 - gx1, labelY); + } + } + + const cx1 = ((c.start / 1000 - vStart) / vSpan) * w; + const cx2 = ((cEndMs / 1000 - vStart) / vSpan) * w; + const cl = Math.max(0, cx1), cr = Math.min(w, cx2); + if (cr > cl) { + ctx.fillStyle = ci % 2 === 0 + ? 'rgba(79, 195, 247, 0.08)' + : 'rgba(255, 183, 77, 0.08)'; + ctx.fillRect(cl, 0, cr - cl, labelY); + } + prevEndMs = cEndMs; + ci++; + } + if (prevEndMs < s.end) { + const gx1 = Math.max(0, ((prevEndMs / 1000 - vStart) / vSpan) * w); + const gx2 = Math.min(w, ((endSec - vStart) / vSpan) * w); + if (gx2 > gx1) { + ctx.fillStyle = 'rgba(244, 67, 54, 0.12)'; + ctx.fillRect(gx1, 0, gx2 - gx1, labelY); + } + } + } + + // Sentence boundary lines + ctx.setLineDash([3, 3]); + ctx.lineWidth = 1; + ctx.strokeStyle = 'rgba(255, 235, 59, 0.4)'; + if (rawX1 >= 0 && rawX1 <= w) { + ctx.beginPath(); + ctx.moveTo(rawX1, 0); + ctx.lineTo(rawX1, h); + ctx.stroke(); + } + if (rawX2 >= 0 && rawX2 <= w) { + ctx.beginPath(); + ctx.moveTo(rawX2, 0); + ctx.lineTo(rawX2, h); + ctx.stroke(); + } + ctx.setLineDash([]); + + // Per-character boundary lines + if (perChar) { + ctx.strokeStyle = 'rgba(255, 255, 255, 0.12)'; + ctx.lineWidth = 0.5; + for (const c of s.chars) { + if (c.start < 0) continue; + const cx = ((c.start / 1000 - vStart) / vSpan) * w; + if (cx > x1 + 1 && cx <= x2) { + ctx.beginPath(); + ctx.moveTo(cx, 0); + ctx.lineTo(cx, h); + ctx.stroke(); + } + } + } + + if (regionW < 4) continue; + + // Text label bar at bottom + if (perChar) { + let ci = 0; + let prevEnd = s.start; + for (const c of s.chars) { + if (c.start < 0) continue; + const cEnd = c.end >= 0 ? c.end : c.start; + if (c.start > prevEnd) { + const gx1 = Math.max(x1, ((prevEnd / 1000 - vStart) / vSpan) * w); + const gx2 = Math.min(x2, ((c.start / 1000 - vStart) / vSpan) * w); + if (gx2 > gx1) { + ctx.fillStyle = 'rgba(244, 67, 54, 0.35)'; + ctx.fillRect(gx1, labelY, gx2 - gx1, rowH); + } + } + const cx1c = Math.max(x1, ((c.start / 1000 - vStart) / vSpan) * w); + const cx2c = Math.min(x2, ((cEnd / 1000 - vStart) / vSpan) * w); + if (cx2c > cx1c) { + ctx.fillStyle = ci % 2 === 0 + ? 'rgba(30, 60, 80, 0.85)' + : 'rgba(50, 40, 30, 0.85)'; + ctx.fillRect(cx1c, labelY, cx2c - cx1c, rowH); + } + prevEnd = cEnd; + ci++; + } + if (prevEnd < s.end) { + const gx1 = Math.max(x1, ((prevEnd / 1000 - vStart) / vSpan) * w); + const gx2 = Math.min(x2, ((endSec - vStart) / vSpan) * w); + if (gx2 > gx1) { + ctx.fillStyle = 'rgba(244, 67, 54, 0.35)'; + ctx.fillRect(gx1, labelY, gx2 - gx1, rowH); + } + } + } else { + ctx.fillStyle = 'rgba(0, 0, 0, 0.45)'; + ctx.fillRect(x1, labelY, regionW, rowH); + } + + ctx.save(); + ctx.beginPath(); + ctx.rect(x1 + 1, labelY, regionW - 2, rowH); + ctx.clip(); + + ctx.fillStyle = '#ddd'; + if (perChar) { + let lastEnd = s.start; + for (const c of s.chars) { + const cStart = c.start >= 0 ? c.start : lastEnd; + const cx = ((cStart / 1000 - vStart) / vSpan) * w; + ctx.fillText(c.char, cx + 1, labelY + 3); + if (c.end >= 0) lastEnd = c.end; + } + } else { + ctx.fillText(s.text, x1 + 2, labelY + 3); + } + + ctx.restore(); + } +} + +// ---- Utilities ---- + +export function matchHlSet(text: string, query: string): Set { + const set = new Set(); + const lower = text.toLowerCase(); + const q = query.toLowerCase(); + let pos = 0; + for (;;) { + const idx = lower.indexOf(q, pos); + if (idx === -1) break; + for (let i = idx; i < idx + q.length; i++) set.add(i); + pos = idx + 1; + } + return set; +} + +export function fmtTime(sec: number): string { + const m = Math.floor(sec / 60); + const s = sec % 60; + return `${String(m).padStart(2, '0')}:${s.toFixed(1).padStart(4, '0')}`; +} + +export function fmtMs(ms: number): string { + const sec = ms / 1000; + const m = Math.floor(sec / 60); + const s = sec % 60; + return `${String(m).padStart(2, '0')}:${s.toFixed(2).padStart(5, '0')}`; +} + +function buildChars(text: string, rawText: string, timestamps: number[][]): CharTiming[] { + const chars: CharTiming[] = []; + let ri = 0; + for (let i = 0; i < text.length; i++) { + if (ri < rawText.length && text[i] === rawText[ri]) { + const ts = timestamps[ri]; + chars.push({ char: text[i], start: ts?.[0] ?? -1, end: ts?.[1] ?? -1 }); + ri++; + } else { + chars.push({ char: text[i], start: -1, end: -1 }); + } + } + return chars; +} diff --git a/training/smart-turn-zh/viewer/src/lib/audio.ts b/training/smart-turn-zh/viewer/src/lib/audio.ts new file mode 100644 index 0000000..60edb51 --- /dev/null +++ b/training/smart-turn-zh/viewer/src/lib/audio.ts @@ -0,0 +1,168 @@ +export interface LODLevel { + min: Float32Array; + max: Float32Array; + bucketSize: number; +} + +interface WavInfo { + channels: number; + sampleRate: number; + bitsPerSample: number; + dataOffset: number; + frames: number; +} + +export class AudioStore { + info: WavInfo | null = null; + raw: Int16Array | null = null; + channelCount = 0; + sampleRate = 16000; + duration = 0; + totalFrames = 0; + + private lodCache = new Map(); + + async load(file: File) { + const buffer = await file.arrayBuffer(); + this.info = parseWavHeader(buffer); + this.raw = new Int16Array( + buffer, + this.info.dataOffset, + this.info.frames * this.info.channels, + ); + this.channelCount = this.info.channels; + this.sampleRate = this.info.sampleRate; + this.totalFrames = this.info.frames; + this.duration = this.totalFrames / this.sampleRate; + this.lodCache.clear(); + } + + /** Read a single sample as float [-1, 1]. channel=-1 for merged average. */ + sample(frame: number, channel: number): number { + if (!this.raw || frame < 0 || frame >= this.totalFrames) return 0; + const nch = this.info!.channels; + if (channel === -1) { + let sum = 0; + for (let c = 0; c < nch; c++) sum += this.raw[frame * nch + c]; + return sum / nch / 32768; + } + return this.raw[frame * nch + channel] / 32768; + } + + getLOD(channel: number): LODLevel[] { + const cached = this.lodCache.get(channel); + if (cached) return cached; + const levels = this.buildLOD(channel); + this.lodCache.set(channel, levels); + return levels; + } + + /** Create a mono AudioBuffer for playback of one channel (or merged). */ + createAudioBuffer(channel: number): AudioBuffer | null { + if (!this.raw || !this.info) return null; + const ctx = new OfflineAudioContext(1, this.totalFrames, this.sampleRate); + const buf = ctx.createBuffer(1, this.totalFrames, this.sampleRate); + const out = buf.getChannelData(0); + const raw = this.raw; + const nch = this.info.channels; + + if (channel === -1) { + for (let i = 0; i < this.totalFrames; i++) { + let sum = 0; + for (let c = 0; c < nch; c++) sum += raw[i * nch + c]; + out[i] = sum / nch / 32768; + } + } else { + for (let i = 0; i < this.totalFrames; i++) { + out[i] = raw[i * nch + channel] / 32768; + } + } + return buf; + } + + private buildLOD(channel: number): LODLevel[] { + if (!this.raw || !this.info) return []; + const raw = this.raw; + const frames = this.totalFrames; + const nch = this.info.channels; + const levels: LODLevel[] = []; + + const B0 = 256; + const n0 = Math.ceil(frames / B0); + const min0 = new Float32Array(n0); + const max0 = new Float32Array(n0); + + for (let b = 0; b < n0; b++) { + const start = b * B0; + const end = Math.min(start + B0, frames); + let lo = Infinity, hi = -Infinity; + + if (channel === -1) { + for (let i = start; i < end; i++) { + let sum = 0; + for (let c = 0; c < nch; c++) sum += raw[i * nch + c]; + const v = sum / nch / 32768; + if (v < lo) lo = v; + if (v > hi) hi = v; + } + } else { + for (let i = start; i < end; i++) { + const v = raw[i * nch + channel] / 32768; + if (v < lo) lo = v; + if (v > hi) hi = v; + } + } + min0[b] = lo; + max0[b] = hi; + } + levels.push({ min: min0, max: max0, bucketSize: B0 }); + + while (levels[levels.length - 1].min.length > 512) { + const prev = levels[levels.length - 1]; + const F = 4; + const count = Math.ceil(prev.min.length / F); + const mn = new Float32Array(count); + const mx = new Float32Array(count); + for (let b = 0; b < count; b++) { + const s = b * F; + const e = Math.min(s + F, prev.min.length); + let lo = Infinity, hi = -Infinity; + for (let j = s; j < e; j++) { + if (prev.min[j] < lo) lo = prev.min[j]; + if (prev.max[j] > hi) hi = prev.max[j]; + } + mn[b] = lo; + mx[b] = hi; + } + levels.push({ min: mn, max: mx, bucketSize: prev.bucketSize * F }); + } + + return levels; + } +} + +function parseWavHeader(buffer: ArrayBuffer): WavInfo { + const view = new DataView(buffer); + const channels = view.getUint16(22, true); + const sampleRate = view.getUint32(24, true); + const bitsPerSample = view.getUint16(34, true); + + let offset = 12; + while (offset < buffer.byteLength - 8) { + const id = String.fromCharCode( + view.getUint8(offset), + view.getUint8(offset + 1), + view.getUint8(offset + 2), + view.getUint8(offset + 3), + ); + const size = view.getUint32(offset + 4, true); + if (id === 'data') { + const dataOffset = offset + 8; + const frames = size / (channels * (bitsPerSample / 8)); + return { channels, sampleRate, bitsPerSample, dataOffset, frames }; + } + offset += 8 + size; + if (offset % 2 !== 0) offset++; + } + throw new Error('No data chunk found in WAV file'); +} diff --git a/training/smart-turn-zh/viewer/src/lib/fft-worker.ts b/training/smart-turn-zh/viewer/src/lib/fft-worker.ts new file mode 100644 index 0000000..d0f2a32 --- /dev/null +++ b/training/smart-turn-zh/viewer/src/lib/fft-worker.ts @@ -0,0 +1,96 @@ +/** Web Worker — computes STFT for a single spectrogram tile. */ + +self.onmessage = (e: MessageEvent) => { + const { tileKey, samples, fftSize, hopSize } = e.data as { + tileKey: string; + samples: Float32Array; + fftSize: number; + hopSize: number; + }; + + const freqBins = (fftSize >>> 1) + 1; + const frames = + samples.length >= fftSize + ? Math.floor((samples.length - fftSize) / hopSize) + 1 + : 0; + const magnitudes = new Float32Array(frames * freqBins); + + // Hann window + const win = new Float32Array(fftSize); + for (let i = 0; i < fftSize; i++) { + win[i] = 0.5 * (1 - Math.cos((2 * Math.PI * i) / (fftSize - 1))); + } + + const re = new Float64Array(fftSize); + const im = new Float64Array(fftSize); + + for (let f = 0; f < frames; f++) { + const off = f * hopSize; + for (let i = 0; i < fftSize; i++) { + re[i] = samples[off + i] * win[i]; + im[i] = 0; + } + + fft(re, im); + + const base = f * freqBins; + for (let k = 0; k < freqBins; k++) { + const mag = Math.sqrt(re[k] * re[k] + im[k] * im[k]) / fftSize; + magnitudes[base + k] = 20 * Math.log10(Math.max(mag, 1e-10)); + } + } + + postMessage( + { type: 'result', tileKey, magnitudes, frames, freqBins }, + { transfer: [magnitudes.buffer] }, + ); +}; + +/* ---- Radix-2 Cooley–Tukey FFT (in-place) ---- */ + +function fft(re: Float64Array, im: Float64Array) { + const n = re.length; + + // Bit-reversal permutation + for (let i = 1, j = 0; i < n; i++) { + let bit = n >> 1; + while (j & bit) { + j ^= bit; + bit >>= 1; + } + j ^= bit; + if (i < j) { + let t = re[i]; + re[i] = re[j]; + re[j] = t; + t = im[i]; + im[i] = im[j]; + im[j] = t; + } + } + + // Butterfly stages + for (let len = 2; len <= n; len <<= 1) { + const half = len >> 1; + const ang = (-2 * Math.PI) / len; + const wR = Math.cos(ang); + const wI = Math.sin(ang); + for (let i = 0; i < n; i += len) { + let cR = 1, + cI = 0; + for (let j = 0; j < half; j++) { + const a = i + j, + b = a + half; + const tR = cR * re[b] - cI * im[b]; + const tI = cR * im[b] + cI * re[b]; + re[b] = re[a] - tR; + im[b] = im[a] - tI; + re[a] += tR; + im[a] += tI; + const nR = cR * wR - cI * wI; + cI = cR * wI + cI * wR; + cR = nR; + } + } + } +} diff --git a/training/smart-turn-zh/viewer/src/lib/spectrogram.ts b/training/smart-turn-zh/viewer/src/lib/spectrogram.ts new file mode 100644 index 0000000..c7c8773 --- /dev/null +++ b/training/smart-turn-zh/viewer/src/lib/spectrogram.ts @@ -0,0 +1,337 @@ +import type { Timeline } from './timeline'; +import type { AudioStore } from './audio'; + +/* ---- Mel helpers ---- */ + +function hzToMel(hz: number): number { + return 2595 * Math.log10(1 + hz / 700); +} + +function melToHz(mel: number): number { + return 700 * (10 ** (mel / 2595) - 1); +} + +/* ---- Turbo colormap (256 entries) ---- */ + +const STOPS: [number, number, number, number][] = [ + [0, 0, 0, 10], + [0.1, 20, 10, 120], + [0.2, 30, 80, 200], + [0.3, 0, 160, 230], + [0.4, 0, 210, 170], + [0.5, 50, 220, 80], + [0.6, 140, 210, 20], + [0.7, 210, 180, 0], + [0.8, 250, 130, 0], + [0.9, 240, 50, 0], + [1, 180, 10, 0], +]; + +const CMAP = new Uint8Array(256 * 3); +for (let i = 0; i < 256; i++) { + const t = i / 255; + let si = 0; + for (let s = 1; s < STOPS.length; s++) { + if (STOPS[s][0] >= t) { + si = s - 1; + break; + } + } + const [t0, r0, g0, b0] = STOPS[si]; + const [t1, r1, g1, b1] = STOPS[si + 1]; + const f = (t - t0) / (t1 - t0); + CMAP[i * 3] = Math.round(r0 + (r1 - r0) * f); + CMAP[i * 3 + 1] = Math.round(g0 + (g1 - g0) * f); + CMAP[i * 3 + 2] = Math.round(b0 + (b1 - b0) * f); +} + +/* ---- Renderer ---- */ + +export type FreqScale = 'mel' | 'linear'; + +interface CachedTile { + magnitudes: Float32Array; + frames: number; + freqBins: number; + canvas: HTMLCanvasElement; + scale: FreqScale; + startTime: number; + endTime: number; +} + +export class SpectrogramRenderer { + channel = -1; + freqScale: FreqScale = 'mel'; + dbMin = -80; + dbMax = 0; + + /** Called after a tile finishes computing so the component can redraw. */ + onTileReady: (() => void) | null = null; + + private ctx: CanvasRenderingContext2D; + private worker: Worker; + private cache = new Map(); + private pending = new Set(); + private lastRaw: Int16Array | null = null; + + private readonly fftSize = 512; + private readonly hopSize = 256; + private readonly tileSec = 10; + private readonly maxPending = 20; + + constructor( + private canvas: HTMLCanvasElement, + private tl: Timeline, + private audio: AudioStore, + ) { + this.ctx = canvas.getContext('2d')!; + this.worker = new Worker( + new URL('./fft-worker.ts', import.meta.url), + { type: 'module' }, + ); + this.worker.onmessage = (e) => this.onWorkerMsg(e); + } + + resize() { + const dpr = devicePixelRatio; + const rect = this.canvas.getBoundingClientRect(); + this.canvas.width = rect.width * dpr; + this.canvas.height = rect.height * dpr; + } + + render() { + if (!this.audio.raw) return; + + // Invalidate cache when audio data changes (new file loaded) + if (this.audio.raw !== this.lastRaw) { + this.clearCache(); + this.lastRaw = this.audio.raw; + } + + const { ctx, canvas, tl } = this; + const dpr = devicePixelRatio; + const w = canvas.width / dpr; + const h = canvas.height / dpr; + + ctx.setTransform(dpr, 0, 0, dpr, 0, 0); + ctx.clearRect(0, 0, w, h); + + // Visible tile range + 1-tile margin on each side + const maxTile = Math.ceil(this.audio.duration / this.tileSec); + const t0 = Math.max(0, Math.floor(tl.viewStart / this.tileSec) - 1); + const t1 = Math.min(maxTile, Math.ceil(tl.viewEnd / this.tileSec) + 1); + + for (let ti = t0; ti < t1; ti++) { + const key = `${this.channel}:${ti}`; + const cached = this.cache.get(key); + + if (cached) { + // Rebuild tile canvas if freq scale changed + if (cached.scale !== this.freqScale) { + this.rebuildTileCanvas(cached); + } + + const x0 = tl.timeToX(cached.startTime, w); + const x1 = tl.timeToX(cached.endTime, w); + const left = Math.max(0, x0); + const right = Math.min(w, x1); + if (right > left) { + const srcX = ((left - x0) / (x1 - x0)) * cached.canvas.width; + const srcW = ((right - left) / (x1 - x0)) * cached.canvas.width; + ctx.drawImage( + cached.canvas, + srcX, 0, srcW, cached.canvas.height, + left, 0, right - left, h, + ); + } + } else if (!this.pending.has(key) && this.pending.size < this.maxPending) { + this.requestTile(ti); + } + } + + this.drawFreqAxis(w, h); + + // Cursor + const cx = tl.timeToX(tl.cursor, w); + if (cx >= 0 && cx <= w) { + ctx.strokeStyle = '#ff5722'; + ctx.lineWidth = 1.5; + ctx.beginPath(); + ctx.moveTo(cx, 0); + ctx.lineTo(cx, h); + ctx.stroke(); + } + } + + dispose() { + this.worker.terminate(); + this.cache.clear(); + } + + /* ---- internals ---- */ + + private clearCache() { + this.cache.clear(); + this.pending.clear(); + } + + private requestTile(ti: number) { + const key = `${this.channel}:${ti}`; + this.pending.add(key); + + const sr = this.audio.sampleRate; + const s0 = ti * this.tileSec * sr; + const s1 = Math.min((ti + 1) * this.tileSec * sr, this.audio.totalFrames); + const samples = this.extractSamples(s0, s1); + + this.worker.postMessage( + { + type: 'compute', + tileKey: key, + samples, + fftSize: this.fftSize, + hopSize: this.hopSize, + }, + { transfer: [samples.buffer] }, + ); + } + + private extractSamples(start: number, end: number): Float32Array { + const raw = this.audio.raw!; + const nch = this.audio.channelCount; + const ch = this.channel; + const len = end - start; + const out = new Float32Array(len); + + if (ch === -1) { + for (let i = 0; i < len; i++) { + let sum = 0; + for (let c = 0; c < nch; c++) sum += raw[(start + i) * nch + c]; + out[i] = sum / nch / 32768; + } + } else { + for (let i = 0; i < len; i++) { + out[i] = raw[(start + i) * nch + ch] / 32768; + } + } + return out; + } + + private onWorkerMsg(e: MessageEvent) { + const { tileKey, magnitudes, frames, freqBins } = e.data as { + tileKey: string; + magnitudes: Float32Array; + frames: number; + freqBins: number; + }; + this.pending.delete(tileKey); + if (frames === 0) return; + + const tileCanvas = this.buildTileCanvas(magnitudes, frames, freqBins); + + const ti = parseInt(tileKey.split(':')[1]); + const startTime = ti * this.tileSec; + const endTime = Math.min(startTime + this.tileSec, this.audio.duration); + + this.cache.set(tileKey, { + magnitudes, frames, freqBins, + canvas: tileCanvas, + scale: this.freqScale, + startTime, endTime, + }); + this.onTileReady?.(); + } + + /** Build an offscreen canvas from magnitude data using current freqScale. */ + private buildTileCanvas( + magnitudes: Float32Array, frames: number, freqBins: number, + ): HTMLCanvasElement { + const img = this.buildImageData(magnitudes, frames, freqBins, this.freqScale); + const oc = document.createElement('canvas'); + oc.width = frames; + oc.height = freqBins; + oc.getContext('2d')!.putImageData(img, 0, 0); + return oc; + } + + /** Rebuild an existing tile's canvas when freq scale changes. */ + private rebuildTileCanvas(tile: CachedTile) { + const img = this.buildImageData( + tile.magnitudes, tile.frames, tile.freqBins, this.freqScale, + ); + tile.canvas.getContext('2d')!.putImageData(img, 0, 0); + tile.scale = this.freqScale; + } + + /** Convert magnitude grid to RGBA ImageData with mel or linear freq mapping. */ + private buildImageData( + magnitudes: Float32Array, frames: number, freqBins: number, scale: FreqScale, + ): ImageData { + const img = new ImageData(frames, freqBins); + const px = img.data; + const range = this.dbMax - this.dbMin; + const nyquist = this.audio.sampleRate / 2; + const melMax = scale === 'mel' ? hzToMel(nyquist) : 0; + + for (let row = 0; row < freqBins; row++) { + // Map output row to a (possibly fractional) FFT bin index + let k: number; + if (scale === 'mel') { + const mel = (row / (freqBins - 1)) * melMax; + const hz = melToHz(mel); + k = (hz / nyquist) * (freqBins - 1); + } else { + k = row; + } + + const k0 = Math.floor(k); + const k1 = Math.min(k0 + 1, freqBins - 1); + const frac = k - k0; + const y = freqBins - 1 - row; // flip: low freq at bottom + + for (let f = 0; f < frames; f++) { + const base = f * freqBins; + const db = magnitudes[base + k0] * (1 - frac) + magnitudes[base + k1] * frac; + const norm = Math.max(0, Math.min(1, (db - this.dbMin) / range)); + const ci = Math.round(norm * 255) * 3; + const pi = (y * frames + f) * 4; + px[pi] = CMAP[ci]; + px[pi + 1] = CMAP[ci + 1]; + px[pi + 2] = CMAP[ci + 2]; + px[pi + 3] = 255; + } + } + + return img; + } + + private drawFreqAxis(w: number, h: number) { + const { ctx } = this; + const nyquist = this.audio.sampleRate / 2; + const freqs = [500, 1000, 2000, 4000, 6000, 8000].filter((f) => f <= nyquist); + + ctx.font = '9px monospace'; + ctx.textBaseline = 'middle'; + ctx.textAlign = 'right'; + + for (const freq of freqs) { + let y: number; + if (this.freqScale === 'mel') { + y = h * (1 - hzToMel(freq) / hzToMel(nyquist)); + } else { + y = h * (1 - freq / nyquist); + } + + ctx.strokeStyle = 'rgba(255,255,255,0.1)'; + ctx.lineWidth = 0.5; + ctx.beginPath(); + ctx.moveTo(0, y); + ctx.lineTo(w, y); + ctx.stroke(); + + ctx.fillStyle = 'rgba(255,255,255,0.3)'; + const label = freq >= 1000 ? `${freq / 1000}k` : `${freq}`; + ctx.fillText(label, w - 4, y); + } + } +} diff --git a/training/smart-turn-zh/viewer/src/lib/timeline.ts b/training/smart-turn-zh/viewer/src/lib/timeline.ts new file mode 100644 index 0000000..8a2dcc1 --- /dev/null +++ b/training/smart-turn-zh/viewer/src/lib/timeline.ts @@ -0,0 +1,93 @@ +export class Timeline { + duration = 0; + viewStart = 0; + viewEnd = 1; + cursor = 0; + sampleRate = 16000; + + /** Loop range (seconds). Both 0 = no loop. */ + loopStart = 0; + loopEnd = 0; + + setLoop(start: number, end: number) { + this.loopStart = Math.min(start, end); + this.loopEnd = Math.max(start, end); + this.emit(); + } + + clearLoop() { + this.loopStart = 0; + this.loopEnd = 0; + this.emit(); + } + + get hasLoop() { + return this.loopEnd > this.loopStart; + } + + private listeners: Array<() => void> = []; + + setDuration(d: number) { + this.duration = d; + this.viewStart = 0; + this.viewEnd = d; + this.emit(); + } + + setView(start: number, end: number) { + const minSpan = 0.005; + let s = start, e = end; + if (e - s < minSpan) { + const mid = (s + e) / 2; + s = mid - minSpan / 2; + e = mid + minSpan / 2; + } + if (s < 0) { e -= s; s = 0; } + if (e > this.duration) { s -= e - this.duration; e = this.duration; } + if (s < 0) s = 0; + this.viewStart = s; + this.viewEnd = e; + this.emit(); + } + + setCursor(t: number) { + this.cursor = Math.max(0, Math.min(this.duration, t)); + this.emit(); + } + + zoom(factor: number, anchorFrac: number) { + const span = this.viewEnd - this.viewStart; + const anchor = this.viewStart + span * anchorFrac; + const newSpan = span * factor; + this.setView(anchor - newSpan * anchorFrac, anchor - newSpan * anchorFrac + newSpan); + } + + pan(deltaSec: number) { + this.setView(this.viewStart + deltaSec, this.viewEnd + deltaSec); + } + + timeToX(t: number, width: number): number { + return ((t - this.viewStart) / (this.viewEnd - this.viewStart)) * width; + } + + xToTime(x: number, width: number): number { + return this.viewStart + (x / width) * (this.viewEnd - this.viewStart); + } + + /** Fire listeners without changing state. Used by the playback loop. */ + flush() { + this.emit(); + } + + onUpdate(fn: () => void): () => void { + this.listeners.push(fn); + return () => { + const idx = this.listeners.indexOf(fn); + if (idx >= 0) this.listeners.splice(idx, 1); + }; + } + + private emit() { + for (const fn of this.listeners) fn(); + } +} diff --git a/training/smart-turn-zh/viewer/src/lib/vad.ts b/training/smart-turn-zh/viewer/src/lib/vad.ts new file mode 100644 index 0000000..63f8219 --- /dev/null +++ b/training/smart-turn-zh/viewer/src/lib/vad.ts @@ -0,0 +1,177 @@ +import type { Timeline } from './timeline'; + +export interface VADBlock { + start: number; // seconds + end: number; // seconds +} + +export function findVADBlocks( + probs: Float32Array, + frameSec: number, + entryThreshold: number, + exitThreshold: number, +): VADBlock[] { + const blocks: VADBlock[] = []; + let active = false; + let blockStart = 0; + + for (let i = 0; i < probs.length; i++) { + if (!active && probs[i] >= entryThreshold) { + active = true; + blockStart = i * frameSec; + } else if (active && probs[i] <= exitThreshold) { + active = false; + blocks.push({ start: blockStart, end: i * frameSec }); + } + } + if (active) { + blocks.push({ start: blockStart, end: probs.length * frameSec }); + } + return blocks; +} + +export function nextVADBlock(blocks: VADBlock[], cursor: number): VADBlock | null { + for (const b of blocks) { + if (b.start > cursor + 0.001) return b; + } + return null; +} + +export function prevVADBlock(blocks: VADBlock[], cursor: number): VADBlock | null { + for (let i = blocks.length - 1; i >= 0; i--) { + if (blocks[i].start < cursor - 0.001) return blocks[i]; + } + return null; +} + +export class VADRenderer { + probs: Float32Array | null = null; + entryThreshold = 0.3; + exitThreshold = 0.1; + readonly frameSec = 0.032; + + private ctx: CanvasRenderingContext2D; + + constructor( + private canvas: HTMLCanvasElement, + private tl: Timeline, + ) { + this.ctx = canvas.getContext('2d')!; + } + + load(buffer: ArrayBuffer) { + this.probs = parseNpy(buffer); + } + + resize() { + const dpr = devicePixelRatio; + const rect = this.canvas.getBoundingClientRect(); + this.canvas.width = rect.width * dpr; + this.canvas.height = rect.height * dpr; + } + + render() { + if (!this.probs) return; + const { ctx, canvas, tl, probs, frameSec } = this; + const dpr = devicePixelRatio; + const w = canvas.width / dpr; + const h = canvas.height / dpr; + + ctx.setTransform(dpr, 0, 0, dpr, 0, 0); + ctx.clearRect(0, 0, w, h); + + const vStart = tl.viewStart; + const vSpan = tl.viewEnd - vStart; + const startFrame = Math.max(0, Math.floor(vStart / frameSec) - 1); + const endFrame = Math.min(probs.length, Math.ceil(tl.viewEnd / frameSec) + 1); + + let active = false; + for (let i = 0; i < startFrame; i++) { + if (!active && probs[i] >= this.entryThreshold) active = true; + else if (active && probs[i] <= this.exitThreshold) active = false; + } + + let regStart = startFrame; + let regActive = active; + + const fillRegion = (from: number, to: number, isActive: boolean) => { + const x1 = ((from * frameSec - vStart) / vSpan) * w; + const x2 = ((to * frameSec - vStart) / vSpan) * w; + const left = Math.max(0, x1), right = Math.min(w, x2); + if (right <= left) return; + ctx.fillStyle = isActive ? 'rgba(76,175,80,0.25)' : 'rgba(255,255,255,0.03)'; + ctx.fillRect(left, 0, right - left, h); + }; + + for (let i = startFrame; i < endFrame; i++) { + const was: boolean = active; + if (!active && probs[i] >= this.entryThreshold) active = true; + else if (active && probs[i] <= this.exitThreshold) active = false; + if (active !== was) { + fillRegion(regStart, i, regActive); + regStart = i; + regActive = active; + } + } + fillRegion(regStart, endFrame, regActive); + + ctx.strokeStyle = '#4fc3f7'; + ctx.lineWidth = 1; + ctx.beginPath(); + let first = true; + for (let x = 0; x < w; x++) { + const t = vStart + (x / w) * vSpan; + const fi = Math.round(t / frameSec); + if (fi < 0 || fi >= probs.length) continue; + const y = h - probs[fi] * h; + if (first) { ctx.moveTo(x, y); first = false; } else ctx.lineTo(x, y); + } + ctx.stroke(); + + this.drawThresh(w, h, this.entryThreshold, '#ff9800', 'Entry'); + this.drawThresh(w, h, this.exitThreshold, '#f44336', 'Exit'); + + const cx = tl.timeToX(tl.cursor, w); + if (cx >= 0 && cx <= w) { + ctx.strokeStyle = '#ff5722'; + ctx.lineWidth = 1.5; + ctx.beginPath(); + ctx.moveTo(cx, 0); + ctx.lineTo(cx, h); + ctx.stroke(); + } + } + + private drawThresh(w: number, h: number, val: number, color: string, label: string) { + const y = h - val * h; + const { ctx } = this; + ctx.strokeStyle = color; + ctx.lineWidth = 1; + ctx.setLineDash([4, 4]); + ctx.beginPath(); + ctx.moveTo(0, y); + ctx.lineTo(w, y); + ctx.stroke(); + ctx.setLineDash([]); + ctx.fillStyle = color; + ctx.font = '10px monospace'; + ctx.fillText(`${label} ${val}`, 4, y - 3); + } +} + +export function parseNpy(buf: ArrayBuffer): Float32Array { + const bytes = new Uint8Array(buf); + const major = bytes[6]; + let headerLen: number, dataOffset: number; + if (major <= 1) { + headerLen = new DataView(buf).getUint16(8, true); + dataOffset = 10 + headerLen; + } else { + headerLen = new DataView(buf).getUint32(8, true); + dataOffset = 12 + headerLen; + } + if (dataOffset % 4 !== 0) { + return new Float32Array(buf.slice(dataOffset)); + } + return new Float32Array(buf, dataOffset); +} diff --git a/training/smart-turn-zh/viewer/src/lib/waveform.ts b/training/smart-turn-zh/viewer/src/lib/waveform.ts new file mode 100644 index 0000000..e36fdfb --- /dev/null +++ b/training/smart-turn-zh/viewer/src/lib/waveform.ts @@ -0,0 +1,202 @@ +import type { Timeline } from './timeline'; +import type { AudioStore } from './audio'; + +export type WaveformScale = 'linear' | 'dB'; + +export class WaveformRenderer { + channel = -1; + scale: WaveformScale = 'dB'; + private ctx: CanvasRenderingContext2D; + + constructor( + private canvas: HTMLCanvasElement, + private tl: Timeline, + private audio: AudioStore, + ) { + this.ctx = canvas.getContext('2d')!; + } + + resize() { + const dpr = devicePixelRatio; + const rect = this.canvas.getBoundingClientRect(); + this.canvas.width = rect.width * dpr; + this.canvas.height = rect.height * dpr; + } + + render() { + if (!this.audio.raw) return; + const { ctx, canvas, tl, audio } = this; + const dpr = devicePixelRatio; + const w = canvas.width / dpr; + const h = canvas.height / dpr; + + ctx.setTransform(dpr, 0, 0, dpr, 0, 0); + ctx.clearRect(0, 0, w, h); + + ctx.strokeStyle = '#333'; + ctx.lineWidth = 0.5; + ctx.beginPath(); + ctx.moveTo(0, h / 2); + ctx.lineTo(w, h / 2); + ctx.stroke(); + + if (this.scale === 'dB') { + const mid = h / 2, amp = mid * 0.95; + ctx.font = '9px monospace'; + ctx.textBaseline = 'middle'; + for (const db of [-6, -12, -24, -48]) { + const norm = Math.max(0, (db + 60) / 60); + const yUp = mid - norm * amp; + const yDn = mid + norm * amp; + ctx.strokeStyle = 'rgba(255,255,255,0.08)'; + ctx.lineWidth = 0.5; + ctx.beginPath(); + ctx.moveTo(0, yUp); ctx.lineTo(w, yUp); + ctx.moveTo(0, yDn); ctx.lineTo(w, yDn); + ctx.stroke(); + ctx.fillStyle = 'rgba(255,255,255,0.2)'; + ctx.fillText(`${db}`, 2, yUp); + } + } + + const sr = audio.sampleRate; + const s0 = Math.floor(tl.viewStart * sr); + const s1 = Math.ceil(tl.viewEnd * sr); + const spp = (s1 - s0) / w; + + if (spp <= 1) { + this.drawRaw(w, h, s0, s1); + } else if (spp <= 512) { + this.drawDirect(w, h, s0, s1); + } else { + this.drawLOD(w, h, s0, s1, spp); + } + + // Loop range highlight + if (tl.hasLoop) { + const x0 = Math.max(0, tl.timeToX(tl.loopStart, w)); + const x1 = Math.min(w, tl.timeToX(tl.loopEnd, w)); + if (x1 > x0) { + ctx.fillStyle = 'rgba(255, 152, 0, 0.15)'; + ctx.fillRect(x0, 0, x1 - x0, h); + ctx.strokeStyle = 'rgba(255, 152, 0, 0.6)'; + ctx.lineWidth = 1; + ctx.beginPath(); + ctx.moveTo(x0, 0); ctx.lineTo(x0, h); + ctx.moveTo(x1, 0); ctx.lineTo(x1, h); + ctx.stroke(); + } + } + + const cx = tl.timeToX(tl.cursor, w); + if (cx >= 0 && cx <= w) { + ctx.strokeStyle = '#ff5722'; + ctx.lineWidth = 1.5; + ctx.beginPath(); + ctx.moveTo(cx, 0); + ctx.lineTo(cx, h); + ctx.stroke(); + } + } + + private mapY(v: number, mid: number, amp: number): number { + if (this.scale === 'dB') { + const sign = v < 0 ? -1 : 1; + const abs = Math.abs(v); + const db = abs > 1e-6 ? 20 * Math.log10(abs) : -60; + const norm = Math.max(0, (db + 60) / 60); + return mid - sign * norm * amp; + } + return mid - v * amp; + } + + private drawRaw(w: number, h: number, s0: number, s1: number) { + const { ctx, audio } = this; + const mid = h / 2, amp = mid * 0.95; + ctx.strokeStyle = '#4fc3f7'; + ctx.lineWidth = 1; + ctx.beginPath(); + for (let x = 0; x < w; x++) { + const idx = s0 + Math.round((x / w) * (s1 - s0)); + const y = this.mapY(audio.sample(idx, this.channel), mid, amp); + x === 0 ? ctx.moveTo(x, y) : ctx.lineTo(x, y); + } + ctx.stroke(); + } + + private drawDirect(w: number, h: number, s0: number, s1: number) { + const { ctx, audio } = this; + const mid = h / 2, amp = mid * 0.95; + const span = s1 - s0; + + ctx.fillStyle = '#4fc3f7'; + ctx.beginPath(); + + const mins = new Float32Array(w); + const maxs = new Float32Array(w); + for (let x = 0; x < w; x++) { + const start = s0 + Math.floor((x / w) * span); + const end = s0 + Math.floor(((x + 1) / w) * span); + let lo = Infinity, hi = -Infinity; + for (let i = start; i < end; i++) { + const v = audio.sample(i, this.channel); + if (v < lo) lo = v; + if (v > hi) hi = v; + } + mins[x] = isFinite(lo) ? lo : 0; + maxs[x] = isFinite(hi) ? hi : 0; + } + + for (let x = 0; x < w; x++) { + const y = this.mapY(maxs[x], mid, amp); + x === 0 ? ctx.moveTo(x, y) : ctx.lineTo(x, y); + } + for (let x = w - 1; x >= 0; x--) { + ctx.lineTo(x, this.mapY(mins[x], mid, amp)); + } + ctx.closePath(); + ctx.fill(); + } + + private drawLOD(w: number, h: number, s0: number, s1: number, spp: number) { + const levels = this.audio.getLOD(this.channel); + if (!levels.length) return; + + let lv = levels[0]; + for (const l of levels) { + if (l.bucketSize <= spp) lv = l; + else break; + } + + const { ctx } = this; + const mid = h / 2, amp = mid * 0.95; + const span = s1 - s0; + + ctx.fillStyle = '#4fc3f7'; + ctx.beginPath(); + + for (let x = 0; x < w; x++) { + const ss = s0 + (x / w) * span; + const se = ss + span / w; + const b0 = Math.max(0, Math.floor(ss / lv.bucketSize)); + const b1 = Math.min(Math.ceil(se / lv.bucketSize), lv.max.length); + let hi = -Infinity; + for (let b = b0; b < b1; b++) if (lv.max[b] > hi) hi = lv.max[b]; + if (!isFinite(hi)) hi = 0; + const y = this.mapY(hi, mid, amp); + x === 0 ? ctx.moveTo(x, y) : ctx.lineTo(x, y); + } + for (let x = w - 1; x >= 0; x--) { + const ss = s0 + (x / w) * span; + const se = ss + span / w; + const b0 = Math.max(0, Math.floor(ss / lv.bucketSize)); + const b1 = Math.min(Math.ceil(se / lv.bucketSize), lv.min.length); + let lo = Infinity; + for (let b = b0; b < b1; b++) if (lv.min[b] < lo) lo = lv.min[b]; + if (!isFinite(lo)) lo = 0; + ctx.lineTo(x, this.mapY(lo, mid, amp)); + } + ctx.closePath(); + ctx.fill(); + } +} diff --git a/training/smart-turn-zh/viewer/src/main.tsx b/training/smart-turn-zh/viewer/src/main.tsx new file mode 100644 index 0000000..5057455 --- /dev/null +++ b/training/smart-turn-zh/viewer/src/main.tsx @@ -0,0 +1,5 @@ +import { createRoot } from 'react-dom/client'; +import { App } from './App'; +import './style.css'; + +createRoot(document.getElementById('root')!).render(); diff --git a/training/smart-turn-zh/viewer/src/style.css b/training/smart-turn-zh/viewer/src/style.css new file mode 100644 index 0000000..9a21cc4 --- /dev/null +++ b/training/smart-turn-zh/viewer/src/style.css @@ -0,0 +1,635 @@ +* { + margin: 0; + padding: 0; + box-sizing: border-box; +} + +:root { + --bg-base: #0c0c14; + --bg-surface: #13131f; + --bg-elevated: #1a1a2e; + --bg-input: #10101c; + --border: #232340; + --border-subtle: #1c1c34; + --border-focus: #5b9cf6; + --text: #d8dae0; + --text-muted: #7a7d8a; + --text-faint: #4a4d5a; + --accent: #5b9cf6; + --accent-hover: #7ab4ff; + --accent-muted: rgba(91, 156, 246, 0.12); + --success: #4ade80; + --warning: #f59e0b; + --danger: #f87171; + --highlight-current: rgba(251, 191, 36, 0.14); + --highlight-match: rgba(251, 191, 36, 0.08); + --radius-sm: 4px; + --radius-md: 6px; + --radius-lg: 8px; + --shadow-sm: 0 1px 2px rgba(0, 0, 0, 0.3); + --shadow-md: 0 2px 8px rgba(0, 0, 0, 0.4); + --transition: 150ms ease; +} + +body { + background: var(--bg-base); + color: var(--text); + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Inter, sans-serif; + font-size: 13px; + overflow: hidden; + height: 100vh; + -webkit-font-smoothing: antialiased; +} + +#app { + display: flex; + flex-direction: column; + height: 100vh; +} + +/* ---- Toolbar ---- */ + +#toolbar { + display: flex; + align-items: center; + justify-content: space-between; + padding: 6px 14px; + background: var(--bg-surface); + border-bottom: 1px solid var(--border); + gap: 12px; + flex-shrink: 0; +} + +#file-controls { + display: flex; + gap: 6px; +} + +.file-btn { + display: inline-flex; + align-items: center; + gap: 4px; + padding: 5px 10px; + background: var(--bg-elevated); + border: 1px solid var(--border); + border-radius: var(--radius-md); + cursor: pointer; + font-size: 12px; + color: var(--text-muted); + user-select: none; + transition: all var(--transition); +} + +.file-btn:hover { + background: var(--border); + color: var(--text); + border-color: var(--border-focus); +} + +.status { + color: var(--text-faint); + font-size: 11px; + min-width: 12px; + text-align: center; +} + +#playback-controls { + display: flex; + align-items: center; + gap: 8px; +} + +select, +button, +input[type='number'] { + background: var(--bg-elevated); + color: var(--text); + border: 1px solid var(--border); + border-radius: var(--radius-md); + padding: 5px 10px; + font-size: 12px; + cursor: pointer; + transition: all var(--transition); + outline: none; +} + +select:hover, +button:hover:not(:disabled), +input[type='number']:hover { + border-color: var(--border-focus); +} + +select:focus, +input[type='number']:focus { + border-color: var(--accent); + box-shadow: 0 0 0 2px var(--accent-muted); +} + +button:hover:not(:disabled) { + background: var(--border); + color: #fff; +} + +button:active:not(:disabled) { + transform: scale(0.97); +} + +button:disabled { + opacity: 0.35; + cursor: not-allowed; +} + +#zoom-controls { + display: flex; + align-items: center; + gap: 2px; +} + +.zoom-sep { + width: 1px; + height: 16px; + background: var(--border); + margin: 0 4px; +} + +.zoom-btn { + padding: 4px 8px !important; + font-size: 11px !important; + min-width: 0 !important; + border-radius: var(--radius-sm) !important; +} + +.zoom-btn.active { + background: var(--accent-muted) !important; + color: var(--accent); + border-color: var(--accent) !important; +} + +#gain-label { + display: flex; + align-items: center; + gap: 4px; + font-size: 11px; + color: var(--text-muted); +} + +#gain-slider { + width: 60px; + cursor: pointer; +} + +#gain-value { + font-family: 'SF Mono', 'Fira Code', 'Cascadia Code', monospace; + font-size: 11px; + min-width: 32px; + color: var(--text-muted); +} + +#time-display { + font-family: 'SF Mono', 'Fira Code', 'Cascadia Code', monospace; + font-size: 13px; + color: var(--text-muted); + min-width: 120px; +} + +#view-span { + font-family: 'SF Mono', 'Fira Code', 'Cascadia Code', monospace; + font-size: 11px; + color: var(--text-faint); + min-width: 40px; +} + +#loop-span { + font-family: 'SF Mono', 'Fira Code', 'Cascadia Code', monospace; + font-size: 11px; + color: rgb(255, 152, 0); + margin-left: 8px; +} + +/* ---- Main area (tracks + ASR side panel) ---- */ + +#main-area { + flex: 1; + display: flex; + min-height: 0; +} + +#tracks-column { + flex: 1; + display: flex; + flex-direction: column; + min-width: 0; +} + +/* ---- Tracks ---- */ + +#tracks { + flex: 1; + display: flex; + flex-direction: column; + min-height: 0; + background: var(--bg-base); +} + +.track { + position: relative; + flex: 1; + min-height: 0; + overflow: hidden; +} + +.track-label { + position: absolute; + top: 6px; + left: 10px; + font-size: 10px; + font-weight: 500; + letter-spacing: 0.04em; + text-transform: uppercase; + color: var(--text-faint); + z-index: 1; + pointer-events: none; +} + +.track canvas { + width: 100%; + height: 100%; + display: block; + cursor: crosshair; +} + +#vad-track { + flex: 0.5; +} + +#spectrogram-track { + flex: 1; +} + +.scale-toggle { + position: absolute; + top: 4px; + right: 8px; + display: flex; + gap: 2px; + z-index: 1; +} + +.scale-btn { + padding: 3px 8px !important; + font-size: 10px !important; + border-radius: var(--radius-sm) !important; + opacity: 0.4; + transition: all var(--transition); +} + +.scale-btn:hover { + opacity: 0.7; +} + +.scale-btn.active { + opacity: 1; + background: var(--accent-muted) !important; + color: var(--accent); + border-color: var(--accent) !important; +} + +#vad-thresholds { + position: absolute; + top: 4px; + right: 8px; + display: flex; + align-items: center; + gap: 8px; + font-size: 10px; + z-index: 1; +} + +.vad-nav { + display: flex; + gap: 2px; +} + +.vad-nav-btn { + padding: 2px 6px !important; + font-size: 10px !important; + min-width: 0 !important; + border-radius: var(--radius-sm) !important; + line-height: 1; +} + +#vad-thresholds label { + display: flex; + align-items: center; + gap: 4px; + color: var(--text-muted); +} + +#vad-thresholds input[type='number'] { + width: 50px; + padding: 2px 4px; + font-size: 10px; +} + +/* ---- Minimap ---- */ + +#minimap-canvas { + width: 100%; + height: 30px; + display: block; + cursor: pointer; + background: var(--bg-base); + border-top: 1px solid var(--border-subtle); + flex-shrink: 0; +} + +/* ---- ASR Panel ---- */ + +#asr-panel { + display: flex; + flex-direction: column; + width: 320px; + flex-shrink: 0; + background: var(--bg-surface); +} + +#search-bar { + display: flex; + align-items: center; + flex-wrap: wrap; + gap: 6px; + padding: 8px 12px; + background: var(--bg-surface); + border-bottom: 1px solid var(--border); + flex-shrink: 0; +} + +#search-input { + flex: 1; + background: var(--bg-input); + color: var(--text); + border: 1px solid var(--border); + border-radius: var(--radius-md); + padding: 6px 10px; + font-size: 12px; + outline: none; + transition: all var(--transition); +} + +#search-input::placeholder { + color: var(--text-faint); +} + +#search-input:focus { + border-color: var(--accent); + box-shadow: 0 0 0 2px var(--accent-muted); +} + +#search-count { + font-size: 11px; + color: var(--text-faint); + font-family: 'SF Mono', 'Fira Code', 'Cascadia Code', monospace; + min-width: 50px; + text-align: center; +} + +#transcript-list { + flex: 1; + overflow-y: auto; + padding: 4px 0; +} + +.sentence { + padding: 6px 12px; + cursor: pointer; + display: flex; + align-items: baseline; + gap: 8px; + line-height: 1.6; + transition: background var(--transition); + border-left: 2px solid transparent; +} + +.sentence:hover { + background: var(--accent-muted); +} + +.sentence.active { + background: var(--accent-muted); + border-left-color: var(--accent); +} + +.sentence.match { + background: var(--highlight-match); + border-left-color: var(--warning); +} + +.sentence.current { + background: var(--highlight-current); + border-left-color: var(--warning); +} + +.sentence .time { + color: var(--text-faint); + font-family: 'SF Mono', 'Fira Code', 'Cascadia Code', monospace; + font-size: 11px; + flex-shrink: 0; + min-width: 55px; + padding: 2px 6px; + position: relative; + cursor: pointer; + border-radius: 4px; + transition: background 0.15s; + text-align: center; +} + +.sentence .time:hover { + background: var(--accent-muted); +} + +.sentence .time::after { + content: '\25B6'; + position: absolute; + inset: 0; + display: flex; + align-items: center; + justify-content: center; + font-size: 14px; + color: var(--accent); + background: var(--accent-muted); + border-radius: 4px; + opacity: 0; + transform: scale(0.7); + transition: opacity 0.15s, transform 0.15s; + pointer-events: none; +} + +.sentence .time:hover::after { + opacity: 1; + transform: scale(1); +} + +.sentence .time:active::after { + transform: scale(0.85); +} + +.sentence .txt { + word-break: break-all; + color: var(--text); +} + +.sentence .txt .char { + cursor: pointer; + border-radius: 2px; + transition: background 0.1s; +} + +.sentence .txt .char:hover { + background: var(--accent-muted); +} + +.sentence .txt .char-active { + background: rgba(251, 191, 36, 0.5) !important; + color: #fff; + border-radius: 2px; +} + +.sentence .txt .search-hl { + background: rgba(251, 191, 36, 0.3); + border-radius: 2px; +} + +/* ---- Resize handles ---- */ + +.resize-handle { + flex-shrink: 0; + background: var(--border-subtle); + z-index: 2; + transition: background var(--transition); +} + +.resize-handle:hover { + background: var(--accent); +} + +.resize-handle:active { + background: var(--accent-hover); +} + +.resize-row { + height: 4px; + cursor: row-resize; +} + +.resize-col { + width: 4px; + cursor: col-resize; +} + +/* ---- Drop overlay ---- */ + +#drop-overlay { + position: fixed; + inset: 0; + background: rgba(91, 156, 246, 0.1); + backdrop-filter: blur(4px); + display: flex; + align-items: center; + justify-content: center; + font-size: 20px; + font-weight: 500; + color: var(--accent); + z-index: 100; + pointer-events: none; + border: 2px dashed var(--accent); + margin: 8px; + border-radius: var(--radius-lg); +} + +#drop-overlay[hidden] { + display: none; +} + +/* ---- Scrollbar ---- */ + +::-webkit-scrollbar { + width: 6px; +} + +::-webkit-scrollbar-track { + background: transparent; +} + +::-webkit-scrollbar-thumb { + background: var(--border); + border-radius: 3px; +} + +::-webkit-scrollbar-thumb:hover { + background: var(--text-faint); +} + +/* ---- Shortcuts dialog ---- */ + +.shortcuts-overlay { + position: fixed; + inset: 0; + background: rgba(0, 0, 0, 0.5); + display: flex; + align-items: center; + justify-content: center; + z-index: 200; +} + +.shortcuts-dialog { + background: var(--bg-surface); + border: 1px solid var(--border); + border-radius: var(--radius-lg); + box-shadow: var(--shadow-md); + min-width: 320px; + max-width: 400px; +} + +.shortcuts-header { + display: flex; + align-items: center; + justify-content: space-between; + padding: 12px 16px; + border-bottom: 1px solid var(--border); + font-size: 13px; + font-weight: 600; + color: var(--text); +} + +.shortcuts-close { + padding: 2px 8px !important; + font-size: 16px !important; + min-width: 0 !important; + border-radius: var(--radius-sm) !important; + line-height: 1; +} + +.shortcuts-body { + padding: 8px 16px 12px; +} + +.shortcut-row { + display: flex; + align-items: center; + justify-content: space-between; + padding: 5px 0; + font-size: 12px; + color: var(--text-muted); +} + +.shortcut-row kbd { + background: var(--bg-elevated); + border: 1px solid var(--border); + border-radius: var(--radius-sm); + padding: 2px 8px; + font-family: 'SF Mono', 'Fira Code', 'Cascadia Code', monospace; + font-size: 11px; + color: var(--text); + min-width: 50px; + text-align: center; +} diff --git a/training/smart-turn-zh/viewer/tsconfig.json b/training/smart-turn-zh/viewer/tsconfig.json new file mode 100644 index 0000000..d2cad1e --- /dev/null +++ b/training/smart-turn-zh/viewer/tsconfig.json @@ -0,0 +1,16 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "ESNext", + "moduleResolution": "bundler", + "jsx": "react-jsx", + "strict": true, + "isolatedModules": true, + "esModuleInterop": true, + "lib": ["ES2022", "DOM", "DOM.Iterable"], + "outDir": "dist", + "sourceMap": true, + "skipLibCheck": true + }, + "include": ["src"] +} diff --git a/training/smart-turn-zh/viewer/vite.config.ts b/training/smart-turn-zh/viewer/vite.config.ts new file mode 100644 index 0000000..000221e --- /dev/null +++ b/training/smart-turn-zh/viewer/vite.config.ts @@ -0,0 +1,7 @@ +import { defineConfig } from 'vite'; +import react from '@vitejs/plugin-react'; + +export default defineConfig({ + plugins: [react()], + build: { outDir: 'dist' }, +});