forked from ServeurpersoCom/omnivoice.cpp
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathquantize.sh
More file actions
executable file
·37 lines (33 loc) · 1.37 KB
/
quantize.sh
File metadata and controls
executable file
·37 lines (33 loc) · 1.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/bin/bash
# Derive lighter GGUFs from the F32 source-of-truth produced by convert.py.
# omnivoice-base-F32.gguf -> BF16, Q8_0, Q4_K_M
# omnivoice-tokenizer-F32.gguf -> BF16, Q8_0, Q4_K_M
#
# Three variants cover the useful precision range : BF16 for max precision
# on CUDA, Q8_0 as the balanced default, Q4_K_M as the smallest variant
# that still sounds correct. Q5_K_M / Q6_K were tested and dropped : their
# size sits between Q4_K_M and Q8_0 with negligible perceptual gain.
#
# Quantization policy is centralized in tools/quantize.cpp should_quantize :
# RVQ codebooks (quantizer.quantizers.*) and the fc / fc2 linear projections
# wrapping them stay at F32 in every variant. Nearest-neighbor lookup is
# sensitive to per-row quantization noise ; even BF16 mantissa truncation
# drifts codes enough to break voice cloning. Conv weights stay at source
# dtype and are cast to F16 at load time by gf_load_conv_f16 (ARM im2col
# strict). Same policy as acestep.cpp keeping VAE-critical paths intact.
set -eu
Q="./build/quantize"
quantize() {
local src="$1" type="$2"
local out="${src/-F32.gguf/-${type}.gguf}"
if [ -f "$out" ]; then
echo "[Skip] $out"
else
$Q "$src" "$out" "$type"
fi
}
for src in models/omnivoice-base-F32.gguf models/omnivoice-tokenizer-F32.gguf; do
quantize "$src" BF16
quantize "$src" Q8_0
quantize "$src" Q4_K_M
done