From 239ff32abc3db8b73bc0dd8ec78c99a733e7793e Mon Sep 17 00:00:00 2001 From: Michael Sitarzewski Date: Thu, 19 Mar 2026 14:19:35 -0500 Subject: [PATCH 1/2] Fix portability: resolve model paths at runtime, add missing setup scripts, fix vocab format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace hardcoded /Users/danielwoods paths with runtime $HOME resolution that auto-discovers the HuggingFace snapshot directory - Add generate_expert_index.py: scans safetensors headers to produce expert_index.json, which repack_experts.py requires but had no generator - Add export_vocab.py: generates vocab.bin in the simple decode format that infer.m's load_vocab() expects. The existing export_tokenizer.py produces a BPE format (magic "BPET") which load_vocab() misreads as num_entries=1.1B, causing an OOM kill on startup - Skip mmap for 120GB expert layer files to avoid OOM kills on systems with memory pressure; pread() fallback works fine Tested on M5 Max (128GB) — 15.66 tok/s with 2-bit experts. Co-Authored-By: Claude Opus 4.6 (1M context) --- generate_expert_index.py | 171 ++++++++++++++++++++++++++++++++ metal_infer/export_tokenizer.py | 9 +- metal_infer/export_vocab.py | 72 ++++++++++++++ metal_infer/infer.m | 45 +++++++-- metal_infer/main.m | 30 +++++- repack_experts.py | 2 +- 6 files changed, 311 insertions(+), 18 deletions(-) create mode 100644 generate_expert_index.py create mode 100644 metal_infer/export_vocab.py diff --git a/generate_expert_index.py b/generate_expert_index.py new file mode 100644 index 0000000..79e9800 --- /dev/null +++ b/generate_expert_index.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +""" +generate_expert_index.py — Generate expert_index.json from Qwen3.5-397B-A17B safetensors. + +Scans safetensors file headers to build a mapping of (layer, component) -> (file, offset, stride). +This index is required by repack_experts.py. + +Usage: + python generate_expert_index.py [--model PATH] [--output expert_index.json] +""" + +import argparse +import json +import os +import re +import struct +import sys +from collections import defaultdict +from pathlib import Path + + +# Expected component sizes per expert (bytes) +COMPONENT_SIZES = { + "gate_proj.weight": 2097152, # [1024, 512] uint32 + "gate_proj.scales": 131072, # [1024, 64] uint16 + "gate_proj.biases": 131072, # [1024, 64] uint16 + "up_proj.weight": 2097152, # [1024, 512] uint32 + "up_proj.scales": 131072, # [1024, 64] uint16 + "up_proj.biases": 131072, # [1024, 64] uint16 + "down_proj.weight": 2097152, # [4096, 128] uint32 + "down_proj.scales": 131072, # [4096, 16] uint16 + "down_proj.biases": 131072, # [4096, 16] uint16 +} + +NUM_EXPERTS = 512 +NUM_LAYERS = 60 + +# Pattern: language_model.model.layers.{L}.mlp.switch_mlp.{component} +EXPERT_PATTERN = re.compile( + r'^language_model\.model\.layers\.(\d+)\.mlp\.switch_mlp\.((?:gate|up|down)_proj\.(?:weight|scales|biases))$' +) + + +def parse_safetensors_header(filepath): + """Parse a safetensors file header. Returns (header_dict, data_start_offset).""" + with open(filepath, 'rb') as f: + header_len = struct.unpack(' filename + expert_tensors = {} # (layer_idx, component) -> (tensor_name, filename) + for name, filename in weight_map.items(): + m = EXPERT_PATTERN.match(name) + if m: + layer_idx = int(m.group(1)) + component = m.group(2) + expert_tensors[(layer_idx, component)] = (name, filename) + + print(f"Model: {model_path}") + print(f"Found {len(expert_tensors)} expert tensors") + print(f"Expected: {NUM_LAYERS * len(COMPONENT_SIZES)} = {NUM_LAYERS} layers x {len(COMPONENT_SIZES)} components") + + if len(expert_tensors) != NUM_LAYERS * len(COMPONENT_SIZES): + print("WARNING: tensor count mismatch", file=sys.stderr) + + # Parse safetensors headers for all needed files + needed_files = set(fn for _, fn in expert_tensors.values()) + print(f"\nParsing {len(needed_files)} safetensors file headers...") + + header_cache = {} + for filename in sorted(needed_files): + filepath = model_path / filename + header_cache[filename] = parse_safetensors_header(str(filepath)) + print(f" {filename}: header parsed") + + # Build expert_reads index + expert_reads = defaultdict(dict) + + for (layer_idx, component), (tensor_name, filename) in sorted(expert_tensors.items()): + header, data_start = header_cache[filename] + + if tensor_name not in header: + # Skip __metadata__ key + if tensor_name == '__metadata__': + continue + print(f"WARNING: {tensor_name} not in {filename} header", file=sys.stderr) + continue + + meta = header[tensor_name] + tensor_offset = meta['data_offsets'][0] + tensor_size = meta['data_offsets'][1] - meta['data_offsets'][0] + + # The tensor contains all 512 experts contiguously + # expert_size = total_tensor_size / num_experts + expert_size = tensor_size // NUM_EXPERTS + + expected_size = COMPONENT_SIZES.get(component) + if expected_size and expert_size != expected_size: + print(f"WARNING: {tensor_name}: computed expert_size={expert_size}, " + f"expected={expected_size}", file=sys.stderr) + + # abs_offset = data section start + tensor's offset within data section + abs_offset = data_start + tensor_offset + + # expert_stride = expert_size (experts are packed contiguously) + expert_stride = expert_size + + expert_reads[str(layer_idx)][component] = { + "file": filename, + "abs_offset": abs_offset, + "expert_stride": expert_stride, + "expert_size": expert_size, + } + + # Verify completeness + complete = True + for layer_idx in range(NUM_LAYERS): + layer_key = str(layer_idx) + if layer_key not in expert_reads: + print(f"ERROR: layer {layer_idx} missing entirely", file=sys.stderr) + complete = False + continue + for comp in COMPONENT_SIZES: + if comp not in expert_reads[layer_key]: + print(f"ERROR: layer {layer_idx} missing {comp}", file=sys.stderr) + complete = False + + if not complete: + print("\nERROR: Index is incomplete", file=sys.stderr) + sys.exit(1) + + # Write output + output = { + "model_path": str(model_path), + "expert_reads": dict(expert_reads), + } + + with open(args.output, 'w') as f: + json.dump(output, f, indent=2) + + print(f"\nWrote {args.output}") + print(f" {len(expert_reads)} layers, {len(COMPONENT_SIZES)} components each") + print(f" Total: {len(expert_reads) * len(COMPONENT_SIZES)} entries") + + +if __name__ == '__main__': + main() diff --git a/metal_infer/export_tokenizer.py b/metal_infer/export_tokenizer.py index 674ed32..cd8cc84 100644 --- a/metal_infer/export_tokenizer.py +++ b/metal_infer/export_tokenizer.py @@ -18,14 +18,17 @@ For each entry: uint32 token_id, uint16 str_len, char[str_len] """ import json +import os import struct import sys def main(): + import glob + default_tok = glob.glob(os.path.expanduser( + '~/.cache/huggingface/hub/models--mlx-community--Qwen3.5-397B-A17B-4bit' + '/snapshots/*/tokenizer.json')) tok_path = sys.argv[1] if len(sys.argv) > 1 else ( - '/Users/danielwoods/.cache/huggingface/hub/' - 'models--mlx-community--Qwen3.5-397B-A17B-4bit/' - 'snapshots/39159bd8aa74f5c8446d2b2dc584f62bb51cb0d3/tokenizer.json' + default_tok[0] if default_tok else 'tokenizer.json' ) out_path = sys.argv[2] if len(sys.argv) > 2 else 'tokenizer.bin' diff --git a/metal_infer/export_vocab.py b/metal_infer/export_vocab.py new file mode 100644 index 0000000..adfa65d --- /dev/null +++ b/metal_infer/export_vocab.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +"""Export vocab.bin in the simple format expected by infer.m's load_vocab(). + +Format: + [num_entries: uint32] [max_id: uint32] + For each entry (0..max_id): [byte_len: uint16] [utf8_bytes: byte_len] + +Usage: + python export_vocab.py [output.bin] +""" +import json +import struct +import sys + +def main(): + tok_path = sys.argv[1] + out_path = sys.argv[2] if len(sys.argv) > 2 else 'vocab.bin' + + with open(tok_path, 'r', encoding='utf-8') as f: + t = json.load(f) + + vocab = t['model']['vocab'] # str -> int + added = t.get('added_tokens', []) # list of {id, content, ...} + + # Merge added tokens into vocab + for tok in added: + vocab[tok['content']] = tok['id'] + + max_id = max(vocab.values()) + num_entries = max_id + 1 + + # BPE byte-level encoding uses Unicode chars for bytes: + # Ġ (U+0120) = space, Ċ (U+010A) = newline, etc. + # Build the reverse mapping to decode these back to real bytes. + bs = list(range(ord("!"), ord("~")+1)) + list(range(ord("¡"), ord("¬")+1)) + list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(256): + if b not in bs: + bs.append(b) + cs.append(256 + n) + n += 1 + bpe_decode_map = {chr(c): bytes([b]) for b, c in zip(bs, cs)} + + def decode_bpe_token(s): + """Convert BPE token string to actual bytes.""" + try: + return b''.join(bpe_decode_map.get(ch, ch.encode('utf-8')) for ch in s) + except Exception: + return s.encode('utf-8') + + # Build id -> string mapping with BPE decoding + id_to_str = {} + for s, tid in vocab.items(): + id_to_str[tid] = decode_bpe_token(s) + + with open(out_path, 'wb') as f: + f.write(struct.pack(' #include #include +#include #include #include #include @@ -123,7 +124,8 @@ #define THINK_START_TOKEN 248068 // #define THINK_END_TOKEN 248069 // -#define MODEL_PATH_DEFAULT "/Users/danielwoods/.cache/huggingface/hub/models--mlx-community--Qwen3.5-397B-A17B-4bit/snapshots/39159bd8aa74f5c8446d2b2dc584f62bb51cb0d3" +// MODEL_PATH_DEFAULT is resolved at runtime via get_default_model_path() below +#define MODEL_PATH_DEFAULT NULL // ============================================================================ // Timing helper @@ -6283,9 +6285,35 @@ static void print_usage(const char *prog) { printf(" --help This message\n"); } +// Resolve default model path at runtime using $HOME +static const char *get_default_model_path(void) { + static char path[1024]; + const char *home = getenv("HOME"); + if (!home) home = "/tmp"; + // Find the first snapshot directory + char snapshots_dir[1024]; + snprintf(snapshots_dir, sizeof(snapshots_dir), + "%s/.cache/huggingface/hub/models--mlx-community--Qwen3.5-397B-A17B-4bit/snapshots", home); + DIR *d = opendir(snapshots_dir); + if (d) { + struct dirent *entry; + while ((entry = readdir(d)) != NULL) { + if (entry->d_name[0] != '.') { + snprintf(path, sizeof(path), "%s/%s", snapshots_dir, entry->d_name); + closedir(d); + return path; + } + } + closedir(d); + } + snprintf(path, sizeof(path), + "%s/.cache/huggingface/hub/models--mlx-community--Qwen3.5-397B-A17B-4bit/snapshots/39159bd8aa74f5c8446d2b2dc584f62bb51cb0d3", home); + return path; +} + int main(int argc, char **argv) { @autoreleasepool { - const char *model_path = MODEL_PATH_DEFAULT; + const char *model_path = get_default_model_path(); const char *weights_path = NULL; const char *manifest_path = NULL; const char *vocab_path = NULL; @@ -6517,15 +6545,10 @@ int main(int argc, char **argv) { fcntl(layer_fds[i], F_RDAHEAD, 1); struct stat st; if (fstat(layer_fds[i], &st) == 0 && st.st_size > 0) { - layer_mmaps[i] = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, layer_fds[i], 0); - if (layer_mmaps[i] != MAP_FAILED) { - layer_mmap_sizes[i] = st.st_size; - // No madvise: kernel default is best. - // MADV_RANDOM disables readahead (tested: hurts). - // MADV_SEQUENTIAL doesn't reduce I/O fragmentation (tested: no effect). - // The kernel fragments 3.9MB preads into ~5.7 disk ops regardless - // of hints — this is inherent to the page cache's physical page layout. - } + // Skip mmap for expert files — 120GB of mmap reservations + // can trigger OOM kills on systems with memory pressure. + // The engine falls back to pread() which works fine. + layer_mmap_sizes[i] = st.st_size; } } } diff --git a/metal_infer/main.m b/metal_infer/main.m index 2254134..5323d36 100644 --- a/metal_infer/main.m +++ b/metal_infer/main.m @@ -40,6 +40,7 @@ #include #include #include +#include // ============================================================================ // Constants matching the Qwen3.5-397B packed expert layout @@ -77,8 +78,8 @@ #define EXPERT_SIZE 7077888 // Total bytes per expert -// Default model path -#define MODEL_PATH "/Users/danielwoods/.cache/huggingface/hub/models--mlx-community--Qwen3.5-397B-A17B-4bit/snapshots/39159bd8aa74f5c8446d2b2dc584f62bb51cb0d3" +// Default model path — resolved at runtime, can be overridden with --model +#define MODEL_PATH NULL // ============================================================================ // Timing helper @@ -1504,7 +1505,30 @@ int main(int argc, char **argv) { int num_active_experts = 4; // --k flag int do_verify = 0; int use_fast = 0; - const char *model_path = MODEL_PATH; + // Resolve default model path at runtime using $HOME + static char default_model_path[1024]; + const char *home = getenv("HOME"); + if (!home) home = "/tmp"; + { + char snapshots_dir[1024]; + snprintf(snapshots_dir, sizeof(snapshots_dir), + "%s/.cache/huggingface/hub/models--mlx-community--Qwen3.5-397B-A17B-4bit/snapshots", home); + DIR *d = opendir(snapshots_dir); + if (d) { + struct dirent *entry; + while ((entry = readdir(d)) != NULL) { + if (entry->d_name[0] != '.') { + snprintf(default_model_path, sizeof(default_model_path), "%s/%s", snapshots_dir, entry->d_name); + break; + } + } + closedir(d); + } else { + snprintf(default_model_path, sizeof(default_model_path), + "%s/.cache/huggingface/hub/models--mlx-community--Qwen3.5-397B-A17B-4bit/snapshots/39159bd8aa74f5c8446d2b2dc584f62bb51cb0d3", home); + } + } + const char *model_path = default_model_path; static struct option long_options[] = { {"layer", required_argument, 0, 'l'}, diff --git a/repack_experts.py b/repack_experts.py index 77eafec..3e1cc9b 100644 --- a/repack_experts.py +++ b/repack_experts.py @@ -213,7 +213,7 @@ def write_layout(output_dir): def main(): parser = argparse.ArgumentParser(description="Repack expert weights into contiguous per-layer binary files") - parser.add_argument('--index', default='/Users/danielwoods/Workspace/ane-research/expert_index.json', + parser.add_argument('--index', default='expert_index.json', help='Path to expert_index.json') parser.add_argument('--layers', default=None, help='Layer spec: "all", "0-4", "0,5,10" (default: all)') From d2c93398374c0b061542da69496c9b58a221acbf Mon Sep 17 00:00:00 2001 From: Michael Sitarzewski Date: Sat, 21 Mar 2026 06:40:28 -0500 Subject: [PATCH 2/2] Move inline `import os` to top-level imports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses review feedback from @0xClandestine — the import was used (for os.path.getsize) but placed inline at the bottom of main(). Co-Authored-By: Claude Opus 4.6 (1M context) --- metal_infer/export_vocab.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metal_infer/export_vocab.py b/metal_infer/export_vocab.py index adfa65d..c0ba448 100644 --- a/metal_infer/export_vocab.py +++ b/metal_infer/export_vocab.py @@ -9,6 +9,7 @@ python export_vocab.py [output.bin] """ import json +import os import struct import sys @@ -64,7 +65,6 @@ def decode_bpe_token(s): if b: f.write(b) - import os sz = os.path.getsize(out_path) print(f"Exported vocab.bin: {num_entries} entries (max_id={max_id}), {sz / 1024 / 1024:.1f} MB")