-
Notifications
You must be signed in to change notification settings - Fork 229
Fix portability: runtime paths, missing setup scripts, vocab format bug #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
msitarzewski
wants to merge
2
commits into
danveloper:main
Choose a base branch
from
msitarzewski:fix/portable-setup-and-vocab
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+311
−18
Open
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,171 @@ | ||
| #!/usr/bin/env python3 | ||
| """ | ||
| generate_expert_index.py — Generate expert_index.json from Qwen3.5-397B-A17B safetensors. | ||
|
|
||
| Scans safetensors file headers to build a mapping of (layer, component) -> (file, offset, stride). | ||
| This index is required by repack_experts.py. | ||
|
|
||
| Usage: | ||
| python generate_expert_index.py [--model PATH] [--output expert_index.json] | ||
| """ | ||
|
|
||
| import argparse | ||
| import json | ||
| import os | ||
| import re | ||
| import struct | ||
| import sys | ||
| from collections import defaultdict | ||
| from pathlib import Path | ||
|
|
||
|
|
||
| # Expected component sizes per expert (bytes) | ||
| COMPONENT_SIZES = { | ||
| "gate_proj.weight": 2097152, # [1024, 512] uint32 | ||
| "gate_proj.scales": 131072, # [1024, 64] uint16 | ||
| "gate_proj.biases": 131072, # [1024, 64] uint16 | ||
| "up_proj.weight": 2097152, # [1024, 512] uint32 | ||
| "up_proj.scales": 131072, # [1024, 64] uint16 | ||
| "up_proj.biases": 131072, # [1024, 64] uint16 | ||
| "down_proj.weight": 2097152, # [4096, 128] uint32 | ||
| "down_proj.scales": 131072, # [4096, 16] uint16 | ||
| "down_proj.biases": 131072, # [4096, 16] uint16 | ||
| } | ||
|
|
||
| NUM_EXPERTS = 512 | ||
| NUM_LAYERS = 60 | ||
|
|
||
| # Pattern: language_model.model.layers.{L}.mlp.switch_mlp.{component} | ||
| EXPERT_PATTERN = re.compile( | ||
| r'^language_model\.model\.layers\.(\d+)\.mlp\.switch_mlp\.((?:gate|up|down)_proj\.(?:weight|scales|biases))$' | ||
| ) | ||
|
|
||
|
|
||
| def parse_safetensors_header(filepath): | ||
| """Parse a safetensors file header. Returns (header_dict, data_start_offset).""" | ||
| with open(filepath, 'rb') as f: | ||
| header_len = struct.unpack('<Q', f.read(8))[0] | ||
| header = json.loads(f.read(header_len)) | ||
| data_start = 8 + header_len | ||
| return header, data_start | ||
|
|
||
|
|
||
| def main(): | ||
| parser = argparse.ArgumentParser(description='Generate expert_index.json from safetensors') | ||
| parser.add_argument('--model', type=str, required=True, | ||
| help='Path to model directory (containing safetensors files)') | ||
| parser.add_argument('--output', type=str, default='expert_index.json', | ||
| help='Output path for expert_index.json') | ||
| args = parser.parse_args() | ||
|
|
||
| model_path = Path(args.model) | ||
|
|
||
| # Load weight index | ||
| index_path = model_path / 'model.safetensors.index.json' | ||
| if not index_path.exists(): | ||
| print(f"ERROR: {index_path} not found", file=sys.stderr) | ||
| sys.exit(1) | ||
|
|
||
| with open(index_path) as f: | ||
| idx = json.load(f) | ||
|
|
||
| weight_map = idx['weight_map'] | ||
|
|
||
| # Find all expert tensors and group by (layer, component) -> filename | ||
| expert_tensors = {} # (layer_idx, component) -> (tensor_name, filename) | ||
| for name, filename in weight_map.items(): | ||
| m = EXPERT_PATTERN.match(name) | ||
| if m: | ||
| layer_idx = int(m.group(1)) | ||
| component = m.group(2) | ||
| expert_tensors[(layer_idx, component)] = (name, filename) | ||
|
|
||
| print(f"Model: {model_path}") | ||
| print(f"Found {len(expert_tensors)} expert tensors") | ||
| print(f"Expected: {NUM_LAYERS * len(COMPONENT_SIZES)} = {NUM_LAYERS} layers x {len(COMPONENT_SIZES)} components") | ||
|
|
||
| if len(expert_tensors) != NUM_LAYERS * len(COMPONENT_SIZES): | ||
| print("WARNING: tensor count mismatch", file=sys.stderr) | ||
|
|
||
| # Parse safetensors headers for all needed files | ||
| needed_files = set(fn for _, fn in expert_tensors.values()) | ||
| print(f"\nParsing {len(needed_files)} safetensors file headers...") | ||
|
|
||
| header_cache = {} | ||
| for filename in sorted(needed_files): | ||
| filepath = model_path / filename | ||
| header_cache[filename] = parse_safetensors_header(str(filepath)) | ||
| print(f" {filename}: header parsed") | ||
|
|
||
| # Build expert_reads index | ||
| expert_reads = defaultdict(dict) | ||
|
|
||
| for (layer_idx, component), (tensor_name, filename) in sorted(expert_tensors.items()): | ||
| header, data_start = header_cache[filename] | ||
|
|
||
| if tensor_name not in header: | ||
| # Skip __metadata__ key | ||
| if tensor_name == '__metadata__': | ||
| continue | ||
| print(f"WARNING: {tensor_name} not in {filename} header", file=sys.stderr) | ||
| continue | ||
|
|
||
| meta = header[tensor_name] | ||
| tensor_offset = meta['data_offsets'][0] | ||
| tensor_size = meta['data_offsets'][1] - meta['data_offsets'][0] | ||
|
|
||
| # The tensor contains all 512 experts contiguously | ||
| # expert_size = total_tensor_size / num_experts | ||
| expert_size = tensor_size // NUM_EXPERTS | ||
|
|
||
| expected_size = COMPONENT_SIZES.get(component) | ||
| if expected_size and expert_size != expected_size: | ||
| print(f"WARNING: {tensor_name}: computed expert_size={expert_size}, " | ||
| f"expected={expected_size}", file=sys.stderr) | ||
|
|
||
| # abs_offset = data section start + tensor's offset within data section | ||
| abs_offset = data_start + tensor_offset | ||
|
|
||
| # expert_stride = expert_size (experts are packed contiguously) | ||
| expert_stride = expert_size | ||
|
|
||
| expert_reads[str(layer_idx)][component] = { | ||
| "file": filename, | ||
| "abs_offset": abs_offset, | ||
| "expert_stride": expert_stride, | ||
| "expert_size": expert_size, | ||
| } | ||
|
|
||
| # Verify completeness | ||
| complete = True | ||
| for layer_idx in range(NUM_LAYERS): | ||
| layer_key = str(layer_idx) | ||
| if layer_key not in expert_reads: | ||
| print(f"ERROR: layer {layer_idx} missing entirely", file=sys.stderr) | ||
| complete = False | ||
| continue | ||
| for comp in COMPONENT_SIZES: | ||
| if comp not in expert_reads[layer_key]: | ||
| print(f"ERROR: layer {layer_idx} missing {comp}", file=sys.stderr) | ||
| complete = False | ||
|
|
||
| if not complete: | ||
| print("\nERROR: Index is incomplete", file=sys.stderr) | ||
| sys.exit(1) | ||
|
|
||
| # Write output | ||
| output = { | ||
| "model_path": str(model_path), | ||
| "expert_reads": dict(expert_reads), | ||
| } | ||
|
|
||
| with open(args.output, 'w') as f: | ||
| json.dump(output, f, indent=2) | ||
|
|
||
| print(f"\nWrote {args.output}") | ||
| print(f" {len(expert_reads)} layers, {len(COMPONENT_SIZES)} components each") | ||
| print(f" Total: {len(expert_reads) * len(COMPONENT_SIZES)} entries") | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,72 @@ | ||
| #!/usr/bin/env python3 | ||
| """Export vocab.bin in the simple format expected by infer.m's load_vocab(). | ||
|
|
||
| Format: | ||
| [num_entries: uint32] [max_id: uint32] | ||
| For each entry (0..max_id): [byte_len: uint16] [utf8_bytes: byte_len] | ||
|
|
||
| Usage: | ||
| python export_vocab.py <tokenizer.json> [output.bin] | ||
| """ | ||
| import json | ||
| import os | ||
| import struct | ||
| import sys | ||
|
|
||
| def main(): | ||
| tok_path = sys.argv[1] | ||
| out_path = sys.argv[2] if len(sys.argv) > 2 else 'vocab.bin' | ||
|
|
||
| with open(tok_path, 'r', encoding='utf-8') as f: | ||
| t = json.load(f) | ||
|
|
||
| vocab = t['model']['vocab'] # str -> int | ||
| added = t.get('added_tokens', []) # list of {id, content, ...} | ||
|
|
||
| # Merge added tokens into vocab | ||
| for tok in added: | ||
| vocab[tok['content']] = tok['id'] | ||
|
|
||
| max_id = max(vocab.values()) | ||
| num_entries = max_id + 1 | ||
|
|
||
| # BPE byte-level encoding uses Unicode chars for bytes: | ||
| # Ġ (U+0120) = space, Ċ (U+010A) = newline, etc. | ||
| # Build the reverse mapping to decode these back to real bytes. | ||
| bs = list(range(ord("!"), ord("~")+1)) + list(range(ord("¡"), ord("¬")+1)) + list(range(ord("®"), ord("ÿ")+1)) | ||
| cs = bs[:] | ||
| n = 0 | ||
| for b in range(256): | ||
| if b not in bs: | ||
| bs.append(b) | ||
| cs.append(256 + n) | ||
| n += 1 | ||
| bpe_decode_map = {chr(c): bytes([b]) for b, c in zip(bs, cs)} | ||
|
|
||
| def decode_bpe_token(s): | ||
| """Convert BPE token string to actual bytes.""" | ||
| try: | ||
| return b''.join(bpe_decode_map.get(ch, ch.encode('utf-8')) for ch in s) | ||
| except Exception: | ||
| return s.encode('utf-8') | ||
|
|
||
| # Build id -> string mapping with BPE decoding | ||
| id_to_str = {} | ||
| for s, tid in vocab.items(): | ||
| id_to_str[tid] = decode_bpe_token(s) | ||
|
|
||
| with open(out_path, 'wb') as f: | ||
| f.write(struct.pack('<I', num_entries)) | ||
| f.write(struct.pack('<I', max_id)) | ||
|
|
||
| for i in range(num_entries): | ||
| b = id_to_str.get(i, b'') | ||
| f.write(struct.pack('<H', len(b))) | ||
| if b: | ||
| f.write(b) | ||
|
|
||
| sz = os.path.getsize(out_path) | ||
| print(f"Exported vocab.bin: {num_entries} entries (max_id={max_id}), {sz / 1024 / 1024:.1f} MB") | ||
|
|
||
| if __name__ == '__main__': | ||
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can remove this, since this not used anywhere