-
Notifications
You must be signed in to change notification settings - Fork 44
Expand file tree
/
Copy pathcli.py
More file actions
289 lines (235 loc) · 9.68 KB
/
Copy pathcli.py
File metadata and controls
289 lines (235 loc) · 9.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
"""
quantcpp CLI — chat with a local LLM in your terminal.
Ollama-style commands:
quantcpp pull MODEL Download a model from HuggingFace
quantcpp list List cached and available models
quantcpp run MODEL [Q] Chat with a model (auto-pulls if needed)
quantcpp serve MODEL Start OpenAI-compatible HTTP server
Backwards-compatible shortcut:
quantcpp Auto-downloads Llama-3.2-1B, starts chat
quantcpp "What is X?" One-shot question with default model
quantcpp --model NAME Use a specific model
"""
import sys
import os
import json
# Ollama-style short aliases → canonical _MODEL_REGISTRY keys
MODEL_ALIASES = {
"smollm2": "SmolLM2-135M",
"smollm2:135m": "SmolLM2-135M",
"qwen3.5": "Qwen3.5-0.8B",
"qwen3.5:0.8b": "Qwen3.5-0.8B",
"llama3.2": "Llama-3.2-1B",
"llama3.2:1b": "Llama-3.2-1B",
}
def _resolve_name(name):
"""Resolve user input to canonical registry key or local path."""
if name is None:
return None
if os.path.exists(name) and name.endswith(".gguf"):
return name
return MODEL_ALIASES.get(name.lower(), name)
def _registry():
from quantcpp import _MODEL_REGISTRY, _CACHE_DIR
return _MODEL_REGISTRY, _CACHE_DIR
def cmd_pull(args):
"""Download a model by alias or canonical name."""
import quantcpp
name = _resolve_name(args.model)
if os.path.exists(name) and name.endswith(".gguf"):
print(f"already local: {name}")
return 0
if name not in quantcpp._MODEL_REGISTRY:
avail = ", ".join(sorted(quantcpp._MODEL_REGISTRY.keys()))
aliases = ", ".join(sorted(MODEL_ALIASES.keys()))
print(f"unknown model: {args.model!r}", file=sys.stderr)
print(f" registry: {avail}", file=sys.stderr)
print(f" aliases: {aliases}", file=sys.stderr)
return 1
print(f"pulling {name}...", file=sys.stderr)
try:
path = quantcpp.download(name)
size_mb = os.path.getsize(path) / (1024 * 1024)
print(f"\u2713 {name} \u2192 {path} ({size_mb:.0f} MB)", file=sys.stderr)
return 0
except Exception as e:
print(f"download failed: {e}", file=sys.stderr)
return 1
def cmd_list(args):
"""List cached and available models."""
registry, cache_dir = _registry()
rows = []
for name, (repo, filename, approx_mb) in sorted(registry.items()):
path = cache_dir / filename
if path.exists():
size_mb = path.stat().st_size / (1024 * 1024)
status = "cached"
display_path = str(path)
else:
size_mb = approx_mb
status = "remote"
display_path = f"~{approx_mb} MB"
alias = next((a for a, n in MODEL_ALIASES.items() if n == name and ":" in a), "")
rows.append((status, name, alias, size_mb, display_path))
if args.json_output:
print(json.dumps([
{"status": s, "name": n, "alias": a, "size_mb": round(sz, 1), "path": p}
for (s, n, a, sz, p) in rows
], indent=2))
return 0
print(f"\n Models cache: {cache_dir}\n")
print(f" {'STATUS':<8} {'NAME':<16} {'ALIAS':<14} {'SIZE':>8}")
print(f" {'-'*8} {'-'*16} {'-'*14} {'-'*8}")
for status, name, alias, size_mb, _ in rows:
size_str = f"{size_mb:.0f} MB"
print(f" {status:<8} {name:<16} {alias:<14} {size_str:>8}")
print()
return 0
def _resolve_to_path(name_or_path):
"""Resolve alias/name to a local .gguf path, downloading if needed."""
import quantcpp
name = _resolve_name(name_or_path)
if os.path.exists(name) and name.endswith(".gguf"):
return name
if name not in quantcpp._MODEL_REGISTRY:
avail = ", ".join(sorted(quantcpp._MODEL_REGISTRY.keys()))
raise ValueError(
f"unknown model: {name_or_path!r}. Available: {avail}"
)
repo, filename, _ = quantcpp._MODEL_REGISTRY[name]
cached = quantcpp._CACHE_DIR / filename
if cached.exists():
return str(cached)
print(f"model not cached \u2014 pulling {name}...", file=sys.stderr)
return quantcpp.download(name)
def cmd_run(args):
"""Chat with a model (auto-pull if needed)."""
try:
model_path = _resolve_to_path(args.model)
except ValueError as e:
print(str(e), file=sys.stderr)
return 1
except Exception as e:
print(f"pull failed: {e}", file=sys.stderr)
return 1
from quantcpp import Model
print(f"loading {os.path.basename(model_path)}...", file=sys.stderr)
m = Model(model_path, max_tokens=args.max_tokens, temperature=args.temperature,
n_threads=args.threads)
if args.prompt:
question = " ".join(args.prompt) if isinstance(args.prompt, list) else args.prompt
for tok in m.generate(question):
print(tok, end="", flush=True)
print()
else:
print("quantcpp \u2014 type your message, Ctrl+C to exit", file=sys.stderr)
try:
while True:
question = input("\nYou: ")
if not question.strip():
continue
print("AI: ", end="", flush=True)
for tok in m.generate(question):
print(tok, end="", flush=True)
print()
except (KeyboardInterrupt, EOFError):
print("\nBye!", file=sys.stderr)
m.close()
return 0
def cmd_serve(args):
"""Start OpenAI-compatible HTTP server (requires quant-server binary)."""
import shutil
import subprocess
try:
model_path = _resolve_to_path(args.model)
except Exception as e:
print(f"error: {e}", file=sys.stderr)
return 1
binary = shutil.which("quant-server")
if not binary:
# Look in common build dirs relative to repo
for guess in ("./build/quant-server", "./build_metal/quant-server"):
if os.path.isfile(guess) and os.access(guess, os.X_OK):
binary = guess
break
if not binary:
print("quant-server binary not found.", file=sys.stderr)
print(" Build with: cmake -B build -DTQ_BUILD_SERVER=ON && cmake --build build",
file=sys.stderr)
print(" Or install via your package manager.", file=sys.stderr)
return 2
cmd = [binary, model_path, "-p", str(args.port), "-j", str(args.threads)]
print(f"quant serve {os.path.basename(model_path)} on :{args.port}", file=sys.stderr)
os.execvp(cmd[0], cmd)
def cmd_chat_default(args):
"""Backwards-compatible default: auto-download Llama-3.2-1B and chat."""
args.model = args.model or "Llama-3.2-1B"
args.threads = getattr(args, "threads", 4)
args.max_tokens = getattr(args, "max_tokens", 256)
args.temperature = getattr(args, "temperature", 0.7)
args.prompt = args.prompt or None
return cmd_run(args)
def main():
import argparse
parser = argparse.ArgumentParser(
prog="quantcpp",
description="Chat with a local LLM. No API key, no GPU, no server.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
commands:
pull MODEL Download a model (e.g. llama3.2:1b)
list List cached and available models
run MODEL [PROMPT] Chat with a model (auto-pulls if needed)
serve MODEL Start OpenAI-compatible HTTP server
examples:
quantcpp pull llama3.2:1b
quantcpp list
quantcpp run llama3.2:1b
quantcpp run llama3.2:1b "What is gravity?"
quantcpp serve llama3.2:1b --port 8080
backwards-compat (no subcommand):
quantcpp # default chat with Llama-3.2-1B
quantcpp "What is gravity?" # one-shot
quantcpp --model SmolLM2-135M # different model
""",
)
sub = parser.add_subparsers(dest="command")
# pull
p_pull = sub.add_parser("pull", help="Download a model from HuggingFace")
p_pull.add_argument("model", help="Model name or alias (e.g. llama3.2:1b)")
# list
p_list = sub.add_parser("list", help="List cached and available models")
p_list.add_argument("--json", dest="json_output", action="store_true")
# run
p_run = sub.add_parser("run", help="Chat with a model (auto-pulls if needed)")
p_run.add_argument("model", help="Model name, alias, or .gguf path")
p_run.add_argument("prompt", nargs="*", default=None, help="Optional prompt")
p_run.add_argument("-j", "--threads", type=int, default=4)
p_run.add_argument("-n", "--max-tokens", type=int, default=256)
p_run.add_argument("-t", "--temperature", type=float, default=0.7)
# serve
p_serve = sub.add_parser("serve", help="Start OpenAI-compatible HTTP server")
p_serve.add_argument("model", help="Model name, alias, or .gguf path")
p_serve.add_argument("-p", "--port", type=int, default=8080)
p_serve.add_argument("-j", "--threads", type=int, default=4)
# Backwards-compat: top-level args for direct chat
parser.add_argument("prompt", nargs="*", default=None,
help="(default mode) question to ask")
parser.add_argument("--model", "-m", default=None,
help="(default mode) model name or .gguf path")
parser.add_argument("--max-tokens", "-n", type=int, default=256)
parser.add_argument("--temperature", "-t", type=float, default=0.7)
parser.add_argument("--threads", "-j", type=int, default=4)
args = parser.parse_args()
if args.command == "pull":
return cmd_pull(args)
if args.command == "list":
return cmd_list(args)
if args.command == "run":
return cmd_run(args)
if args.command == "serve":
return cmd_serve(args)
# No subcommand → backwards-compat default chat
return cmd_chat_default(args)
if __name__ == "__main__":
sys.exit(main())