Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@
*.dSYM/
/misc/
.*.swp
.DS_Store
24 changes: 18 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,18 @@ METAL_SRCS := $(wildcard metal/*.metal)

ifeq ($(UNAME_S),Darwin)
METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
CORE_OBJS = ds4.o ds4_metal.o
NATIVE_CORE_OBJS = ds4_native.o
CORE_OBJS = ds4.o ds4_metal.o ds4_rpc.o
NATIVE_CORE_OBJS = ds4_native.o ds4_rpc.o
else
CFLAGS += -DDS4_NO_METAL
CORE_OBJS = ds4.o
NATIVE_CORE_OBJS = ds4_native.o
CORE_OBJS = ds4.o ds4_rpc.o
NATIVE_CORE_OBJS = ds4_native.o ds4_rpc.o
METAL_LDLIBS := $(LDLIBS)
endif

.PHONY: all clean test

all: ds4 ds4-server
all: ds4 ds4-server ds4-rpc-worker

ifeq ($(UNAME_S),Darwin)
ds4: ds4_cli.o linenoise.o $(CORE_OBJS)
Expand All @@ -29,6 +29,9 @@ ds4: ds4_cli.o linenoise.o $(CORE_OBJS)
ds4-server: ds4_server.o rax.o $(CORE_OBJS)
$(CC) $(CFLAGS) -o $@ ds4_server.o rax.o $(CORE_OBJS) $(METAL_LDLIBS)

ds4-rpc-worker: ds4_rpc_worker.o $(CORE_OBJS)
$(CC) $(CFLAGS) -o $@ ds4_rpc_worker.o $(CORE_OBJS) $(METAL_LDLIBS)

ds4_native: ds4_cli_native.o linenoise.o $(NATIVE_CORE_OBJS)
$(CC) $(CFLAGS) -o $@ ds4_cli_native.o linenoise.o $(NATIVE_CORE_OBJS) $(NATIVE_LDLIBS)
else
Expand All @@ -38,6 +41,9 @@ ds4: ds4_cli.o linenoise.o $(CORE_OBJS)
ds4-server: ds4_server.o rax.o $(CORE_OBJS)
$(CC) $(CFLAGS) -o $@ $^ $(LDLIBS)

ds4-rpc-worker: ds4_rpc_worker.o $(CORE_OBJS)
$(CC) $(CFLAGS) -o $@ $^ $(LDLIBS)

ds4_native: ds4_cli_native.o linenoise.o $(NATIVE_CORE_OBJS)
$(CC) $(CFLAGS) -o $@ ds4_cli_native.o linenoise.o $(NATIVE_CORE_OBJS) $(LDLIBS)
endif
Expand All @@ -51,6 +57,12 @@ ds4_cli.o: ds4_cli.c ds4.h linenoise.h
ds4_server.o: ds4_server.c ds4.h rax.h
$(CC) $(CFLAGS) -c -o $@ ds4_server.c

ds4_rpc.o: ds4_rpc.c ds4_rpc.h ds4.h
$(CC) $(CFLAGS) -c -o $@ ds4_rpc.c

ds4_rpc_worker.o: ds4_rpc_worker.c ds4_rpc.h ds4.h
$(CC) $(CFLAGS) -c -o $@ ds4_rpc_worker.c

ds4_test.o: tests/ds4_test.c ds4_server.c ds4.h rax.h
$(CC) $(CFLAGS) -Wno-unused-function -c -o $@ tests/ds4_test.c

Expand All @@ -76,4 +88,4 @@ test: ds4_test
./ds4_test

clean:
rm -f ds4 ds4-server ds4_native ds4_server_test ds4_test *.o
rm -f ds4 ds4-server ds4-rpc-worker ds4_native ds4_server_test ds4_test *.o
187 changes: 187 additions & 0 deletions bench-ds4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
#!/usr/bin/env python3
"""Measure ds4-server prefill and decode rates.

Streams the OpenAI-compatible chat completion API to separate prefill
(time-to-first-token) from decode (steady-state generation), so the same
methodology covers single-host Q2, single-host Q4 (on a >=256 GB machine),
and pipeline-parallel Q4 over RPC. Whatever ds4-server is running, this
measures what it actually produces over the wire -- no special hooks.

Usage:
python3 bench-ds4.py [--url URL] [--long PATH] [--max-tokens N]
[--no-thinking | --thinking] [--runs N]

Examples:
# Default: localhost:8000, short prompt only, thinking disabled.
python3 bench-ds4.py

# With the long-context test the README uses (11709 tokens):
python3 bench-ds4.py --long tests/long_context_security_prompt.txt

# Average over 3 runs (cold first run is reported separately):
python3 bench-ds4.py --runs 3 --long tests/long_context_security_prompt.txt

The same prompt is reused across runs in --runs mode so the rendered-prefix
cache absorbs setup overhead; the first run is reported as "cold" and the
remaining runs are averaged as "warm".
"""

import argparse
import json
import sys
import time
import urllib.request


def stream_request(url: str, prompt: str, max_tokens: int, thinking_enabled: bool):
"""One streamed chat completion. Returns timing + token counts."""
body = {
"model": "deepseek-v4-flash",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": max_tokens,
"temperature": 0,
"stream": True,
"stream_options": {"include_usage": True},
}
if not thinking_enabled:
body["thinking"] = {"type": "disabled"}

req = urllib.request.Request(
url,
data=json.dumps(body).encode(),
headers={"Content-Type": "application/json"},
method="POST",
)

t_start = time.perf_counter()
t_first = None
t_last = None
prompt_tokens = 0
completion_tokens = 0
reasoning_tokens = 0

with urllib.request.urlopen(req, timeout=3600) as resp:
for raw in resp:
line = raw.decode("utf-8", errors="replace").rstrip()
if not line.startswith("data: "):
continue
payload = line[6:]
if payload == "[DONE]":
continue
try:
obj = json.loads(payload)
except json.JSONDecodeError:
continue

choices = obj.get("choices") or []
if choices:
delta = choices[0].get("delta") or {}
if delta.get("content") is not None or delta.get("reasoning_content") is not None:
now = time.perf_counter()
if t_first is None:
t_first = now
t_last = now
if delta.get("reasoning_content") is not None:
reasoning_tokens += 1

usage = obj.get("usage")
if usage:
prompt_tokens = usage.get("prompt_tokens", prompt_tokens)
completion_tokens = usage.get("completion_tokens", completion_tokens)

t_end = time.perf_counter()
return {
"t_total": t_end - t_start,
"t_prefill": (t_first - t_start) if t_first is not None else None,
"t_decode": (t_last - t_first) if (t_first is not None and t_last is not None) else None,
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"reasoning_tokens": reasoning_tokens,
}


def fmt_rate(tokens, seconds):
if seconds is None or seconds <= 0 or tokens <= 0:
return "n/a"
return f"{tokens / seconds:.1f} t/s"


def print_run(label, r):
print(f"[{label}]")
print(f" prompt : {r['prompt_tokens']} tokens")
print(f" completion : {r['completion_tokens']} tokens"
+ (f" ({r['reasoning_tokens']} reasoning)" if r['reasoning_tokens'] else ""))
if r["t_prefill"] is not None:
print(f" prefill : {r['t_prefill']:6.2f} s "
f"({fmt_rate(r['prompt_tokens'], r['t_prefill'])})")
else:
print(" prefill : n/a (no streamed token observed)")
if r["t_decode"] is not None and r["completion_tokens"] > 1:
# Decode rate excludes the first token (which is part of prefill latency).
print(f" decode : {r['t_decode']:6.2f} s "
f"({fmt_rate(r['completion_tokens'] - 1, r['t_decode'])})")
else:
print(" decode : n/a")
print(f" wall total : {r['t_total']:6.2f} s")
print()


def main():
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument("--url", default="http://127.0.0.1:8000/v1/chat/completions")
ap.add_argument("--short", default=(
"Scrivi una breve storia su un gatto che impara a programmare in C. "
"Mantieni la storia entro tre paragrafi."
), help="Short-prompt text (default: small Italian story prompt).")
ap.add_argument("--long", help="Path to a long-context prompt file.")
ap.add_argument("--max-tokens", type=int, default=256)
ap.add_argument("--thinking", action="store_true",
help="Leave thinking mode enabled (default: disabled for "
"predictable decode rate).")
ap.add_argument("--runs", type=int, default=1,
help="Repeat each prompt N times; first is reported cold, "
"remaining are averaged warm.")
args = ap.parse_args()

long_prompt = None
if args.long:
with open(args.long, "r", encoding="utf-8") as f:
long_prompt = f.read()

cases = [("short", args.short)]
if long_prompt is not None:
cases.append((f"long ({len(long_prompt)} chars)", long_prompt))

for case_label, prompt in cases:
results = []
for i in range(args.runs):
try:
r = stream_request(args.url, prompt, args.max_tokens, args.thinking)
except Exception as e:
print(f"[{case_label} run {i+1}] failed: {e}")
sys.exit(1)
results.append(r)
tag = "cold" if i == 0 and args.runs > 1 else f"run {i+1}"
print_run(f"{case_label} {tag}", r)

if args.runs > 1:
warm = results[1:]
warm_prefill = [r["t_prefill"] for r in warm if r["t_prefill"]]
warm_decode = [(r["completion_tokens"] - 1, r["t_decode"]) for r in warm
if r["t_decode"] and r["completion_tokens"] > 1]
if warm_prefill and results[0]["prompt_tokens"]:
avg_prefill = sum(warm_prefill) / len(warm_prefill)
print(f"[{case_label} warm avg over {len(warm)} run(s)]")
print(f" prefill : {avg_prefill:6.2f} s "
f"({fmt_rate(results[0]['prompt_tokens'], avg_prefill)})")
if warm_decode:
tot_tokens = sum(t for t, _ in warm_decode)
tot_seconds = sum(s for _, s in warm_decode)
print(f" decode : {tot_seconds:6.2f} s "
f"({fmt_rate(tot_tokens, tot_seconds)})")
print()


if __name__ == "__main__":
main()
Binary file added ds4-rpc-worker
Binary file not shown.
Loading