Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
227 changes: 227 additions & 0 deletions README.md

Large diffs are not rendered by default.

713 changes: 425 additions & 288 deletions ds4.c

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions ds4.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ typedef enum {
DS4_BACKEND_CPU,
} ds4_backend;

typedef enum {
DS4_MPP_AUTO = 0,
DS4_MPP_ON,
DS4_MPP_OFF,
} ds4_mpp_mode;

typedef enum {
DS4_THINK_NONE,
DS4_THINK_HIGH,
Expand Down Expand Up @@ -67,6 +73,7 @@ typedef struct {
float directional_steering_ffn;
bool warm_weights;
bool quality;
ds4_mpp_mode mpp_mode;
} ds4_engine_options;

typedef void (*ds4_token_emit_fn)(void *ud, int token);
Expand All @@ -91,7 +98,9 @@ typedef struct {
int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt);
void ds4_engine_close(ds4_engine *e);
void ds4_engine_summary(ds4_engine *e);
int ds4_engine_vocab_size(ds4_engine *e);
const char *ds4_backend_name(ds4_backend backend);
const char *ds4_mpp_mode_name(ds4_mpp_mode mode);
bool ds4_think_mode_enabled(ds4_think_mode mode);
const char *ds4_think_mode_name(ds4_think_mode mode);
const char *ds4_think_max_prefix(void);
Expand Down Expand Up @@ -168,6 +177,7 @@ int ds4_session_argmax_excluding(ds4_session *s, int excluded_id);
int ds4_session_sample(ds4_session *s, float temperature, int top_k, float top_p, float min_p, uint64_t *rng);
int ds4_session_top_logprobs(ds4_session *s, ds4_token_score *out, int k);
int ds4_session_token_logprob(ds4_session *s, int token, ds4_token_score *out);
int ds4_session_copy_logits(ds4_session *s, float *out, int cap);
int ds4_session_eval(ds4_session *s, int token, char *err, size_t errlen);
int ds4_session_eval_speculative_argmax(ds4_session *s, int first_token,
int max_tokens, int eos_token,
Expand Down
106 changes: 105 additions & 1 deletion ds4_cli.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ typedef struct {
float top_p;
uint64_t seed;
bool dump_tokens;
const char *dump_logits_path;
const char *dump_logprobs_path;
int dump_logprobs_top_k;
const char *imatrix_dataset_path;
Expand Down Expand Up @@ -102,7 +103,10 @@ static void usage(FILE *fp) {
" -t, --threads N\n"
" CPU helper threads for host-side or reference work.\n"
" --quality\n"
" Prefer exact kernels where faster approximate paths exist; MTP uses strict verification.\n"
" Prefer exact kernels where faster approximate paths exist; disables Metal Tensor routes; MTP uses strict verification.\n"
" -mt MODE, --mt MODE\n"
" Metal Tensor policy: auto, on, or off. Default: auto. Auto enables validated safe routes; 'on' is a route diagnostic and may change output.\n"
" Legacy alias: --mpp MODE.\n"
" --dir-steering-file FILE\n"
" Load one f32 direction vector per layer for directional steering.\n"
" --dir-steering-ffn F\n"
Expand Down Expand Up @@ -153,6 +157,8 @@ static void usage(FILE *fp) {
" Load the model and print a summary only.\n"
" --dump-tokens\n"
" Tokenize -p/--prompt-file exactly as written, then exit without inference.\n"
" --dump-logits FILE\n"
" Write full next-token logits as JSON after prompt prefill, then exit.\n"
" --dump-logprobs FILE\n"
" Write greedy continuation top-logprobs as JSON without printing text.\n"
" --logprobs-top-k N\n"
Expand Down Expand Up @@ -240,6 +246,15 @@ static ds4_backend default_backend(void) {
#endif
}

static ds4_mpp_mode parse_mpp_mode(const char *s) {
if (!strcmp(s, "auto")) return DS4_MPP_AUTO;
if (!strcmp(s, "on")) return DS4_MPP_ON;
if (!strcmp(s, "off")) return DS4_MPP_OFF;
fprintf(stderr, "ds4: invalid Metal Tensor mode: %s\n", s);
fprintf(stderr, "ds4: valid Metal Tensor modes are: auto, on, off\n");
exit(2);
}

static void log_context_memory(ds4_backend backend, int ctx_size) {
ds4_context_memory m = ds4_context_memory_estimate(backend, ctx_size);
fprintf(stderr,
Expand Down Expand Up @@ -629,6 +644,86 @@ static void json_write_token(FILE *fp, ds4_engine *engine, int token) {
free(text);
}

static int run_logits_dump(ds4_engine *engine, const cli_config *cfg, const ds4_tokens *prompt) {
ds4_session *session = NULL;
if (ds4_session_create(&session, engine, cfg->gen.ctx_size) != 0) {
fprintf(stderr, "ds4: --dump-logits requires a graph session backend\n");
return 1;
}

char err[160];
cli_prefill_progress progress = {
.base_tokens = 0,
.input_tokens = prompt->len,
.use_color = ds4_log_is_tty(stderr),
};
ds4_session_set_progress(session, cli_prefill_progress_cb, &progress);
if (ds4_session_sync(session, prompt, err, sizeof(err)) != 0) {
ds4_session_set_progress(session, NULL, NULL);
fprintf(stderr, "ds4: prompt processing failed: %s\n", err);
ds4_session_free(session);
return 1;
}
ds4_session_set_progress(session, NULL, NULL);

const int vocab = ds4_engine_vocab_size(engine);
float *logits = malloc((size_t)vocab * sizeof(logits[0]));
if (!logits) {
ds4_session_free(session);
return 1;
}
if (ds4_session_copy_logits(session, logits, vocab) != vocab) {
fprintf(stderr, "ds4: failed to copy session logits\n");
free(logits);
ds4_session_free(session);
return 1;
}

FILE *fp = fopen(cfg->gen.dump_logits_path, "wb");
if (!fp) {
fprintf(stderr, "ds4: failed to open --dump-logits file: %s\n", cfg->gen.dump_logits_path);
free(logits);
ds4_session_free(session);
return 1;
}

fprintf(fp, "{\n \"source\":\"ds4\",\n \"model\":");
json_write_string(fp, cfg->engine.model_path, strlen(cfg->engine.model_path));
fprintf(fp,
",\n \"backend\":\"%s\",\n \"mt\":\"%s\",\n \"quant_bits\":%d,\n"
" \"prompt_tokens\":%d,\n \"ctx\":%d,\n \"vocab\":%d,\n",
ds4_backend_name(cfg->engine.backend),
ds4_mpp_mode_name(cfg->engine.mpp_mode),
ds4_engine_routed_quant_bits(engine),
prompt->len,
cfg->gen.ctx_size,
vocab);
const int argmax = ds4_session_argmax(session);
fputs(" \"argmax_token\":", fp);
json_write_token(fp, engine, argmax);
fprintf(fp, ",\n \"argmax_logit\":%.9g,\n \"logits\":[", logits[argmax]);
for (int i = 0; i < vocab; i++) {
if (i) fputc(',', fp);
if ((i % 8) == 0) fputs("\n ", fp);
if (isfinite(logits[i])) {
fprintf(fp, "%.9g", logits[i]);
} else {
fputs("null", fp);
}
}
fputs("\n ]\n}\n", fp);
if (fclose(fp) != 0) {
fprintf(stderr, "ds4: failed to close --dump-logits file: %s\n", cfg->gen.dump_logits_path);
free(logits);
ds4_session_free(session);
return 1;
}

free(logits);
ds4_session_free(session);
return 0;
}

static int run_logprob_dump(ds4_engine *engine, const cli_config *cfg, const ds4_tokens *prompt) {
ds4_session *session = NULL;
if (ds4_session_create(&session, engine, cfg->gen.ctx_size) != 0) {
Expand Down Expand Up @@ -730,6 +825,11 @@ static int run_generation(ds4_engine *engine, const cli_config *cfg) {
ds4_tokens_free(&prompt);
return rc;
}
if (cfg->gen.dump_logits_path) {
rc = run_logits_dump(engine, cfg, &prompt);
ds4_tokens_free(&prompt);
return rc;
}
if (cfg->gen.dump_logprobs_path) {
rc = run_logprob_dump(engine, cfg, &prompt);
ds4_tokens_free(&prompt);
Expand Down Expand Up @@ -1244,6 +1344,8 @@ static cli_config parse_options(int argc, char **argv) {
c.gen.seed = parse_u64(need_arg(&i, argc, argv, arg), arg);
} else if (!strcmp(arg, "--quality")) {
c.engine.quality = true;
} else if (!strcmp(arg, "-mt") || !strcmp(arg, "--mt") || !strcmp(arg, "--mpp")) {
c.engine.mpp_mode = parse_mpp_mode(need_arg(&i, argc, argv, arg));
} else if (!strcmp(arg, "--dir-steering-file")) {
c.engine.directional_steering_file = need_arg(&i, argc, argv, arg);
} else if (!strcmp(arg, "--dir-steering-ffn")) {
Expand All @@ -1264,6 +1366,8 @@ static cli_config parse_options(int argc, char **argv) {
c.engine.backend = DS4_BACKEND_CUDA;
} else if (!strcmp(arg, "--dump-tokens")) {
c.gen.dump_tokens = true;
} else if (!strcmp(arg, "--dump-logits")) {
c.gen.dump_logits_path = need_arg(&i, argc, argv, arg);
} else if (!strcmp(arg, "--dump-logprobs")) {
c.gen.dump_logprobs_path = need_arg(&i, argc, argv, arg);
} else if (!strcmp(arg, "--logprobs-top-k")) {
Expand Down
16 changes: 16 additions & 0 deletions ds4_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include <stdbool.h>
#include <stdint.h>

#include "ds4.h"

/* =========================================================================
* GPU Tensor and Command Lifetime.
* =========================================================================
Expand Down Expand Up @@ -41,6 +43,9 @@ int ds4_gpu_set_model_map_range(const void *model_map, uint64_t model_size, uint
int ds4_gpu_cache_model_range(const void *model_map, uint64_t model_size, uint64_t offset, uint64_t bytes, const char *label);
int ds4_gpu_cache_q8_f16_range(const void *model_map, uint64_t model_size, uint64_t offset, uint64_t bytes, uint64_t in_dim, uint64_t out_dim, const char *label);
void ds4_gpu_set_quality(bool quality);
void ds4_gpu_set_mpp_mode(ds4_mpp_mode mode);
void ds4_gpu_set_mpp_compare_context(const char *module, uint32_t layer_index, uint32_t pos0);
void ds4_gpu_clear_mpp_compare_context(void);
void ds4_gpu_print_memory_report(const char *label);

/* =========================================================================
Expand Down Expand Up @@ -139,6 +144,16 @@ int ds4_gpu_matmul_q8_0_tensor(
const ds4_gpu_tensor *x,
uint64_t n_tok);

int ds4_gpu_matmul_q8_0_mpp_tensor(
ds4_gpu_tensor *out,
const void *model_map,
uint64_t model_size,
uint64_t weight_offset,
uint64_t in_dim,
uint64_t out_dim,
const ds4_gpu_tensor *x,
uint64_t n_tok);

int ds4_gpu_shared_gate_up_swiglu_q8_0_tensor(
ds4_gpu_tensor *gate,
ds4_gpu_tensor *up,
Expand Down Expand Up @@ -665,6 +680,7 @@ int ds4_gpu_routed_moe_batch_tensor(
uint32_t n_expert,
float clamp,
const ds4_gpu_tensor *x,
uint32_t layer_index,
uint32_t n_tokens,
bool *mid_is_f16);

Expand Down
Loading