Skip to content
Closed
421 changes: 213 additions & 208 deletions ds4.c

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions ds4_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ void ds4_gpu_print_memory_report(const char *label);
* compressed-attention indexer that chooses visible compressed rows.
*/

/* `weight_type` is the GGUF tensor type code (1 for F16, 8 for Q8_0). */
int ds4_gpu_embed_token_hc_tensor(
ds4_gpu_tensor *out_hc,
const void *model_map,
Expand All @@ -58,7 +59,8 @@ int ds4_gpu_embed_token_hc_tensor(
uint32_t n_vocab,
uint32_t token,
uint32_t n_embd,
uint32_t n_hc);
uint32_t n_hc,
uint32_t weight_type);

int ds4_gpu_embed_tokens_hc_tensor(
ds4_gpu_tensor *out_hc,
Expand All @@ -69,7 +71,8 @@ int ds4_gpu_embed_tokens_hc_tensor(
uint32_t n_vocab,
uint32_t n_tokens,
uint32_t n_embd,
uint32_t n_hc);
uint32_t n_hc,
uint32_t weight_type);

int ds4_gpu_indexer_score_one_tensor(
ds4_gpu_tensor *scores,
Expand Down
Loading