Skip to content
92 changes: 91 additions & 1 deletion convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10618,6 +10618,13 @@ def set_gguf_parameters(self):
self.gguf_writer.add_logit_scale(logits_scale)
logger.info("gguf: (granite) logits_scale = %s", logits_scale)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith(("encoder.", "projector.")):
Comment thread
ReinforcedKnowledge marked this conversation as resolved.
return
if name.startswith("language_model."):
name = name[len("language_model."):]
yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM")
class GraniteMoeModel(GraniteModel):
Expand Down Expand Up @@ -12355,6 +12362,90 @@ def modify_tensors(self, data_torch, name, bid):
yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("GraniteSpeechForConditionalGeneration", ModelType.MMPROJ)
class GraniteSpeechMmprojModel(MmprojModel):
has_vision_encoder = False
has_audio_encoder = True

_batch_norm_tensors: list[dict[str, Tensor]] | None = None

def get_audio_config(self) -> dict[str, Any] | None:
return self.global_config.get("encoder_config")

def set_gguf_parameters(self):
assert self.hparams_audio is not None
a = self.hparams_audio
a["hidden_size"] = a["hidden_dim"]
a["intermediate_size"] = a["hidden_dim"] * a["feedforward_mult"]
a["num_attention_heads"] = a["num_heads"]
a["num_hidden_layers"] = a["num_layers"]

super().set_gguf_parameters()

self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE_SPEECH)
self.gguf_writer.add_audio_num_mel_bins(a["input_dim"])
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
self.gguf_writer.add_audio_chunk_size(a["context_size"])
self.gguf_writer.add_audio_conv_kernel_size(a["conv_kernel_size"])
self.gguf_writer.add_audio_max_pos_emb(a["max_pos_emb"])

p = self.global_config
self.gguf_writer.add_audio_projector_window_size(p["window_size"])
self.gguf_writer.add_audio_projector_downsample_rate(p["downsample_rate"])
self.gguf_writer.add_audio_projector_head_count(p["projector_config"]["num_attention_heads"])
self.gguf_writer.add_audio_projector_layernorm_eps(p["projector_config"]["layer_norm_eps"])

def tensor_force_quant(self, name, new_name, bid, n_dims):
if "encoder" in name or "projector" in name:
if ".conv" in name and ".weight" in name:
Comment thread
ReinforcedKnowledge marked this conversation as resolved.
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("language_model."):
return
if "attention_dists" in name:
return
if "num_batches_tracked" in name:
return

# fold running_mean, running_var and eps into weight and bias for batch_norm
if "batch_norm" in name and "encoder.layers." in name:
if self._batch_norm_tensors is None:
self._batch_norm_tensors = [{} for _ in range(self.block_count)]
assert bid is not None
self._batch_norm_tensors[bid][name] = data_torch
if len(self._batch_norm_tensors[bid]) < 4:
return
prefix = f"encoder.layers.{bid}.conv.batch_norm"
weight = self._batch_norm_tensors[bid][f"{prefix}.weight"]
bias = self._batch_norm_tensors[bid][f"{prefix}.bias"]
running_mean = self._batch_norm_tensors[bid][f"{prefix}.running_mean"]
running_var = self._batch_norm_tensors[bid][f"{prefix}.running_var"]
eps = 1e-5
a = weight / torch.sqrt(running_var + eps)
b = bias - running_mean * a
yield from super().modify_tensors(a, f"encoder.layers.{bid}.conv.batch_norm.weight", bid)
yield from super().modify_tensors(b, f"encoder.layers.{bid}.conv.batch_norm.bias", bid)
return

if ".attn.to_kv.weight" in name:
k_weight, v_weight = data_torch.chunk(2, dim=0)
yield from super().modify_tensors(k_weight, name.replace("to_kv", "to_k"), bid)
yield from super().modify_tensors(v_weight, name.replace("to_kv", "to_v"), bid)
return

if ("up_conv" in name or "down_conv" in name) and name.endswith(".weight"):
if data_torch.ndim == 3 and data_torch.shape[2] == 1:
data_torch = data_torch.squeeze(2)

if "depth_conv" in name and name.endswith(".weight"):
if data_torch.ndim == 3 and data_torch.shape[1] == 1:
data_torch = data_torch.squeeze(1)

yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("Lfm25AudioTokenizer")
class LFM25AudioTokenizer(LFM2Model):
model_arch = gguf.MODEL_ARCH.LFM2
Expand Down Expand Up @@ -13364,7 +13455,6 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
# TODO: refactor this later to avoid adding exception here
if model_type == ModelType.TEXT and arch == "StepVLForConditionalGeneration":
return arch

# if "architectures" is found in the sub-config, use that instead
if model_type == ModelType.TEXT and text_config.get("architectures") is not None:
arch = text_config["architectures"][0]
Expand Down
68 changes: 68 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,13 +338,20 @@ class ClipAudio:
FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length"
PROJECTION_DIM = "clip.audio.projection_dim"
BLOCK_COUNT = "clip.audio.block_count"
CHUNK_SIZE = "clip.audio.chunk_size"
CONV_KERNEL_SIZE = "clip.audio.conv_kernel_size"
MAX_POS_EMB = "clip.audio.max_pos_emb"

class Attention:
HEAD_COUNT = "clip.audio.attention.head_count"
LAYERNORM_EPS = "clip.audio.attention.layer_norm_epsilon"

class Projector:
STACK_FACTOR = "clip.audio.projector.stack_factor"
WINDOW_SIZE = "clip.audio.projector.window_size"
DOWNSAMPLE_RATE = "clip.audio.projector.downsample_rate"
HEAD_COUNT = "clip.audio.projector.head_count"
LAYERNORM_EPS = "clip.audio.projector.layer_norm_epsilon"

class Diffusion:
SHIFT_LOGITS = "diffusion.shift_logits"
Expand Down Expand Up @@ -852,6 +859,26 @@ class MODEL_TENSOR(IntEnum):
A_ENC_CONV_NORM = auto() # SSM conv
A_ENC_CONV_PW1 = auto()
A_ENC_CONV_PW2 = auto()
A_CTC_OUT = auto()
A_CTC_OUT_MID = auto()
A_ENC_ATTN_REL_POS_EMB = auto()
# qformer projector
A_QF_PROJ_QUERY = auto()
A_QF_PROJ_NORM = auto()
A_QF_PROJ_LINEAR = auto()
A_QF_SELF_ATTN_Q = auto()
A_QF_SELF_ATTN_K = auto()
A_QF_SELF_ATTN_V = auto()
A_QF_SELF_ATTN_O = auto()
A_QF_SELF_ATTN_NORM = auto()
A_QF_CROSS_ATTN_Q = auto()
A_QF_CROSS_ATTN_K = auto()
A_QF_CROSS_ATTN_V = auto()
A_QF_CROSS_ATTN_O = auto()
A_QF_CROSS_ATTN_NORM = auto()
A_QF_FFN_UP = auto()
A_QF_FFN_DOWN = auto()
A_QF_FFN_NORM = auto()


MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
Expand Down Expand Up @@ -1330,6 +1357,26 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.A_ENC_CONV_NORM: "a.blk.{bid}.conv_norm",
MODEL_TENSOR.A_ENC_CONV_PW1: "a.blk.{bid}.conv_pw1",
MODEL_TENSOR.A_ENC_CONV_PW2: "a.blk.{bid}.conv_pw2",
MODEL_TENSOR.A_CTC_OUT: "a.enc_ctc_out",
MODEL_TENSOR.A_CTC_OUT_MID: "a.enc_ctc_out_mid",
MODEL_TENSOR.A_ENC_ATTN_REL_POS_EMB: "a.blk.{bid}.attn_rel_pos_emb",
# qformer projector
MODEL_TENSOR.A_QF_PROJ_QUERY: "a.proj_query",
MODEL_TENSOR.A_QF_PROJ_NORM: "a.proj_norm",
MODEL_TENSOR.A_QF_PROJ_LINEAR: "a.proj_linear",
MODEL_TENSOR.A_QF_SELF_ATTN_Q: "a.proj_blk.{bid}.self_attn_q",
MODEL_TENSOR.A_QF_SELF_ATTN_K: "a.proj_blk.{bid}.self_attn_k",
MODEL_TENSOR.A_QF_SELF_ATTN_V: "a.proj_blk.{bid}.self_attn_v",
MODEL_TENSOR.A_QF_SELF_ATTN_O: "a.proj_blk.{bid}.self_attn_out",
MODEL_TENSOR.A_QF_SELF_ATTN_NORM: "a.proj_blk.{bid}.self_attn_norm",
MODEL_TENSOR.A_QF_CROSS_ATTN_Q: "a.proj_blk.{bid}.cross_attn_q",
MODEL_TENSOR.A_QF_CROSS_ATTN_K: "a.proj_blk.{bid}.cross_attn_k",
MODEL_TENSOR.A_QF_CROSS_ATTN_V: "a.proj_blk.{bid}.cross_attn_v",
MODEL_TENSOR.A_QF_CROSS_ATTN_O: "a.proj_blk.{bid}.cross_attn_out",
MODEL_TENSOR.A_QF_CROSS_ATTN_NORM: "a.proj_blk.{bid}.cross_attn_norm",
MODEL_TENSOR.A_QF_FFN_UP: "a.proj_blk.{bid}.ffn_up",
MODEL_TENSOR.A_QF_FFN_DOWN: "a.proj_blk.{bid}.ffn_down",
MODEL_TENSOR.A_QF_FFN_NORM: "a.proj_blk.{bid}.ffn_norm",
# NextN/MTP
MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj",
MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens",
Expand Down Expand Up @@ -1477,6 +1524,26 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.A_MM_HARD_EMB_NORM,
MODEL_TENSOR.A_PER_DIM_K_SCALE,
MODEL_TENSOR.A_PER_DIM_SCALE,
MODEL_TENSOR.A_CTC_OUT,
MODEL_TENSOR.A_CTC_OUT_MID,
MODEL_TENSOR.A_ENC_ATTN_REL_POS_EMB,
# qformer projector
MODEL_TENSOR.A_QF_PROJ_QUERY,
MODEL_TENSOR.A_QF_PROJ_NORM,
MODEL_TENSOR.A_QF_PROJ_LINEAR,
MODEL_TENSOR.A_QF_SELF_ATTN_Q,
MODEL_TENSOR.A_QF_SELF_ATTN_K,
MODEL_TENSOR.A_QF_SELF_ATTN_V,
MODEL_TENSOR.A_QF_SELF_ATTN_O,
MODEL_TENSOR.A_QF_SELF_ATTN_NORM,
MODEL_TENSOR.A_QF_CROSS_ATTN_Q,
MODEL_TENSOR.A_QF_CROSS_ATTN_K,
MODEL_TENSOR.A_QF_CROSS_ATTN_V,
MODEL_TENSOR.A_QF_CROSS_ATTN_O,
MODEL_TENSOR.A_QF_CROSS_ATTN_NORM,
MODEL_TENSOR.A_QF_FFN_UP,
MODEL_TENSOR.A_QF_FFN_DOWN,
MODEL_TENSOR.A_QF_FFN_NORM,
],
MODEL_ARCH.LLAMA: [
MODEL_TENSOR.TOKEN_EMBD,
Expand Down Expand Up @@ -4138,6 +4205,7 @@ class VisionProjectorType:
YOUTUVL = "youtuvl"
NEMOTRON_V2_VL = "nemotron_v2_vl"
HUNYUANOCR = "hunyuanocr"
GRANITE_SPEECH = "granite_speech" # audio


# Items here are (block size, type size)
Expand Down
21 changes: 21 additions & 0 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1257,6 +1257,27 @@ def add_audio_num_mel_bins(self, value: int) -> None:
def add_audio_stack_factor(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)

def add_audio_chunk_size(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.CHUNK_SIZE, value)

def add_audio_conv_kernel_size(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.CONV_KERNEL_SIZE, value)

def add_audio_max_pos_emb(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.MAX_POS_EMB, value)

def add_audio_projector_window_size(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.Projector.WINDOW_SIZE, value)

def add_audio_projector_downsample_rate(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.Projector.DOWNSAMPLE_RATE, value)

def add_audio_projector_head_count(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.Projector.HEAD_COUNT, value)

def add_audio_projector_layernorm_eps(self, value: float) -> None:
self.add_float32(Keys.ClipAudio.Projector.LAYERNORM_EPS, value)

def add_xielu_alpha_p(self, values: Sequence[float]):
self.add_array(Keys.xIELU.ALPHA_P, values)

Expand Down
Loading