diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5b4fb79fc1b..8b5eef6595b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -10618,6 +10618,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_logit_scale(logits_scale) logger.info("gguf: (granite) logits_scale = %s", logits_scale) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith(("encoder.", "projector.")): + return + if name.startswith("language_model."): + name = name[len("language_model."):] + yield from super().modify_tensors(data_torch, name, bid) + @ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM") class GraniteMoeModel(GraniteModel): @@ -12355,6 +12362,90 @@ def modify_tensors(self, data_torch, name, bid): yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("GraniteSpeechForConditionalGeneration", ModelType.MMPROJ) +class GraniteSpeechMmprojModel(MmprojModel): + has_vision_encoder = False + has_audio_encoder = True + + _batch_norm_tensors: list[dict[str, Tensor]] | None = None + + def get_audio_config(self) -> dict[str, Any] | None: + return self.global_config.get("encoder_config") + + def set_gguf_parameters(self): + assert self.hparams_audio is not None + a = self.hparams_audio + a["hidden_size"] = a["hidden_dim"] + a["intermediate_size"] = a["hidden_dim"] * a["feedforward_mult"] + a["num_attention_heads"] = a["num_heads"] + a["num_hidden_layers"] = a["num_layers"] + + super().set_gguf_parameters() + + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GRANITE_SPEECH) + self.gguf_writer.add_audio_num_mel_bins(a["input_dim"]) + self.gguf_writer.add_audio_attention_layernorm_eps(1e-5) + self.gguf_writer.add_audio_chunk_size(a["context_size"]) + self.gguf_writer.add_audio_conv_kernel_size(a["conv_kernel_size"]) + self.gguf_writer.add_audio_max_pos_emb(a["max_pos_emb"]) + + p = self.global_config + self.gguf_writer.add_audio_projector_window_size(p["window_size"]) + self.gguf_writer.add_audio_projector_downsample_rate(p["downsample_rate"]) + self.gguf_writer.add_audio_projector_head_count(p["projector_config"]["num_attention_heads"]) + self.gguf_writer.add_audio_projector_layernorm_eps(p["projector_config"]["layer_norm_eps"]) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if "encoder" in name or "projector" in name: + if ".conv" in name and ".weight" in name: + return gguf.GGMLQuantizationType.F32 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("language_model."): + return + if "attention_dists" in name: + return + if "num_batches_tracked" in name: + return + + # fold running_mean, running_var and eps into weight and bias for batch_norm + if "batch_norm" in name and "encoder.layers." in name: + if self._batch_norm_tensors is None: + self._batch_norm_tensors = [{} for _ in range(self.block_count)] + assert bid is not None + self._batch_norm_tensors[bid][name] = data_torch + if len(self._batch_norm_tensors[bid]) < 4: + return + prefix = f"encoder.layers.{bid}.conv.batch_norm" + weight = self._batch_norm_tensors[bid][f"{prefix}.weight"] + bias = self._batch_norm_tensors[bid][f"{prefix}.bias"] + running_mean = self._batch_norm_tensors[bid][f"{prefix}.running_mean"] + running_var = self._batch_norm_tensors[bid][f"{prefix}.running_var"] + eps = 1e-5 + a = weight / torch.sqrt(running_var + eps) + b = bias - running_mean * a + yield from super().modify_tensors(a, f"encoder.layers.{bid}.conv.batch_norm.weight", bid) + yield from super().modify_tensors(b, f"encoder.layers.{bid}.conv.batch_norm.bias", bid) + return + + if ".attn.to_kv.weight" in name: + k_weight, v_weight = data_torch.chunk(2, dim=0) + yield from super().modify_tensors(k_weight, name.replace("to_kv", "to_k"), bid) + yield from super().modify_tensors(v_weight, name.replace("to_kv", "to_v"), bid) + return + + if ("up_conv" in name or "down_conv" in name) and name.endswith(".weight"): + if data_torch.ndim == 3 and data_torch.shape[2] == 1: + data_torch = data_torch.squeeze(2) + + if "depth_conv" in name and name.endswith(".weight"): + if data_torch.ndim == 3 and data_torch.shape[1] == 1: + data_torch = data_torch.squeeze(1) + + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("Lfm25AudioTokenizer") class LFM25AudioTokenizer(LFM2Model): model_arch = gguf.MODEL_ARCH.LFM2 @@ -13364,7 +13455,6 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st # TODO: refactor this later to avoid adding exception here if model_type == ModelType.TEXT and arch == "StepVLForConditionalGeneration": return arch - # if "architectures" is found in the sub-config, use that instead if model_type == ModelType.TEXT and text_config.get("architectures") is not None: arch = text_config["architectures"][0] diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c5297a2f440..280515ae04d 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -338,6 +338,9 @@ class ClipAudio: FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length" PROJECTION_DIM = "clip.audio.projection_dim" BLOCK_COUNT = "clip.audio.block_count" + CHUNK_SIZE = "clip.audio.chunk_size" + CONV_KERNEL_SIZE = "clip.audio.conv_kernel_size" + MAX_POS_EMB = "clip.audio.max_pos_emb" class Attention: HEAD_COUNT = "clip.audio.attention.head_count" @@ -345,6 +348,10 @@ class Attention: class Projector: STACK_FACTOR = "clip.audio.projector.stack_factor" + WINDOW_SIZE = "clip.audio.projector.window_size" + DOWNSAMPLE_RATE = "clip.audio.projector.downsample_rate" + HEAD_COUNT = "clip.audio.projector.head_count" + LAYERNORM_EPS = "clip.audio.projector.layer_norm_epsilon" class Diffusion: SHIFT_LOGITS = "diffusion.shift_logits" @@ -852,6 +859,26 @@ class MODEL_TENSOR(IntEnum): A_ENC_CONV_NORM = auto() # SSM conv A_ENC_CONV_PW1 = auto() A_ENC_CONV_PW2 = auto() + A_CTC_OUT = auto() + A_CTC_OUT_MID = auto() + A_ENC_ATTN_REL_POS_EMB = auto() + # qformer projector + A_QF_PROJ_QUERY = auto() + A_QF_PROJ_NORM = auto() + A_QF_PROJ_LINEAR = auto() + A_QF_SELF_ATTN_Q = auto() + A_QF_SELF_ATTN_K = auto() + A_QF_SELF_ATTN_V = auto() + A_QF_SELF_ATTN_O = auto() + A_QF_SELF_ATTN_NORM = auto() + A_QF_CROSS_ATTN_Q = auto() + A_QF_CROSS_ATTN_K = auto() + A_QF_CROSS_ATTN_V = auto() + A_QF_CROSS_ATTN_O = auto() + A_QF_CROSS_ATTN_NORM = auto() + A_QF_FFN_UP = auto() + A_QF_FFN_DOWN = auto() + A_QF_FFN_NORM = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -1330,6 +1357,26 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.A_ENC_CONV_NORM: "a.blk.{bid}.conv_norm", MODEL_TENSOR.A_ENC_CONV_PW1: "a.blk.{bid}.conv_pw1", MODEL_TENSOR.A_ENC_CONV_PW2: "a.blk.{bid}.conv_pw2", + MODEL_TENSOR.A_CTC_OUT: "a.enc_ctc_out", + MODEL_TENSOR.A_CTC_OUT_MID: "a.enc_ctc_out_mid", + MODEL_TENSOR.A_ENC_ATTN_REL_POS_EMB: "a.blk.{bid}.attn_rel_pos_emb", + # qformer projector + MODEL_TENSOR.A_QF_PROJ_QUERY: "a.proj_query", + MODEL_TENSOR.A_QF_PROJ_NORM: "a.proj_norm", + MODEL_TENSOR.A_QF_PROJ_LINEAR: "a.proj_linear", + MODEL_TENSOR.A_QF_SELF_ATTN_Q: "a.proj_blk.{bid}.self_attn_q", + MODEL_TENSOR.A_QF_SELF_ATTN_K: "a.proj_blk.{bid}.self_attn_k", + MODEL_TENSOR.A_QF_SELF_ATTN_V: "a.proj_blk.{bid}.self_attn_v", + MODEL_TENSOR.A_QF_SELF_ATTN_O: "a.proj_blk.{bid}.self_attn_out", + MODEL_TENSOR.A_QF_SELF_ATTN_NORM: "a.proj_blk.{bid}.self_attn_norm", + MODEL_TENSOR.A_QF_CROSS_ATTN_Q: "a.proj_blk.{bid}.cross_attn_q", + MODEL_TENSOR.A_QF_CROSS_ATTN_K: "a.proj_blk.{bid}.cross_attn_k", + MODEL_TENSOR.A_QF_CROSS_ATTN_V: "a.proj_blk.{bid}.cross_attn_v", + MODEL_TENSOR.A_QF_CROSS_ATTN_O: "a.proj_blk.{bid}.cross_attn_out", + MODEL_TENSOR.A_QF_CROSS_ATTN_NORM: "a.proj_blk.{bid}.cross_attn_norm", + MODEL_TENSOR.A_QF_FFN_UP: "a.proj_blk.{bid}.ffn_up", + MODEL_TENSOR.A_QF_FFN_DOWN: "a.proj_blk.{bid}.ffn_down", + MODEL_TENSOR.A_QF_FFN_NORM: "a.proj_blk.{bid}.ffn_norm", # NextN/MTP MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj", MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens", @@ -1477,6 +1524,26 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.A_MM_HARD_EMB_NORM, MODEL_TENSOR.A_PER_DIM_K_SCALE, MODEL_TENSOR.A_PER_DIM_SCALE, + MODEL_TENSOR.A_CTC_OUT, + MODEL_TENSOR.A_CTC_OUT_MID, + MODEL_TENSOR.A_ENC_ATTN_REL_POS_EMB, + # qformer projector + MODEL_TENSOR.A_QF_PROJ_QUERY, + MODEL_TENSOR.A_QF_PROJ_NORM, + MODEL_TENSOR.A_QF_PROJ_LINEAR, + MODEL_TENSOR.A_QF_SELF_ATTN_Q, + MODEL_TENSOR.A_QF_SELF_ATTN_K, + MODEL_TENSOR.A_QF_SELF_ATTN_V, + MODEL_TENSOR.A_QF_SELF_ATTN_O, + MODEL_TENSOR.A_QF_SELF_ATTN_NORM, + MODEL_TENSOR.A_QF_CROSS_ATTN_Q, + MODEL_TENSOR.A_QF_CROSS_ATTN_K, + MODEL_TENSOR.A_QF_CROSS_ATTN_V, + MODEL_TENSOR.A_QF_CROSS_ATTN_O, + MODEL_TENSOR.A_QF_CROSS_ATTN_NORM, + MODEL_TENSOR.A_QF_FFN_UP, + MODEL_TENSOR.A_QF_FFN_DOWN, + MODEL_TENSOR.A_QF_FFN_NORM, ], MODEL_ARCH.LLAMA: [ MODEL_TENSOR.TOKEN_EMBD, @@ -4138,6 +4205,7 @@ class VisionProjectorType: YOUTUVL = "youtuvl" NEMOTRON_V2_VL = "nemotron_v2_vl" HUNYUANOCR = "hunyuanocr" + GRANITE_SPEECH = "granite_speech" # audio # Items here are (block size, type size) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 90d500dc771..e92b0066330 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1257,6 +1257,27 @@ def add_audio_num_mel_bins(self, value: int) -> None: def add_audio_stack_factor(self, value: int) -> None: self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value) + def add_audio_chunk_size(self, value: int) -> None: + self.add_uint32(Keys.ClipAudio.CHUNK_SIZE, value) + + def add_audio_conv_kernel_size(self, value: int) -> None: + self.add_uint32(Keys.ClipAudio.CONV_KERNEL_SIZE, value) + + def add_audio_max_pos_emb(self, value: int) -> None: + self.add_uint32(Keys.ClipAudio.MAX_POS_EMB, value) + + def add_audio_projector_window_size(self, value: int) -> None: + self.add_uint32(Keys.ClipAudio.Projector.WINDOW_SIZE, value) + + def add_audio_projector_downsample_rate(self, value: int) -> None: + self.add_uint32(Keys.ClipAudio.Projector.DOWNSAMPLE_RATE, value) + + def add_audio_projector_head_count(self, value: int) -> None: + self.add_uint32(Keys.ClipAudio.Projector.HEAD_COUNT, value) + + def add_audio_projector_layernorm_eps(self, value: float) -> None: + self.add_float32(Keys.ClipAudio.Projector.LAYERNORM_EPS, value) + def add_xielu_alpha_p(self, values: Sequence[float]): self.add_array(Keys.xIELU.ALPHA_P, values) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 01a9b236000..e6abddf8ad5 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -158,6 +158,21 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_MSFA_NORM: ( "model.vision_tower.timm_model.msfa.norm", # gemma3n ), + MODEL_TENSOR.A_CTC_OUT: ( + "encoder.out", + ), + MODEL_TENSOR.A_CTC_OUT_MID: ( + "encoder.out_mid", + ), + MODEL_TENSOR.A_QF_PROJ_QUERY: ( + "projector.query", + ), + MODEL_TENSOR.A_QF_PROJ_NORM: ( + "projector.qformer.layernorm", + ), + MODEL_TENSOR.A_QF_PROJ_LINEAR: ( + "projector.linear", + ), } block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { @@ -1890,6 +1905,7 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_INP_PROJ: ( "conformer.subsample_conv_projection.input_proj_linear", # gemma4 + "encoder.input_linear", ), MODEL_TENSOR.A_ENC_CONV2D: ( @@ -1912,6 +1928,7 @@ class TensorNameMap: "conformer.layers.{bid}.self_attn.linear_q", # lfm2 "conformer.layers.{bid}.attention.attn.q_proj", # gemma3n "conformer.layers.{bid}.self_attn.q_proj", # gemma4 + "encoder.layers.{bid}.attn.to_q", # granite_speech ), MODEL_TENSOR.A_ENC_ATTN_K: ( @@ -1919,6 +1936,7 @@ class TensorNameMap: "conformer.layers.{bid}.self_attn.linear_k", # lfm2 "conformer.layers.{bid}.attention.attn.k_proj", # gemma3n "conformer.layers.{bid}.self_attn.k_proj", # gemma4 + "encoder.layers.{bid}.attn.to_k", # granite_speech (split from to_kv) ), MODEL_TENSOR.A_ENC_ATTN_V: ( @@ -1926,6 +1944,7 @@ class TensorNameMap: "conformer.layers.{bid}.self_attn.linear_v", # lfm2 "conformer.layers.{bid}.attention.attn.v_proj", # gemma3n "conformer.layers.{bid}.self_attn.v_proj", # gemma4 + "encoder.layers.{bid}.attn.to_v", # granite_speech (split from to_kv) ), MODEL_TENSOR.A_ENC_ATTN_K_REL: ( @@ -1953,6 +1972,7 @@ class TensorNameMap: "audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox "conformer.layers.{bid}.norm_self_att", # lfm2 "conformer.layers.{bid}.attention.pre_attn_norm", # gemma3n + "encoder.layers.{bid}.attn.pre_norm", # granite_speech ), MODEL_TENSOR.A_ENC_OUTPUT: ( @@ -1960,18 +1980,21 @@ class TensorNameMap: "conformer.layers.{bid}.self_attn.linear_out", # lfm2 "conformer.layers.{bid}.attention.post", # gemma3n "conformer.layers.{bid}.self_attn.post", # gemma4 + "encoder.layers.{bid}.attn.to_out", # granite_speech ), MODEL_TENSOR.A_ENC_OUTPUT_NORM: ( "audio_tower.layers.{bid}.final_layer_norm", # ultravox "conformer.layers.{bid}.norm_out", # lfm2 "conformer.layers.{bid}.attention.post_norm", # gemma3n + "encoder.layers.{bid}.post_norm", # granite_speech ), MODEL_TENSOR.A_ENC_FFN_NORM: ( "conformer.layers.{bid}.norm_feed_forward1", # lfm2 "conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n "conformer.layers.{bid}.feed_forward1.pre_layer_norm", # gemma4 + "encoder.layers.{bid}.ff1.pre_norm", # granite_speech ), MODEL_TENSOR.A_ENC_FFN_POST_NORM: ( @@ -1988,6 +2011,7 @@ class TensorNameMap: "conformer.layers.{bid}.feed_forward1.linear1", # lfm2 "conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n "conformer.layers.{bid}.feed_forward1.ffw_layer_1", # gemma4 + "encoder.layers.{bid}.ff1.up_proj", # granite_speech ), MODEL_TENSOR.A_ENC_FFN_GATE: (), @@ -1997,24 +2021,28 @@ class TensorNameMap: "conformer.layers.{bid}.feed_forward1.linear2", # lfm2 "conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n "conformer.layers.{bid}.feed_forward1.ffw_layer_2", # gemma4 + "encoder.layers.{bid}.ff1.down_proj", # granite_speech ), MODEL_TENSOR.A_ENC_FFN_UP_1: ( "conformer.layers.{bid}.feed_forward2.linear1", # lfm2 "conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n "conformer.layers.{bid}.feed_forward2.ffw_layer_1", # gemma4 + "encoder.layers.{bid}.ff2.up_proj", # granite_speech ), MODEL_TENSOR.A_ENC_FFN_DOWN_1: ( "conformer.layers.{bid}.feed_forward2.linear2", # lfm2 "conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n "conformer.layers.{bid}.feed_forward2.ffw_layer_2", # gemma4 + "encoder.layers.{bid}.ff2.down_proj", # granite_speech ), MODEL_TENSOR.A_ENC_FFN_NORM_1: ( "conformer.layers.{bid}.norm_feed_forward2", # lfm2 "conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n "conformer.layers.{bid}.feed_forward2.pre_layer_norm", # gemma4 + "encoder.layers.{bid}.ff2.pre_norm", # granite_speech ), MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: ( @@ -2071,26 +2099,31 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_CONV_DW: ( "conformer.layers.{bid}.conv.depthwise_conv", # lfm2 "conformer.layers.{bid}.lconv1d.depthwise_conv1d", # gemma3n + "encoder.layers.{bid}.conv.depth_conv.conv", # granite_speech ), MODEL_TENSOR.A_ENC_CONV_NORM: ( "conformer.layers.{bid}.conv.batch_norm", # lfm2 "conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n + "encoder.layers.{bid}.conv.batch_norm", # granite_speech ), MODEL_TENSOR.A_ENC_CONV_PW1: ( "conformer.layers.{bid}.conv.pointwise_conv1", # lfm2 "conformer.layers.{bid}.lconv1d.linear_start", # gemma3n + "encoder.layers.{bid}.conv.up_conv", # granite_speech ), MODEL_TENSOR.A_ENC_CONV_PW2: ( "conformer.layers.{bid}.conv.pointwise_conv2", # lfm2 "conformer.layers.{bid}.lconv1d.linear_end", # gemma3n + "encoder.layers.{bid}.conv.down_conv", # granite_speech ), MODEL_TENSOR.A_ENC_NORM_CONV: ( "conformer.layers.{bid}.norm_conv", # lfm2 "conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n + "encoder.layers.{bid}.conv.norm", # granite_speech ), MODEL_TENSOR.A_PER_DIM_K_SCALE: ( @@ -2114,6 +2147,49 @@ class TensorNameMap: "model.embed_audio.soft_embedding_norm", # gemma3n ), + MODEL_TENSOR.A_ENC_ATTN_REL_POS_EMB: ( + "encoder.layers.{bid}.attn.rel_pos_emb.weight", + ), + MODEL_TENSOR.A_QF_SELF_ATTN_Q: ( + "projector.qformer.encoder.layer.{bid}.attention.attention.query", + ), + MODEL_TENSOR.A_QF_SELF_ATTN_K: ( + "projector.qformer.encoder.layer.{bid}.attention.attention.key", + ), + MODEL_TENSOR.A_QF_SELF_ATTN_V: ( + "projector.qformer.encoder.layer.{bid}.attention.attention.value", + ), + MODEL_TENSOR.A_QF_SELF_ATTN_O: ( + "projector.qformer.encoder.layer.{bid}.attention.output.dense", + ), + MODEL_TENSOR.A_QF_SELF_ATTN_NORM: ( + "projector.qformer.encoder.layer.{bid}.attention.output.LayerNorm", + ), + MODEL_TENSOR.A_QF_CROSS_ATTN_Q: ( + "projector.qformer.encoder.layer.{bid}.crossattention.attention.query", + ), + MODEL_TENSOR.A_QF_CROSS_ATTN_K: ( + "projector.qformer.encoder.layer.{bid}.crossattention.attention.key", + ), + MODEL_TENSOR.A_QF_CROSS_ATTN_V: ( + "projector.qformer.encoder.layer.{bid}.crossattention.attention.value", + ), + MODEL_TENSOR.A_QF_CROSS_ATTN_O: ( + "projector.qformer.encoder.layer.{bid}.crossattention.output.dense", + ), + MODEL_TENSOR.A_QF_CROSS_ATTN_NORM: ( + "projector.qformer.encoder.layer.{bid}.crossattention.output.LayerNorm", + ), + MODEL_TENSOR.A_QF_FFN_UP: ( + "projector.qformer.encoder.layer.{bid}.intermediate_query.dense", + ), + MODEL_TENSOR.A_QF_FFN_DOWN: ( + "projector.qformer.encoder.layer.{bid}.output_query.dense", + ), + MODEL_TENSOR.A_QF_FFN_NORM: ( + "projector.qformer.encoder.layer.{bid}.output_query.LayerNorm", + ), + # NextN/MTP tensors MODEL_TENSOR.NEXTN_EH_PROJ: ( "model.layers.{bid}.eh_proj", diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 399876128ef..f78e49cea09 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -20,6 +20,7 @@ add_library(mtmd models/dotsocr.cpp models/gemma4a.cpp models/gemma4v.cpp + models/granite-speech.cpp models/glm4v.cpp models/hunyuanocr.cpp models/internvl.cpp diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 17cb703f7fb..342601bf91c 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -63,6 +63,13 @@ #define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities #define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins" #define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor" +#define KEY_A_CHUNK_SIZE "clip.audio.chunk_size" +#define KEY_A_CONV_KERNEL_SIZE "clip.audio.conv_kernel_size" +#define KEY_A_MAX_POS_EMB "clip.audio.max_pos_emb" +#define KEY_A_PROJ_WINDOW_SIZE "clip.audio.projector.window_size" +#define KEY_A_PROJ_DOWNSAMPLE_RATE "clip.audio.projector.downsample_rate" +#define KEY_A_PROJ_HEAD_COUNT "clip.audio.projector.head_count" +#define KEY_A_PROJ_LAYERNORM_EPS "clip.audio.projector.layer_norm_epsilon" // @@ -182,6 +189,27 @@ #define TN_CONV_NORM "%s.blk.%d.conv_norm.%s" #define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s" #define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s" +#define TN_INP_PROJ "a.input_projection.%s" +#define TN_CTC_OUT "a.enc_ctc_out.%s" +#define TN_CTC_OUT_MID "a.enc_ctc_out_mid.%s" +#define TN_ATTN_REL_POS_EMB "%s.blk.%d.attn_rel_pos_emb" +// qformer projector +#define TN_QF_PROJ_QUERY "a.proj_query" +#define TN_QF_PROJ_NORM "a.proj_norm.%s" +#define TN_QF_PROJ_LINEAR "a.proj_linear.%s" +#define TN_QF_SELF_ATTN_Q "a.proj_blk.%d.self_attn_q.%s" +#define TN_QF_SELF_ATTN_K "a.proj_blk.%d.self_attn_k.%s" +#define TN_QF_SELF_ATTN_V "a.proj_blk.%d.self_attn_v.%s" +#define TN_QF_SELF_ATTN_O "a.proj_blk.%d.self_attn_out.%s" +#define TN_QF_SELF_ATTN_N "a.proj_blk.%d.self_attn_norm.%s" +#define TN_QF_CROSS_ATTN_Q "a.proj_blk.%d.cross_attn_q.%s" +#define TN_QF_CROSS_ATTN_K "a.proj_blk.%d.cross_attn_k.%s" +#define TN_QF_CROSS_ATTN_V "a.proj_blk.%d.cross_attn_v.%s" +#define TN_QF_CROSS_ATTN_O "a.proj_blk.%d.cross_attn_out.%s" +#define TN_QF_CROSS_ATTN_N "a.proj_blk.%d.cross_attn_norm.%s" +#define TN_QF_FFN_UP "a.proj_blk.%d.ffn_up.%s" +#define TN_QF_FFN_DOWN "a.proj_blk.%d.ffn_down.%s" +#define TN_QF_FFN_NORM "a.proj_blk.%d.ffn_norm.%s" // gemma4 audio conformer #define TN_A_MM_INP_PROJ "mm.a.input_projection.%s" @@ -293,6 +321,7 @@ enum projector_type { PROJECTOR_TYPE_KIMIK25, PROJECTOR_TYPE_NEMOTRON_V2_VL, PROJECTOR_TYPE_HUNYUANOCR, + PROJECTOR_TYPE_GRANITE_SPEECH, PROJECTOR_TYPE_UNKNOWN, }; @@ -338,6 +367,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_KIMIK25, "kimik25"}, { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"}, { PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"}, + { PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 9a93584d9be..915243ad227 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -92,6 +92,13 @@ struct clip_hparams { // audio int32_t n_mel_bins = 0; // whisper preprocessor int32_t proj_stack_factor = 0; // ultravox + int32_t audio_chunk_size = 0; + int32_t audio_conv_kernel_size = 0; + int32_t audio_max_pos_emb = 0; + int32_t audio_proj_window_size = 0; + int32_t audio_proj_downsample_rate = 0; + int32_t audio_proj_head_count = 0; + float audio_proj_layernorm_eps = 0.0f; // audio-to-mel preprocessor params int32_t audio_chunk_len = -1; // in seconds @@ -224,6 +231,9 @@ struct clip_layer { ggml_tensor * per_dim_k_scale_w = nullptr; ggml_tensor * ff_post_norm_1_w = nullptr; + // granite_speech conformer per-layer + ggml_tensor * attn_rel_pos_emb = nullptr; + bool has_deepstack() const { return deepstack_fc1_w != nullptr; } @@ -268,6 +278,37 @@ struct mobilenetv5_block { ggml_tensor * attn_norm_w = nullptr; }; +struct qformer_proj_layer { + ggml_tensor * self_attn_q_w = nullptr; + ggml_tensor * self_attn_q_b = nullptr; + ggml_tensor * self_attn_k_w = nullptr; + ggml_tensor * self_attn_k_b = nullptr; + ggml_tensor * self_attn_v_w = nullptr; + ggml_tensor * self_attn_v_b = nullptr; + ggml_tensor * self_attn_o_w = nullptr; + ggml_tensor * self_attn_o_b = nullptr; + ggml_tensor * self_attn_norm_w = nullptr; + ggml_tensor * self_attn_norm_b = nullptr; + + ggml_tensor * cross_attn_q_w = nullptr; + ggml_tensor * cross_attn_q_b = nullptr; + ggml_tensor * cross_attn_k_w = nullptr; + ggml_tensor * cross_attn_k_b = nullptr; + ggml_tensor * cross_attn_v_w = nullptr; + ggml_tensor * cross_attn_v_b = nullptr; + ggml_tensor * cross_attn_o_w = nullptr; + ggml_tensor * cross_attn_o_b = nullptr; + ggml_tensor * cross_attn_norm_w = nullptr; + ggml_tensor * cross_attn_norm_b = nullptr; + + ggml_tensor * ffn_up_w = nullptr; + ggml_tensor * ffn_up_b = nullptr; + ggml_tensor * ffn_down_w = nullptr; + ggml_tensor * ffn_down_b = nullptr; + ggml_tensor * ffn_norm_w = nullptr; + ggml_tensor * ffn_norm_b = nullptr; +}; + struct clip_model { clip_modality modality = CLIP_MODALITY_VISION; projector_type proj_type = PROJECTOR_TYPE_MLP; @@ -485,6 +526,21 @@ struct clip_model { ggml_tensor * audio_out_proj_w = nullptr; ggml_tensor * audio_out_proj_b = nullptr; + // granite_speech encoder + ggml_tensor * inp_proj_w = nullptr; + ggml_tensor * inp_proj_b = nullptr; + ggml_tensor * ctc_out_w = nullptr; + ggml_tensor * ctc_out_b = nullptr; + ggml_tensor * ctc_out_mid_w = nullptr; + ggml_tensor * ctc_out_mid_b = nullptr; + // qformer projector + ggml_tensor * qf_proj_query = nullptr; + ggml_tensor * qf_proj_norm_w = nullptr; + ggml_tensor * qf_proj_norm_b = nullptr; + ggml_tensor * qf_proj_linear_w = nullptr; + ggml_tensor * qf_proj_linear_b = nullptr; + std::vector qf_proj_layers; + bool audio_has_avgpool() const { return proj_type == PROJECTOR_TYPE_QWEN2A || proj_type == PROJECTOR_TYPE_VOXTRAL diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index f0e8786b660..9ef4cd4000c 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -935,6 +935,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_GRANITE_SPEECH: + { + builder = std::make_unique(ctx, img); + } break; case PROJECTOR_TYPE_GLM4V: { builder = std::make_unique(ctx, img); @@ -1478,6 +1482,21 @@ struct clip_model_loader { hparams.audio_window_len = 320; // 20ms frame (NOT 25ms/400) hparams.audio_hop_len = 160; } break; + case PROJECTOR_TYPE_GRANITE_SPEECH: + { + hparams.audio_chunk_len = 0; + hparams.audio_sample_rate = 16000; + hparams.audio_n_fft = 512; + hparams.audio_window_len = 400; + hparams.audio_hop_len = 160; + get_u32(KEY_A_CHUNK_SIZE, hparams.audio_chunk_size); + get_u32(KEY_A_CONV_KERNEL_SIZE, hparams.audio_conv_kernel_size); + get_u32(KEY_A_MAX_POS_EMB, hparams.audio_max_pos_emb); + get_u32(KEY_A_PROJ_WINDOW_SIZE, hparams.audio_proj_window_size); + get_u32(KEY_A_PROJ_DOWNSAMPLE_RATE, hparams.audio_proj_downsample_rate); + get_u32(KEY_A_PROJ_HEAD_COUNT, hparams.audio_proj_head_count); + get_f32(KEY_A_PROJ_LAYERNORM_EPS, hparams.audio_proj_layernorm_eps); + } break; case PROJECTOR_TYPE_JANUS_PRO: { hparams.image_pad_color = {127, 127, 127}; @@ -1629,13 +1648,14 @@ struct clip_model_loader { model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false); - if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) { - hparams.n_layer = 0; // gemma3n does not use normal layer structure - } + const bool has_standard_layers = ( + model.proj_type != PROJECTOR_TYPE_GEMMA3NV && + model.proj_type != PROJECTOR_TYPE_GRANITE_SPEECH); // layers - model.layers.resize(hparams.n_layer); - for (int il = 0; il < hparams.n_layer; ++il) { + const int n_layers_to_load = has_standard_layers ? hparams.n_layer : 0; + model.layers.resize(n_layers_to_load); + for (int il = 0; il < n_layers_to_load; ++il) { auto & layer = model.layers[il]; layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"), false); layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"), false); @@ -2340,6 +2360,99 @@ struct clip_model_loader { layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias")); } } break; + case PROJECTOR_TYPE_GRANITE_SPEECH: + { + model.layers.resize(hparams.n_layer); + + model.inp_proj_w = get_tensor(string_format(TN_INP_PROJ, "weight")); + model.inp_proj_b = get_tensor(string_format(TN_INP_PROJ, "bias")); + model.ctc_out_w = get_tensor(string_format(TN_CTC_OUT, "weight")); + model.ctc_out_b = get_tensor(string_format(TN_CTC_OUT, "bias")); + model.ctc_out_mid_w = get_tensor(string_format(TN_CTC_OUT_MID, "weight")); + model.ctc_out_mid_b = get_tensor(string_format(TN_CTC_OUT_MID, "bias")); + + for (int il = 0; il < hparams.n_layer; ++il) { + auto & layer = model.layers[il]; + + layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight")); + layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight")); + layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight")); + layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight")); + layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias")); + layer.attn_rel_pos_emb = get_tensor(string_format(TN_ATTN_REL_POS_EMB, prefix, il)); + + layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight")); + layer.ln_1_b = get_tensor(string_format(TN_LN_1, prefix, il, "bias")); + + layer.ln_2_w = get_tensor(string_format(TN_LN_2, prefix, il, "weight")); + layer.ln_2_b = get_tensor(string_format(TN_LN_2, prefix, il, "bias")); + + layer.ff_norm_w = get_tensor(string_format(TN_FFN_NORM, prefix, il, "weight")); + layer.ff_norm_b = get_tensor(string_format(TN_FFN_NORM, prefix, il, "bias")); + layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, prefix, il, "weight")); + layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, prefix, il, "bias")); + layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight")); + layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias")); + + layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight")); + layer.ff_norm_1_b = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "bias")); + layer.ff_up_1_w = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "weight")); + layer.ff_up_1_b = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "bias")); + layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight")); + layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias")); + + layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight")); + layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias")); + layer.conv_norm_w = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight")); + layer.conv_norm_b = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias")); + layer.conv_dw_w = get_tensor(string_format(TN_CONV_DW, prefix, il, "weight")); + layer.conv_pw1_w = get_tensor(string_format(TN_CONV_PW1, prefix, il, "weight")); + layer.conv_pw1_b = get_tensor(string_format(TN_CONV_PW1, prefix, il, "bias")); + layer.conv_pw2_w = get_tensor(string_format(TN_CONV_PW2, prefix, il, "weight")); + layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias")); + } + + model.qf_proj_query = get_tensor(TN_QF_PROJ_QUERY); + model.qf_proj_norm_w = get_tensor(string_format(TN_QF_PROJ_NORM, "weight")); + model.qf_proj_norm_b = get_tensor(string_format(TN_QF_PROJ_NORM, "bias")); + model.qf_proj_linear_w = get_tensor(string_format(TN_QF_PROJ_LINEAR, "weight")); + model.qf_proj_linear_b = get_tensor(string_format(TN_QF_PROJ_LINEAR, "bias")); + + const int n_proj_layers = 2; + model.qf_proj_layers.resize(n_proj_layers); + for (int il = 0; il < n_proj_layers; ++il) { + auto & pl = model.qf_proj_layers[il]; + + pl.self_attn_q_w = get_tensor(string_format(TN_QF_SELF_ATTN_Q, il, "weight")); + pl.self_attn_q_b = get_tensor(string_format(TN_QF_SELF_ATTN_Q, il, "bias")); + pl.self_attn_k_w = get_tensor(string_format(TN_QF_SELF_ATTN_K, il, "weight")); + pl.self_attn_k_b = get_tensor(string_format(TN_QF_SELF_ATTN_K, il, "bias")); + pl.self_attn_v_w = get_tensor(string_format(TN_QF_SELF_ATTN_V, il, "weight")); + pl.self_attn_v_b = get_tensor(string_format(TN_QF_SELF_ATTN_V, il, "bias")); + pl.self_attn_o_w = get_tensor(string_format(TN_QF_SELF_ATTN_O, il, "weight")); + pl.self_attn_o_b = get_tensor(string_format(TN_QF_SELF_ATTN_O, il, "bias")); + pl.self_attn_norm_w = get_tensor(string_format(TN_QF_SELF_ATTN_N, il, "weight")); + pl.self_attn_norm_b = get_tensor(string_format(TN_QF_SELF_ATTN_N, il, "bias")); + + pl.cross_attn_q_w = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, il, "weight")); + pl.cross_attn_q_b = get_tensor(string_format(TN_QF_CROSS_ATTN_Q, il, "bias")); + pl.cross_attn_k_w = get_tensor(string_format(TN_QF_CROSS_ATTN_K, il, "weight")); + pl.cross_attn_k_b = get_tensor(string_format(TN_QF_CROSS_ATTN_K, il, "bias")); + pl.cross_attn_v_w = get_tensor(string_format(TN_QF_CROSS_ATTN_V, il, "weight")); + pl.cross_attn_v_b = get_tensor(string_format(TN_QF_CROSS_ATTN_V, il, "bias")); + pl.cross_attn_o_w = get_tensor(string_format(TN_QF_CROSS_ATTN_O, il, "weight")); + pl.cross_attn_o_b = get_tensor(string_format(TN_QF_CROSS_ATTN_O, il, "bias")); + pl.cross_attn_norm_w = get_tensor(string_format(TN_QF_CROSS_ATTN_N, il, "weight")); + pl.cross_attn_norm_b = get_tensor(string_format(TN_QF_CROSS_ATTN_N, il, "bias")); + + pl.ffn_up_w = get_tensor(string_format(TN_QF_FFN_UP, il, "weight")); + pl.ffn_up_b = get_tensor(string_format(TN_QF_FFN_UP, il, "bias")); + pl.ffn_down_w = get_tensor(string_format(TN_QF_FFN_DOWN, il, "weight")); + pl.ffn_down_b = get_tensor(string_format(TN_QF_FFN_DOWN, il, "bias")); + pl.ffn_norm_w = get_tensor(string_format(TN_QF_FFN_NORM, il, "weight")); + pl.ffn_norm_b = get_tensor(string_format(TN_QF_FFN_NORM, il, "bias")); + } + } break; default: GGML_ASSERT(false && "unknown projector type"); } @@ -3023,6 +3136,12 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } n_patches = n; } break; + case PROJECTOR_TYPE_GRANITE_SPEECH: + { + const int ws = ctx->model.hparams.audio_proj_window_size; + const int ds = ctx->model.hparams.audio_proj_downsample_rate; + n_patches = ((img->nx + ws - 1) / ws) * (ws / ds); + } break; default: GGML_ABORT("unsupported projector type"); } @@ -3554,6 +3673,39 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } set_input_f32("pos_emb", pos_emb); } break; + case PROJECTOR_TYPE_GRANITE_SPEECH: + { + const int context_size = ctx->model.hparams.audio_chunk_size; + const int max_pos_emb = ctx->model.hparams.audio_max_pos_emb; + + std::vector dists(context_size * context_size); + for (int i = 0; i < context_size; i++) { + for (int j = 0; j < context_size; j++) { + int d = i - j; + if (d < -context_size) d = -context_size; + if (d > context_size) d = context_size; + dists[i * context_size + j] = d + max_pos_emb; + } + } + set_input_i32("attn_dists", dists); + + const int n_frames = image_size_width; + const int remainder = n_frames % context_size; + if (remainder > 0) { + const int num_blocks = (n_frames + context_size - 1) / context_size; + std::vector mask(context_size * context_size * num_blocks, 0.0f); + const float neg_inf = -INFINITY; + const int last_block_offset = (num_blocks - 1) * context_size * context_size; + for (int q = 0; q < context_size; q++) { + for (int k = 0; k < context_size; k++) { + if (q >= remainder || k >= remainder) { + mask[last_block_offset + q * context_size + k] = neg_inf; + } + } + } + set_input_f32("attn_mask", mask); + } + } break; default: GGML_ABORT("Unknown projector type"); } @@ -3700,6 +3852,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.position_embeddings->ne[0]; case PROJECTOR_TYPE_GEMMA4A: return ctx->model.hparams.projection_dim; + case PROJECTOR_TYPE_GRANITE_SPEECH: + return ctx->model.qf_proj_linear_w->ne[1]; case PROJECTOR_TYPE_GLM4V: return ctx->model.mm_ffn_down_w->ne[1]; default: diff --git a/tools/mtmd/models/granite-speech.cpp b/tools/mtmd/models/granite-speech.cpp new file mode 100644 index 00000000000..1c466fc1d0d --- /dev/null +++ b/tools/mtmd/models/granite-speech.cpp @@ -0,0 +1,276 @@ +#include "models.h" + +ggml_cgraph * clip_graph_granite_speech::build() { + const int n_frames = img.nx; + const int context_size = hparams.audio_chunk_size; + const int ctc_layer = n_layer / 2; + const int conv_kernel = hparams.audio_conv_kernel_size; + const int conv_pad = conv_kernel / 2; + + const int num_blocks = (n_frames + context_size - 1) / context_size; + const int padded_len = num_blocks * context_size; + const int remainder = n_frames % context_size; + + ggml_tensor * attn_dists = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, context_size * context_size); + ggml_set_name(attn_dists, "attn_dists"); + ggml_set_input(attn_dists); + ggml_build_forward_expand(gf, attn_dists); + + ggml_tensor * attn_mask = nullptr; + if (remainder > 0) { + attn_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, + context_size, context_size, 1, num_blocks); + ggml_set_name(attn_mask, "attn_mask"); + ggml_set_input(attn_mask); + ggml_build_forward_expand(gf, attn_mask); + } + + ggml_tensor * inp = build_inp_raw(1); + auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); + cb(cur, "inp_transposed", -1); + + cur = build_mm(model.inp_proj_w, cur); + cur = ggml_add(ctx0, cur, model.inp_proj_b); + cb(cur, "inp_linear", -1); + + for (int il = 0; il < n_layer; il++) { + const auto & layer = model.layers[il]; + auto * residual = cur; + + // ffn1 (half-step) + { + auto * ffn1 = build_norm(cur, layer.ff_norm_w, layer.ff_norm_b, + NORM_TYPE_NORMAL, eps, il); + cb(ffn1, "ffn1_norm", il); + + ffn1 = build_ffn(ffn1, + layer.ff_up_w, layer.ff_up_b, + nullptr, nullptr, + layer.ff_down_w, layer.ff_down_b, + FFN_SILU, il); + cb(ffn1, "ffn1_out", il); + + residual = ggml_add(ctx0, residual, ggml_scale(ctx0, ffn1, 0.5f)); + cb(residual, "ffn1_residual", il); + } + + // self-attention with Shaw RPE + { + auto * normed = build_norm(residual, layer.ln_1_w, layer.ln_1_b, + NORM_TYPE_NORMAL, eps, il); + cb(normed, "attn_norm", il); + + if (n_frames < padded_len) { + normed = ggml_pad(ctx0, normed, 0, padded_len - n_frames, 0, 0); + } + + ggml_tensor * Q = build_mm(layer.q_w, normed); + ggml_tensor * K = build_mm(layer.k_w, normed); + ggml_tensor * V = build_mm(layer.v_w, normed); + + Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, context_size, num_blocks); + K = ggml_reshape_4d(ctx0, K, d_head, n_head, context_size, num_blocks); + V = ggml_reshape_4d(ctx0, V, d_head, n_head, context_size, num_blocks); + + ggml_tensor * Q_perm = ggml_permute(ctx0, Q, 0, 2, 1, 3); + ggml_tensor * K_perm = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); + + ggml_tensor * kq = ggml_mul_mat(ctx0, K_perm, Q_perm); + + // Shaw RPE: pos_emb ne[2]=1 broadcasts against Q ne[2]=num_blocks in mul_mat + ggml_tensor * pos_emb = ggml_get_rows(ctx0, layer.attn_rel_pos_emb, attn_dists); + pos_emb = ggml_reshape_3d(ctx0, pos_emb, d_head, context_size, context_size); + pos_emb = ggml_reshape_4d(ctx0, pos_emb, d_head, context_size, 1, context_size); + + ggml_tensor * Q_shaw = ggml_permute(ctx0, Q, 0, 1, 3, 2); + ggml_tensor * pos_attn = ggml_mul_mat(ctx0, pos_emb, Q_shaw); + pos_attn = ggml_cont(ctx0, ggml_permute(ctx0, pos_attn, 0, 2, 3, 1)); + + ggml_tensor * scores = ggml_add(ctx0, kq, pos_attn); + ggml_tensor * attn_weights = ggml_soft_max_ext(ctx0, scores, attn_mask, + kq_scale, 0.0f); + + ggml_tensor * V_perm = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); + ggml_tensor * attn_out = ggml_mul_mat(ctx0, V_perm, attn_weights); + + attn_out = ggml_permute(ctx0, attn_out, 0, 2, 1, 3); + attn_out = ggml_cont_2d(ctx0, attn_out, n_embd, padded_len); + + if (n_frames < padded_len) { + attn_out = ggml_view_2d(ctx0, attn_out, + n_embd, n_frames, attn_out->nb[1], 0); + } + + cur = build_mm(layer.o_w, attn_out); + cur = ggml_add(ctx0, cur, layer.o_b); + cb(cur, "attn_out", il); + } + + residual = ggml_add(ctx0, residual, cur); + + // conv module + { + cur = build_norm(residual, layer.norm_conv_w, layer.norm_conv_b, + NORM_TYPE_NORMAL, eps, il); + cb(cur, "conv_norm", il); + + auto * x = build_mm(layer.conv_pw1_w, cur); + x = ggml_add(ctx0, x, layer.conv_pw1_b); + cb(x, "conv_pw1", il); + + // GLU: ggml has no fused op, manual split + sigmoid gate + { + int64_t d = x->ne[0] / 2; + ggml_tensor * gate = ggml_sigmoid(ctx0, + ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0])); + x = ggml_mul(ctx0, + ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate); + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + } + cb(x, "conv_glu", il); + + x = ggml_pad(ctx0, x, conv_pad, 0, 0, 0); + x = ggml_roll(ctx0, x, conv_pad, 0, 0, 0); + x = ggml_pad(ctx0, x, conv_pad, 0, 0, 0); + x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w); + cb(x, "conv_dw", il); + + // folded batch norm + x = ggml_add(ctx0, ggml_mul(ctx0, x, layer.conv_norm_w), layer.conv_norm_b); + x = ggml_silu(ctx0, x); + cb(x, "conv_bn_silu", il); + + x = build_mm(layer.conv_pw2_w, x); + x = ggml_add(ctx0, x, layer.conv_pw2_b); + cb(x, "conv_pw2", il); + + cur = x; + } + + residual = ggml_add(ctx0, residual, cur); + + // ffn2 (half-step) + { + auto * ffn2 = build_norm(residual, layer.ff_norm_1_w, layer.ff_norm_1_b, + NORM_TYPE_NORMAL, eps, il); + cb(ffn2, "ffn2_norm", il); + + ffn2 = build_ffn(ffn2, + layer.ff_up_1_w, layer.ff_up_1_b, + nullptr, nullptr, + layer.ff_down_1_w, layer.ff_down_1_b, + FFN_SILU, il); + cb(ffn2, "ffn2_out", il); + + residual = ggml_add(ctx0, residual, ggml_scale(ctx0, ffn2, 0.5f)); + } + + cur = build_norm(residual, layer.ln_2_w, layer.ln_2_b, + NORM_TYPE_NORMAL, eps, il); + cb(cur, "layer_out", il); + + // CTC branch + if (il + 1 == ctc_layer) { + auto * mid = build_mm(model.ctc_out_w, cur); + mid = ggml_add(ctx0, mid, model.ctc_out_b); + mid = ggml_soft_max(ctx0, mid); + mid = build_mm(model.ctc_out_mid_w, mid); + mid = ggml_add(ctx0, mid, model.ctc_out_mid_b); + cur = ggml_add(ctx0, cur, mid); + cb(cur, "ctc_branch", il); + } + } + + cb(cur, "encoder_out", -1); + + // QFormer projector + { + const int window_size = hparams.audio_proj_window_size; + const int num_queries = window_size / hparams.audio_proj_downsample_rate; + const int proj_n_head = hparams.audio_proj_head_count; + const int proj_d_head = n_embd / proj_n_head; + const float proj_kq_scale = 1.0f / sqrtf((float)proj_d_head); + const float proj_eps = hparams.audio_proj_layernorm_eps; + const int nblocks_proj = (n_frames + window_size - 1) / window_size; + const int padded_proj = nblocks_proj * window_size; + + if (n_frames < padded_proj) { + cur = ggml_pad(ctx0, cur, 0, padded_proj - n_frames, 0, 0); + } + + ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, n_embd, window_size, nblocks_proj); + + ggml_tensor * queries = build_norm(model.qf_proj_query, + model.qf_proj_norm_w, model.qf_proj_norm_b, + NORM_TYPE_NORMAL, proj_eps, -1); + { + ggml_tensor * q_3d = ggml_reshape_3d(ctx0, queries, n_embd, num_queries, 1); + ggml_tensor * q_shape = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, + n_embd, num_queries, nblocks_proj); + queries = ggml_repeat(ctx0, q_3d, q_shape); + } + + for (int il = 0; il < (int)model.qf_proj_layers.size(); il++) { + const auto & pl = model.qf_proj_layers[il]; + + // self-attention + { + ggml_tensor * Q = ggml_add(ctx0, build_mm(pl.self_attn_q_w, queries), pl.self_attn_q_b); + ggml_tensor * K = ggml_add(ctx0, build_mm(pl.self_attn_k_w, queries), pl.self_attn_k_b); + ggml_tensor * V = ggml_add(ctx0, build_mm(pl.self_attn_v_w, queries), pl.self_attn_v_b); + + Q = ggml_reshape_4d(ctx0, Q, proj_d_head, proj_n_head, num_queries, nblocks_proj); + K = ggml_reshape_4d(ctx0, K, proj_d_head, proj_n_head, num_queries, nblocks_proj); + V = ggml_reshape_4d(ctx0, V, proj_d_head, proj_n_head, num_queries, nblocks_proj); + + ggml_tensor * sa_out = build_attn(pl.self_attn_o_w, pl.self_attn_o_b, + Q, K, V, nullptr, proj_kq_scale, il); + sa_out = ggml_reshape_3d(ctx0, sa_out, n_embd, num_queries, nblocks_proj); + + queries = build_norm(ggml_add(ctx0, sa_out, queries), + pl.self_attn_norm_w, pl.self_attn_norm_b, + NORM_TYPE_NORMAL, proj_eps, il); + } + + // cross-attention + { + ggml_tensor * Q = ggml_add(ctx0, build_mm(pl.cross_attn_q_w, queries), pl.cross_attn_q_b); + ggml_tensor * K = ggml_add(ctx0, build_mm(pl.cross_attn_k_w, enc_windows), pl.cross_attn_k_b); + ggml_tensor * V = ggml_add(ctx0, build_mm(pl.cross_attn_v_w, enc_windows), pl.cross_attn_v_b); + + Q = ggml_reshape_4d(ctx0, Q, proj_d_head, proj_n_head, num_queries, nblocks_proj); + K = ggml_reshape_4d(ctx0, K, proj_d_head, proj_n_head, window_size, nblocks_proj); + V = ggml_reshape_4d(ctx0, V, proj_d_head, proj_n_head, window_size, nblocks_proj); + + ggml_tensor * ca_out = build_attn(pl.cross_attn_o_w, pl.cross_attn_o_b, + Q, K, V, nullptr, proj_kq_scale, il); + ca_out = ggml_reshape_3d(ctx0, ca_out, n_embd, num_queries, nblocks_proj); + + queries = build_norm(ggml_add(ctx0, ca_out, queries), + pl.cross_attn_norm_w, pl.cross_attn_norm_b, + NORM_TYPE_NORMAL, proj_eps, il); + } + + // ffn + { + ggml_tensor * ffn_out = build_ffn(queries, + pl.ffn_up_w, pl.ffn_up_b, + nullptr, nullptr, + pl.ffn_down_w, pl.ffn_down_b, + FFN_GELU, il); + + queries = build_norm(ggml_add(ctx0, ffn_out, queries), + pl.ffn_norm_w, pl.ffn_norm_b, + NORM_TYPE_NORMAL, proj_eps, il); + } + } + + cur = ggml_reshape_2d(ctx0, queries, n_embd, num_queries * nblocks_proj); + cur = ggml_add(ctx0, build_mm(model.qf_proj_linear_w, cur), model.qf_proj_linear_b); + cb(cur, "projector_out", -1); + } + + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 03d99e15b05..73de49d869b 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -103,6 +103,11 @@ struct clip_graph_conformer : clip_graph { ggml_cgraph * build() override; }; +struct clip_graph_granite_speech : clip_graph { + clip_graph_granite_speech(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + struct clip_graph_gemma4a : clip_graph { clip_graph_gemma4a(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index 38a8ce4f4a6..55c1effe4c3 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -650,6 +650,108 @@ bool mtmd_audio_preprocessor_conformer::preprocess(const float * return true; } +// +// mtmd_audio_preprocessor_granite_speech +// + +void mtmd_audio_preprocessor_granite_speech::initialize() { + cache.fill_sin_cos_table(hparams.audio_n_fft); + cache.fill_hann_window(hparams.audio_window_len, true); + cache.fill_mel_filterbank_matrix( + hparams.n_mel_bins / 2, hparams.audio_n_fft, hparams.audio_sample_rate, + 0.0f, -1.0f, false, 1.0f, true); +} + +bool mtmd_audio_preprocessor_granite_speech::preprocess(const float * samples, + size_t n_samples, + std::vector & output) { + if (n_samples == 0) { + return false; + } + + GGML_ASSERT(!cache.sin_vals.empty()); + GGML_ASSERT(!cache.cos_vals.empty()); + GGML_ASSERT(!cache.filters.data.empty()); + + const int n_fft = hparams.audio_n_fft; + const int pad = n_fft / 2; + + // reflect padding + const int n_padded = (int)n_samples + 2 * pad; + std::vector padded(n_padded, 0.0f); + std::copy(samples, samples + n_samples, padded.data() + pad); + for (int i = 0; i < pad; i++) { + int src = i + 1; + if (src >= (int)n_samples) { + src = (int)n_samples - 1; + } + padded[pad - 1 - i] = samples[src]; + } + for (int i = 0; i < pad; i++) { + int src = (int)n_samples - 2 - i; + if (src < 0) { + src = 0; + } + padded[pad + (int)n_samples + i] = samples[src]; + } + + filter_params params; + params.n_mel = hparams.n_mel_bins / 2; + params.n_fft_bins = 1 + (n_fft / 2); + params.hann_window_size = hparams.audio_window_len; + params.hop_length = hparams.audio_hop_len; + params.sample_rate = hparams.audio_sample_rate; + params.no_padding = true; + params.center_padding = false; + params.preemph = 0.0f; + params.use_natural_log = false; + params.norm_per_feature = false; + params.mel_floor = 1e-10f; + + mtmd_audio_mel mel; + if (!log_mel_spectrogram(padded.data(), n_padded, 4, params, cache, mel)) { + return false; + } + + double mmax = -1e20; + for (int i = 0; i < mel.n_mel * mel.n_len; i++) { + if (mel.data[i] > mmax) { + mmax = mel.data[i]; + } + } + mmax -= 8.0; + + for (int i = 0; i < mel.n_mel * mel.n_len; i++) { + if (mel.data[i] < mmax) { + mel.data[i] = mmax; + } + mel.data[i] = (mel.data[i] + 4.0) / 4.0; + } + + int n_frames = mel.n_len; + if (n_frames % 2 == 1) { + n_frames--; + } + const int n_mel = mel.n_mel; + const int n_stacked = n_frames / 2; + + mtmd_audio_mel stacked; + stacked.n_mel = 2 * n_mel; + stacked.n_len = n_stacked; + stacked.n_len_org = (int)n_samples; + stacked.data.resize(2 * n_mel * n_stacked); + + for (int t = 0; t < n_stacked; t++) { + for (int m = 0; m < n_mel; m++) { + stacked.data[m * n_stacked + t] = mel.data[m * mel.n_len + 2 * t]; + stacked.data[(m + n_mel) * n_stacked + t] = mel.data[m * mel.n_len + 2 * t + 1]; + } + } + + output.push_back(std::move(stacked)); + return true; +} + // // mtmd_audio_preprocessor_gemma4a // diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index efaa14f924f..c1a705de522 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -78,6 +78,15 @@ struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor { mtmd_audio_cache cache; }; +struct mtmd_audio_preprocessor_granite_speech : mtmd_audio_preprocessor { + mtmd_audio_preprocessor_granite_speech(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} + void initialize() override; + bool preprocess(const float * samples, size_t n_samples, std::vector & output) override; + + private: + mtmd_audio_cache cache; +}; + struct mtmd_audio_preprocessor_gemma4a : mtmd_audio_preprocessor { mtmd_audio_preprocessor_gemma4a(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} void initialize() override; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 52fca4e81b3..b985ccbe6b0 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -485,6 +485,10 @@ struct mtmd_context { { audio_preproc = std::make_unique(ctx_a); } break; + case PROJECTOR_TYPE_GRANITE_SPEECH: + { + audio_preproc = std::make_unique(ctx_a); + } break; case PROJECTOR_TYPE_GEMMA4A: { aud_beg = "<|audio>";