From dcb1f4cef50aa1648b56b9607b10332e9b6e0c41 Mon Sep 17 00:00:00 2001 From: linchuanxie Date: Thu, 23 Apr 2026 13:28:29 +0800 Subject: [PATCH 1/3] support hy3.0 fp8 quantization --- README.md | 7 + README_cn.md | 7 + angelslim/compressor/quant/core/config.py | 22 +- angelslim/compressor/quant/core/hook.py | 22 +- angelslim/compressor/quant/core/save.py | 31 +- angelslim/compressor/quant/ptq.py | 54 +++ angelslim/engine.py | 1 - angelslim/models/llm/__init__.py | 1 + angelslim/models/llm/hunyuan_v3_moe.py | 389 ++++++++++++++++++ .../hunyuanv3_a20b_fp8_static_c8.yaml | 37 ++ 10 files changed, 563 insertions(+), 8 deletions(-) create mode 100644 angelslim/models/llm/hunyuan_v3_moe.py create mode 100644 configs/hunyuan/fp8_static/hunyuanv3_a20b_fp8_static_c8.yaml diff --git a/README.md b/README.md index 6c2d41a4..c03438a0 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ A more accessible, comprehensive, and efficient toolkit for large model compress

## 📣Latest News +- [26/04/23] We now support FP8-Static quantization for **Hunyuan3.0** (MoE A20B). - [26/03/25] We have released **DAQ**, the quantization algorithm that preserves the knowledge acquired while the update of parameters is relatively small during post-training training.[[Paper]](https://arxiv.org/abs/2603.22324) | [[Docs]](docs/source/features/quantization/daq.md) - [26/02/09] We have released HY-1.8B-2Bit, 2bit on-device large language model,[[Huggingface]](https://huggingface.co/AngelSlim/HY-1.8B-2Bit). - [26/01/13] We have released v0.3. We support the training and deployment of Eagle3 for all-scale LLMs/VLMs/Audio models, as detailed in the [guidance documentation](https://angelslim.readthedocs.io/zh-cn/latest/features/speculative_decoding/eagle/index.html). And We released **Sherry**, the hardware-efficient 1.25 bit quantization algorithm [[Paper]](https://arxiv.org/abs/2601.07892) | [[Code]](https://github.com/Tencent/AngelSlim/tree/sherry/Sherry)🔥🔥🔥 @@ -253,6 +254,12 @@ python3 tools/run.py -c configs/qwen3/fp8_static/qwen3-1_7b_fp8_static.yaml This example produces quantized model weights by performing PTQ calibration on a model loaded from HuggingFace. +For **Hunyuan3.0** (MoE A20B) FP8-Static quantization: + +```shell +python tools/run.py -c configs/hunyuan/fp8_static/hunyuanv3_a20b_fp8_static_c8.yaml +``` +
Code-based Start diff --git a/README_cn.md b/README_cn.md index ae1fc891..9d42bc17 100644 --- a/README_cn.md +++ b/README_cn.md @@ -22,6 +22,7 @@

## 📣最新进展 +- [26/04/23] 我们支持了 **Hunyuan3.0**(MoE A20B)模型的 FP8-Static 量化。 - [26/03/25] 我们发布了量化算法DAQ,该方法在后训练参数更新较小时,可保留量化后模型能力 [[论文]](https://arxiv.org/abs/2603.22324) | [[文档]](docs/source/features/quantization/daq.md) - [26/02/09] 我们发布了 HY-1.8B-2Bit, 2比特端侧大模型, 模型可见[[Huggingface]](https://huggingface.co/AngelSlim/HY-1.8B-2Bit). - [26/01/13] 我们发布V0.3版本, 支持了全模态场景的投机采样训练及部署,文档:[Eagle3 for LLM/VLM/Audio](https://angelslim.readthedocs.io/zh-cn/latest/features/speculative_decoding/eagle/index.html)。并且我们发布了 **Sherry** 新的硬件高效的1.25bit三值量化算法 [[论文]](https://arxiv.org/abs/2601.07892) | [[代码]](https://github.com/Tencent/AngelSlim/tree/sherry/Sherry)🔥🔥🔥 @@ -252,6 +253,12 @@ bash scripts/speculative/train_eagle3_online.sh 该示例将会加载`HugggingFace`模型进行PTQ量化校准,最终量化产出模型权重. +对 **Hunyuan3.0**(MoE A20B)进行 FP8-Static 量化: + + ```shell + python tools/run.py -c configs/hunyuan/fp8_static/hunyuanv3_a20b_fp8_static_c8.yaml + ``` +
2、源码启动 diff --git a/angelslim/compressor/quant/core/config.py b/angelslim/compressor/quant/core/config.py index 872cba57..1ae16296 100644 --- a/angelslim/compressor/quant/core/config.py +++ b/angelslim/compressor/quant/core/config.py @@ -31,7 +31,10 @@ "per-group": AbsMaxGroupWiseWeightObserver, } -KVCACHE_OBSERVERS_CLASS = {"per-channel": AbsmaxPerchannelObserver} +KVCACHE_OBSERVERS_CLASS = { + "per-channel": AbsmaxPerchannelObserver, + "per-tensor": AbsmaxPertensorObserver, +} class QuantConfig: @@ -60,6 +63,7 @@ def __init__(self, config, global_config=None): self.quant_helpers = quantization_args.quant_helpers act_quant_method = quantization_args.quant_method.get("activation", None) weight_quant_method = quantization_args.quant_method["weight"] + kv_cache_quant_method = quantization_args.quant_method.get("kv_cache", None) self.cpu_convert = quantization_args.cpu_convert self.save_name = quantization_args.save_name @@ -77,7 +81,11 @@ def __init__(self, config, global_config=None): ACT_OBSERVERS_CLASS[act_quant_method] if "static" in is_dynamic else None ) self.weight_observer = WEIGHT_OBSERVERS_CLASS[weight_quant_method] - self.kv_cache_observer = None + self.kv_cache_observer = ( + KVCACHE_OBSERVERS_CLASS[kv_cache_quant_method] + if kv_cache_quant_method is not None + else None + ) if "w4a8" in self.quant_algo: group_size = ( @@ -98,6 +106,8 @@ def __init__(self, config, global_config=None): if act_quant_method is not None: self.quant_algo_info["a"] = f"fp8_{act_quant_method}-{is_dynamic}" + if kv_cache_quant_method is not None: + self.quant_algo_info["c"] = f"fp8_{kv_cache_quant_method}" self.low_memory = config.quantization.low_memory self.quant_analyse = config.quantization.quant_analyse self.quant_vit = config.quantization.quant_vit @@ -117,13 +127,19 @@ def __init__(self, config, global_config=None): ACT_OBSERVERS_CLASS[act_quant_method] if "static" in is_dynamic else None ) self.weight_observer = WEIGHT_OBSERVERS_CLASS[weight_quant_method] - self.kv_cache_observer = None + self.kv_cache_observer = ( + KVCACHE_OBSERVERS_CLASS[kv_cache_quant_method] + if kv_cache_quant_method is not None + else None + ) self.quant_algo_info = { "w": f"int8_{weight_quant_method}", "ignore_layers": quantization_args.ignore_layers, } if act_quant_method is not None: self.quant_algo_info["a"] = f"int8_{act_quant_method}-{is_dynamic}" + if kv_cache_quant_method is not None: + self.quant_algo_info["c"] = f"int8_{kv_cache_quant_method}" self.low_memory = config.quantization.low_memory self.quant_analyse = config.quantization.quant_analyse elif "int4_awq" in self.quant_algo: diff --git a/angelslim/compressor/quant/core/hook.py b/angelslim/compressor/quant/core/hook.py index 893b34e0..0390a5a8 100644 --- a/angelslim/compressor/quant/core/hook.py +++ b/angelslim/compressor/quant/core/hook.py @@ -51,7 +51,9 @@ def apply_hook(self): sub_layer, act_observer, weight_observer, - kv_cache_observer if name in self.kv_names else None, + # kv_cache_observer is now handled by monkey patching at attention level + # so we pass None here + None, self.quant_model.quant_algo_dict, **extra_kwargs ) @@ -59,6 +61,14 @@ def apply_hook(self): self.observer_dict[sub_layer] = observer self._forward_hook_list.append(forward_hook_handle) + # Apply KV cache observers using monkey patching (for attention-level observation) + if kv_cache_observer is not None and hasattr(self.quant_model, 'apply_kvcache_observers'): + quant_bits = self.quant_model.quant_algo_dict.get("c_quant_bits", 8) + self.quant_model.apply_kvcache_observers( + kv_cache_observer_class=kv_cache_observer, + quant_bits=quant_bits, + ) + def apply_smooth_hook(self, smooth_mapping_layers, smooth_observer): for smooth_layer, _ in smooth_mapping_layers.values(): observer = PTQObserver( @@ -86,6 +96,9 @@ def remove_hook(self): for hook in self._forward_hook_list: hook.remove() self._forward_hook_list = [] + # Remove KV cache observer patches if available + if hasattr(self.quant_model, 'remove_kvcache_observers'): + self.quant_model.remove_kvcache_observers() def post_process(self): maxval = get_fp_maxval(bits=8) @@ -109,5 +122,8 @@ def post_process(self): self.quant_model.act_scales_dict[name] / maxval.type(act_dtype) ) if self.quant_model.quant_algo_dict["c_quant_algo"] == "fp8": - for k, v in self.quant_model.kv_cache_scales_dict.items(): - self.quant_model.kv_cache_scales_dict[k] = v / maxval.type(v.dtype) + # Process KV cache scales from attention-level observers + if hasattr(self.quant_model, 'get_kvcache_scales'): + kv_scales = self.quant_model.get_kvcache_scales() + for k, v in kv_scales.items(): + self.quant_model.kv_cache_scales_dict[k] = v / maxval.type(v.dtype) diff --git a/angelslim/compressor/quant/core/save.py b/angelslim/compressor/quant/core/save.py index 3fab500e..bb964272 100644 --- a/angelslim/compressor/quant/core/save.py +++ b/angelslim/compressor/quant/core/save.py @@ -249,6 +249,17 @@ def save(self, save_path): raise ValueError(f"{self.quant_model.quant_config.quant_algo} not supported") quantization_config = {"quant_method": save_name, ignore_field: ignored_layers} + # Set kv_cache_scheme if kv_cache quantization is enabled + c_quant_algo = self.quant_model.quant_config.quant_algo_info.get("c", None) + if c_quant_algo is not None: + kv_cache_scheme = { + "num_bits": 8, + "strategy": re.search(r"per-([a-zA-Z]+)", c_quant_algo).group(1), + "type": "float", + } + else: + kv_cache_scheme = None + if save_name == "compressed-tensors": quantization_config.update( { @@ -260,13 +271,15 @@ def save(self, save_path): "targets": ["Linear"], } }, - "kv_cache_scheme": None, + "kv_cache_scheme": kv_cache_scheme, "format": quant_format, "quantization_status": "compressed", } ) else: quantization_config["activation_scheme"] = "dynamic" if is_dynamic else "static" + if kv_cache_scheme is not None: + quantization_config["kv_cache_scheme"] = "static" if ( hasattr(self.quant_model.quant_config, "transform_config") @@ -287,6 +300,22 @@ def save(self, save_path): json.dump(trtllm_config, f, indent=4) self.quant_model.tokenizer.save_pretrained(save_path) + # Save KV cache scales if available + if hasattr(self.quant_model, 'kv_cache_scales_dict') and self.quant_model.kv_cache_scales_dict: + kv_scales_path = os.path.join(save_path, "kv_cache_scales.safetensors") + kv_scales_dict = {} + kv_scale_map = {} + for name, scale in self.quant_model.kv_cache_scales_dict.items(): + kv_scales_dict[name] = scale + kv_scale_map[name] = "kv_cache_scales.safetensors" + safe_save(kv_scales_dict, kv_scales_path) + print_info("Save KV cache scales to: {}".format(kv_scales_path)) + new_model_index_file = os.path.join(save_path, "model.safetensors.index.json") + with open(new_model_index_file, "r") as f: + new_model_index = json.load(f) + new_model_index["weight_map"].update(kv_scale_map) + with open(os.path.join(save_path, "model.safetensors.index.json"), "w") as f: + json.dump(new_model_index, f, indent=2) class PTQOnlyScaleSave(PTQSaveBase): diff --git a/angelslim/compressor/quant/ptq.py b/angelslim/compressor/quant/ptq.py index 3044d97d..5a2f7e5d 100644 --- a/angelslim/compressor/quant/ptq.py +++ b/angelslim/compressor/quant/ptq.py @@ -205,7 +205,61 @@ def save(self, save_path: str): save_func = self.quant_model.get_save_func()(self.quant_model) save_func.save(save_path) + def get_meta_weights_info(self, model): + """获取所有meta权重的详细信息""" + meta_params = [] + + for name, param in model.named_parameters(): + if param.device.type == 'meta': + meta_params.append({ + 'name': name, + }) + return meta_params + + def set_meta_weights_info(self, model): + """替换所有meta权重""" + orign_w_dict = {} + for name, param in model.named_parameters(): + if param.device.type == 'meta': + with open( + os.path.join( + self.absolute_model_path, "model.safetensors.index.json" + ), + "r", + ) as f: + model_index = json.load(f) + orign_w_file = os.path.join( + self.absolute_model_path, + model_index["weight_map"][name], + ) + if orign_w_file in orign_w_dict.keys(): + orign_w = orign_w_dict[orign_w_file] + else: + orign_w = load_file(orign_w_file, device="cpu") + orign_w_dict[orign_w_file] = orign_w + + empty_tensor = torch.empty(param.data.shape, dtype=param.data.dtype, device='cpu') + new_param = torch.nn.Parameter(empty_tensor) + new_param.data = orign_w[name] + parts = name.split('.') + current_module = model + + # 导航到包含参数的模块 + for part in parts[:-1]: + current_module = getattr(current_module, part) + + # 设置新的参数 + setattr(current_module, parts[-1], new_param) + + del orign_w_dict + + + def _convert(self): + self.set_meta_weights_info(self.quant_model.model) + print_info(f"Meta weight:{self.get_meta_weights_info(self.quant_model.model)}") + + # 1. get act, weight and kv-cache scale for name, sub_layer in self.ptq_hook.quant_layers_dict.items(): if ( diff --git a/angelslim/engine.py b/angelslim/engine.py index 6b21e647..757d8741 100644 --- a/angelslim/engine.py +++ b/angelslim/engine.py @@ -121,7 +121,6 @@ def prepare_model( low_cpu_mem_usage=low_cpu_mem_usage, use_cache=use_cache, using_multi_nodes=using_multi_nodes, - attn_implementation=attn_implementation, ) self.model_path = model_path elif self.series in ["Omni"]: diff --git a/angelslim/models/llm/__init__.py b/angelslim/models/llm/__init__.py index e8735382..6a6a7a1a 100644 --- a/angelslim/models/llm/__init__.py +++ b/angelslim/models/llm/__init__.py @@ -16,6 +16,7 @@ from .glm import GLM # noqa: F401 from .hunyuan_dense import HunyuanDense # noqa: F401 from .hunyuan_moe import HunyuanMoE # noqa: F401 +from .hunyuan_v3_moe import HYV3MoE # noqa: F401 from .kimi_k2 import KimiK2 # noqa: F401 from .llama import Llama # noqa: F401 from .qwen import Qwen # noqa: F401 diff --git a/angelslim/models/llm/hunyuan_v3_moe.py b/angelslim/models/llm/hunyuan_v3_moe.py new file mode 100644 index 00000000..701cb9cc --- /dev/null +++ b/angelslim/models/llm/hunyuan_v3_moe.py @@ -0,0 +1,389 @@ +# Copyright 2025 Tencent Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +import torch +import torch.nn as nn +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers.models.hy_v3.modeling_hy_v3 import ( + ALL_ATTENTION_FUNCTIONS, + HYV3Experts, + apply_rotary_pos_emb, + eager_attention_forward, +) + +from ...compressor.quant.core import PTQSaveVllmHF +from ...utils.utils import find_layers, find_parent_layer_and_sub_name +from ..base_model import BaseLLMModel +from ..model_factory import SlimModelFactory + + +class HYV3ExpertsWithLinear(HYV3Experts): + """Wrapper around HYV3Experts that exposes per-expert weights as nn.Linear modules. + + HYV3Experts stores all expert weights as 3-D nn.Parameter tensors, which are + invisible to AngelSlim's find_layers() and PTQ hook (both only recognise + nn.Linear). This wrapper splits those tensors into individual nn.Linear + modules at construction time so that the standard quantisation pipeline can + observe and quantise them. + + Weight shape mapping + -------------------- + gate_up_proj : [num_experts, 2*intermediate_dim, hidden_dim] + gate_up_proj[i] → chunk(2, dim=0) + gate_proj[i].weight : [intermediate_dim, hidden_dim] + up_proj[i].weight : [intermediate_dim, hidden_dim] + down_proj : [num_experts, hidden_dim, intermediate_dim] + down_proj[i] → down_proj[i].weight : [hidden_dim, intermediate_dim] + """ + + def __init__(self, experts_layer): + # Bypass HYV3Experts.__init__ to avoid allocating large empty Parameter + # tensors that we would immediately overwrite. HYV3Experts does not + # store self.config, so we copy the required scalar attributes directly. + nn.Module.__init__(self) + self.num_experts = experts_layer.num_experts + self.hidden_dim = experts_layer.hidden_dim + self.intermediate_dim = experts_layer.intermediate_dim + self.act_fn = experts_layer.act_fn + + for expert_idx in range(self.num_experts): + expert = nn.ModuleDict( + { + "gate_proj": nn.Linear(self.hidden_dim, self.intermediate_dim, bias=False), + "up_proj": nn.Linear(self.hidden_dim, self.intermediate_dim, bias=False), + "down_proj": nn.Linear(self.intermediate_dim, self.hidden_dim, bias=False), + } + ) + # gate_up_proj[i]: [2*intermediate_dim, hidden_dim] + # chunk on dim=0 → [intermediate_dim, hidden_dim] each + expert["gate_proj"].weight.data, expert["up_proj"].weight.data = ( + experts_layer.gate_up_proj[expert_idx].chunk(2, dim=0) + ) + # down_proj[i]: [hidden_dim, intermediate_dim] + expert["down_proj"].weight.data = experts_layer.down_proj[expert_idx] + setattr(self, f"{expert_idx}", expert) + + def forward( + self, + hidden_states: torch.Tensor, + top_k_index: torch.Tensor, + top_k_weights: torch.Tensor, + ) -> torch.Tensor: + final_hidden_states = torch.zeros_like(hidden_states) + with torch.no_grad(): + expert_mask = torch.nn.functional.one_hot(top_k_index, num_classes=self.num_experts) + expert_mask = expert_mask.permute(2, 1, 0) + expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero() + + for expert_idx in expert_hit: + expert_idx = expert_idx[0] + if expert_idx == self.num_experts: + continue + top_k_pos, token_idx = torch.where(expert_mask[expert_idx]) + current_state = hidden_states[token_idx] + expert_layer = getattr(self, f"{expert_idx}") + gate = expert_layer["gate_proj"](current_state) + up = expert_layer["up_proj"](current_state) + current_hidden_states = self.act_fn(gate) * up + current_hidden_states = expert_layer["down_proj"](current_hidden_states) + current_hidden_states = ( + current_hidden_states * top_k_weights[token_idx, top_k_pos, None] + ) + final_hidden_states.index_add_( + 0, token_idx, current_hidden_states.to(final_hidden_states.dtype) + ) + + return final_hidden_states + + +@SlimModelFactory.register +class HYV3MoE(BaseLLMModel): + def __init__( + self, + model=None, + deploy_backend="vllm", + ): + super().__init__( + model=model, + deploy_backend=deploy_backend, + ) + self.block_name = "model.layers" + # Store original forward methods for restoration + self._original_attn_forwards = {} + # Store KV cache observers: {attn_layer_name: {"key_observer": ..., "value_observer": ...}} + self.kv_cache_observers = {} + + def from_pretrained( + self, + model_path, + torch_dtype="auto", + device_map="auto", + trust_remote_code=True, + low_cpu_mem_usage=True, + use_cache=False, + using_multi_nodes=False, + ): + attn_implementation = "eager" + torch_dtype = torch.bfloat16 + self.model = AutoModelForCausalLM.from_pretrained( + model_path, + attn_implementation=attn_implementation, + torch_dtype=torch_dtype, + device_map=device_map, + trust_remote_code=trust_remote_code, + low_cpu_mem_usage=low_cpu_mem_usage, + use_cache=use_cache, + ) + + # Load tokenizer + self.tokenizer = AutoTokenizer.from_pretrained( + model_path, trust_remote_code=trust_remote_code + ) + + def replace_moe(self): + """Replace HYV3Experts instances with HYV3ExpertsWithLinear. + + This must be called before init_ptq() so that find_layers() can discover + the per-expert nn.Linear modules and register them with the PTQ hook. + """ + for name, module in self.model.named_modules(): + if isinstance(module, HYV3Experts) and not isinstance( + module, HYV3ExpertsWithLinear + ): + parent_layer, sub_name = find_parent_layer_and_sub_name(self.model, name) + moe_linear = HYV3ExpertsWithLinear(module) + del module + setattr(parent_layer, sub_name, moe_linear) + + def init_ptq(self, slim_config): + self.replace_moe() + super().init_ptq(slim_config) + + def get_observer_layers(self): + names = [ + "self_attn.q_proj", + "self_attn.k_proj", + "self_attn.v_proj", + "self_attn.o_proj", + "mlp.gate_proj", + "mlp.up_proj", + "mlp.down_proj", + "shared_mlp.gate_proj", + "shared_mlp.up_proj", + "shared_mlp.down_proj", + ] + expert_pattern = [ + r"model\.layers\.\d+\.mlp\.experts\.\d+\.gate_proj", + r"model\.layers\.\d+\.mlp\.experts\.\d+\.up_proj", + r"model\.layers\.\d+\.mlp\.experts\.\d+\.down_proj", + ] + + obs_layers = [nn.Linear] + observer_layers_dict = find_layers(self.model, layers=obs_layers) + + compiled_patterns = [re.compile(pattern) for pattern in expert_pattern] + + observer_layers_dict = { + k: v + for k, v in observer_layers_dict.items() + if k.startswith(self.block_name) + and ( + any(name in k for name in names) + or any(pattern.search(k) for pattern in compiled_patterns) + ) + } + + if self.quant_config.custom_observe_layers_names != "default": + for custom_observe_name in self.quant_config.custom_observe_layers_names: + for default_name in observer_layers_dict.keys(): + if custom_observe_name not in default_name: + observer_layers_dict.pop(default_name) + return observer_layers_dict + + def get_parent_dict(self, observer_layers_dict): + parent_mapping = {r"experts\.\d+": "experts"} + parent_dict = {} + for layer_name in observer_layers_dict.keys(): + parent_name = layer_name + for k, v in parent_mapping.items(): + parent_name = re.sub(k, v, layer_name) + if parent_name != layer_name: + parent_dict[layer_name] = parent_name + return parent_dict + + def get_kvcache_observer_layers_names(self, observe_names): + """Return empty list since we use attention-level patching for KV cache.""" + # Return empty list to disable the default k_proj/v_proj output observation + # We will use apply_kvcache_observers() instead for RoPE-after key/value states + return [] + + def get_attention_layers(self): + """Get all attention layers in the model.""" + attention_layers = {} + for name, module in self.model.named_modules(): + if name.endswith(".self_attn") and hasattr(module, "forward"): + # Verify it has k_proj and v_proj attributes + if hasattr(module, "k_proj") and hasattr(module, "v_proj"): + attention_layers[name] = module + return attention_layers + + def apply_kvcache_observers(self, kv_cache_observer_class, quant_bits=8): + """ + Apply KV cache observers to attention layers using monkey patching. + This observes key_states and value_states AFTER RoPE is applied. + + Args: + kv_cache_observer_class: The observer class to use (e.g., AbsmaxPertensorObserver) + quant_bits: Quantization bits for the observer + """ + from ...compressor.quant.observers import AbsmaxPertensorObserver + + if kv_cache_observer_class is None: + kv_cache_observer_class = AbsmaxPertensorObserver + + attention_layers = self.get_attention_layers() + + for attn_name, attn_module in attention_layers.items(): + # Create observers for key and value states + key_observer = kv_cache_observer_class( + layer=attn_module.k_proj, + quant_bits=quant_bits, + ) + value_observer = kv_cache_observer_class( + layer=attn_module.v_proj, + quant_bits=quant_bits, + ) + + # Store observers + self.kv_cache_observers[attn_name] = { + "key_observer": key_observer, + "value_observer": value_observer, + } + + # Save original forward + self._original_attn_forwards[attn_name] = attn_module.forward + + # Create patched forward + self._patch_attention_forward(attn_module, attn_name) + + def _patch_attention_forward(self, attn_module, attn_name): + """ + Patch the attention module's forward method to observe KV cache after RoPE. + + Adapted to the new transformers ``HYV3Attention.forward`` signature, where + rotary embeddings are pre-computed and passed in as ``position_embeddings`` + (a ``(cos, sin)`` tuple), ``q_norm``/``k_norm`` are applied unconditionally + on the pre-transpose view, and attention dispatch goes through + ``ALL_ATTENTION_FUNCTIONS``. + """ + key_observer = self.kv_cache_observers[attn_name]["key_observer"] + value_observer = self.kv_cache_observers[attn_name]["value_observer"] + + def patched_forward( + hidden_states, + position_embeddings, + attention_mask, + past_key_values=None, + cache_position=None, + **kwargs, + ): + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, attn_module.head_dim) + + query_states = attn_module.q_proj(hidden_states).view(hidden_shape) + key_states = attn_module.k_proj(hidden_states).view(hidden_shape) + value_states = attn_module.v_proj(hidden_states).view(hidden_shape) + + query_states = attn_module.q_norm(query_states) + key_states = attn_module.k_norm(key_states) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin + ) + + # === OBSERVE KV CACHE AFTER RoPE === + key_observer(key_states) + value_observer(value_states) + # === END OBSERVE === + + if past_key_values is not None: + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update( + key_states, value_states, attn_module.layer_idx, cache_kwargs + ) + + attention_interface = eager_attention_forward + if attn_module.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[ + attn_module.config._attn_implementation + ] + + attn_output, attn_weights = attention_interface( + attn_module, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not attn_module.training else attn_module.attention_dropout, + scaling=attn_module.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = attn_module.o_proj(attn_output) + return attn_output, attn_weights + + # Replace the forward method + attn_module.forward = patched_forward + + def remove_kvcache_observers(self): + """Remove patched forward methods and restore original ones.""" + for attn_name, original_forward in self._original_attn_forwards.items(): + # Find the attention module and restore its forward + parts = attn_name.split(".") + module = self.model + for part in parts: + module = getattr(module, part) + module.forward = original_forward + + self._original_attn_forwards.clear() + + def get_kvcache_scales(self): + """ + Get KV cache scales from observers. + Returns dict with format: {"layer_name.k_cache.scale": scale, + "layer_name.v_cache.scale": scale} + """ + kv_scales = {} + for attn_name, observers in self.kv_cache_observers.items(): + key_scale = observers["key_observer"].scales() + value_scale = observers["value_observer"].scales() + kv_scales[f"{attn_name}.k_cache.scale"] = key_scale + kv_scales[f"{attn_name}.v_cache.scale"] = value_scale + return kv_scales + + def get_save_func(self): + if self.deploy_backend in ["vllm", "huggingface"]: + return PTQSaveVllmHF + else: + raise NotImplementedError( + f"deploy_backend {self.deploy_backend} is not supported for saving." + ) diff --git a/configs/hunyuan/fp8_static/hunyuanv3_a20b_fp8_static_c8.yaml b/configs/hunyuan/fp8_static/hunyuanv3_a20b_fp8_static_c8.yaml new file mode 100644 index 00000000..34217472 --- /dev/null +++ b/configs/hunyuan/fp8_static/hunyuanv3_a20b_fp8_static_c8.yaml @@ -0,0 +1,37 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: HYV3MoE + model_path: tencent/Hy3-preview + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: auto + +# Compression configuration +compression: + name: PTQ + quantization: + name: fp8_static + bits: 8 + quant_method: + weight: "per-tensor" + activation: "per-tensor" + kv_cache: "per-tensor" + ignore_layers: # Skip quantization for these layers + - "lm_head" + - "model.embed_tokens" + cpu_convert: true + save_name: "fp8" + +# Dataset for calibration +dataset: + name: TextDataset + data_path: ./dataset/sharegpt_gpt4/sharegpt_gpt4_256.jsonl + max_seq_length: 2048 + num_samples: 512 + batch_size: 1 From ac18ea22977c28230b6ea357642667d9292f2d59 Mon Sep 17 00:00:00 2001 From: linchuanxie Date: Thu, 23 Apr 2026 13:39:30 +0800 Subject: [PATCH 2/3] support hy3.0 fp8 quantization --- angelslim/compressor/quant/core/hook.py | 6 +++--- angelslim/compressor/quant/core/save.py | 5 ++++- angelslim/compressor/quant/ptq.py | 23 ++++++++++------------- angelslim/models/llm/hunyuan_v3_moe.py | 8 ++------ 4 files changed, 19 insertions(+), 23 deletions(-) diff --git a/angelslim/compressor/quant/core/hook.py b/angelslim/compressor/quant/core/hook.py index 0390a5a8..ecea808f 100644 --- a/angelslim/compressor/quant/core/hook.py +++ b/angelslim/compressor/quant/core/hook.py @@ -62,7 +62,7 @@ def apply_hook(self): self._forward_hook_list.append(forward_hook_handle) # Apply KV cache observers using monkey patching (for attention-level observation) - if kv_cache_observer is not None and hasattr(self.quant_model, 'apply_kvcache_observers'): + if kv_cache_observer is not None and hasattr(self.quant_model, "apply_kvcache_observers"): quant_bits = self.quant_model.quant_algo_dict.get("c_quant_bits", 8) self.quant_model.apply_kvcache_observers( kv_cache_observer_class=kv_cache_observer, @@ -97,7 +97,7 @@ def remove_hook(self): hook.remove() self._forward_hook_list = [] # Remove KV cache observer patches if available - if hasattr(self.quant_model, 'remove_kvcache_observers'): + if hasattr(self.quant_model, "remove_kvcache_observers"): self.quant_model.remove_kvcache_observers() def post_process(self): @@ -123,7 +123,7 @@ def post_process(self): ) if self.quant_model.quant_algo_dict["c_quant_algo"] == "fp8": # Process KV cache scales from attention-level observers - if hasattr(self.quant_model, 'get_kvcache_scales'): + if hasattr(self.quant_model, "get_kvcache_scales"): kv_scales = self.quant_model.get_kvcache_scales() for k, v in kv_scales.items(): self.quant_model.kv_cache_scales_dict[k] = v / maxval.type(v.dtype) diff --git a/angelslim/compressor/quant/core/save.py b/angelslim/compressor/quant/core/save.py index bb964272..53f5e446 100644 --- a/angelslim/compressor/quant/core/save.py +++ b/angelslim/compressor/quant/core/save.py @@ -301,7 +301,10 @@ def save(self, save_path): self.quant_model.tokenizer.save_pretrained(save_path) # Save KV cache scales if available - if hasattr(self.quant_model, 'kv_cache_scales_dict') and self.quant_model.kv_cache_scales_dict: + if ( + hasattr(self.quant_model, "kv_cache_scales_dict") + and self.quant_model.kv_cache_scales_dict + ): kv_scales_path = os.path.join(save_path, "kv_cache_scales.safetensors") kv_scales_dict = {} kv_scale_map = {} diff --git a/angelslim/compressor/quant/ptq.py b/angelslim/compressor/quant/ptq.py index 5a2f7e5d..4de5f20e 100644 --- a/angelslim/compressor/quant/ptq.py +++ b/angelslim/compressor/quant/ptq.py @@ -210,21 +210,21 @@ def get_meta_weights_info(self, model): meta_params = [] for name, param in model.named_parameters(): - if param.device.type == 'meta': - meta_params.append({ - 'name': name, - }) + if param.device.type == "meta": + meta_params.append( + { + "name": name, + } + ) return meta_params def set_meta_weights_info(self, model): """替换所有meta权重""" orign_w_dict = {} for name, param in model.named_parameters(): - if param.device.type == 'meta': + if param.device.type == "meta": with open( - os.path.join( - self.absolute_model_path, "model.safetensors.index.json" - ), + os.path.join(self.absolute_model_path, "model.safetensors.index.json"), "r", ) as f: model_index = json.load(f) @@ -238,10 +238,10 @@ def set_meta_weights_info(self, model): orign_w = load_file(orign_w_file, device="cpu") orign_w_dict[orign_w_file] = orign_w - empty_tensor = torch.empty(param.data.shape, dtype=param.data.dtype, device='cpu') + empty_tensor = torch.empty(param.data.shape, dtype=param.data.dtype, device="cpu") new_param = torch.nn.Parameter(empty_tensor) new_param.data = orign_w[name] - parts = name.split('.') + parts = name.split(".") current_module = model # 导航到包含参数的模块 @@ -253,13 +253,10 @@ def set_meta_weights_info(self, model): del orign_w_dict - - def _convert(self): self.set_meta_weights_info(self.quant_model.model) print_info(f"Meta weight:{self.get_meta_weights_info(self.quant_model.model)}") - # 1. get act, weight and kv-cache scale for name, sub_layer in self.ptq_hook.quant_layers_dict.items(): if ( diff --git a/angelslim/models/llm/hunyuan_v3_moe.py b/angelslim/models/llm/hunyuan_v3_moe.py index 701cb9cc..d8919643 100644 --- a/angelslim/models/llm/hunyuan_v3_moe.py +++ b/angelslim/models/llm/hunyuan_v3_moe.py @@ -160,9 +160,7 @@ def replace_moe(self): the per-expert nn.Linear modules and register them with the PTQ hook. """ for name, module in self.model.named_modules(): - if isinstance(module, HYV3Experts) and not isinstance( - module, HYV3ExpertsWithLinear - ): + if isinstance(module, HYV3Experts) and not isinstance(module, HYV3ExpertsWithLinear): parent_layer, sub_name = find_parent_layer_and_sub_name(self.model, name) moe_linear = HYV3ExpertsWithLinear(module) del module @@ -315,9 +313,7 @@ def patched_forward( value_states = value_states.transpose(1, 2) cos, sin = position_embeddings - query_states, key_states = apply_rotary_pos_emb( - query_states, key_states, cos, sin - ) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) # === OBSERVE KV CACHE AFTER RoPE === key_observer(key_states) From 273b399d3efd7e9db7f0bf861fddc1643f9c97a5 Mon Sep 17 00:00:00 2001 From: linchuanxie Date: Thu, 23 Apr 2026 15:54:41 +0800 Subject: [PATCH 3/3] update readme --- README.md | 4 ++-- README_cn.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index c03438a0..4de8dd8c 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ A more accessible, comprehensive, and efficient toolkit for large model compress

## 📣Latest News -- [26/04/23] We now support FP8-Static quantization for **Hunyuan3.0** (MoE A20B). +- [26/04/23] We now support FP8-Static quantization for **Hy3-preview** (MoE A20B). - [26/03/25] We have released **DAQ**, the quantization algorithm that preserves the knowledge acquired while the update of parameters is relatively small during post-training training.[[Paper]](https://arxiv.org/abs/2603.22324) | [[Docs]](docs/source/features/quantization/daq.md) - [26/02/09] We have released HY-1.8B-2Bit, 2bit on-device large language model,[[Huggingface]](https://huggingface.co/AngelSlim/HY-1.8B-2Bit). - [26/01/13] We have released v0.3. We support the training and deployment of Eagle3 for all-scale LLMs/VLMs/Audio models, as detailed in the [guidance documentation](https://angelslim.readthedocs.io/zh-cn/latest/features/speculative_decoding/eagle/index.html). And We released **Sherry**, the hardware-efficient 1.25 bit quantization algorithm [[Paper]](https://arxiv.org/abs/2601.07892) | [[Code]](https://github.com/Tencent/AngelSlim/tree/sherry/Sherry)🔥🔥🔥 @@ -254,7 +254,7 @@ python3 tools/run.py -c configs/qwen3/fp8_static/qwen3-1_7b_fp8_static.yaml This example produces quantized model weights by performing PTQ calibration on a model loaded from HuggingFace. -For **Hunyuan3.0** (MoE A20B) FP8-Static quantization: +For **Hy3-preview** (MoE A20B) FP8-Static quantization: ```shell python tools/run.py -c configs/hunyuan/fp8_static/hunyuanv3_a20b_fp8_static_c8.yaml diff --git a/README_cn.md b/README_cn.md index 9d42bc17..e775c4f9 100644 --- a/README_cn.md +++ b/README_cn.md @@ -22,7 +22,7 @@

## 📣最新进展 -- [26/04/23] 我们支持了 **Hunyuan3.0**(MoE A20B)模型的 FP8-Static 量化。 +- [26/04/23] 我们支持了 **Hy3-preview**(MoE A20B)模型的 FP8-Static 量化。 - [26/03/25] 我们发布了量化算法DAQ,该方法在后训练参数更新较小时,可保留量化后模型能力 [[论文]](https://arxiv.org/abs/2603.22324) | [[文档]](docs/source/features/quantization/daq.md) - [26/02/09] 我们发布了 HY-1.8B-2Bit, 2比特端侧大模型, 模型可见[[Huggingface]](https://huggingface.co/AngelSlim/HY-1.8B-2Bit). - [26/01/13] 我们发布V0.3版本, 支持了全模态场景的投机采样训练及部署,文档:[Eagle3 for LLM/VLM/Audio](https://angelslim.readthedocs.io/zh-cn/latest/features/speculative_decoding/eagle/index.html)。并且我们发布了 **Sherry** 新的硬件高效的1.25bit三值量化算法 [[论文]](https://arxiv.org/abs/2601.07892) | [[代码]](https://github.com/Tencent/AngelSlim/tree/sherry/Sherry)🔥🔥🔥 @@ -253,7 +253,7 @@ bash scripts/speculative/train_eagle3_online.sh 该示例将会加载`HugggingFace`模型进行PTQ量化校准,最终量化产出模型权重. -对 **Hunyuan3.0**(MoE A20B)进行 FP8-Static 量化: +对 **Hy3-preview**(MoE A20B)进行 FP8-Static 量化: ```shell python tools/run.py -c configs/hunyuan/fp8_static/hunyuanv3_a20b_fp8_static_c8.yaml