From dcb1f4cef50aa1648b56b9607b10332e9b6e0c41 Mon Sep 17 00:00:00 2001
From: linchuanxie <linchuanxie@tencent.com>
Date: Thu, 23 Apr 2026 13:28:29 +0800
Subject: [PATCH 1/3] support hy3.0 fp8 quantization

---
 README.md                                     |   7 +
 README_cn.md                                  |   7 +
 angelslim/compressor/quant/core/config.py     |  22 +-
 angelslim/compressor/quant/core/hook.py       |  22 +-
 angelslim/compressor/quant/core/save.py       |  31 +-
 angelslim/compressor/quant/ptq.py             |  54 +++
 angelslim/engine.py                           |   1 -
 angelslim/models/llm/__init__.py              |   1 +
 angelslim/models/llm/hunyuan_v3_moe.py        | 389 ++++++++++++++++++
 .../hunyuanv3_a20b_fp8_static_c8.yaml         |  37 ++
 10 files changed, 563 insertions(+), 8 deletions(-)
 create mode 100644 angelslim/models/llm/hunyuan_v3_moe.py
 create mode 100644 configs/hunyuan/fp8_static/hunyuanv3_a20b_fp8_static_c8.yaml
diff --git a/README.md b/README.md
index 6c2d41a4..c03438a0 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,7 @@ A more accessible, comprehensive, and efficient toolkit for large model compress
 </p>
 
 ## 📣Latest News
+- [26/04/23] We now support FP8-Static quantization for **Hunyuan3.0** (MoE A20B).
 - [26/03/25] We have released **DAQ**, the quantization algorithm that preserves the knowledge acquired while the update of parameters is relatively small during post-training training.[[Paper]](https://arxiv.org/abs/2603.22324) | [[Docs]](docs/source/features/quantization/daq.md)
 - [26/02/09] We have released HY-1.8B-2Bit, 2bit on-device large language model,[[Huggingface]](https://huggingface.co/AngelSlim/HY-1.8B-2Bit).
 - [26/01/13] We have released v0.3. We support the training and deployment of Eagle3 for all-scale LLMs/VLMs/Audio models, as detailed in the [guidance documentation](https://angelslim.readthedocs.io/zh-cn/latest/features/speculative_decoding/eagle/index.html). And We released **Sherry**, the hardware-efficient 1.25 bit quantization algorithm [[Paper]](https://arxiv.org/abs/2601.07892) | [[Code]](https://github.com/Tencent/AngelSlim/tree/sherry/Sherry)🔥🔥🔥
@@ -253,6 +254,12 @@ python3 tools/run.py -c configs/qwen3/fp8_static/qwen3-1_7b_fp8_static.yaml
 
 This example produces quantized model weights by performing PTQ calibration on a model loaded from HuggingFace.
 
+For **Hunyuan3.0** (MoE A20B) FP8-Static quantization:
+
+```shell
+python tools/run.py -c configs/hunyuan/fp8_static/hunyuanv3_a20b_fp8_static_c8.yaml
+```
+
 <details>
 <summary>Code-based Start</summary>
 
diff --git a/README_cn.md b/README_cn.md
index ae1fc891..9d42bc17 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -22,6 +22,7 @@
 </p>
 
 ## 📣最新进展
+- [26/04/23] 我们支持了 **Hunyuan3.0**（MoE A20B）模型的 FP8-Static 量化。
 - [26/03/25] 我们发布了量化算法DAQ，该方法在后训练参数更新较小时，可保留量化后模型能力 [[论文]](https://arxiv.org/abs/2603.22324) | [[文档]](docs/source/features/quantization/daq.md)
 - [26/02/09] 我们发布了 HY-1.8B-2Bit, 2比特端侧大模型, 模型可见[[Huggingface]](https://huggingface.co/AngelSlim/HY-1.8B-2Bit).
 - [26/01/13] 我们发布V0.3版本， 支持了全模态场景的投机采样训练及部署，文档：[Eagle3 for LLM/VLM/Audio](https://angelslim.readthedocs.io/zh-cn/latest/features/speculative_decoding/eagle/index.html)。并且我们发布了 **Sherry** 新的硬件高效的1.25bit三值量化算法 [[论文]](https://arxiv.org/abs/2601.07892) | [[代码]](https://github.com/Tencent/AngelSlim/tree/sherry/Sherry)🔥🔥🔥
@@ -252,6 +253,12 @@ bash scripts/speculative/train_eagle3_online.sh
 
   该示例将会加载`HugggingFace`模型进行PTQ量化校准，最终量化产出模型权重.
 
+对 **Hunyuan3.0**（MoE A20B）进行 FP8-Static 量化：
+
+  ```shell
+  python tools/run.py -c configs/hunyuan/fp8_static/hunyuanv3_a20b_fp8_static_c8.yaml
+  ```
+
 <details>
 <summary>2、源码启动</summary>
 
diff --git a/angelslim/compressor/quant/core/config.py b/angelslim/compressor/quant/core/config.py
index 872cba57..1ae16296 100644
--- a/angelslim/compressor/quant/core/config.py
+++ b/angelslim/compressor/quant/core/config.py
@@ -31,7 +31,10 @@
     "per-group": AbsMaxGroupWiseWeightObserver,
 }
 
-KVCACHE_OBSERVERS_CLASS = {"per-channel": AbsmaxPerchannelObserver}
+KVCACHE_OBSERVERS_CLASS = {
+    "per-channel": AbsmaxPerchannelObserver,
+    "per-tensor": AbsmaxPertensorObserver,
+}
 
 
 class QuantConfig:
@@ -60,6 +63,7 @@ def __init__(self, config, global_config=None):
         self.quant_helpers = quantization_args.quant_helpers
         act_quant_method = quantization_args.quant_method.get("activation", None)
         weight_quant_method = quantization_args.quant_method["weight"]
+        kv_cache_quant_method = quantization_args.quant_method.get("kv_cache", None)
         self.cpu_convert = quantization_args.cpu_convert
         self.save_name = quantization_args.save_name
 
@@ -77,7 +81,11 @@ def __init__(self, config, global_config=None):
                 ACT_OBSERVERS_CLASS[act_quant_method] if "static" in is_dynamic else None
             )
             self.weight_observer = WEIGHT_OBSERVERS_CLASS[weight_quant_method]
-            self.kv_cache_observer = None
+            self.kv_cache_observer = (
+                KVCACHE_OBSERVERS_CLASS[kv_cache_quant_method]
+                if kv_cache_quant_method is not None
+                else None
+            )
 
             if "w4a8" in self.quant_algo:
                 group_size = (
@@ -98,6 +106,8 @@ def __init__(self, config, global_config=None):
 
             if act_quant_method is not None:
                 self.quant_algo_info["a"] = f"fp8_{act_quant_method}-{is_dynamic}"
+            if kv_cache_quant_method is not None:
+                self.quant_algo_info["c"] = f"fp8_{kv_cache_quant_method}"
             self.low_memory = config.quantization.low_memory
             self.quant_analyse = config.quantization.quant_analyse
             self.quant_vit = config.quantization.quant_vit
@@ -117,13 +127,19 @@ def __init__(self, config, global_config=None):
                 ACT_OBSERVERS_CLASS[act_quant_method] if "static" in is_dynamic else None
             )
             self.weight_observer = WEIGHT_OBSERVERS_CLASS[weight_quant_method]
-            self.kv_cache_observer = None
+            self.kv_cache_observer = (
+                KVCACHE_OBSERVERS_CLASS[kv_cache_quant_method]
+                if kv_cache_quant_method is not None
+                else None
+            )
             self.quant_algo_info = {
                 "w": f"int8_{weight_quant_method}",
                 "ignore_layers": quantization_args.ignore_layers,
             }
             if act_quant_method is not None:
                 self.quant_algo_info["a"] = f"int8_{act_quant_method}-{is_dynamic}"
+            if kv_cache_quant_method is not None:
+                self.quant_algo_info["c"] = f"int8_{kv_cache_quant_method}"
             self.low_memory = config.quantization.low_memory
             self.quant_analyse = config.quantization.quant_analyse
         elif "int4_awq" in self.quant_algo:
diff --git a/angelslim/compressor/quant/core/hook.py b/angelslim/compressor/quant/core/hook.py
index 893b34e0..0390a5a8 100644
--- a/angelslim/compressor/quant/core/hook.py
+++ b/angelslim/compressor/quant/core/hook.py
@@ -51,7 +51,9 @@ def apply_hook(self):
                 sub_layer,
                 act_observer,
                 weight_observer,
-                kv_cache_observer if name in self.kv_names else None,
+                # kv_cache_observer is now handled by monkey patching at attention level
+                # so we pass None here
+                None,
                 self.quant_model.quant_algo_dict,
                 **extra_kwargs
             )
@@ -59,6 +61,14 @@ def apply_hook(self):
             self.observer_dict[sub_layer] = observer
             self._forward_hook_list.append(forward_hook_handle)
 
+        # Apply KV cache observers using monkey patching (for attention-level observation)
+        if kv_cache_observer is not None and hasattr(self.quant_model, 'apply_kvcache_observers'):
+            quant_bits = self.quant_model.quant_algo_dict.get("c_quant_bits", 8)
+            self.quant_model.apply_kvcache_observers(
+                kv_cache_observer_class=kv_cache_observer,
+                quant_bits=quant_bits,
+            )
+
     def apply_smooth_hook(self, smooth_mapping_layers, smooth_observer):
         for smooth_layer, _ in smooth_mapping_layers.values():
             observer = PTQObserver(
@@ -86,6 +96,9 @@ def remove_hook(self):
         for hook in self._forward_hook_list:
             hook.remove()
         self._forward_hook_list = []
+        # Remove KV cache observer patches if available
+        if hasattr(self.quant_model, 'remove_kvcache_observers'):
+            self.quant_model.remove_kvcache_observers()
 
     def post_process(self):
         maxval = get_fp_maxval(bits=8)
@@ -109,5 +122,8 @@ def post_process(self):
                                 self.quant_model.act_scales_dict[name] / maxval.type(act_dtype)
                             )
         if self.quant_model.quant_algo_dict["c_quant_algo"] == "fp8":
-            for k, v in self.quant_model.kv_cache_scales_dict.items():
-                self.quant_model.kv_cache_scales_dict[k] = v / maxval.type(v.dtype)
+            # Process KV cache scales from attention-level observers
+            if hasattr(self.quant_model, 'get_kvcache_scales'):
+                kv_scales = self.quant_model.get_kvcache_scales()
+                for k, v in kv_scales.items():
+                    self.quant_model.kv_cache_scales_dict[k] = v / maxval.type(v.dtype)
diff --git a/angelslim/compressor/quant/core/save.py b/angelslim/compressor/quant/core/save.py
index 3fab500e..bb964272 100644
--- a/angelslim/compressor/quant/core/save.py
+++ b/angelslim/compressor/quant/core/save.py
@@ -249,6 +249,17 @@ def save(self, save_path):
             raise ValueError(f"{self.quant_model.quant_config.quant_algo} not supported")
 
         quantization_config = {"quant_method": save_name, ignore_field: ignored_layers}
+        # Set kv_cache_scheme if kv_cache quantization is enabled
+        c_quant_algo = self.quant_model.quant_config.quant_algo_info.get("c", None)
+        if c_quant_algo is not None:
+            kv_cache_scheme = {
+                "num_bits": 8,
+                "strategy": re.search(r"per-([a-zA-Z]+)", c_quant_algo).group(1),
+                "type": "float",
+            }
+        else:
+            kv_cache_scheme = None
+
         if save_name == "compressed-tensors":
             quantization_config.update(
                 {
@@ -260,13 +271,15 @@ def save(self, save_path):
                             "targets": ["Linear"],
                         }
                     },
-                    "kv_cache_scheme": None,
+                    "kv_cache_scheme": kv_cache_scheme,
                     "format": quant_format,
                     "quantization_status": "compressed",
                 }
             )
         else:
             quantization_config["activation_scheme"] = "dynamic" if is_dynamic else "static"
+            if kv_cache_scheme is not None:
+                quantization_config["kv_cache_scheme"] = "static"
 
         if (
             hasattr(self.quant_model.quant_config, "transform_config")
@@ -287,6 +300,22 @@ def save(self, save_path):
             json.dump(trtllm_config, f, indent=4)
 
         self.quant_model.tokenizer.save_pretrained(save_path)
+        # Save KV cache scales if available
+        if hasattr(self.quant_model, 'kv_cache_scales_dict') and self.quant_model.kv_cache_scales_dict:
+            kv_scales_path = os.path.join(save_path, "kv_cache_scales.safetensors")
+            kv_scales_dict = {}
+            kv_scale_map = {}
+            for name, scale in self.quant_model.kv_cache_scales_dict.items():
+                kv_scales_dict[name] = scale
+                kv_scale_map[name] = "kv_cache_scales.safetensors"
+            safe_save(kv_scales_dict, kv_scales_path)
+            print_info("Save KV cache scales to: {}".format(kv_scales_path))
+            new_model_index_file = os.path.join(save_path, "model.safetensors.index.json")
+            with open(new_model_index_file, "r") as f:
+                new_model_index = json.load(f)
+            new_model_index["weight_map"].update(kv_scale_map)
+            with open(os.path.join(save_path, "model.safetensors.index.json"), "w") as f:
+                json.dump(new_model_index, f, indent=2)
 
 
 class PTQOnlyScaleSave(PTQSaveBase):
diff --git a/angelslim/compressor/quant/ptq.py b/angelslim/compressor/quant/ptq.py
index 3044d97d..5a2f7e5d 100644
--- a/angelslim/compressor/quant/ptq.py
+++ b/angelslim/compressor/quant/ptq.py
@@ -205,7 +205,61 @@ def save(self, save_path: str):
             save_func = self.quant_model.get_save_func()(self.quant_model)
             save_func.save(save_path)
 
+    def get_meta_weights_info(self, model):
+        """获取所有meta权重的详细信息"""
+        meta_params = []
+
+        for name, param in model.named_parameters():
+            if param.device.type == 'meta':
+                meta_params.append({
+                    'name': name,
+                })
+        return meta_params
+
+    def set_meta_weights_info(self, model):
+        """替换所有meta权重"""
+        orign_w_dict = {}
+        for name, param in model.named_parameters():
+            if param.device.type == 'meta':
+                with open(
+                    os.path.join(
+                        self.absolute_model_path, "model.safetensors.index.json"
+                    ),
+                    "r",
+                ) as f:
+                    model_index = json.load(f)
+                orign_w_file = os.path.join(
+                    self.absolute_model_path,
+                    model_index["weight_map"][name],
+                )
+                if orign_w_file in orign_w_dict.keys():
+                    orign_w = orign_w_dict[orign_w_file]
+                else:
+                    orign_w = load_file(orign_w_file, device="cpu")
+                    orign_w_dict[orign_w_file] = orign_w
+
+                empty_tensor = torch.empty(param.data.shape, dtype=param.data.dtype, device='cpu')
+                new_param = torch.nn.Parameter(empty_tensor)
+                new_param.data = orign_w[name]
+                parts = name.split('.')
+                current_module = model
+
+                # 导航到包含参数的模块
+                for part in parts[:-1]:
+                    current_module = getattr(current_module, part)
+
+                # 设置新的参数
+                setattr(current_module, parts[-1], new_param)
+
+        del orign_w_dict
+
+
+
     def _convert(self):
+        self.set_meta_weights_info(self.quant_model.model)
+        print_info(f"Meta weight:{self.get_meta_weights_info(self.quant_model.model)}")
+
+
         # 1. get act, weight and kv-cache scale
         for name, sub_layer in self.ptq_hook.quant_layers_dict.items():
             if (
diff --git a/angelslim/engine.py b/angelslim/engine.py
index 6b21e647..757d8741 100644
--- a/angelslim/engine.py
+++ b/angelslim/engine.py
@@ -121,7 +121,6 @@ def prepare_model(
                     low_cpu_mem_usage=low_cpu_mem_usage,
                     use_cache=use_cache,
                     using_multi_nodes=using_multi_nodes,
-                    attn_implementation=attn_implementation,
                 )
                 self.model_path = model_path
         elif self.series in ["Omni"]:
diff --git a/angelslim/models/llm/__init__.py b/angelslim/models/llm/__init__.py
index e8735382..6a6a7a1a 100644
--- a/angelslim/models/llm/__init__.py
+++ b/angelslim/models/llm/__init__.py
@@ -16,6 +16,7 @@
 from .glm import GLM  # noqa: F401
 from .hunyuan_dense import HunyuanDense  # noqa: F401
 from .hunyuan_moe import HunyuanMoE  # noqa: F401
+from .hunyuan_v3_moe import HYV3MoE  # noqa: F401
 from .kimi_k2 import KimiK2  # noqa: F401
 from .llama import Llama  # noqa: F401
 from .qwen import Qwen  # noqa: F401
diff --git a/angelslim/models/llm/hunyuan_v3_moe.py b/angelslim/models/llm/hunyuan_v3_moe.py
new file mode 100644
index 00000000..701cb9cc
--- /dev/null
+++ b/angelslim/models/llm/hunyuan_v3_moe.py
@@ -0,0 +1,389 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+import torch
+import torch.nn as nn
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.models.hy_v3.modeling_hy_v3 import (
+    ALL_ATTENTION_FUNCTIONS,
+    HYV3Experts,
+    apply_rotary_pos_emb,
+    eager_attention_forward,
+)
+
+from ...compressor.quant.core import PTQSaveVllmHF
+from ...utils.utils import find_layers, find_parent_layer_and_sub_name
+from ..base_model import BaseLLMModel
+from ..model_factory import SlimModelFactory
+
+
+class HYV3ExpertsWithLinear(HYV3Experts):
+    """Wrapper around HYV3Experts that exposes per-expert weights as nn.Linear modules.
+
+    HYV3Experts stores all expert weights as 3-D nn.Parameter tensors, which are
+    invisible to AngelSlim's find_layers() and PTQ hook (both only recognise
+    nn.Linear).  This wrapper splits those tensors into individual nn.Linear
+    modules at construction time so that the standard quantisation pipeline can
+    observe and quantise them.
+
+    Weight shape mapping
+    --------------------
+    gate_up_proj : [num_experts, 2*intermediate_dim, hidden_dim]
+        gate_up_proj[i]  →  chunk(2, dim=0)
+            gate_proj[i].weight : [intermediate_dim, hidden_dim]
+            up_proj[i].weight   : [intermediate_dim, hidden_dim]
+    down_proj : [num_experts, hidden_dim, intermediate_dim]
+        down_proj[i] → down_proj[i].weight : [hidden_dim, intermediate_dim]
+    """
+
+    def __init__(self, experts_layer):
+        # Bypass HYV3Experts.__init__ to avoid allocating large empty Parameter
+        # tensors that we would immediately overwrite.  HYV3Experts does not
+        # store self.config, so we copy the required scalar attributes directly.
+        nn.Module.__init__(self)
+        self.num_experts = experts_layer.num_experts
+        self.hidden_dim = experts_layer.hidden_dim
+        self.intermediate_dim = experts_layer.intermediate_dim
+        self.act_fn = experts_layer.act_fn
+
+        for expert_idx in range(self.num_experts):
+            expert = nn.ModuleDict(
+                {
+                    "gate_proj": nn.Linear(self.hidden_dim, self.intermediate_dim, bias=False),
+                    "up_proj": nn.Linear(self.hidden_dim, self.intermediate_dim, bias=False),
+                    "down_proj": nn.Linear(self.intermediate_dim, self.hidden_dim, bias=False),
+                }
+            )
+            # gate_up_proj[i]: [2*intermediate_dim, hidden_dim]
+            # chunk on dim=0 → [intermediate_dim, hidden_dim] each
+            expert["gate_proj"].weight.data, expert["up_proj"].weight.data = (
+                experts_layer.gate_up_proj[expert_idx].chunk(2, dim=0)
+            )
+            # down_proj[i]: [hidden_dim, intermediate_dim]
+            expert["down_proj"].weight.data = experts_layer.down_proj[expert_idx]
+            setattr(self, f"{expert_idx}", expert)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        top_k_index: torch.Tensor,
+        top_k_weights: torch.Tensor,
+    ) -> torch.Tensor:
+        final_hidden_states = torch.zeros_like(hidden_states)
+        with torch.no_grad():
+            expert_mask = torch.nn.functional.one_hot(top_k_index, num_classes=self.num_experts)
+            expert_mask = expert_mask.permute(2, 1, 0)
+            expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+
+        for expert_idx in expert_hit:
+            expert_idx = expert_idx[0]
+            if expert_idx == self.num_experts:
+                continue
+            top_k_pos, token_idx = torch.where(expert_mask[expert_idx])
+            current_state = hidden_states[token_idx]
+            expert_layer = getattr(self, f"{expert_idx}")
+            gate = expert_layer["gate_proj"](current_state)
+            up = expert_layer["up_proj"](current_state)
+            current_hidden_states = self.act_fn(gate) * up
+            current_hidden_states = expert_layer["down_proj"](current_hidden_states)
+            current_hidden_states = (
+                current_hidden_states * top_k_weights[token_idx, top_k_pos, None]
+            )
+            final_hidden_states.index_add_(
+                0, token_idx, current_hidden_states.to(final_hidden_states.dtype)
+            )
+
+        return final_hidden_states
+
+
+@SlimModelFactory.register
+class HYV3MoE(BaseLLMModel):
+    def __init__(
+        self,
+        model=None,
+        deploy_backend="vllm",
+    ):
+        super().__init__(
+            model=model,
+            deploy_backend=deploy_backend,
+        )
+        self.block_name = "model.layers"
+        # Store original forward methods for restoration
+        self._original_attn_forwards = {}
+        # Store KV cache observers: {attn_layer_name: {"key_observer": ..., "value_observer": ...}}
+        self.kv_cache_observers = {}
+
+    def from_pretrained(
+        self,
+        model_path,
+        torch_dtype="auto",
+        device_map="auto",
+        trust_remote_code=True,
+        low_cpu_mem_usage=True,
+        use_cache=False,
+        using_multi_nodes=False,
+    ):
+        attn_implementation = "eager"
+        torch_dtype = torch.bfloat16
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            attn_implementation=attn_implementation,
+            torch_dtype=torch_dtype,
+            device_map=device_map,
+            trust_remote_code=trust_remote_code,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            use_cache=use_cache,
+        )
+
+        # Load tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=trust_remote_code
+        )
+
+    def replace_moe(self):
+        """Replace HYV3Experts instances with HYV3ExpertsWithLinear.
+
+        This must be called before init_ptq() so that find_layers() can discover
+        the per-expert nn.Linear modules and register them with the PTQ hook.
+        """
+        for name, module in self.model.named_modules():
+            if isinstance(module, HYV3Experts) and not isinstance(
+                module, HYV3ExpertsWithLinear
+            ):
+                parent_layer, sub_name = find_parent_layer_and_sub_name(self.model, name)
+                moe_linear = HYV3ExpertsWithLinear(module)
+                del module
+                setattr(parent_layer, sub_name, moe_linear)
+
+    def init_ptq(self, slim_config):
+        self.replace_moe()
+        super().init_ptq(slim_config)
+
+    def get_observer_layers(self):
+        names = [
+            "self_attn.q_proj",
+            "self_attn.k_proj",
+            "self_attn.v_proj",
+            "self_attn.o_proj",
+            "mlp.gate_proj",
+            "mlp.up_proj",
+            "mlp.down_proj",
+            "shared_mlp.gate_proj",
+            "shared_mlp.up_proj",
+            "shared_mlp.down_proj",
+        ]
+        expert_pattern = [
+            r"model\.layers\.\d+\.mlp\.experts\.\d+\.gate_proj",
+            r"model\.layers\.\d+\.mlp\.experts\.\d+\.up_proj",
+            r"model\.layers\.\d+\.mlp\.experts\.\d+\.down_proj",
+        ]
+
+        obs_layers = [nn.Linear]
+        observer_layers_dict = find_layers(self.model, layers=obs_layers)
+
+        compiled_patterns = [re.compile(pattern) for pattern in expert_pattern]
+
+        observer_layers_dict = {
+            k: v
+            for k, v in observer_layers_dict.items()
+            if k.startswith(self.block_name)
+            and (
+                any(name in k for name in names)
+                or any(pattern.search(k) for pattern in compiled_patterns)
+            )
+        }
+
+        if self.quant_config.custom_observe_layers_names != "default":
+            for custom_observe_name in self.quant_config.custom_observe_layers_names:
+                for default_name in observer_layers_dict.keys():
+                    if custom_observe_name not in default_name:
+                        observer_layers_dict.pop(default_name)
+        return observer_layers_dict
+
+    def get_parent_dict(self, observer_layers_dict):
+        parent_mapping = {r"experts\.\d+": "experts"}
+        parent_dict = {}
+        for layer_name in observer_layers_dict.keys():
+            parent_name = layer_name
+            for k, v in parent_mapping.items():
+                parent_name = re.sub(k, v, layer_name)
+            if parent_name != layer_name:
+                parent_dict[layer_name] = parent_name
+        return parent_dict
+
+    def get_kvcache_observer_layers_names(self, observe_names):
+        """Return empty list since we use attention-level patching for KV cache."""
+        # Return empty list to disable the default k_proj/v_proj output observation
+        # We will use apply_kvcache_observers() instead for RoPE-after key/value states
+        return []
+
+    def get_attention_layers(self):
+        """Get all attention layers in the model."""
+        attention_layers = {}
+        for name, module in self.model.named_modules():
+            if name.endswith(".self_attn") and hasattr(module, "forward"):
+                # Verify it has k_proj and v_proj attributes
+                if hasattr(module, "k_proj") and hasattr(module, "v_proj"):
+                    attention_layers[name] = module
+        return attention_layers
+
+    def apply_kvcache_observers(self, kv_cache_observer_class, quant_bits=8):
+        """
+        Apply KV cache observers to attention layers using monkey patching.
+        This observes key_states and value_states AFTER RoPE is applied.
+
+        Args:
+            kv_cache_observer_class: The observer class to use (e.g., AbsmaxPertensorObserver)
+            quant_bits: Quantization bits for the observer
+        """
+        from ...compressor.quant.observers import AbsmaxPertensorObserver
+
+        if kv_cache_observer_class is None:
+            kv_cache_observer_class = AbsmaxPertensorObserver
+
+        attention_layers = self.get_attention_layers()
+
+        for attn_name, attn_module in attention_layers.items():
+            # Create observers for key and value states
+            key_observer = kv_cache_observer_class(
+                layer=attn_module.k_proj,
+                quant_bits=quant_bits,
+            )
+            value_observer = kv_cache_observer_class(
+                layer=attn_module.v_proj,
+                quant_bits=quant_bits,
+            )
+
+            # Store observers
+            self.kv_cache_observers[attn_name] = {
+                "key_observer": key_observer,
+                "value_observer": value_observer,
+            }
+
+            # Save original forward
+            self._original_attn_forwards[attn_name] = attn_module.forward
+
+            # Create patched forward
+            self._patch_attention_forward(attn_module, attn_name)
+
+    def _patch_attention_forward(self, attn_module, attn_name):
+        """
+        Patch the attention module's forward method to observe KV cache after RoPE.
+
+        Adapted to the new transformers ``HYV3Attention.forward`` signature, where
+        rotary embeddings are pre-computed and passed in as ``position_embeddings``
+        (a ``(cos, sin)`` tuple), ``q_norm``/``k_norm`` are applied unconditionally
+        on the pre-transpose view, and attention dispatch goes through
+        ``ALL_ATTENTION_FUNCTIONS``.
+        """
+        key_observer = self.kv_cache_observers[attn_name]["key_observer"]
+        value_observer = self.kv_cache_observers[attn_name]["value_observer"]
+
+        def patched_forward(
+            hidden_states,
+            position_embeddings,
+            attention_mask,
+            past_key_values=None,
+            cache_position=None,
+            **kwargs,
+        ):
+            input_shape = hidden_states.shape[:-1]
+            hidden_shape = (*input_shape, -1, attn_module.head_dim)
+
+            query_states = attn_module.q_proj(hidden_states).view(hidden_shape)
+            key_states = attn_module.k_proj(hidden_states).view(hidden_shape)
+            value_states = attn_module.v_proj(hidden_states).view(hidden_shape)
+
+            query_states = attn_module.q_norm(query_states)
+            key_states = attn_module.k_norm(key_states)
+
+            query_states = query_states.transpose(1, 2)
+            key_states = key_states.transpose(1, 2)
+            value_states = value_states.transpose(1, 2)
+
+            cos, sin = position_embeddings
+            query_states, key_states = apply_rotary_pos_emb(
+                query_states, key_states, cos, sin
+            )
+
+            # === OBSERVE KV CACHE AFTER RoPE ===
+            key_observer(key_states)
+            value_observer(value_states)
+            # === END OBSERVE ===
+
+            if past_key_values is not None:
+                cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+                key_states, value_states = past_key_values.update(
+                    key_states, value_states, attn_module.layer_idx, cache_kwargs
+                )
+
+            attention_interface = eager_attention_forward
+            if attn_module.config._attn_implementation != "eager":
+                attention_interface = ALL_ATTENTION_FUNCTIONS[
+                    attn_module.config._attn_implementation
+                ]
+
+            attn_output, attn_weights = attention_interface(
+                attn_module,
+                query_states,
+                key_states,
+                value_states,
+                attention_mask,
+                dropout=0.0 if not attn_module.training else attn_module.attention_dropout,
+                scaling=attn_module.scaling,
+                **kwargs,
+            )
+
+            attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+            attn_output = attn_module.o_proj(attn_output)
+            return attn_output, attn_weights
+
+        # Replace the forward method
+        attn_module.forward = patched_forward
+
+    def remove_kvcache_observers(self):
+        """Remove patched forward methods and restore original ones."""
+        for attn_name, original_forward in self._original_attn_forwards.items():
+            # Find the attention module and restore its forward
+            parts = attn_name.split(".")
+            module = self.model
+            for part in parts:
+                module = getattr(module, part)
+            module.forward = original_forward
+
+        self._original_attn_forwards.clear()
+
+    def get_kvcache_scales(self):
+        """
+        Get KV cache scales from observers.
+        Returns dict with format: {"layer_name.k_cache.scale": scale,
+                                   "layer_name.v_cache.scale": scale}
+        """
+        kv_scales = {}
+        for attn_name, observers in self.kv_cache_observers.items():
+            key_scale = observers["key_observer"].scales()
+            value_scale = observers["value_observer"].scales()
+            kv_scales[f"{attn_name}.k_cache.scale"] = key_scale
+            kv_scales[f"{attn_name}.v_cache.scale"] = value_scale
+        return kv_scales
+
+    def get_save_func(self):
+        if self.deploy_backend in ["vllm", "huggingface"]:
+            return PTQSaveVllmHF
+        else:
+            raise NotImplementedError(
+                f"deploy_backend {self.deploy_backend} is not supported for saving."
+            )
diff --git a/configs/hunyuan/fp8_static/hunyuanv3_a20b_fp8_static_c8.yaml b/configs/hunyuan/fp8_static/hunyuanv3_a20b_fp8_static_c8.yaml
new file mode 100644
index 00000000..34217472
--- /dev/null
+++ b/configs/hunyuan/fp8_static/hunyuanv3_a20b_fp8_static_c8.yaml
@@ -0,0 +1,37 @@
+# Global configuration of pipeline
+global:
+  save_path: ./output
+
+# Simplified Configuration for LLM compression
+model:
+  name: HYV3MoE
+  model_path: tencent/Hy3-preview 
+  trust_remote_code: true
+  low_cpu_mem_usage: true
+  use_cache: false
+  torch_dtype: auto
+  device_map: auto
+
+# Compression configuration
+compression:
+  name: PTQ
+  quantization:
+    name: fp8_static 
+    bits: 8
+    quant_method:
+      weight: "per-tensor"
+      activation: "per-tensor"
+      kv_cache: "per-tensor"
+    ignore_layers:         # Skip quantization for these layers
+      - "lm_head"
+      - "model.embed_tokens"
+    cpu_convert: true
+    save_name: "fp8"
+
+# Dataset for calibration
+dataset:
+  name: TextDataset
+  data_path: ./dataset/sharegpt_gpt4/sharegpt_gpt4_256.jsonl
+  max_seq_length: 2048
+  num_samples: 512
+  batch_size: 1

From ac18ea22977c28230b6ea357642667d9292f2d59 Mon Sep 17 00:00:00 2001
From: linchuanxie <linchuanxie@tencent.com>
Date: Thu, 23 Apr 2026 13:39:30 +0800
Subject: [PATCH 2/3] support hy3.0 fp8 quantization

---
 angelslim/compressor/quant/core/hook.py |  6 +++---
 angelslim/compressor/quant/core/save.py |  5 ++++-
 angelslim/compressor/quant/ptq.py       | 23 ++++++++++-------------
 angelslim/models/llm/hunyuan_v3_moe.py  |  8 ++------
 4 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/angelslim/compressor/quant/core/hook.py b/angelslim/compressor/quant/core/hook.py
index 0390a5a8..ecea808f 100644
--- a/angelslim/compressor/quant/core/hook.py
+++ b/angelslim/compressor/quant/core/hook.py
@@ -62,7 +62,7 @@ def apply_hook(self):
             self._forward_hook_list.append(forward_hook_handle)
 
         # Apply KV cache observers using monkey patching (for attention-level observation)
-        if kv_cache_observer is not None and hasattr(self.quant_model, 'apply_kvcache_observers'):
+        if kv_cache_observer is not None and hasattr(self.quant_model, "apply_kvcache_observers"):
             quant_bits = self.quant_model.quant_algo_dict.get("c_quant_bits", 8)
             self.quant_model.apply_kvcache_observers(
                 kv_cache_observer_class=kv_cache_observer,
@@ -97,7 +97,7 @@ def remove_hook(self):
             hook.remove()
         self._forward_hook_list = []
         # Remove KV cache observer patches if available
-        if hasattr(self.quant_model, 'remove_kvcache_observers'):
+        if hasattr(self.quant_model, "remove_kvcache_observers"):
             self.quant_model.remove_kvcache_observers()
 
     def post_process(self):
@@ -123,7 +123,7 @@ def post_process(self):
                             )
         if self.quant_model.quant_algo_dict["c_quant_algo"] == "fp8":
             # Process KV cache scales from attention-level observers
-            if hasattr(self.quant_model, 'get_kvcache_scales'):
+            if hasattr(self.quant_model, "get_kvcache_scales"):
                 kv_scales = self.quant_model.get_kvcache_scales()
                 for k, v in kv_scales.items():
                     self.quant_model.kv_cache_scales_dict[k] = v / maxval.type(v.dtype)
diff --git a/angelslim/compressor/quant/core/save.py b/angelslim/compressor/quant/core/save.py
index bb964272..53f5e446 100644
--- a/angelslim/compressor/quant/core/save.py
+++ b/angelslim/compressor/quant/core/save.py
@@ -301,7 +301,10 @@ def save(self, save_path):
 
         self.quant_model.tokenizer.save_pretrained(save_path)
         # Save KV cache scales if available
-        if hasattr(self.quant_model, 'kv_cache_scales_dict') and self.quant_model.kv_cache_scales_dict:
+        if (
+            hasattr(self.quant_model, "kv_cache_scales_dict")
+            and self.quant_model.kv_cache_scales_dict
+        ):
             kv_scales_path = os.path.join(save_path, "kv_cache_scales.safetensors")
             kv_scales_dict = {}
             kv_scale_map = {}
diff --git a/angelslim/compressor/quant/ptq.py b/angelslim/compressor/quant/ptq.py
index 5a2f7e5d..4de5f20e 100644
--- a/angelslim/compressor/quant/ptq.py
+++ b/angelslim/compressor/quant/ptq.py
@@ -210,21 +210,21 @@ def get_meta_weights_info(self, model):
         meta_params = []
 
         for name, param in model.named_parameters():
-            if param.device.type == 'meta':
-                meta_params.append({
-                    'name': name,
-                })
+            if param.device.type == "meta":
+                meta_params.append(
+                    {
+                        "name": name,
+                    }
+                )
         return meta_params
 
     def set_meta_weights_info(self, model):
         """替换所有meta权重"""
         orign_w_dict = {}
         for name, param in model.named_parameters():
-            if param.device.type == 'meta':
+            if param.device.type == "meta":
                 with open(
-                    os.path.join(
-                        self.absolute_model_path, "model.safetensors.index.json"
-                    ),
+                    os.path.join(self.absolute_model_path, "model.safetensors.index.json"),
                     "r",
                 ) as f:
                     model_index = json.load(f)
@@ -238,10 +238,10 @@ def set_meta_weights_info(self, model):
                     orign_w = load_file(orign_w_file, device="cpu")
                     orign_w_dict[orign_w_file] = orign_w
 
-                empty_tensor = torch.empty(param.data.shape, dtype=param.data.dtype, device='cpu')
+                empty_tensor = torch.empty(param.data.shape, dtype=param.data.dtype, device="cpu")
                 new_param = torch.nn.Parameter(empty_tensor)
                 new_param.data = orign_w[name]
-                parts = name.split('.')
+                parts = name.split(".")
                 current_module = model
 
                 # 导航到包含参数的模块
@@ -253,13 +253,10 @@ def set_meta_weights_info(self, model):
 
         del orign_w_dict
 
-
-
     def _convert(self):
         self.set_meta_weights_info(self.quant_model.model)
         print_info(f"Meta weight:{self.get_meta_weights_info(self.quant_model.model)}")
 
-
         # 1. get act, weight and kv-cache scale
         for name, sub_layer in self.ptq_hook.quant_layers_dict.items():
             if (
diff --git a/angelslim/models/llm/hunyuan_v3_moe.py b/angelslim/models/llm/hunyuan_v3_moe.py
index 701cb9cc..d8919643 100644
--- a/angelslim/models/llm/hunyuan_v3_moe.py
+++ b/angelslim/models/llm/hunyuan_v3_moe.py
@@ -160,9 +160,7 @@ def replace_moe(self):
         the per-expert nn.Linear modules and register them with the PTQ hook.
         """
         for name, module in self.model.named_modules():
-            if isinstance(module, HYV3Experts) and not isinstance(
-                module, HYV3ExpertsWithLinear
-            ):
+            if isinstance(module, HYV3Experts) and not isinstance(module, HYV3ExpertsWithLinear):
                 parent_layer, sub_name = find_parent_layer_and_sub_name(self.model, name)
                 moe_linear = HYV3ExpertsWithLinear(module)
                 del module
@@ -315,9 +313,7 @@ def patched_forward(
             value_states = value_states.transpose(1, 2)
 
             cos, sin = position_embeddings
-            query_states, key_states = apply_rotary_pos_emb(
-                query_states, key_states, cos, sin
-            )
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
             # === OBSERVE KV CACHE AFTER RoPE ===
             key_observer(key_states)

From 273b399d3efd7e9db7f0bf861fddc1643f9c97a5 Mon Sep 17 00:00:00 2001
From: linchuanxie <linchuanxie@tencent.com>
Date: Thu, 23 Apr 2026 15:54:41 +0800
Subject: [PATCH 3/3] update readme

---
 README.md    | 4 ++--
 README_cn.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index c03438a0..4de8dd8c 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ A more accessible, comprehensive, and efficient toolkit for large model compress
 </p>
 
 ## 📣Latest News
-- [26/04/23] We now support FP8-Static quantization for **Hunyuan3.0** (MoE A20B).
+- [26/04/23] We now support FP8-Static quantization for **Hy3-preview** (MoE A20B).
 - [26/03/25] We have released **DAQ**, the quantization algorithm that preserves the knowledge acquired while the update of parameters is relatively small during post-training training.[[Paper]](https://arxiv.org/abs/2603.22324) | [[Docs]](docs/source/features/quantization/daq.md)
 - [26/02/09] We have released HY-1.8B-2Bit, 2bit on-device large language model,[[Huggingface]](https://huggingface.co/AngelSlim/HY-1.8B-2Bit).
 - [26/01/13] We have released v0.3. We support the training and deployment of Eagle3 for all-scale LLMs/VLMs/Audio models, as detailed in the [guidance documentation](https://angelslim.readthedocs.io/zh-cn/latest/features/speculative_decoding/eagle/index.html). And We released **Sherry**, the hardware-efficient 1.25 bit quantization algorithm [[Paper]](https://arxiv.org/abs/2601.07892) | [[Code]](https://github.com/Tencent/AngelSlim/tree/sherry/Sherry)🔥🔥🔥
@@ -254,7 +254,7 @@ python3 tools/run.py -c configs/qwen3/fp8_static/qwen3-1_7b_fp8_static.yaml
 
 This example produces quantized model weights by performing PTQ calibration on a model loaded from HuggingFace.
 
-For **Hunyuan3.0** (MoE A20B) FP8-Static quantization:
+For **Hy3-preview** (MoE A20B) FP8-Static quantization:
 
 ```shell
 python tools/run.py -c configs/hunyuan/fp8_static/hunyuanv3_a20b_fp8_static_c8.yaml
diff --git a/README_cn.md b/README_cn.md
index 9d42bc17..e775c4f9 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -22,7 +22,7 @@
 </p>
 
 ## 📣最新进展
-- [26/04/23] 我们支持了 **Hunyuan3.0**（MoE A20B）模型的 FP8-Static 量化。
+- [26/04/23] 我们支持了 **Hy3-preview**（MoE A20B）模型的 FP8-Static 量化。
 - [26/03/25] 我们发布了量化算法DAQ，该方法在后训练参数更新较小时，可保留量化后模型能力 [[论文]](https://arxiv.org/abs/2603.22324) | [[文档]](docs/source/features/quantization/daq.md)
 - [26/02/09] 我们发布了 HY-1.8B-2Bit, 2比特端侧大模型, 模型可见[[Huggingface]](https://huggingface.co/AngelSlim/HY-1.8B-2Bit).
 - [26/01/13] 我们发布V0.3版本， 支持了全模态场景的投机采样训练及部署，文档：[Eagle3 for LLM/VLM/Audio](https://angelslim.readthedocs.io/zh-cn/latest/features/speculative_decoding/eagle/index.html)。并且我们发布了 **Sherry** 新的硬件高效的1.25bit三值量化算法 [[论文]](https://arxiv.org/abs/2601.07892) | [[代码]](https://github.com/Tencent/AngelSlim/tree/sherry/Sherry)🔥🔥🔥
@@ -253,7 +253,7 @@ bash scripts/speculative/train_eagle3_online.sh
 
   该示例将会加载`HugggingFace`模型进行PTQ量化校准，最终量化产出模型权重.
 
-对 **Hunyuan3.0**（MoE A20B）进行 FP8-Static 量化：
+对 **Hy3-preview**（MoE A20B）进行 FP8-Static 量化：
 
   ```shell
   python tools/run.py -c configs/hunyuan/fp8_static/hunyuanv3_a20b_fp8_static_c8.yaml