From 048845329459993503848c58106a4c3d7754cb7d Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 26 Feb 2026 13:56:44 +0800
Subject: [PATCH 01/10] add v4 patch

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../modeling/finegrained_fp8_patch_v4.py      | 152 ++++++++++++++++++
 auto_round/modeling/hpu_patch.py              |   8 +-
 2 files changed, 159 insertions(+), 1 deletion(-)
 create mode 100644 auto_round/modeling/finegrained_fp8_patch_v4.py

diff --git a/auto_round/modeling/finegrained_fp8_patch_v4.py b/auto_round/modeling/finegrained_fp8_patch_v4.py
new file mode 100644
index 000000000..5889df0a7
--- /dev/null
+++ b/auto_round/modeling/finegrained_fp8_patch_v4.py
@@ -0,0 +1,152 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from ..utils import is_accelerate_available, is_torch_available, logging
+
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+    # import triton
+    # import triton.language as tl
+    from torch.nn import functional as F
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+
+
+logger = logging.get_logger(__name__)
+
+
+
+
+
+
+logger = logging.get_logger(__name__)
+
+
+_FP8_DTYPE = torch.float8_e4m3fn
+_FP8_MIN = torch.finfo(_FP8_DTYPE).min
+_FP8_MAX = torch.finfo(_FP8_DTYPE).max
+
+
+class FP8Linear(nn.Linear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = False,
+        dtype=torch.float8_e4m3fn,
+        block_size: tuple[int, int] | None = None,
+        activation_scheme="dynamic",
+    ):
+        super().__init__(in_features, out_features)
+
+        # If block size is None, it means that we are doing per-tensor quantization
+        self.block_size = block_size
+        self.activation_scheme = activation_scheme
+
+        self.weight = torch.nn.Parameter(torch.empty(out_features, in_features, dtype=dtype))
+
+        if self.block_size is None:
+            self.weight_scale_inv = nn.Parameter(torch.tensor(1.0, dtype=torch.float32))
+        else:
+            scale_out_features = (out_features + self.block_size[0] - 1) // self.block_size[0]
+            scale_in_features = (in_features + self.block_size[1] - 1) // self.block_size[1]
+            self.weight_scale_inv = nn.Parameter(
+                torch.empty(scale_out_features, scale_in_features, dtype=torch.float32)
+            )
+
+        if self.activation_scheme == "static":
+            self.activation_scale = nn.Parameter(torch.tensor(1.0, dtype=torch.float32))
+
+        if bias:
+            self.bias = nn.Parameter(torch.empty(self.out_features))
+        else:
+            self.register_parameter("bias", None)
+
+def _replace_with_fp8_linear(
+    model,
+    tp_plan=None,
+    modules_to_not_convert=None,
+    current_key_name=None,
+    quantization_config=None,
+    has_been_replaced=False,
+):
+    """Replace Linear layers with FP8Linear."""
+    if current_key_name is None:
+        current_key_name = []
+
+    for name, module in model.named_children():
+        current_key_name.append(name)
+
+        if isinstance(module, nn.Linear) and name not in (modules_to_not_convert or []):
+            current_key_name_str = ".".join(current_key_name)
+            if not any(key in current_key_name_str for key in (modules_to_not_convert or [])):
+                with init_empty_weights():
+                    model._modules[name] = FP8Linear(
+                        in_features=module.in_features,
+                        out_features=module.out_features,
+                        bias=module.bias is not None,
+                        device=module.weight.device,
+                        dtype=module.weight.dtype,
+                        activation_scheme=quantization_config.activation_scheme,
+                        block_size=quantization_config.weight_block_size,
+                    )
+                    has_been_replaced = True
+            # when changing a layer the TP PLAN for that layer should be updated. TODO
+
+        if len(list(module.children())) > 0:
+            _, has_been_replaced = _replace_with_fp8_linear(
+                module,
+                tp_plan,
+                modules_to_not_convert,
+                current_key_name,
+                quantization_config,
+                has_been_replaced=has_been_replaced,
+            )
+
+        current_key_name.pop(-1)
+
+    return model, has_been_replaced
+
+
+def replace_with_fp8_linear(
+    model,
+    modules_to_not_convert=None,
+    quantization_config=None,
+):
+    """Helper function to replace model layers with FP8 versions."""
+    modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert
+
+    if quantization_config.modules_to_not_convert is not None:
+        modules_to_not_convert.extend(quantization_config.modules_to_not_convert)
+    modules_to_not_convert = list(set(modules_to_not_convert))
+    model, has_been_replaced = _replace_with_fp8_linear(
+        model,
+        tp_plan=model._tp_plan,
+        modules_to_not_convert=modules_to_not_convert,
+        quantization_config=quantization_config,
+    )
+
+    if not has_been_replaced:
+        logger.warning(
+            "You are loading your model using fp8 but no linear modules were found in your model."
+            " Please double check your model architecture."
+        )
+
+    return model
diff --git a/auto_round/modeling/hpu_patch.py b/auto_round/modeling/hpu_patch.py
index 521caec4b..777120625 100644
--- a/auto_round/modeling/hpu_patch.py
+++ b/auto_round/modeling/hpu_patch.py
@@ -16,7 +16,13 @@ def patch_finegrained_fp8():
         import sys
 
         # Import auto-round's HPU-compatible finegrained_fp8_patch module
-        finegrained_fp8_patch = importlib.import_module("auto_round.modeling.finegrained_fp8_patch")
+        from auto_round.utils import is_transformers_version_greater_or_equal_5
+        if is_transformers_version_greater_or_equal_5():
+            patch_file_name = "auto_round.modeling.finegrained_fp8_patch"
+        else:
+            patch_file_name = "auto_round.modeling.finegrained_fp8_patch_v4"
+    
+        finegrained_fp8_patch = importlib.import_module(patch_file_name)
 
         # Replace transformers.integrations.finegrained_fp8 in sys.modules
         sys.modules["transformers.integrations.finegrained_fp8"] = finegrained_fp8_patch

From ce04f767b6376925766ec8342cba741610c679e0 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 26 Feb 2026 06:03:13 +0000
Subject: [PATCH 02/10] fix import

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/modeling/finegrained_fp8_patch_v4.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/modeling/finegrained_fp8_patch_v4.py b/auto_round/modeling/finegrained_fp8_patch_v4.py
index 5889df0a7..ba0e4b356 100644
--- a/auto_round/modeling/finegrained_fp8_patch_v4.py
+++ b/auto_round/modeling/finegrained_fp8_patch_v4.py
@@ -15,7 +15,7 @@
 
 from typing import Optional
 
-from ..utils import is_accelerate_available, is_torch_available, logging
+from transformers.utils import is_accelerate_available, is_torch_available, logging
 
 
 if is_torch_available():

From 0dda8ab23915b1fee8ff074a5faabccb3c9505fb Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 26 Feb 2026 06:06:42 +0000
Subject: [PATCH 03/10] quick fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../modeling/finegrained_fp8_patch_v4.py      | 33 ++++++++++---------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/auto_round/modeling/finegrained_fp8_patch_v4.py b/auto_round/modeling/finegrained_fp8_patch_v4.py
index ba0e4b356..822b55486 100644
--- a/auto_round/modeling/finegrained_fp8_patch_v4.py
+++ b/auto_round/modeling/finegrained_fp8_patch_v4.py
@@ -45,40 +45,43 @@
 
 
 class FP8Linear(nn.Linear):
+    dtype = torch.float8_e4m3fn
+
     def __init__(
         self,
         in_features: int,
         out_features: int,
         bias: bool = False,
-        dtype=torch.float8_e4m3fn,
-        block_size: tuple[int, int] | None = None,
+        dtype=None,
+        block_size: Optional[tuple[int, int]] = None,
+        device=None,
         activation_scheme="dynamic",
     ):
         super().__init__(in_features, out_features)
+        self.in_features = in_features
+        self.out_features = out_features
 
-        # If block size is None, it means that we are doing per-tensor quantization
-        self.block_size = block_size
-        self.activation_scheme = activation_scheme
+        self.weight = torch.nn.Parameter(torch.empty(out_features, in_features, dtype=FP8Linear.dtype, device=device))
 
-        self.weight = torch.nn.Parameter(torch.empty(out_features, in_features, dtype=dtype))
-
-        if self.block_size is None:
-            self.weight_scale_inv = nn.Parameter(torch.tensor(1.0, dtype=torch.float32))
-        else:
-            scale_out_features = (out_features + self.block_size[0] - 1) // self.block_size[0]
-            scale_in_features = (in_features + self.block_size[1] - 1) // self.block_size[1]
+        if self.weight.element_size() == 1:
+            scale_out_features = (out_features + block_size[0] - 1) // block_size[0]
+            scale_in_features = (in_features + block_size[1] - 1) // block_size[1]
             self.weight_scale_inv = nn.Parameter(
-                torch.empty(scale_out_features, scale_in_features, dtype=torch.float32)
+                torch.empty(scale_out_features, scale_in_features, dtype=torch.float32, device=device)
             )
+        else:
+            self.register_parameter("weight_scale_inv", None)
 
-        if self.activation_scheme == "static":
-            self.activation_scale = nn.Parameter(torch.tensor(1.0, dtype=torch.float32))
+        self.block_size = block_size
+
+        self.activation_scheme = activation_scheme
 
         if bias:
             self.bias = nn.Parameter(torch.empty(self.out_features))
         else:
             self.register_parameter("bias", None)
 
+
 def _replace_with_fp8_linear(
     model,
     tp_plan=None,

From 619d9352ede0dc8cbf1f81a9ba659574231e80b2 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 26 Feb 2026 06:52:16 +0000
Subject: [PATCH 04/10] add quant code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 examples/quant_model.py | 74 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 examples/quant_model.py

diff --git a/examples/quant_model.py b/examples/quant_model.py
new file mode 100644
index 000000000..0c0c980d6
--- /dev/null
+++ b/examples/quant_model.py
@@ -0,0 +1,74 @@
+# model_name = "/dataset/meta-llama/Meta-Llama-3-8B/"
+# model_name = "/data5/yliu7/HF_HOME/DeepSeek-R1-bf16-layer4"
+# model_name = "/models/Qwen3-8B-FP8/"
+# model_name = "/data5/yliu7/HF_HOME/DeepSeek-R1-bf16-layer4"
+# model_name = "Qwen/Qwen2.5-0.5B-Instruct"
+# model_name="/models/Qwen3-235B-A22B/"
+model_name = "/mnt/disk5/unsloth/DeepSeek-R1-BF16"
+model_name = "/models/Qwen3-8B-FP8/"
+# model_name = "/mnt/disk8/Qwen/Qwen3-8B-FP8"
+# model_name = "/mnt/disk5/Qwen3-30B-A3B-FP8"
+# model_name = "/models/DeepSeek-V2-Lite-Chat/"
+# model_name = "/mnt/disk8/deepseek-ai/DeepSeek-V2-Lite-Chat"
+model_name = "/mnt/disk8/Qwen/Qwen3-30B-A3B"
+from auto_round import AutoRound
+
+
+def fix_everything(seed):
+    import random
+    import numpy as np
+    import torch
+
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    # torch.cuda.manual_seed_all(seed)
+
+
+def main(args):
+    model_name = args.model
+    scheme = "FP8_STATIC"
+    autoround = AutoRound(
+        model_name,
+        scheme=scheme,
+        enable_torch_compile=True,
+        iters=0,
+        low_gpu_mem_usage=True,
+        low_cpu_mem_usage=True,
+        disable_opt_rtn=True,
+        # disable_trust_remote_code=True,
+
+        # static_kv_dtype="fp8",
+    )
+    model_base_name = model_name.rstrip("/").split("/")[-1]
+    output_dir = args.output_dir
+    if output_dir is None:
+        output_dir = "/mnt/disk5/hf_models/" + model_base_name + "-" + scheme + "-fp8-kv-2-test"
+    print(f"Output dir: {output_dir}")
+
+    model, save_folder = autoround.quantize_and_save(
+        output_dir=output_dir,
+        format="llm_compressor",
+    )
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Auto-round quantization script."
+    )
+    parser.add_argument(
+        "-m",
+        "--model",
+        help="Path to the model.",
+        type=str,
+        default=model_name,
+    )
+    parser.add_argument(
+        "-o",
+        "--output_dir",
+        help="Path to the output directory.",
+        type=str,
+        default=None,
+    )
+    args = parser.parse_args()
+    main(args)

From b45122fff60fbf0aae4c46f0f5e2691c3ee68e19 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 6 Mar 2026 03:13:08 +0000
Subject: [PATCH 05/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/modeling/finegrained_fp8_patch_v4.py | 6 +-----
 auto_round/modeling/hpu_patch.py                | 3 ++-
 examples/quant_model.py                         | 8 ++++----
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/auto_round/modeling/finegrained_fp8_patch_v4.py b/auto_round/modeling/finegrained_fp8_patch_v4.py
index 822b55486..4275f60db 100644
--- a/auto_round/modeling/finegrained_fp8_patch_v4.py
+++ b/auto_round/modeling/finegrained_fp8_patch_v4.py
@@ -17,10 +17,10 @@
 
 from transformers.utils import is_accelerate_available, is_torch_available, logging
 
-
 if is_torch_available():
     import torch
     import torch.nn as nn
+
     # import triton
     # import triton.language as tl
     from torch.nn import functional as F
@@ -32,10 +32,6 @@
 logger = logging.get_logger(__name__)
 
 
-
-
-
-
 logger = logging.get_logger(__name__)
 
 
diff --git a/auto_round/modeling/hpu_patch.py b/auto_round/modeling/hpu_patch.py
index 777120625..a7ecca40c 100644
--- a/auto_round/modeling/hpu_patch.py
+++ b/auto_round/modeling/hpu_patch.py
@@ -17,11 +17,12 @@ def patch_finegrained_fp8():
 
         # Import auto-round's HPU-compatible finegrained_fp8_patch module
         from auto_round.utils import is_transformers_version_greater_or_equal_5
+
         if is_transformers_version_greater_or_equal_5():
             patch_file_name = "auto_round.modeling.finegrained_fp8_patch"
         else:
             patch_file_name = "auto_round.modeling.finegrained_fp8_patch_v4"
-    
+
         finegrained_fp8_patch = importlib.import_module(patch_file_name)
 
         # Replace transformers.integrations.finegrained_fp8 in sys.modules
diff --git a/examples/quant_model.py b/examples/quant_model.py
index 0c0c980d6..da4f05ade 100644
--- a/examples/quant_model.py
+++ b/examples/quant_model.py
@@ -16,6 +16,7 @@
 
 def fix_everything(seed):
     import random
+
     import numpy as np
     import torch
 
@@ -37,7 +38,6 @@ def main(args):
         low_cpu_mem_usage=True,
         disable_opt_rtn=True,
         # disable_trust_remote_code=True,
-
         # static_kv_dtype="fp8",
     )
     model_base_name = model_name.rstrip("/").split("/")[-1]
@@ -51,11 +51,11 @@ def main(args):
         format="llm_compressor",
     )
 
+
 if __name__ == "__main__":
     import argparse
-    parser = argparse.ArgumentParser(
-        description="Auto-round quantization script."
-    )
+
+    parser = argparse.ArgumentParser(description="Auto-round quantization script.")
     parser.add_argument(
         "-m",
         "--model",

From 15c5ca8cc67c0ea8e73077b9e72d9fa5e23f051d Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 9 Mar 2026 08:51:09 +0800
Subject: [PATCH 06/10] remove example

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 examples/quant_model.py | 74 -----------------------------------------
 1 file changed, 74 deletions(-)
 delete mode 100644 examples/quant_model.py

diff --git a/examples/quant_model.py b/examples/quant_model.py
deleted file mode 100644
index da4f05ade..000000000
--- a/examples/quant_model.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# model_name = "/dataset/meta-llama/Meta-Llama-3-8B/"
-# model_name = "/data5/yliu7/HF_HOME/DeepSeek-R1-bf16-layer4"
-# model_name = "/models/Qwen3-8B-FP8/"
-# model_name = "/data5/yliu7/HF_HOME/DeepSeek-R1-bf16-layer4"
-# model_name = "Qwen/Qwen2.5-0.5B-Instruct"
-# model_name="/models/Qwen3-235B-A22B/"
-model_name = "/mnt/disk5/unsloth/DeepSeek-R1-BF16"
-model_name = "/models/Qwen3-8B-FP8/"
-# model_name = "/mnt/disk8/Qwen/Qwen3-8B-FP8"
-# model_name = "/mnt/disk5/Qwen3-30B-A3B-FP8"
-# model_name = "/models/DeepSeek-V2-Lite-Chat/"
-# model_name = "/mnt/disk8/deepseek-ai/DeepSeek-V2-Lite-Chat"
-model_name = "/mnt/disk8/Qwen/Qwen3-30B-A3B"
-from auto_round import AutoRound
-
-
-def fix_everything(seed):
-    import random
-
-    import numpy as np
-    import torch
-
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    # torch.cuda.manual_seed_all(seed)
-
-
-def main(args):
-    model_name = args.model
-    scheme = "FP8_STATIC"
-    autoround = AutoRound(
-        model_name,
-        scheme=scheme,
-        enable_torch_compile=True,
-        iters=0,
-        low_gpu_mem_usage=True,
-        low_cpu_mem_usage=True,
-        disable_opt_rtn=True,
-        # disable_trust_remote_code=True,
-        # static_kv_dtype="fp8",
-    )
-    model_base_name = model_name.rstrip("/").split("/")[-1]
-    output_dir = args.output_dir
-    if output_dir is None:
-        output_dir = "/mnt/disk5/hf_models/" + model_base_name + "-" + scheme + "-fp8-kv-2-test"
-    print(f"Output dir: {output_dir}")
-
-    model, save_folder = autoround.quantize_and_save(
-        output_dir=output_dir,
-        format="llm_compressor",
-    )
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser(description="Auto-round quantization script.")
-    parser.add_argument(
-        "-m",
-        "--model",
-        help="Path to the model.",
-        type=str,
-        default=model_name,
-    )
-    parser.add_argument(
-        "-o",
-        "--output_dir",
-        help="Path to the output directory.",
-        type=str,
-        default=None,
-    )
-    args = parser.parse_args()
-    main(args)

From c24c2bda9f6560ab1cb525e4d244c3480cc89a90 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 9 Mar 2026 00:56:22 +0000
Subject: [PATCH 07/10] fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/modeling/hpu_patch.py | 12 ++++++++++--
 auto_round/utils/common.py       |  7 +++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/auto_round/modeling/hpu_patch.py b/auto_round/modeling/hpu_patch.py
index a7ecca40c..aa12f8ad5 100644
--- a/auto_round/modeling/hpu_patch.py
+++ b/auto_round/modeling/hpu_patch.py
@@ -16,12 +16,20 @@ def patch_finegrained_fp8():
         import sys
 
         # Import auto-round's HPU-compatible finegrained_fp8_patch module
-        from auto_round.utils import is_transformers_version_greater_or_equal_5
+        from auto_round.utils import is_transformers_version_greater_or_equal_5, is_transformers_version_greater_or_equal_4
 
         if is_transformers_version_greater_or_equal_5():
             patch_file_name = "auto_round.modeling.finegrained_fp8_patch"
-        else:
+        elif is_transformers_version_greater_or_equal_4():
             patch_file_name = "auto_round.modeling.finegrained_fp8_patch_v4"
+        else:
+            logger.warning(
+                (
+                "Transformers version is below 4.0.0, skipping finegrained_fp8 patching.",
+                " Please upgrade to Transformers 4.x or later for HPU support."
+                )
+            )
+            return
 
         finegrained_fp8_patch = importlib.import_module(patch_file_name)
 
diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py
index c494c2959..7b72b4a42 100644
--- a/auto_round/utils/common.py
+++ b/auto_round/utils/common.py
@@ -407,3 +407,10 @@ def is_transformers_version_greater_or_equal_5():
     from packaging import version
 
     return version.parse(transformers.__version__) >= version.parse("5.0.0")
+
+@lru_cache(None)
+def is_transformers_version_greater_or_equal_4():
+    import transformers
+    from packaging import version
+
+    return version.parse(transformers.__version__) >= version.parse("4.0.0")

From d814c91eaef43ba3df3e341766e3f87780de32e3 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 9 Mar 2026 00:56:35 +0000
Subject: [PATCH 08/10] add todo

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/utils/common.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py
index 7b72b4a42..b32947bd4 100644
--- a/auto_round/utils/common.py
+++ b/auto_round/utils/common.py
@@ -408,6 +408,7 @@ def is_transformers_version_greater_or_equal_5():
 
     return version.parse(transformers.__version__) >= version.parse("5.0.0")
 
+# TODO: (yiliu30) refine version check logic
 @lru_cache(None)
 def is_transformers_version_greater_or_equal_4():
     import transformers

From 5ebebd5e9583189ada7dfc13336291cd16154702 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 9 Mar 2026 01:00:17 +0000
Subject: [PATCH 09/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/modeling/hpu_patch.py | 9 ++++++---
 auto_round/utils/common.py       | 1 +
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/auto_round/modeling/hpu_patch.py b/auto_round/modeling/hpu_patch.py
index aa12f8ad5..db321f21a 100644
--- a/auto_round/modeling/hpu_patch.py
+++ b/auto_round/modeling/hpu_patch.py
@@ -16,7 +16,10 @@ def patch_finegrained_fp8():
         import sys
 
         # Import auto-round's HPU-compatible finegrained_fp8_patch module
-        from auto_round.utils import is_transformers_version_greater_or_equal_5, is_transformers_version_greater_or_equal_4
+        from auto_round.utils import (
+            is_transformers_version_greater_or_equal_4,
+            is_transformers_version_greater_or_equal_5,
+        )
 
         if is_transformers_version_greater_or_equal_5():
             patch_file_name = "auto_round.modeling.finegrained_fp8_patch"
@@ -25,8 +28,8 @@ def patch_finegrained_fp8():
         else:
             logger.warning(
                 (
-                "Transformers version is below 4.0.0, skipping finegrained_fp8 patching.",
-                " Please upgrade to Transformers 4.x or later for HPU support."
+                    "Transformers version is below 4.0.0, skipping finegrained_fp8 patching.",
+                    " Please upgrade to Transformers 4.x or later for HPU support.",
                 )
             )
             return
diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py
index b32947bd4..4e9e54ec8 100644
--- a/auto_round/utils/common.py
+++ b/auto_round/utils/common.py
@@ -408,6 +408,7 @@ def is_transformers_version_greater_or_equal_5():
 
     return version.parse(transformers.__version__) >= version.parse("5.0.0")
 
+
 # TODO: (yiliu30) refine version check logic
 @lru_cache(None)
 def is_transformers_version_greater_or_equal_4():

From 6170a1dd43241370fb711754dbfcd6cfd023e370 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 9 Mar 2026 00:59:40 +0000
Subject: [PATCH 10/10] update license

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/modeling/finegrained_fp8_patch_v4.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/auto_round/modeling/finegrained_fp8_patch_v4.py b/auto_round/modeling/finegrained_fp8_patch_v4.py
index 4275f60db..11c2f6882 100644
--- a/auto_round/modeling/finegrained_fp8_patch_v4.py
+++ b/auto_round/modeling/finegrained_fp8_patch_v4.py
@@ -1,18 +1,17 @@
-# coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright (c) 2026 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+# Copied from https://github.com/huggingface/transformers/blob/v4.57.3/src/transformers/integrations/finegrained_fp8.py
 from typing import Optional
 
 from transformers.utils import is_accelerate_available, is_torch_available, logging