From 048845329459993503848c58106a4c3d7754cb7d Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 26 Feb 2026 13:56:44 +0800 Subject: [PATCH 01/10] add v4 patch Signed-off-by: yiliu30 --- .../modeling/finegrained_fp8_patch_v4.py | 152 ++++++++++++++++++ auto_round/modeling/hpu_patch.py | 8 +- 2 files changed, 159 insertions(+), 1 deletion(-) create mode 100644 auto_round/modeling/finegrained_fp8_patch_v4.py diff --git a/auto_round/modeling/finegrained_fp8_patch_v4.py b/auto_round/modeling/finegrained_fp8_patch_v4.py new file mode 100644 index 000000000..5889df0a7 --- /dev/null +++ b/auto_round/modeling/finegrained_fp8_patch_v4.py @@ -0,0 +1,152 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from ..utils import is_accelerate_available, is_torch_available, logging + + +if is_torch_available(): + import torch + import torch.nn as nn + # import triton + # import triton.language as tl + from torch.nn import functional as F + +if is_accelerate_available(): + from accelerate import init_empty_weights + + +logger = logging.get_logger(__name__) + + + + + + +logger = logging.get_logger(__name__) + + +_FP8_DTYPE = torch.float8_e4m3fn +_FP8_MIN = torch.finfo(_FP8_DTYPE).min +_FP8_MAX = torch.finfo(_FP8_DTYPE).max + + +class FP8Linear(nn.Linear): + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = False, + dtype=torch.float8_e4m3fn, + block_size: tuple[int, int] | None = None, + activation_scheme="dynamic", + ): + super().__init__(in_features, out_features) + + # If block size is None, it means that we are doing per-tensor quantization + self.block_size = block_size + self.activation_scheme = activation_scheme + + self.weight = torch.nn.Parameter(torch.empty(out_features, in_features, dtype=dtype)) + + if self.block_size is None: + self.weight_scale_inv = nn.Parameter(torch.tensor(1.0, dtype=torch.float32)) + else: + scale_out_features = (out_features + self.block_size[0] - 1) // self.block_size[0] + scale_in_features = (in_features + self.block_size[1] - 1) // self.block_size[1] + self.weight_scale_inv = nn.Parameter( + torch.empty(scale_out_features, scale_in_features, dtype=torch.float32) + ) + + if self.activation_scheme == "static": + self.activation_scale = nn.Parameter(torch.tensor(1.0, dtype=torch.float32)) + + if bias: + self.bias = nn.Parameter(torch.empty(self.out_features)) + else: + self.register_parameter("bias", None) + +def _replace_with_fp8_linear( + model, + tp_plan=None, + modules_to_not_convert=None, + current_key_name=None, + quantization_config=None, + has_been_replaced=False, +): + """Replace Linear layers with FP8Linear.""" + if current_key_name is None: + current_key_name = [] + + for name, module in model.named_children(): + current_key_name.append(name) + + if isinstance(module, nn.Linear) and name not in (modules_to_not_convert or []): + current_key_name_str = ".".join(current_key_name) + if not any(key in current_key_name_str for key in (modules_to_not_convert or [])): + with init_empty_weights(): + model._modules[name] = FP8Linear( + in_features=module.in_features, + out_features=module.out_features, + bias=module.bias is not None, + device=module.weight.device, + dtype=module.weight.dtype, + activation_scheme=quantization_config.activation_scheme, + block_size=quantization_config.weight_block_size, + ) + has_been_replaced = True + # when changing a layer the TP PLAN for that layer should be updated. TODO + + if len(list(module.children())) > 0: + _, has_been_replaced = _replace_with_fp8_linear( + module, + tp_plan, + modules_to_not_convert, + current_key_name, + quantization_config, + has_been_replaced=has_been_replaced, + ) + + current_key_name.pop(-1) + + return model, has_been_replaced + + +def replace_with_fp8_linear( + model, + modules_to_not_convert=None, + quantization_config=None, +): + """Helper function to replace model layers with FP8 versions.""" + modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert + + if quantization_config.modules_to_not_convert is not None: + modules_to_not_convert.extend(quantization_config.modules_to_not_convert) + modules_to_not_convert = list(set(modules_to_not_convert)) + model, has_been_replaced = _replace_with_fp8_linear( + model, + tp_plan=model._tp_plan, + modules_to_not_convert=modules_to_not_convert, + quantization_config=quantization_config, + ) + + if not has_been_replaced: + logger.warning( + "You are loading your model using fp8 but no linear modules were found in your model." + " Please double check your model architecture." + ) + + return model diff --git a/auto_round/modeling/hpu_patch.py b/auto_round/modeling/hpu_patch.py index 521caec4b..777120625 100644 --- a/auto_round/modeling/hpu_patch.py +++ b/auto_round/modeling/hpu_patch.py @@ -16,7 +16,13 @@ def patch_finegrained_fp8(): import sys # Import auto-round's HPU-compatible finegrained_fp8_patch module - finegrained_fp8_patch = importlib.import_module("auto_round.modeling.finegrained_fp8_patch") + from auto_round.utils import is_transformers_version_greater_or_equal_5 + if is_transformers_version_greater_or_equal_5(): + patch_file_name = "auto_round.modeling.finegrained_fp8_patch" + else: + patch_file_name = "auto_round.modeling.finegrained_fp8_patch_v4" + + finegrained_fp8_patch = importlib.import_module(patch_file_name) # Replace transformers.integrations.finegrained_fp8 in sys.modules sys.modules["transformers.integrations.finegrained_fp8"] = finegrained_fp8_patch From ce04f767b6376925766ec8342cba741610c679e0 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 26 Feb 2026 06:03:13 +0000 Subject: [PATCH 02/10] fix import Signed-off-by: yiliu30 --- auto_round/modeling/finegrained_fp8_patch_v4.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/modeling/finegrained_fp8_patch_v4.py b/auto_round/modeling/finegrained_fp8_patch_v4.py index 5889df0a7..ba0e4b356 100644 --- a/auto_round/modeling/finegrained_fp8_patch_v4.py +++ b/auto_round/modeling/finegrained_fp8_patch_v4.py @@ -15,7 +15,7 @@ from typing import Optional -from ..utils import is_accelerate_available, is_torch_available, logging +from transformers.utils import is_accelerate_available, is_torch_available, logging if is_torch_available(): From 0dda8ab23915b1fee8ff074a5faabccb3c9505fb Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 26 Feb 2026 06:06:42 +0000 Subject: [PATCH 03/10] quick fix Signed-off-by: yiliu30 --- .../modeling/finegrained_fp8_patch_v4.py | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/auto_round/modeling/finegrained_fp8_patch_v4.py b/auto_round/modeling/finegrained_fp8_patch_v4.py index ba0e4b356..822b55486 100644 --- a/auto_round/modeling/finegrained_fp8_patch_v4.py +++ b/auto_round/modeling/finegrained_fp8_patch_v4.py @@ -45,40 +45,43 @@ class FP8Linear(nn.Linear): + dtype = torch.float8_e4m3fn + def __init__( self, in_features: int, out_features: int, bias: bool = False, - dtype=torch.float8_e4m3fn, - block_size: tuple[int, int] | None = None, + dtype=None, + block_size: Optional[tuple[int, int]] = None, + device=None, activation_scheme="dynamic", ): super().__init__(in_features, out_features) + self.in_features = in_features + self.out_features = out_features - # If block size is None, it means that we are doing per-tensor quantization - self.block_size = block_size - self.activation_scheme = activation_scheme + self.weight = torch.nn.Parameter(torch.empty(out_features, in_features, dtype=FP8Linear.dtype, device=device)) - self.weight = torch.nn.Parameter(torch.empty(out_features, in_features, dtype=dtype)) - - if self.block_size is None: - self.weight_scale_inv = nn.Parameter(torch.tensor(1.0, dtype=torch.float32)) - else: - scale_out_features = (out_features + self.block_size[0] - 1) // self.block_size[0] - scale_in_features = (in_features + self.block_size[1] - 1) // self.block_size[1] + if self.weight.element_size() == 1: + scale_out_features = (out_features + block_size[0] - 1) // block_size[0] + scale_in_features = (in_features + block_size[1] - 1) // block_size[1] self.weight_scale_inv = nn.Parameter( - torch.empty(scale_out_features, scale_in_features, dtype=torch.float32) + torch.empty(scale_out_features, scale_in_features, dtype=torch.float32, device=device) ) + else: + self.register_parameter("weight_scale_inv", None) - if self.activation_scheme == "static": - self.activation_scale = nn.Parameter(torch.tensor(1.0, dtype=torch.float32)) + self.block_size = block_size + + self.activation_scheme = activation_scheme if bias: self.bias = nn.Parameter(torch.empty(self.out_features)) else: self.register_parameter("bias", None) + def _replace_with_fp8_linear( model, tp_plan=None, From 619d9352ede0dc8cbf1f81a9ba659574231e80b2 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 26 Feb 2026 06:52:16 +0000 Subject: [PATCH 04/10] add quant code Signed-off-by: yiliu30 --- examples/quant_model.py | 74 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 examples/quant_model.py diff --git a/examples/quant_model.py b/examples/quant_model.py new file mode 100644 index 000000000..0c0c980d6 --- /dev/null +++ b/examples/quant_model.py @@ -0,0 +1,74 @@ +# model_name = "/dataset/meta-llama/Meta-Llama-3-8B/" +# model_name = "/data5/yliu7/HF_HOME/DeepSeek-R1-bf16-layer4" +# model_name = "/models/Qwen3-8B-FP8/" +# model_name = "/data5/yliu7/HF_HOME/DeepSeek-R1-bf16-layer4" +# model_name = "Qwen/Qwen2.5-0.5B-Instruct" +# model_name="/models/Qwen3-235B-A22B/" +model_name = "/mnt/disk5/unsloth/DeepSeek-R1-BF16" +model_name = "/models/Qwen3-8B-FP8/" +# model_name = "/mnt/disk8/Qwen/Qwen3-8B-FP8" +# model_name = "/mnt/disk5/Qwen3-30B-A3B-FP8" +# model_name = "/models/DeepSeek-V2-Lite-Chat/" +# model_name = "/mnt/disk8/deepseek-ai/DeepSeek-V2-Lite-Chat" +model_name = "/mnt/disk8/Qwen/Qwen3-30B-A3B" +from auto_round import AutoRound + + +def fix_everything(seed): + import random + import numpy as np + import torch + + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + # torch.cuda.manual_seed_all(seed) + + +def main(args): + model_name = args.model + scheme = "FP8_STATIC" + autoround = AutoRound( + model_name, + scheme=scheme, + enable_torch_compile=True, + iters=0, + low_gpu_mem_usage=True, + low_cpu_mem_usage=True, + disable_opt_rtn=True, + # disable_trust_remote_code=True, + + # static_kv_dtype="fp8", + ) + model_base_name = model_name.rstrip("/").split("/")[-1] + output_dir = args.output_dir + if output_dir is None: + output_dir = "/mnt/disk5/hf_models/" + model_base_name + "-" + scheme + "-fp8-kv-2-test" + print(f"Output dir: {output_dir}") + + model, save_folder = autoround.quantize_and_save( + output_dir=output_dir, + format="llm_compressor", + ) + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser( + description="Auto-round quantization script." + ) + parser.add_argument( + "-m", + "--model", + help="Path to the model.", + type=str, + default=model_name, + ) + parser.add_argument( + "-o", + "--output_dir", + help="Path to the output directory.", + type=str, + default=None, + ) + args = parser.parse_args() + main(args) From b45122fff60fbf0aae4c46f0f5e2691c3ee68e19 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 6 Mar 2026 03:13:08 +0000 Subject: [PATCH 05/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/modeling/finegrained_fp8_patch_v4.py | 6 +----- auto_round/modeling/hpu_patch.py | 3 ++- examples/quant_model.py | 8 ++++---- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/auto_round/modeling/finegrained_fp8_patch_v4.py b/auto_round/modeling/finegrained_fp8_patch_v4.py index 822b55486..4275f60db 100644 --- a/auto_round/modeling/finegrained_fp8_patch_v4.py +++ b/auto_round/modeling/finegrained_fp8_patch_v4.py @@ -17,10 +17,10 @@ from transformers.utils import is_accelerate_available, is_torch_available, logging - if is_torch_available(): import torch import torch.nn as nn + # import triton # import triton.language as tl from torch.nn import functional as F @@ -32,10 +32,6 @@ logger = logging.get_logger(__name__) - - - - logger = logging.get_logger(__name__) diff --git a/auto_round/modeling/hpu_patch.py b/auto_round/modeling/hpu_patch.py index 777120625..a7ecca40c 100644 --- a/auto_round/modeling/hpu_patch.py +++ b/auto_round/modeling/hpu_patch.py @@ -17,11 +17,12 @@ def patch_finegrained_fp8(): # Import auto-round's HPU-compatible finegrained_fp8_patch module from auto_round.utils import is_transformers_version_greater_or_equal_5 + if is_transformers_version_greater_or_equal_5(): patch_file_name = "auto_round.modeling.finegrained_fp8_patch" else: patch_file_name = "auto_round.modeling.finegrained_fp8_patch_v4" - + finegrained_fp8_patch = importlib.import_module(patch_file_name) # Replace transformers.integrations.finegrained_fp8 in sys.modules diff --git a/examples/quant_model.py b/examples/quant_model.py index 0c0c980d6..da4f05ade 100644 --- a/examples/quant_model.py +++ b/examples/quant_model.py @@ -16,6 +16,7 @@ def fix_everything(seed): import random + import numpy as np import torch @@ -37,7 +38,6 @@ def main(args): low_cpu_mem_usage=True, disable_opt_rtn=True, # disable_trust_remote_code=True, - # static_kv_dtype="fp8", ) model_base_name = model_name.rstrip("/").split("/")[-1] @@ -51,11 +51,11 @@ def main(args): format="llm_compressor", ) + if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser( - description="Auto-round quantization script." - ) + + parser = argparse.ArgumentParser(description="Auto-round quantization script.") parser.add_argument( "-m", "--model", From 15c5ca8cc67c0ea8e73077b9e72d9fa5e23f051d Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 9 Mar 2026 08:51:09 +0800 Subject: [PATCH 06/10] remove example Signed-off-by: yiliu30 --- examples/quant_model.py | 74 ----------------------------------------- 1 file changed, 74 deletions(-) delete mode 100644 examples/quant_model.py diff --git a/examples/quant_model.py b/examples/quant_model.py deleted file mode 100644 index da4f05ade..000000000 --- a/examples/quant_model.py +++ /dev/null @@ -1,74 +0,0 @@ -# model_name = "/dataset/meta-llama/Meta-Llama-3-8B/" -# model_name = "/data5/yliu7/HF_HOME/DeepSeek-R1-bf16-layer4" -# model_name = "/models/Qwen3-8B-FP8/" -# model_name = "/data5/yliu7/HF_HOME/DeepSeek-R1-bf16-layer4" -# model_name = "Qwen/Qwen2.5-0.5B-Instruct" -# model_name="/models/Qwen3-235B-A22B/" -model_name = "/mnt/disk5/unsloth/DeepSeek-R1-BF16" -model_name = "/models/Qwen3-8B-FP8/" -# model_name = "/mnt/disk8/Qwen/Qwen3-8B-FP8" -# model_name = "/mnt/disk5/Qwen3-30B-A3B-FP8" -# model_name = "/models/DeepSeek-V2-Lite-Chat/" -# model_name = "/mnt/disk8/deepseek-ai/DeepSeek-V2-Lite-Chat" -model_name = "/mnt/disk8/Qwen/Qwen3-30B-A3B" -from auto_round import AutoRound - - -def fix_everything(seed): - import random - - import numpy as np - import torch - - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - # torch.cuda.manual_seed_all(seed) - - -def main(args): - model_name = args.model - scheme = "FP8_STATIC" - autoround = AutoRound( - model_name, - scheme=scheme, - enable_torch_compile=True, - iters=0, - low_gpu_mem_usage=True, - low_cpu_mem_usage=True, - disable_opt_rtn=True, - # disable_trust_remote_code=True, - # static_kv_dtype="fp8", - ) - model_base_name = model_name.rstrip("/").split("/")[-1] - output_dir = args.output_dir - if output_dir is None: - output_dir = "/mnt/disk5/hf_models/" + model_base_name + "-" + scheme + "-fp8-kv-2-test" - print(f"Output dir: {output_dir}") - - model, save_folder = autoround.quantize_and_save( - output_dir=output_dir, - format="llm_compressor", - ) - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description="Auto-round quantization script.") - parser.add_argument( - "-m", - "--model", - help="Path to the model.", - type=str, - default=model_name, - ) - parser.add_argument( - "-o", - "--output_dir", - help="Path to the output directory.", - type=str, - default=None, - ) - args = parser.parse_args() - main(args) From c24c2bda9f6560ab1cb525e4d244c3480cc89a90 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 9 Mar 2026 00:56:22 +0000 Subject: [PATCH 07/10] fix Signed-off-by: yiliu30 --- auto_round/modeling/hpu_patch.py | 12 ++++++++++-- auto_round/utils/common.py | 7 +++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/auto_round/modeling/hpu_patch.py b/auto_round/modeling/hpu_patch.py index a7ecca40c..aa12f8ad5 100644 --- a/auto_round/modeling/hpu_patch.py +++ b/auto_round/modeling/hpu_patch.py @@ -16,12 +16,20 @@ def patch_finegrained_fp8(): import sys # Import auto-round's HPU-compatible finegrained_fp8_patch module - from auto_round.utils import is_transformers_version_greater_or_equal_5 + from auto_round.utils import is_transformers_version_greater_or_equal_5, is_transformers_version_greater_or_equal_4 if is_transformers_version_greater_or_equal_5(): patch_file_name = "auto_round.modeling.finegrained_fp8_patch" - else: + elif is_transformers_version_greater_or_equal_4(): patch_file_name = "auto_round.modeling.finegrained_fp8_patch_v4" + else: + logger.warning( + ( + "Transformers version is below 4.0.0, skipping finegrained_fp8 patching.", + " Please upgrade to Transformers 4.x or later for HPU support." + ) + ) + return finegrained_fp8_patch = importlib.import_module(patch_file_name) diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py index c494c2959..7b72b4a42 100644 --- a/auto_round/utils/common.py +++ b/auto_round/utils/common.py @@ -407,3 +407,10 @@ def is_transformers_version_greater_or_equal_5(): from packaging import version return version.parse(transformers.__version__) >= version.parse("5.0.0") + +@lru_cache(None) +def is_transformers_version_greater_or_equal_4(): + import transformers + from packaging import version + + return version.parse(transformers.__version__) >= version.parse("4.0.0") From d814c91eaef43ba3df3e341766e3f87780de32e3 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 9 Mar 2026 00:56:35 +0000 Subject: [PATCH 08/10] add todo Signed-off-by: yiliu30 --- auto_round/utils/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py index 7b72b4a42..b32947bd4 100644 --- a/auto_round/utils/common.py +++ b/auto_round/utils/common.py @@ -408,6 +408,7 @@ def is_transformers_version_greater_or_equal_5(): return version.parse(transformers.__version__) >= version.parse("5.0.0") +# TODO: (yiliu30) refine version check logic @lru_cache(None) def is_transformers_version_greater_or_equal_4(): import transformers From 5ebebd5e9583189ada7dfc13336291cd16154702 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 01:00:17 +0000 Subject: [PATCH 09/10] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/modeling/hpu_patch.py | 9 ++++++--- auto_round/utils/common.py | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/auto_round/modeling/hpu_patch.py b/auto_round/modeling/hpu_patch.py index aa12f8ad5..db321f21a 100644 --- a/auto_round/modeling/hpu_patch.py +++ b/auto_round/modeling/hpu_patch.py @@ -16,7 +16,10 @@ def patch_finegrained_fp8(): import sys # Import auto-round's HPU-compatible finegrained_fp8_patch module - from auto_round.utils import is_transformers_version_greater_or_equal_5, is_transformers_version_greater_or_equal_4 + from auto_round.utils import ( + is_transformers_version_greater_or_equal_4, + is_transformers_version_greater_or_equal_5, + ) if is_transformers_version_greater_or_equal_5(): patch_file_name = "auto_round.modeling.finegrained_fp8_patch" @@ -25,8 +28,8 @@ def patch_finegrained_fp8(): else: logger.warning( ( - "Transformers version is below 4.0.0, skipping finegrained_fp8 patching.", - " Please upgrade to Transformers 4.x or later for HPU support." + "Transformers version is below 4.0.0, skipping finegrained_fp8 patching.", + " Please upgrade to Transformers 4.x or later for HPU support.", ) ) return diff --git a/auto_round/utils/common.py b/auto_round/utils/common.py index b32947bd4..4e9e54ec8 100644 --- a/auto_round/utils/common.py +++ b/auto_round/utils/common.py @@ -408,6 +408,7 @@ def is_transformers_version_greater_or_equal_5(): return version.parse(transformers.__version__) >= version.parse("5.0.0") + # TODO: (yiliu30) refine version check logic @lru_cache(None) def is_transformers_version_greater_or_equal_4(): From 6170a1dd43241370fb711754dbfcd6cfd023e370 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 9 Mar 2026 00:59:40 +0000 Subject: [PATCH 10/10] update license Signed-off-by: yiliu30 --- auto_round/modeling/finegrained_fp8_patch_v4.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/auto_round/modeling/finegrained_fp8_patch_v4.py b/auto_round/modeling/finegrained_fp8_patch_v4.py index 4275f60db..11c2f6882 100644 --- a/auto_round/modeling/finegrained_fp8_patch_v4.py +++ b/auto_round/modeling/finegrained_fp8_patch_v4.py @@ -1,18 +1,17 @@ -# coding=utf-8 -# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# Copyright (c) 2026 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +# Copied from https://github.com/huggingface/transformers/blob/v4.57.3/src/transformers/integrations/finegrained_fp8.py from typing import Optional from transformers.utils import is_accelerate_available, is_torch_available, logging