From 6d36ba96b6888ed04c57647a374c7fcc622f9ad0 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 20 May 2026 14:12:29 +0800
Subject: [PATCH 1/3] fix oom

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 tests/quantization/bnb/test_mixed_int8.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index 031fdc9f9e27..e90b3432c9dc 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -655,8 +655,18 @@ def setUp(self) -> None:
         backend_empty_cache(torch_device)
 
         model_id = "hf-internal-testing/flux.1-dev-int8-pkg"
+        # Load each bnb 8bit component separately and move to CPU immediately
+        # to avoid having both large models on GPU simultaneously (OOM on <=24GB cards).
         t5_8bit = T5EncoderModel.from_pretrained(model_id, subfolder="text_encoder_2")
+        t5_8bit = t5_8bit.to("cpu")
+        gc.collect()
+        backend_empty_cache(torch_device)
+
         transformer_8bit = FluxTransformer2DModel.from_pretrained(model_id, subfolder="transformer")
+        transformer_8bit = transformer_8bit.to("cpu")
+        gc.collect()
+        backend_empty_cache(torch_device)
+
         self.pipeline_8bit = DiffusionPipeline.from_pretrained(
             "black-forest-labs/FLUX.1-dev",
             text_encoder_2=t5_8bit,

From 53e0a7cf0ac06d32b096895059fd7017b20fbcab Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 20 May 2026 14:21:06 +0800
Subject: [PATCH 2/3] revert

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 tests/quantization/bnb/test_mixed_int8.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index e90b3432c9dc..32ab3bd50951 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -655,25 +655,19 @@ def setUp(self) -> None:
         backend_empty_cache(torch_device)
 
         model_id = "hf-internal-testing/flux.1-dev-int8-pkg"
-        # Load each bnb 8bit component separately and move to CPU immediately
-        # to avoid having both large models on GPU simultaneously (OOM on <=24GB cards).
         t5_8bit = T5EncoderModel.from_pretrained(model_id, subfolder="text_encoder_2")
-        t5_8bit = t5_8bit.to("cpu")
-        gc.collect()
-        backend_empty_cache(torch_device)
-
         transformer_8bit = FluxTransformer2DModel.from_pretrained(model_id, subfolder="transformer")
-        transformer_8bit = transformer_8bit.to("cpu")
-        gc.collect()
-        backend_empty_cache(torch_device)
-
         self.pipeline_8bit = DiffusionPipeline.from_pretrained(
             "black-forest-labs/FLUX.1-dev",
             text_encoder_2=t5_8bit,
             transformer=transformer_8bit,
             torch_dtype=torch.float16,
         )
-        self.pipeline_8bit.enable_model_cpu_offload()
+        # Use sequential CPU offload to keep peak GPU memory minimal (one layer at a time).
+        # enable_model_cpu_offload moves an entire sub-model to GPU at once, which OOMs on
+        # <=24 GB cards for FLUX.1-dev even with int8 quantization.
+        # This requires the bitsandbytes fix that preserves Int8Params.SCB across .to() calls.
+        self.pipeline_8bit.enable_sequential_cpu_offload()
 
     def tearDown(self):
         del self.pipeline_8bit

From 862eb67251f75b65b3d0dae744dbd8a1e52f88fe Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 21 May 2026 09:58:16 +0800
Subject: [PATCH 3/3] adjust tol

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 tests/quantization/bnb/test_mixed_int8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
index 32ab3bd50951..daea8ff27d6d 100644
--- a/tests/quantization/bnb/test_mixed_int8.py
+++ b/tests/quantization/bnb/test_mixed_int8.py
@@ -713,7 +713,7 @@ def test_lora_loading(self):
         expected_slice = np.array([0.3916, 0.3916, 0.3887, 0.4243, 0.4155, 0.4233, 0.4570, 0.4531, 0.4248])
 
         max_diff = numpy_cosine_similarity_distance(expected_slice, out_slice)
-        self.assertTrue(max_diff < 1e-3)
+        self.assertTrue(max_diff < 2e-3)
 
 
 @require_transformers_version_greater("4.44.0")