From 6d36ba96b6888ed04c57647a374c7fcc622f9ad0 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 20 May 2026 14:12:29 +0800 Subject: [PATCH 1/3] fix oom Signed-off-by: jiqing-feng --- tests/quantization/bnb/test_mixed_int8.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py index 031fdc9f9e27..e90b3432c9dc 100644 --- a/tests/quantization/bnb/test_mixed_int8.py +++ b/tests/quantization/bnb/test_mixed_int8.py @@ -655,8 +655,18 @@ def setUp(self) -> None: backend_empty_cache(torch_device) model_id = "hf-internal-testing/flux.1-dev-int8-pkg" + # Load each bnb 8bit component separately and move to CPU immediately + # to avoid having both large models on GPU simultaneously (OOM on <=24GB cards). t5_8bit = T5EncoderModel.from_pretrained(model_id, subfolder="text_encoder_2") + t5_8bit = t5_8bit.to("cpu") + gc.collect() + backend_empty_cache(torch_device) + transformer_8bit = FluxTransformer2DModel.from_pretrained(model_id, subfolder="transformer") + transformer_8bit = transformer_8bit.to("cpu") + gc.collect() + backend_empty_cache(torch_device) + self.pipeline_8bit = DiffusionPipeline.from_pretrained( "black-forest-labs/FLUX.1-dev", text_encoder_2=t5_8bit, From 53e0a7cf0ac06d32b096895059fd7017b20fbcab Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 20 May 2026 14:21:06 +0800 Subject: [PATCH 2/3] revert Signed-off-by: jiqing-feng --- tests/quantization/bnb/test_mixed_int8.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py index e90b3432c9dc..32ab3bd50951 100644 --- a/tests/quantization/bnb/test_mixed_int8.py +++ b/tests/quantization/bnb/test_mixed_int8.py @@ -655,25 +655,19 @@ def setUp(self) -> None: backend_empty_cache(torch_device) model_id = "hf-internal-testing/flux.1-dev-int8-pkg" - # Load each bnb 8bit component separately and move to CPU immediately - # to avoid having both large models on GPU simultaneously (OOM on <=24GB cards). t5_8bit = T5EncoderModel.from_pretrained(model_id, subfolder="text_encoder_2") - t5_8bit = t5_8bit.to("cpu") - gc.collect() - backend_empty_cache(torch_device) - transformer_8bit = FluxTransformer2DModel.from_pretrained(model_id, subfolder="transformer") - transformer_8bit = transformer_8bit.to("cpu") - gc.collect() - backend_empty_cache(torch_device) - self.pipeline_8bit = DiffusionPipeline.from_pretrained( "black-forest-labs/FLUX.1-dev", text_encoder_2=t5_8bit, transformer=transformer_8bit, torch_dtype=torch.float16, ) - self.pipeline_8bit.enable_model_cpu_offload() + # Use sequential CPU offload to keep peak GPU memory minimal (one layer at a time). + # enable_model_cpu_offload moves an entire sub-model to GPU at once, which OOMs on + # <=24 GB cards for FLUX.1-dev even with int8 quantization. + # This requires the bitsandbytes fix that preserves Int8Params.SCB across .to() calls. + self.pipeline_8bit.enable_sequential_cpu_offload() def tearDown(self): del self.pipeline_8bit From 862eb67251f75b65b3d0dae744dbd8a1e52f88fe Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Thu, 21 May 2026 09:58:16 +0800 Subject: [PATCH 3/3] adjust tol Signed-off-by: jiqing-feng --- tests/quantization/bnb/test_mixed_int8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py index 32ab3bd50951..daea8ff27d6d 100644 --- a/tests/quantization/bnb/test_mixed_int8.py +++ b/tests/quantization/bnb/test_mixed_int8.py @@ -713,7 +713,7 @@ def test_lora_loading(self): expected_slice = np.array([0.3916, 0.3916, 0.3887, 0.4243, 0.4155, 0.4233, 0.4570, 0.4531, 0.4248]) max_diff = numpy_cosine_similarity_distance(expected_slice, out_slice) - self.assertTrue(max_diff < 1e-3) + self.assertTrue(max_diff < 2e-3) @require_transformers_version_greater("4.44.0")