| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| import gc |
| import sys |
| import unittest |
|
|
| import numpy as np |
| import torch |
| from transformers import AutoTokenizer, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel |
|
|
| from diffusers import ( |
| FlowMatchEulerDiscreteScheduler, |
| SD3Transformer2DModel, |
| StableDiffusion3Img2ImgPipeline, |
| StableDiffusion3Pipeline, |
| ) |
| from diffusers.utils import load_image |
| from diffusers.utils.import_utils import is_accelerate_available |
|
|
| from ..testing_utils import ( |
| backend_empty_cache, |
| is_flaky, |
| nightly, |
| numpy_cosine_similarity_distance, |
| require_big_accelerator, |
| require_peft_backend, |
| require_torch_accelerator, |
| torch_device, |
| ) |
|
|
|
|
| sys.path.append(".") |
|
|
| from .utils import PeftLoraLoaderMixinTests |
|
|
|
|
| if is_accelerate_available(): |
| from accelerate.utils import release_memory |
|
|
|
|
| @require_peft_backend |
| class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): |
| pipeline_class = StableDiffusion3Pipeline |
| scheduler_cls = FlowMatchEulerDiscreteScheduler |
| scheduler_kwargs = {} |
| transformer_kwargs = { |
| "sample_size": 32, |
| "patch_size": 1, |
| "in_channels": 4, |
| "num_layers": 1, |
| "attention_head_dim": 8, |
| "num_attention_heads": 4, |
| "caption_projection_dim": 32, |
| "joint_attention_dim": 32, |
| "pooled_projection_dim": 64, |
| "out_channels": 4, |
| } |
| transformer_cls = SD3Transformer2DModel |
| vae_kwargs = { |
| "sample_size": 32, |
| "in_channels": 3, |
| "out_channels": 3, |
| "block_out_channels": (4,), |
| "layers_per_block": 1, |
| "latent_channels": 4, |
| "norm_num_groups": 1, |
| "use_quant_conv": False, |
| "use_post_quant_conv": False, |
| "shift_factor": 0.0609, |
| "scaling_factor": 1.5035, |
| } |
| has_three_text_encoders = True |
| tokenizer_cls, tokenizer_id = CLIPTokenizer, "hf-internal-testing/tiny-random-clip" |
| tokenizer_2_cls, tokenizer_2_id = CLIPTokenizer, "hf-internal-testing/tiny-random-clip" |
| tokenizer_3_cls, tokenizer_3_id = AutoTokenizer, "hf-internal-testing/tiny-random-t5" |
| text_encoder_cls, text_encoder_id = CLIPTextModelWithProjection, "hf-internal-testing/tiny-sd3-text_encoder" |
| text_encoder_2_cls, text_encoder_2_id = CLIPTextModelWithProjection, "hf-internal-testing/tiny-sd3-text_encoder-2" |
| text_encoder_3_cls, text_encoder_3_id = T5EncoderModel, "hf-internal-testing/tiny-random-t5" |
|
|
| @property |
| def output_shape(self): |
| return (1, 32, 32, 3) |
|
|
| @require_torch_accelerator |
| def test_sd3_lora(self): |
| """ |
| Test loading the loras that are saved with the diffusers and peft formats. |
| Related PR: https://github.com/huggingface/diffusers/pull/8584 |
| """ |
| components = self.get_dummy_components() |
| pipe = self.pipeline_class(**components[0]) |
| pipe = pipe.to(torch_device) |
| pipe.set_progress_bar_config(disable=None) |
|
|
| lora_model_id = "hf-internal-testing/tiny-sd3-loras" |
|
|
| lora_filename = "lora_diffusers_format.safetensors" |
| pipe.load_lora_weights(lora_model_id, weight_name=lora_filename) |
| pipe.unload_lora_weights() |
|
|
| lora_filename = "lora_peft_format.safetensors" |
| pipe.load_lora_weights(lora_model_id, weight_name=lora_filename) |
|
|
| @unittest.skip("Not supported in SD3.") |
| def test_simple_inference_with_text_denoiser_block_scale(self): |
| pass |
|
|
| @unittest.skip("Not supported in SD3.") |
| def test_simple_inference_with_text_denoiser_multi_adapter_block_lora(self): |
| pass |
|
|
| @unittest.skip("Not supported in SD3.") |
| def test_simple_inference_with_text_denoiser_block_scale_for_all_dict_options(self): |
| pass |
|
|
| @unittest.skip("Not supported in SD3.") |
| def test_modify_padding_mode(self): |
| pass |
|
|
| @is_flaky |
| def test_multiple_wrong_adapter_name_raises_error(self): |
| super().test_multiple_wrong_adapter_name_raises_error() |
|
|
|
|
| @nightly |
| @require_torch_accelerator |
| @require_peft_backend |
| @require_big_accelerator |
| class SD3LoraIntegrationTests(unittest.TestCase): |
| pipeline_class = StableDiffusion3Img2ImgPipeline |
| repo_id = "stabilityai/stable-diffusion-3-medium-diffusers" |
|
|
| def setUp(self): |
| super().setUp() |
| gc.collect() |
| backend_empty_cache(torch_device) |
|
|
| def tearDown(self): |
| super().tearDown() |
| gc.collect() |
| backend_empty_cache(torch_device) |
|
|
| def get_inputs(self, device, seed=0): |
| init_image = load_image( |
| "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" |
| ) |
| if str(device).startswith("mps"): |
| generator = torch.manual_seed(seed) |
| else: |
| generator = torch.Generator(device="cpu").manual_seed(seed) |
|
|
| return { |
| "prompt": "corgi", |
| "num_inference_steps": 2, |
| "guidance_scale": 5.0, |
| "output_type": "np", |
| "generator": generator, |
| "image": init_image, |
| } |
|
|
| def test_sd3_img2img_lora(self): |
| pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16) |
| pipe.load_lora_weights("zwloong/sd3-lora-training-rank16-v2") |
| pipe.fuse_lora() |
| pipe.unload_lora_weights() |
| pipe = pipe.to(torch_device) |
|
|
| inputs = self.get_inputs(torch_device) |
|
|
| image = pipe(**inputs).images[0] |
| image_slice = image[0, -3:, -3:] |
| expected_slice = np.array([0.5649, 0.5405, 0.5488, 0.5688, 0.5449, 0.5513, 0.5337, 0.5107, 0.5059]) |
|
|
| max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten()) |
|
|
| assert max_diff < 1e-4, f"Outputs are not close enough, got {max_diff}" |
| pipe.unload_lora_weights() |
| release_memory(pipe) |
|
|