|
|
|
from diffusers import DiffusionPipeline |
|
from diffusers.utils import pt_to_pil |
|
import torch |
|
|
|
|
|
stage_1 = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-M-v1.0", variant="fp16", torch_dtype=torch.float16) |
|
stage_1.enable_xformers_memory_efficient_attention() |
|
stage_1.enable_model_cpu_offload() |
|
|
|
|
|
stage_2 = DiffusionPipeline.from_pretrained( |
|
"DeepFloyd/IF-II-M-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16 |
|
) |
|
stage_2.enable_xformers_memory_efficient_attention() |
|
stage_2.enable_model_cpu_offload() |
|
|
|
|
|
safety_modules = {"feature_extractor": stage_1.feature_extractor, "safety_checker": stage_1.safety_checker, "watermarker": stage_1.watermarker} |
|
stage_3 = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-x4-upscaler", **safety_modules, torch_dtype=torch.float16) |
|
stage_3.enable_xformers_memory_efficient_attention() |
|
stage_3.enable_model_cpu_offload() |
|
|
|
prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"' |
|
|
|
|
|
prompt_embeds, negative_embeds = stage_1.encode_prompt(prompt) |
|
|
|
generator = torch.manual_seed(0) |
|
|
|
image = stage_1(prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, generator=generator, output_type="pt").images |
|
pt_to_pil(image)[0].save("./if_stage_I.png") |
|
|
|
image = stage_2( |
|
image=image, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, generator=generator, output_type="pt" |
|
).images |
|
pt_to_pil(image)[0].save("./if_stage_II.png") |
|
|
|
image = stage_3(prompt=prompt, image=image, generator=generator, noise_level=100).images |
|
image[0].save("./if_stage_III.png") |
|
|
|
|