kandinsky-community
/

kandinsky-2-1

@@ -25,41 +25,23 @@ pip install diffusers transformers
 ### Text to image
 ```python
-from diffusers import KandinskyPipeline, KandinskyPriorPipeline
 import torch
-pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
 pipe_prior.to("cuda")
 prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting"
 negative_prompt = "low quality, bad quality"
-image_emb = pipe_prior(
-    prompt, guidance_scale=1.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt
-).images
-zero_image_emb = pipe_prior(
-    negative_prompt, guidance_scale=1.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt
-).images
-pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
-pipe.to("cuda")
-images = pipe(
-    prompt,
-    image_embeds=image_emb,
-    negative_image_embeds=zero_image_emb,
-    num_images_per_prompt=2,
-    height=768,
-    width=768,
-    num_inference_steps=100,
-    guidance_scale=4.0,
-    generator=generator,
-).images[0]
-image.save("./cheeseburger_monster.png")
 ```
 ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/cheeseburger.png)
@@ -81,7 +63,9 @@ original_image = Image.open(BytesIO(response.content)).convert("RGB")
 original_image = original_image.resize((768, 512))
 # create prior
-pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
 pipe_prior.to("cuda")
 # create img2img pipeline
@@ -91,22 +75,16 @@ pipe.to("cuda")
 prompt = "A fantasy landscape, Cinematic lighting"
 negative_prompt = "low quality, bad quality"
-image_emb = pipe_prior(
-    prompt, guidance_scale=4.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt
-).images
-zero_image_emb = pipe_prior(
-    negative_prompt, guidance_scale=4.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt
-).images
 out = pipe(
     prompt,
     image=original_image,
-    image_embeds=image_emb,
-    negative_image_embeds=zero_image_emb,
     height=768,
     width=768,
-    num_inference_steps=500,
     strength=0.3,
 )
@@ -124,9 +102,10 @@ from diffusers.utils import load_image
 import PIL
 import torch
-from torchvision import transforms
-pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
 pipe_prior.to("cuda")
 img1 = load_image(
@@ -137,16 +116,20 @@ img2 = load_image(
     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/starry_night.jpeg"
 )
 images_texts = ["a cat", img1, img2]
 weights = [0.3, 0.3, 0.4]
-image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights)
 pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
 pipe.to("cuda")
-image = pipe(
-    "", image_embeds=image_emb, negative_image_embeds=zero_image_emb, height=768, width=768, num_inference_steps=150
-).images[0]
 image.save("starry_cat.png")
 ```

 ### Text to image
 ```python
+from diffusers import DiffusionPipeline
 import torch
+pipe_prior = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16)
 pipe_prior.to("cuda")
+t2i_pipe = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
+t2i_pipe.to("cuda")
 prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting"
 negative_prompt = "low quality, bad quality"
+generator = torch.Generator(device="cuda").manual_seed(12)
+image_embeds, negative_image_embeds = pipe_prior(prompt, negative_prompt, generator=generator).to_tuple()
+image = t2i_pipe(prompt, image_embeds=image_embeds, negative_image_embeds=negative_image_embeds).images[0]
+image.save("cheeseburger_monster.png")
 ```
 ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/cheeseburger.png)
 original_image = original_image.resize((768, 512))
 # create prior
+pipe_prior = KandinskyPriorPipeline.from_pretrained(
+    "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+)
 pipe_prior.to("cuda")
 # create img2img pipeline
 prompt = "A fantasy landscape, Cinematic lighting"
 negative_prompt = "low quality, bad quality"
+generator = torch.Generator(device="cuda").manual_seed(30)
+image_embeds, negative_image_embeds = pipe_prior(prompt, negative_prompt, generator=generator).to_tuple()
 out = pipe(
     prompt,
     image=original_image,
+    image_embeds=image_embeds,
+    negative_image_embeds=negative_image_embeds,
     height=768,
     width=768,
     strength=0.3,
 )
 import PIL
 import torch
+pipe_prior = KandinskyPriorPipeline.from_pretrained(
+    "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+)
 pipe_prior.to("cuda")
 img1 = load_image(
     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/starry_night.jpeg"
 )
+# add all the conditions we want to interpolate, can be either text or image
 images_texts = ["a cat", img1, img2]
+# specify the weights for each condition in images_texts
 weights = [0.3, 0.3, 0.4]
+# We can leave the prompt empty
+prompt = ""
+prior_out = pipe_prior.interpolate(images_texts, weights)
 pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
 pipe.to("cuda")
+image = pipe(prompt, **prior_out, height=768, width=768).images[0]
 image.save("starry_cat.png")
 ```