import gradio as gr from PIL import Image import torch from torchvision import transforms from transformers import ( CLIPProcessor, CLIPModel, CLIPTokenizer, CLIPTextModelWithProjection, CLIPVisionModelWithProjection, CLIPFeatureExtractor, ) import math from typing import List from PIL import Image, ImageChops import numpy as np import torch from diffusers import UnCLIPPipeline # from diffusers.utils.torch_utils import randn_tensor from transformers import CLIPTokenizer from src.priors.prior_transformer import ( PriorTransformer, ) # original huggingface prior transformer without time conditioning from src.pipelines.pipeline_kandinsky_prior import KandinskyPriorPipeline from diffusers import DiffusionPipeline import spaces __DEVICE__ = "cpu" if torch.cuda.is_available(): __DEVICE__ = "cuda" __DEVICE__ = "cuda" class Ours: def __init__(self, device): text_encoder = ( CLIPTextModelWithProjection.from_pretrained( "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280, torch_dtype=torch.float16, ) .eval() .requires_grad_(False) ) tokenizer = CLIPTokenizer.from_pretrained( "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k" ) prior = PriorTransformer.from_pretrained( "ECLIPSE-Community/ECLIPSE_KandinskyV22_Prior", torch_dtype=torch.float16, ) self.pipe_prior = KandinskyPriorPipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-prior", prior=prior, text_encoder=text_encoder, tokenizer=tokenizer, torch_dtype=torch.float16, ).to(device) self.pipe = DiffusionPipeline.from_pretrained( "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16 ).to(device) def inference(self, text, negative_text, steps, guidance_scale, width, height): gen_images = [] for i in range(2): image_emb, negative_image_emb = self.pipe_prior( text, negative_prompt=negative_text ).to_tuple() image = self.pipe( image_embeds=image_emb, negative_image_embeds=negative_image_emb, num_inference_steps=steps, guidance_scale=guidance_scale, width=width, height=height, ).images gen_images.append(image[0]) return gen_images selected_model = Ours(device=__DEVICE__) @spaces.GPU def get_images(text, negative_text, steps, guidance_scale, width, height, fixed_res): if fixed_res!="manual": print(f"Using {fixed_res} resolution") width, height = fixed_res.split("x") images = selected_model.inference(text, negative_text, steps, guidance_scale, width=int(width), height=int(height)) new_images = [] for img in images: new_images.append(img) return new_images with gr.Blocks() as demo: gr.Markdown( """

[CVPR 2024] ECLIPSE: Revisiting the Text-to-Image Prior for Effecient Image Generation

Project Page | Paper

""" ) with gr.Group(): with gr.Row(): with gr.Column(): text = gr.Textbox( label="Enter your prompt", show_label=False, max_lines=1, placeholder="Enter your prompt", elem_id="prompt-text-input", ) with gr.Row(): with gr.Column(): negative_text = gr.Textbox( label="Enter your negative prompt", show_label=False, max_lines=1, placeholder="Enter your negative prompt", elem_id="prompt-text-input", ) with gr.Row(): steps = gr.Slider(label="Steps", minimum=10, maximum=100, value=50, step=1) guidance_scale = gr.Slider( label="Guidance Scale", minimum=0, maximum=10, value=7.5, step=0.1 ) with gr.Row(): with gr.Group(): width_inp = gr.Textbox( label="Please provide the width", value="512", max_lines=1, ) height_inp = gr.Textbox( label="Please provide the height", max_lines=1, value="512", ) fixed_res = gr.Dropdown( ["manual", "512x512", "1024x1024", "1920x1080", "1280x720"], value="manual", label="Prefined Resolution", info="Either select one or manually define one!" ) with gr.Row(): btn = gr.Button(value="Generate Image") gallery = gr.Gallery( label="Generated images", show_label=False, elem_id="gallery" , columns=[2], rows=[1], object_fit="contain", height="auto") btn.click( get_images, inputs=[ text, negative_text, steps, guidance_scale, width_inp, height_inp, fixed_res, ], outputs=gallery, ) text.submit( get_images, inputs=[ text, negative_text, steps, guidance_scale, width_inp, height_inp, fixed_res, ], outputs=gallery, ) negative_text.submit( get_images, inputs=[ text, negative_text, steps, guidance_scale, width_inp, height_inp, fixed_res, ], outputs=gallery, ) with gr.Accordion(label="Ethics & Privacy", open=False): gr.HTML( """

Privacy

We do not collect any images or key data. This demo is designed with sole purpose of fun and reducing misuse of AI.

Biases and content acknowledgment

This model will have the same biases as pre-trained CLIP model.
""" ) if __name__ == "__main__": demo.queue(max_size=20).launch()