mpatel57's picture
latest gradio changes
04efe97 verified
import gradio as gr
from PIL import Image
import torch
from torchvision import transforms
from transformers import (
CLIPProcessor,
CLIPModel,
CLIPTokenizer,
CLIPTextModelWithProjection,
CLIPVisionModelWithProjection,
CLIPFeatureExtractor,
)
import math
from typing import List
from PIL import Image, ImageChops
import numpy as np
import torch
from diffusers import UnCLIPPipeline
# from diffusers.utils.torch_utils import randn_tensor
from transformers import CLIPTokenizer
from src.priors.prior_transformer import (
PriorTransformer,
) # original huggingface prior transformer without time conditioning
from src.pipelines.pipeline_kandinsky_prior import KandinskyPriorPipeline
from diffusers import DiffusionPipeline
import spaces
__DEVICE__ = "cpu"
if torch.cuda.is_available():
__DEVICE__ = "cuda"
__DEVICE__ = "cuda"
class Ours:
def __init__(self, device):
text_encoder = (
CLIPTextModelWithProjection.from_pretrained(
"laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
projection_dim=1280,
torch_dtype=torch.float16,
)
.eval()
.requires_grad_(False)
)
tokenizer = CLIPTokenizer.from_pretrained(
"laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
)
prior = PriorTransformer.from_pretrained(
"ECLIPSE-Community/ECLIPSE_KandinskyV22_Prior",
torch_dtype=torch.float16,
)
self.pipe_prior = KandinskyPriorPipeline.from_pretrained(
"kandinsky-community/kandinsky-2-2-prior",
prior=prior,
text_encoder=text_encoder,
tokenizer=tokenizer,
torch_dtype=torch.float16,
).to(device)
self.pipe = DiffusionPipeline.from_pretrained(
"kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
).to(device)
def inference(self, text, negative_text, steps, guidance_scale, width, height):
gen_images = []
for i in range(2):
image_emb, negative_image_emb = self.pipe_prior(
text, negative_prompt=negative_text
).to_tuple()
image = self.pipe(
image_embeds=image_emb,
negative_image_embeds=negative_image_emb,
num_inference_steps=steps,
guidance_scale=guidance_scale,
width=width,
height=height,
).images
gen_images.append(image[0])
return gen_images
selected_model = Ours(device=__DEVICE__)
@spaces.GPU
def get_images(text, negative_text, steps, guidance_scale, width, height, fixed_res):
if fixed_res!="manual":
print(f"Using {fixed_res} resolution")
width, height = fixed_res.split("x")
images = selected_model.inference(text, negative_text, steps, guidance_scale, width=int(width), height=int(height))
new_images = []
for img in images:
new_images.append(img)
return new_images
with gr.Blocks() as demo:
gr.Markdown(
"""<h1 style="text-align: center;"><b>[CVPR 2024] <i>ECLIPSE</i>: Revisiting the Text-to-Image Prior for Effecient Image Generation</b></h1>
<h1 style='text-align: center;'><a href='https://eclipse-t2i.vercel.app/'>Project Page</a> | <a href='https://arxiv.org/abs/2312.04655'>Paper</a> </h1>
"""
)
with gr.Group():
with gr.Row():
with gr.Column():
text = gr.Textbox(
label="Enter your prompt",
show_label=False,
max_lines=1,
placeholder="Enter your prompt",
elem_id="prompt-text-input",
)
with gr.Row():
with gr.Column():
negative_text = gr.Textbox(
label="Enter your negative prompt",
show_label=False,
max_lines=1,
placeholder="Enter your negative prompt",
elem_id="prompt-text-input",
)
with gr.Row():
steps = gr.Slider(label="Steps", minimum=10, maximum=100, value=50, step=1)
guidance_scale = gr.Slider(
label="Guidance Scale", minimum=0, maximum=10, value=7.5, step=0.1
)
with gr.Row():
with gr.Group():
width_inp = gr.Textbox(
label="Please provide the width",
value="512",
max_lines=1,
)
height_inp = gr.Textbox(
label="Please provide the height",
max_lines=1,
value="512",
)
fixed_res = gr.Dropdown(
["manual", "512x512", "1024x1024", "1920x1080", "1280x720"], value="manual", label="Prefined Resolution", info="Either select one or manually define one!"
)
with gr.Row():
btn = gr.Button(value="Generate Image")
gallery = gr.Gallery(
label="Generated images", show_label=False, elem_id="gallery"
, columns=[2], rows=[1], object_fit="contain", height="auto")
btn.click(
get_images,
inputs=[
text,
negative_text,
steps,
guidance_scale,
width_inp,
height_inp,
fixed_res,
],
outputs=gallery,
)
text.submit(
get_images,
inputs=[
text,
negative_text,
steps,
guidance_scale,
width_inp,
height_inp,
fixed_res,
],
outputs=gallery,
)
negative_text.submit(
get_images,
inputs=[
text,
negative_text,
steps,
guidance_scale,
width_inp,
height_inp,
fixed_res,
],
outputs=gallery,
)
with gr.Accordion(label="Ethics & Privacy", open=False):
gr.HTML(
"""<div class="acknowledgments">
<p><h4>Privacy</h4>
We do not collect any images or key data. This demo is designed with sole purpose of fun and reducing misuse of AI.
<p><h4>Biases and content acknowledgment</h4>
This model will have the same biases as pre-trained CLIP model. </div>
"""
)
if __name__ == "__main__":
demo.queue(max_size=20).launch()