Spaces:

multimodalart
/

diffusion

Runtime error

File size: 5,502 Bytes

bc5a411
077fc3d
 
 
bc5a411
 
 
077fc3d
51df617
3668992
 
 
 
 
 
 
 
 
 
077fc3d
 
bc5a411
 
077fc3d
bc5a411
 
 
 
 
 
077fc3d
bc5a411
077fc3d
bc5a411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
077fc3d
 
 
bc5a411
 
077fc3d
 
 
 
 
2925ca2
 
bc5a411
 
077fc3d
bc5a411
 
 
077fc3d
bc5a411
 
077fc3d
 
 
51df617

import gc
import math
import sys

from IPython import display
import torch
from torchvision import utils as tv_utils
from torchvision.transforms import functional as TF
import gradio as gr
from git.repo.base import Repo
from os.path import exists as path_exists

if not (path_exists(f"v-diffusion-pytorch")):
    Repo.clone_from("https://github.com/crowsonkb/v-diffusion-pytorch", "v-diffusion-pytorch")
if not (path_exists(f"CLIP")):
    Repo.clone_from("https://github.com/openai/CLIP", "CLIP")
sys.path.append('v-diffusion-pytorch')


from huggingface_hub import hf_hub_download

from CLIP import clip
from diffusion import get_model, sampling, utils

cc12m_model = hf_hub_download(repo_id="multimodalart/crowsonkb-v-diffusion-cc12m-1-cfg", filename="cc12m_1_cfg.pth")
model = get_model('cc12m_1_cfg')()
_, side_y, side_x = model.shape
model.load_state_dict(torch.load(cc12m_model, map_location='cpu'))
model = model.half().cuda().eval().requires_grad_(False)
clip_model = clip.load(model.clip_model, jit=False, device='cpu')[0]

def run_all(prompt, steps, n_images, weight):
    import random
    seed = int(random.randint(0, 2147483647))
    target_embed = clip_model.encode_text(clip.tokenize(prompt)).float().cuda()

    def cfg_model_fn(x, t):
        """The CFG wrapper function."""
        n = x.shape[0]
        x_in = x.repeat([2, 1, 1, 1])
        t_in = t.repeat([2])
        clip_embed_repeat = target_embed.repeat([n, 1])
        clip_embed_in = torch.cat([torch.zeros_like(clip_embed_repeat), clip_embed_repeat])
        v_uncond, v_cond = model(x_in, t_in, clip_embed_in).chunk(2, dim=0)
        v = v_uncond + (v_cond - v_uncond) * weight
        return v

    gc.collect()
    torch.cuda.empty_cache()
    torch.manual_seed(seed)
    x = torch.randn([n_images, 3, side_y, side_x], device='cuda')
    t = torch.linspace(1, 0, steps + 1, device='cuda')[:-1]
    step_list = utils.get_spliced_ddpm_cosine_schedule(t)
    outs = sampling.plms_sample(cfg_model_fn, x, step_list, {})#, callback=display_callback)
    images_out = []
    for i, out in enumerate(outs):
        images_out.append(utils.to_pil_image(out))
    return(images_out)
    

##################### START GRADIO HERE ############################
#image = gr.outputs.Image(type="pil", label="Your result")
gallery = gr.Gallery(css={"height": "256px","width":"256px"})
iface = gr.Interface(
    fn=run_all, 
    inputs=[
    gr.inputs.Textbox(label="Prompt - try adding increments to your prompt such as 'oil on canvas', 'a painting', 'a book cover'",default="chalk pastel drawing of a dog wearing a funny hat"),
    gr.inputs.Slider(label="Steps - more steps can increase quality but will take longer to generate",default=50,maximum=250,minimum=1,step=1),
    gr.inputs.Slider(label="Number of images in parallel", default=2, maximum=4, minimum=1,step=1),
    gr.inputs.Slider(label="Weight", default=5, maximum=15, minimum=0, step=1),
    #gr.inputs.Checkbox(label="CLIP Guided"),
    #gr.inputs.Dropdown(label="Flavor",choices=["ginger", "cumin", "holywater", "zynth", "wyvern", "aaron", "moth", "juu", "custom"]),
    #markdown,
    #gr.inputs.Dropdown(label="Style",choices=["Default","Balanced","Detailed","Consistent Creativity","Realistic","Smooth","Subtle MSE","Hyper Fast Results"],default="Hyper Fast Results"),
    #gr.inputs.Radio(label="Width", choices=[32,64,128,256,512],default=512),
    #gr.inputs.Radio(label="Height", choices=[32,64,128,256,512],default=512),
    ], 
    outputs=gallery,
    title="Generate images from text with V-Diffusion CC12M",
    #description="<div>By typing a prompt and pressing submit you can generate images based on this prompt. <a href='https://github.com/CompVis/latent-diffusion' target='_blank'>Latent Diffusion</a> is a text-to-image model created by <a href='https://github.com/CompVis' target='_blank'>CompVis</a>, trained on the <a href='https://laion.ai/laion-400-open-dataset/'>LAION-400M dataset.</a><br>This UI to the model was assembled by <a style='color: rgb(245, 158, 11);font-weight:bold' href='https://twitter.com/multimodalart' target='_blank'>@multimodalart</a></div>",
    #article="<h4 style='font-size: 110%;margin-top:.5em'>Biases acknowledgment</h4><div>Despite how impressive being able to turn text into image is, beware to the fact that this model may output content that reinforces or exarcbates societal biases. According to the <a href='https://arxiv.org/abs/2112.10752' target='_blank'>Latent Diffusion paper</a>:<i> \"Deep learning modules tend to reproduce or exacerbate biases that are already present in the data\"</i>. The model was trained on an unfiltered version the LAION-400M dataset, which scrapped non-curated image-text-pairs from the internet (the exception being the the removal of illegal content) and is meant to be used for research purposes, such as this one. <a href='https://laion.ai/laion-400-open-dataset/' target='_blank'>You can read more on LAION's website</a></div><h4 style='font-size: 110%;margin-top:1em'>Who owns the images produced by this demo?</h4><div>Definetly not me! Probably you do. I say probably because the Copyright discussion about AI generated art is ongoing. So <a href='https://www.theverge.com/2022/2/21/22944335/us-copyright-office-reject-ai-generated-art-recent-entrance-to-paradise' target='_blank'>it may be the case that everything produced here falls automatically into the public domain</a>. But in any case it is either yours or is in the public domain.</div>"
    )
iface.launch()