Spaces:
Running
on
A10G
Running
on
A10G
################################################################################ | |
# Copyright (C) 2023 Xingqian Xu - All Rights Reserved # | |
# # | |
# Please visit Versatile Diffusion's arXiv paper for more details, link at # | |
# arxiv.org/abs/2211.08332 # | |
# # | |
# Besides, this work is also inspired by many established techniques including:# | |
# Denoising Diffusion Probablistic Model; Denoising Diffusion Implicit Model; # | |
# Latent Diffusion Model; Stable Diffusion; Stable Diffusion - Img2Img; Stable # | |
# Diffusion - Variation; ImageMixer; DreamBooth; Stable Diffusion - Lora; More # | |
# Control for Free; Prompt-to-Prompt; # | |
# # | |
################################################################################ | |
import gradio as gr | |
import os | |
import PIL | |
from PIL import Image | |
from pathlib import Path | |
import numpy as np | |
import numpy.random as npr | |
from contextlib import nullcontext | |
import types | |
import torch | |
import torchvision.transforms as tvtrans | |
from lib.cfg_helper import model_cfg_bank | |
from lib.model_zoo import get_model | |
from cusomized_gradio_blocks import create_myexamples, customized_as_example, customized_postprocess | |
n_sample_image = 2 | |
n_sample_text = 4 | |
cache_examples = True | |
from lib.model_zoo.ddim import DDIMSampler | |
########## | |
# helper # | |
########## | |
def highlight_print(info): | |
print('') | |
print(''.join(['#']*(len(info)+4))) | |
print('# '+info+' #') | |
print(''.join(['#']*(len(info)+4))) | |
print('') | |
def decompose(x, q=20, niter=100): | |
x_mean = x.mean(-1, keepdim=True) | |
x_input = x - x_mean | |
u, s, v = torch.pca_lowrank(x_input, q=q, center=False, niter=niter) | |
ss = torch.stack([torch.diag(si) for si in s]) | |
x_lowrank = torch.bmm(torch.bmm(u, ss), torch.permute(v, [0, 2, 1])) | |
x_remain = x_input - x_lowrank | |
return u, s, v, x_mean, x_remain | |
class adjust_rank(object): | |
def __init__(self, max_drop_rank=[1, 5], q=20): | |
self.max_semantic_drop_rank = max_drop_rank[0] | |
self.max_style_drop_rank = max_drop_rank[1] | |
self.q = q | |
def t2y0_semf_wrapper(t0, y00, t1, y01): | |
return lambda t: (np.exp((t-0.5)*2)-t0)/(t1-t0)*(y01-y00)+y00 | |
t0, y00 = np.exp((0 -0.5)*2), -self.max_semantic_drop_rank | |
t1, y01 = np.exp((0.5-0.5)*2), 1 | |
self.t2y0_semf = t2y0_semf_wrapper(t0, y00, t1, y01) | |
def x2y_semf_wrapper(x0, x1, y1): | |
return lambda x, y0: (x-x0)/(x1-x0)*(y1-y0)+y0 | |
x0 = 0 | |
x1, y1 = self.max_semantic_drop_rank+1, 1 | |
self.x2y_semf = x2y_semf_wrapper(x0, x1, y1) | |
def t2y0_styf_wrapper(t0, y00, t1, y01): | |
return lambda t: (np.exp((t-0.5)*2)-t0)/(t1-t0)*(y01-y00)+y00 | |
t0, y00 = np.exp((1 -0.5)*2), -(q-self.max_style_drop_rank) | |
t1, y01 = np.exp((0.5-0.5)*2), 1 | |
self.t2y0_styf = t2y0_styf_wrapper(t0, y00, t1, y01) | |
def x2y_styf_wrapper(x0, x1, y1): | |
return lambda x, y0: (x-x0)/(x1-x0)*(y1-y0)+y0 | |
x0 = q-1 | |
x1, y1 = self.max_style_drop_rank-1, 1 | |
self.x2y_styf = x2y_styf_wrapper(x0, x1, y1) | |
def __call__(self, x, lvl): | |
if lvl == 0.5: | |
return x | |
if x.dtype == torch.float16: | |
fp16 = True | |
x = x.float() | |
else: | |
fp16 = False | |
std_save = x.std(axis=[-2, -1]) | |
u, s, v, x_mean, x_remain = decompose(x, q=self.q) | |
if lvl < 0.5: | |
assert lvl>=0 | |
for xi in range(0, self.max_semantic_drop_rank+1): | |
y0 = self.t2y0_semf(lvl) | |
yi = self.x2y_semf(xi, y0) | |
yi = 0 if yi<0 else yi | |
s[:, xi] *= yi | |
elif lvl > 0.5: | |
assert lvl <= 1 | |
for xi in range(self.max_style_drop_rank, self.q): | |
y0 = self.t2y0_styf(lvl) | |
yi = self.x2y_styf(xi, y0) | |
yi = 0 if yi<0 else yi | |
s[:, xi] *= yi | |
x_remain = 0 | |
ss = torch.stack([torch.diag(si) for si in s]) | |
x_lowrank = torch.bmm(torch.bmm(u, ss), torch.permute(v, [0, 2, 1])) | |
x_new = x_lowrank + x_mean + x_remain | |
std_new = x_new.std(axis=[-2, -1]) | |
x_new = x_new / std_new * std_save | |
if fp16: | |
x_new = x_new.half() | |
return x_new | |
def remove_duplicate_word(tx): | |
def combine_words(input, length): | |
combined_inputs = [] | |
if len(splitted_input)>1: | |
for i in range(len(input)-1): | |
combined_inputs.append(input[i]+" "+last_word_of(splitted_input[i+1],length)) #add the last word of the right-neighbour (overlapping) sequence (before it has expanded), which is the next word in the original sentence | |
return combined_inputs, length+1 | |
def remove_duplicates(input, length): | |
bool_broke=False #this means we didn't find any duplicates here | |
for i in range(len(input) - length): | |
if input[i]==input[i + length]: #found a duplicate piece of sentence! | |
for j in range(0, length): #remove the overlapping sequences in reverse order | |
del input[i + length - j] | |
bool_broke = True | |
break #break the for loop as the loop length does not matches the length of splitted_input anymore as we removed elements | |
if bool_broke: | |
return remove_duplicates(input, length) #if we found a duplicate, look for another duplicate of the same length | |
return input | |
def last_word_of(input, length): | |
splitted = input.split(" ") | |
if len(splitted)==0: | |
return input | |
else: | |
return splitted[length-1] | |
def split_and_puncsplit(text): | |
tx = text.split(" ") | |
txnew = [] | |
for txi in tx: | |
txqueue=[] | |
while True: | |
if txi[0] in '([{': | |
txqueue.extend([txi[:1], '<puncnext>']) | |
txi = txi[1:] | |
if len(txi) == 0: | |
break | |
else: | |
break | |
txnew += txqueue | |
txstack=[] | |
if len(txi) == 0: | |
continue | |
while True: | |
if txi[-1] in '?!.,:;}])': | |
txstack = ['<puncnext>', txi[-1:]] + txstack | |
txi = txi[:-1] | |
if len(txi) == 0: | |
break | |
else: | |
break | |
if len(txi) != 0: | |
txnew += [txi] | |
txnew += txstack | |
return txnew | |
if tx == '': | |
return tx | |
splitted_input = split_and_puncsplit(tx) | |
word_length = 1 | |
intermediate_output = False | |
while len(splitted_input)>1: | |
splitted_input = remove_duplicates(splitted_input, word_length) | |
if len(splitted_input)>1: | |
splitted_input, word_length = combine_words(splitted_input, word_length) | |
if intermediate_output: | |
print(splitted_input) | |
print(word_length) | |
output = splitted_input[0] | |
output = output.replace(' <puncnext> ', '') | |
return output | |
def get_instruction(mode): | |
t2i_instruction = ["Generate image from text prompt."] | |
i2i_instruction = ["Generate image conditioned on reference image.",] | |
i2t_instruction = ["Generate text from reference image. "] | |
t2t_instruction = ["Generate text from reference text prompt. "] | |
dcg_instruction = ["Generate image conditioned on both text and image."] | |
tcg_instruction = ["Generate image conditioned on text and up to two images."] | |
mcg_instruction = ["Generate image from multiple contexts."] | |
if mode == "Text-to-Image": | |
return '\n'.join(t2i_instruction) | |
elif mode == "Image-Variation": | |
return '\n'.join(i2i_instruction) | |
elif mode == "Image-to-Text": | |
return '\n'.join(i2t_instruction) | |
elif mode == "Text-Variation": | |
return '\n'.join(t2t_instruction) | |
elif mode == "Dual-Context": | |
return '\n'.join(dcg_instruction) | |
elif mode == "Triple-Context": | |
return '\n'.join(tcg_instruction) | |
elif mode == "Multi-Context": | |
return '\n'.join(mcg_instruction) | |
else: | |
assert False | |
######## | |
# main # | |
######## | |
class vd_dummy(object): | |
def __init__(self, *args, **kwarg): | |
self.which = 'Vdummy' | |
def inference_t2i(self, *args, **kwarg): pass | |
def inference_i2i(self, *args, **kwarg): pass | |
def inference_i2t(self, *args, **kwarg): pass | |
def inference_t2t(self, *args, **kwarg): pass | |
def inference_dcg(self, *args, **kwarg): pass | |
def inference_tcg(self, *args, **kwarg): pass | |
def inference_mcg(self, *args, **kwarg): | |
return None, None | |
class vd_inference(object): | |
def __init__(self, fp16=False, which='v2.0'): | |
highlight_print(which) | |
self.which = which | |
if self.which == 'v1.0': | |
cfgm = model_cfg_bank()('vd_four_flow_v1-0') | |
else: | |
assert False, 'Model type not supported' | |
net = get_model()(cfgm) | |
if fp16: | |
highlight_print('Running in FP16') | |
if self.which == 'v1.0': | |
net.ctx['text'].fp16 = True | |
net.ctx['image'].fp16 = True | |
net = net.half() | |
self.dtype = torch.float16 | |
else: | |
self.dtype = torch.float32 | |
if self.which == 'v1.0': | |
# if fp16: | |
# sd = torch.load('pretrained/vd-four-flow-v1-0-fp16.pth', map_location='cpu') | |
# else: | |
# sd = torch.load('pretrained/vd-four-flow-v1-0.pth', map_location='cpu') | |
from huggingface_hub import hf_hub_download | |
if fp16: | |
temppath = hf_hub_download('shi-labs/versatile-diffusion-model', 'pretrained_pth/vd-four-flow-v1-0-fp16.pth') | |
else: | |
temppath = hf_hub_download('shi-labs/versatile-diffusion-model', 'pretrained_pth/vd-four-flow-v1-0.pth') | |
sd = torch.load(temppath, map_location='cpu') | |
net.load_state_dict(sd, strict=False) | |
self.use_cuda = torch.cuda.is_available() | |
if self.use_cuda: | |
net.to('cuda') | |
self.net = net | |
self.sampler = DDIMSampler(net) | |
self.output_dim = [512, 512] | |
self.n_sample_image = n_sample_image | |
self.n_sample_text = n_sample_text | |
self.ddim_steps = 50 | |
self.ddim_eta = 0.0 | |
self.scale_textto = 7.5 | |
self.image_latent_dim = 4 | |
self.text_latent_dim = 768 | |
self.text_temperature = 1 | |
if which == 'v1.0': | |
self.adjust_rank_f = adjust_rank(max_drop_rank=[1, 5], q=20) | |
self.scale_imgto = 7.5 | |
self.disentanglement_noglobal = True | |
def inference_t2i(self, text, seed): | |
n_samples = self.n_sample_image | |
scale = self.scale_textto | |
sampler = self.sampler | |
h, w = self.output_dim | |
u = self.net.ctx_encode([""], which='text').repeat(n_samples, 1, 1) | |
c = self.net.ctx_encode([text], which='text').repeat(n_samples, 1, 1) | |
shape = [n_samples, self.image_latent_dim, h//8, w//8] | |
np.random.seed(seed) | |
torch.manual_seed(seed + 100) | |
x, _ = sampler.sample( | |
steps=self.ddim_steps, | |
x_info={'type':'image'}, | |
c_info={'type':'text', 'conditioning':c, 'unconditional_conditioning':u, | |
'unconditional_guidance_scale':scale}, | |
shape=shape, | |
verbose=False, | |
eta=self.ddim_eta) | |
im = self.net.vae_decode(x, which='image') | |
im = [tvtrans.ToPILImage()(i) for i in im] | |
return im | |
def inference_i2i(self, im, fid_lvl, fcs_lvl, clr_adj, seed): | |
n_samples = self.n_sample_image | |
scale = self.scale_imgto | |
sampler = self.sampler | |
h, w = self.output_dim | |
device = self.net.device | |
BICUBIC = PIL.Image.Resampling.BICUBIC | |
im = im.resize([w, h], resample=BICUBIC) | |
if fid_lvl == 1: | |
return [im]*n_samples | |
cx = tvtrans.ToTensor()(im)[None].to(device).to(self.dtype) | |
c = self.net.ctx_encode(cx, which='image') | |
if self.disentanglement_noglobal: | |
c_glb = c[:, 0:1] | |
c_loc = c[:, 1: ] | |
c_loc = self.adjust_rank_f(c_loc, fcs_lvl) | |
c = torch.cat([c_glb, c_loc], dim=1).repeat(n_samples, 1, 1) | |
else: | |
c = self.adjust_rank_f(c, fcs_lvl).repeat(n_samples, 1, 1) | |
u = torch.zeros_like(c) | |
shape = [n_samples, self.image_latent_dim, h//8, w//8] | |
np.random.seed(seed) | |
torch.manual_seed(seed + 100) | |
if fid_lvl!=0: | |
x0 = self.net.vae_encode(cx, which='image').repeat(n_samples, 1, 1, 1) | |
step = int(self.ddim_steps * (1-fid_lvl)) | |
x, _ = sampler.sample( | |
steps=self.ddim_steps, | |
x_info={'type':'image', 'x0':x0, 'x0_forward_timesteps':step}, | |
c_info={'type':'image', 'conditioning':c, 'unconditional_conditioning':u, | |
'unconditional_guidance_scale':scale}, | |
shape=shape, | |
verbose=False, | |
eta=self.ddim_eta) | |
else: | |
x, _ = sampler.sample( | |
steps=self.ddim_steps, | |
x_info={'type':'image',}, | |
c_info={'type':'image', 'conditioning':c, 'unconditional_conditioning':u, | |
'unconditional_guidance_scale':scale}, | |
shape=shape, | |
verbose=False, | |
eta=self.ddim_eta) | |
imout = self.net.vae_decode(x, which='image') | |
if clr_adj == 'Simple': | |
cx_mean = cx.view(3, -1).mean(-1)[:, None, None] | |
cx_std = cx.view(3, -1).std(-1)[:, None, None] | |
imout_mean = [imouti.view(3, -1).mean(-1)[:, None, None] for imouti in imout] | |
imout_std = [imouti.view(3, -1).std(-1)[:, None, None] for imouti in imout] | |
imout = [(ii-mi)/si*cx_std+cx_mean for ii, mi, si in zip(imout, imout_mean, imout_std)] | |
imout = [torch.clamp(ii, 0, 1) for ii in imout] | |
imout = [tvtrans.ToPILImage()(i) for i in imout] | |
return imout | |
def inference_i2t(self, im, seed): | |
n_samples = self.n_sample_text | |
scale = self.scale_imgto | |
sampler = self.sampler | |
h, w = self.output_dim | |
device = self.net.device | |
BICUBIC = PIL.Image.Resampling.BICUBIC | |
im = im.resize([w, h], resample=BICUBIC) | |
cx = tvtrans.ToTensor()(im)[None].to(device) | |
c = self.net.ctx_encode(cx, which='image').repeat(n_samples, 1, 1) | |
u = self.net.ctx_encode(torch.zeros_like(cx), which='image').repeat(n_samples, 1, 1) | |
shape = [n_samples, self.text_latent_dim] | |
np.random.seed(seed) | |
torch.manual_seed(seed + 100) | |
x, _ = sampler.sample( | |
steps=self.ddim_steps, | |
x_info={'type':'text',}, | |
c_info={'type':'image', 'conditioning':c, 'unconditional_conditioning':u, | |
'unconditional_guidance_scale':scale}, | |
shape=shape, | |
verbose=False, | |
eta=self.ddim_eta) | |
tx = self.net.vae_decode(x, which='text', temperature=self.text_temperature) | |
tx = [remove_duplicate_word(txi) for txi in tx] | |
tx_combined = '\n'.join(tx) | |
return tx_combined | |
def inference_t2t(self, text, seed): | |
n_samples = self.n_sample_text | |
scale = self.scale_textto | |
sampler = self.sampler | |
u = self.net.ctx_encode([""], which='text').repeat(n_samples, 1, 1) | |
c = self.net.ctx_encode([text], which='text').repeat(n_samples, 1, 1) | |
shape = [n_samples, self.text_latent_dim] | |
np.random.seed(seed) | |
torch.manual_seed(seed + 100) | |
x, _ = sampler.sample( | |
steps=self.ddim_steps, | |
x_info={'type':'text',}, | |
c_info={'type':'text', 'conditioning':c, 'unconditional_conditioning':u, | |
'unconditional_guidance_scale':scale}, | |
shape=shape, | |
verbose=False, | |
eta=self.ddim_eta) | |
tx = self.net.vae_decode(x, which='text', temperature=self.text_temperature) | |
tx = [remove_duplicate_word(txi) for txi in tx] | |
tx_combined = '\n'.join(tx) | |
return tx_combined | |
def inference_dcg(self, imctx, fcs_lvl, textctx, textstrength, seed): | |
n_samples = self.n_sample_image | |
sampler = self.sampler | |
h, w = self.output_dim | |
device = self.net.device | |
c_info_list = [] | |
if (textctx is not None) and (textctx != "") and (textstrength != 0): | |
ut = self.net.ctx_encode([""], which='text').repeat(n_samples, 1, 1) | |
ct = self.net.ctx_encode([textctx], which='text').repeat(n_samples, 1, 1) | |
scale = self.scale_imgto*(1-textstrength) + self.scale_textto*textstrength | |
c_info_list.append({ | |
'type':'text', | |
'conditioning':ct, | |
'unconditional_conditioning':ut, | |
'unconditional_guidance_scale':scale, | |
'ratio': textstrength, }) | |
else: | |
scale = self.scale_imgto | |
textstrength = 0 | |
BICUBIC = PIL.Image.Resampling.BICUBIC | |
cx = imctx.resize([w, h], resample=BICUBIC) | |
cx = tvtrans.ToTensor()(cx)[None].to(device).to(self.dtype) | |
ci = self.net.ctx_encode(cx, which='image') | |
if self.disentanglement_noglobal: | |
ci_glb = ci[:, 0:1] | |
ci_loc = ci[:, 1: ] | |
ci_loc = self.adjust_rank_f(ci_loc, fcs_lvl) | |
ci = torch.cat([ci_glb, ci_loc], dim=1).repeat(n_samples, 1, 1) | |
else: | |
ci = self.adjust_rank_f(ci, fcs_lvl).repeat(n_samples, 1, 1) | |
c_info_list.append({ | |
'type':'image', | |
'conditioning':ci, | |
'unconditional_conditioning':torch.zeros_like(ci), | |
'unconditional_guidance_scale':scale, | |
'ratio': (1-textstrength), }) | |
shape = [n_samples, self.image_latent_dim, h//8, w//8] | |
np.random.seed(seed) | |
torch.manual_seed(seed + 100) | |
x, _ = sampler.sample_multicontext( | |
steps=self.ddim_steps, | |
x_info={'type':'image',}, | |
c_info_list=c_info_list, | |
shape=shape, | |
verbose=False, | |
eta=self.ddim_eta) | |
imout = self.net.vae_decode(x, which='image') | |
imout = [tvtrans.ToPILImage()(i) for i in imout] | |
return imout | |
def inference_tcg(self, *args): | |
args_imag = list(args[0:10]) + [None, None, None, None, None]*2 | |
args_rest = args[10:] | |
imin, imout = self.inference_mcg(*args_imag, *args_rest) | |
return imin, imout | |
def inference_mcg(self, *args): | |
imctx = [args[0:5], args[5:10], args[10:15], args[15:20]] | |
textctx, textstrength, seed = args[20:] | |
n_samples = self.n_sample_image | |
sampler = self.sampler | |
h, w = self.output_dim | |
device = self.net.device | |
c_info_list = [] | |
if (textctx is not None) and (textctx != "") and (textstrength != 0): | |
ut = self.net.ctx_encode([""], which='text').repeat(n_samples, 1, 1) | |
ct = self.net.ctx_encode([textctx], which='text').repeat(n_samples, 1, 1) | |
scale = self.scale_imgto*(1-textstrength) + self.scale_textto*textstrength | |
c_info_list.append({ | |
'type':'text', | |
'conditioning':ct, | |
'unconditional_conditioning':ut, | |
'unconditional_guidance_scale':scale, | |
'ratio': textstrength, }) | |
else: | |
scale = self.scale_imgto | |
textstrength = 0 | |
input_save = [] | |
imc = [] | |
for im, imm, strength, fcs_lvl, use_mask in imctx: | |
if (im is None) and (imm is None): | |
continue | |
BILINEAR = PIL.Image.Resampling.BILINEAR | |
BICUBIC = PIL.Image.Resampling.BICUBIC | |
if use_mask: | |
cx = imm['image'].resize([w, h], resample=BICUBIC) | |
cx = tvtrans.ToTensor()(cx)[None].to(self.dtype).to(device) | |
m = imm['mask'].resize([w, h], resample=BILINEAR) | |
m = tvtrans.ToTensor()(m)[None, 0:1].to(self.dtype).to(device) | |
m = (1-m) | |
cx_show = cx*m | |
ci = self.net.ctx_encode(cx, which='image', masks=m) | |
else: | |
cx = im.resize([w, h], resample=BICUBIC) | |
cx = tvtrans.ToTensor()(cx)[None].to(self.dtype).to(device) | |
ci = self.net.ctx_encode(cx, which='image') | |
cx_show = cx | |
input_save.append(tvtrans.ToPILImage()(cx_show[0])) | |
if self.disentanglement_noglobal: | |
ci_glb = ci[:, 0:1] | |
ci_loc = ci[:, 1: ] | |
ci_loc = self.adjust_rank_f(ci_loc, fcs_lvl) | |
ci = torch.cat([ci_glb, ci_loc], dim=1).repeat(n_samples, 1, 1) | |
else: | |
ci = self.adjust_rank_f(ci, fcs_lvl).repeat(n_samples, 1, 1) | |
imc.append(ci * strength) | |
cis = torch.cat(imc, dim=1) | |
c_info_list.append({ | |
'type':'image', | |
'conditioning':cis, | |
'unconditional_conditioning':torch.zeros_like(cis), | |
'unconditional_guidance_scale':scale, | |
'ratio': (1-textstrength), }) | |
shape = [n_samples, self.image_latent_dim, h//8, w//8] | |
np.random.seed(seed) | |
torch.manual_seed(seed + 100) | |
x, _ = sampler.sample_multicontext( | |
steps=self.ddim_steps, | |
x_info={'type':'image',}, | |
c_info_list=c_info_list, | |
shape=shape, | |
verbose=False, | |
eta=self.ddim_eta) | |
imout = self.net.vae_decode(x, which='image') | |
imout = [tvtrans.ToPILImage()(i) for i in imout] | |
return input_save, imout | |
# vd_inference = vd_dummy() | |
vd_inference = vd_inference(which='v1.0', fp16=True) | |
################# | |
# sub interface # | |
################# | |
def t2i_interface(with_example=False): | |
gr.HTML('<p id=myinst>  Description: ' + get_instruction("Text-to-Image") + '</p>') | |
with gr.Row(): | |
with gr.Column(): | |
text = gr.Textbox(lines=4, placeholder="Input prompt...", label='Text Input') | |
seed = gr.Number(20, label="Seed", precision=0) | |
button = gr.Button("Run") | |
with gr.Column(): | |
img_output = gr.Gallery(label="Image Result", elem_id='customized_imbox').style(grid=n_sample_image) | |
button.click( | |
vd_inference.inference_t2i, | |
inputs=[text, seed], | |
outputs=[img_output]) | |
if with_example: | |
gr.Examples( | |
label='Examples', | |
examples=get_example('Text-to-Image'), | |
fn=vd_inference.inference_t2i, | |
inputs=[text, seed], | |
outputs=[img_output], | |
cache_examples=cache_examples), | |
def i2i_interface(with_example=False): | |
gr.HTML('<p id=myinst>  Description: ' + get_instruction("Image-Variation") + '</p>') | |
with gr.Row(): | |
with gr.Column(): | |
img_input = gr.Image(label='Image Input', type='pil', elem_id='customized_imbox') | |
sim_flag = gr.Checkbox(label='Show Detail Controls') | |
with gr.Row(): | |
fid_lvl = gr.Slider(label="Fidelity (Dislike -- Same)", minimum=0, maximum=1, value=0, step=0.02, visible=False) | |
fcs_lvl = gr.Slider(label="Focus (Semantic -- Style)", minimum=0, maximum=1, value=0.5, step=0.02, visible=False) | |
clr_adj = gr.Radio(label="Color Adjustment", choices=["None", "Simple"], value='Simple', visible=False) | |
explain = gr.HTML('<p id=myinst>  Fidelity: How likely the output image looks like the referece image (0-dislike (default), 1-same).</p>'+ | |
'<p id=myinst>  Focus: What the output image should focused on (0-semantic, 0.5-balanced (default), 1-style).</p>', | |
visible=False) | |
seed = gr.Number(20, label="Seed", precision=0) | |
button = gr.Button("Run") | |
with gr.Column(): | |
img_output = gr.Gallery(label="Image Result", elem_id='customized_imbox').style(grid=n_sample_image) | |
sim_flag.change( | |
fn=lambda x: { | |
explain : gr.update(visible=x), | |
fid_lvl : gr.update(visible=x), | |
fcs_lvl : gr.update(visible=x), | |
clr_adj : gr.update(visible=x), }, | |
inputs=sim_flag, | |
outputs=[explain, fid_lvl, fcs_lvl, clr_adj, seed],) | |
button.click( | |
vd_inference.inference_i2i, | |
inputs=[img_input, fid_lvl, fcs_lvl, clr_adj, seed], | |
outputs=[img_output]) | |
if with_example: | |
gr.Examples( | |
label='Examples', | |
examples=get_example('Image-Variation'), | |
fn=vd_inference.inference_i2i, | |
inputs=[img_input, fid_lvl, fcs_lvl, clr_adj, seed], | |
outputs=[img_output], | |
cache_examples=cache_examples), | |
def i2t_interface(with_example=False): | |
gr.HTML('<p id=myinst>  Description: ' + get_instruction("Image-to-Text") + '</p>') | |
with gr.Row(): | |
with gr.Column(): | |
img_input = gr.Image(label='Image Input', type='pil', elem_id='customized_imbox') | |
seed = gr.Number(20, label="Seed", precision=0) | |
button = gr.Button("Run") | |
with gr.Column(): | |
txt_output = gr.Textbox(lines=4, label='Text Result') | |
button.click( | |
vd_inference.inference_i2t, | |
inputs=[img_input, seed], | |
outputs=[txt_output]) | |
if with_example: | |
gr.Examples( | |
label='Examples', | |
examples=get_example('Image-to-Text'), | |
fn=vd_inference.inference_i2t, | |
inputs=[img_input, seed], | |
outputs=[txt_output], | |
cache_examples=cache_examples), | |
def t2t_interface(with_example=False): | |
gr.HTML('<p id=myinst>  Description: ' + get_instruction("Text-Variation") + '</p>') | |
with gr.Row(): | |
with gr.Column(): | |
text = gr.Textbox(lines=4, placeholder="Input prompt...", label='Text Input') | |
seed = gr.Number(20, label="Seed", precision=0) | |
button = gr.Button("Run") | |
with gr.Column(): | |
txt_output = gr.Textbox(lines=4, label='Text Result') | |
button.click( | |
vd_inference.inference_t2t, | |
inputs=[text, seed], | |
outputs=[txt_output]) | |
if with_example: | |
gr.Examples( | |
label='Examples', | |
examples=get_example('Text-Variation'), | |
fn=vd_inference.inference_t2t, | |
inputs=[text, seed], | |
outputs=[txt_output], | |
cache_examples=cache_examples, ) | |
class image_mimage_swap(object): | |
def __init__(self, block0, block1): | |
self.block0 = block0 | |
self.block1 = block1 | |
self.which_update = 'both' | |
def __call__(self, x0, x1, flag): | |
if self.which_update == 'both': | |
return self.update_both(x0, x1, flag) | |
elif self.which_update == 'visible': | |
return self.update_visible(x0, x1, flag) | |
elif self.which_update == 'visible_oneoff': | |
return self.update_visible_oneoff(x0, x1, flag) | |
else: | |
assert False | |
def update_both(self, x0, x1, flag): | |
if flag: | |
ug0 = gr.update(visible=False) | |
if x0 is None: | |
ug1 = gr.update(value=None, visible=True) | |
else: | |
if (x1 is not None) and ('mask' in x1): | |
value1 = {'image':x0, 'mask':x1['mask']} | |
else: | |
value1 = {'image':x0, 'mask':None} | |
ug1 = gr.update(value=value1, visible=True) | |
else: | |
if (x1 is not None) and ('image' in x1): | |
value0 = x1['image'] | |
else: | |
value0 = None | |
ug0 = gr.update(value=value0, visible=True) | |
ug1 = gr.update(visible=False) | |
return { | |
self.block0 : ug0, | |
self.block1 : ug1,} | |
def update_visible(self, x0, x1, flag): | |
return { | |
self.block0 : gr.update(visible=not flag), | |
self.block1 : gr.update(visible=flag), } | |
def update_visible_oneoff(self, x0, x1, flag): | |
self.which_update = 'both' | |
return { | |
self.block0 : gr.update(visible=not flag), | |
self.block1 : gr.update(visible=flag), } | |
class example_visible_only_hack(object): | |
def __init__(self, checkbox_list, functor_list): | |
self.checkbox_list = checkbox_list | |
self.functor_list = functor_list | |
def __call__(self, *args): | |
for bi, fi, vi in zip(self.checkbox_list, self.functor_list, args): | |
if bi.value != vi: | |
fi.which_update = 'visible_oneoff' | |
def dcg_interface(with_example=False): | |
gr.HTML('<p id=myinst>  Description: ' + get_instruction("Dual-Context") + '</p>') | |
with gr.Row(): | |
input_session = [] | |
with gr.Column(): | |
img = gr.Image(label='Image Input', type='pil', elem_id='customized_imbox') | |
fcs = gr.Slider(label="Focus (Semantic -- Style)", minimum=0, maximum=1, value=0.5, step=0.02) | |
gr.HTML('<p id=myinst>  Focus: Focus on what aspect of the image? (0-semantic, 0.5-balanced (default), 1-style).</p>') | |
text = gr.Textbox(lines=2, placeholder="Input prompt...", label='Text Input') | |
tstrength = gr.Slider(label="Text Domination (NoEffect -- TextOnly)", minimum=0, maximum=1, value=0, step=0.02) | |
seed = gr.Number(20, label="Seed", precision=0) | |
button = gr.Button("Run") | |
with gr.Column(): | |
output_gallary = gr.Gallery(label="Image Result", elem_id='customized_imbox').style(grid=n_sample_image) | |
input_list = [] | |
for i in input_session: | |
input_list += i | |
button.click( | |
vd_inference.inference_dcg, | |
inputs=[img, fcs, text, tstrength, seed], | |
outputs=[output_gallary]) | |
if with_example: | |
gr.Examples( | |
label='Examples', | |
examples=get_example('Dual-Context'), | |
fn=vd_inference.inference_dcg, | |
inputs=[img, fcs, text, tstrength, seed], | |
outputs=[output_gallary], | |
cache_examples=cache_examples) | |
def tcg_interface(with_example=False): | |
gr.HTML('<p id=myinst>  Description: ' + get_instruction("Triple-Context") + '</p>') | |
with gr.Row(): | |
input_session = [] | |
with gr.Column(min_width=940): | |
with gr.Row(): | |
with gr.Column(): | |
img0 = gr.Image(label='Image Input', type='pil', elem_id='customized_imbox') | |
img0.as_example = types.MethodType(customized_as_example, img0) | |
imgm0 = gr.Image(label='Image Input with Mask', type='pil', elem_id='customized_imbox', tool='sketch', source="upload", visible=False) | |
imgm0.postprocess = types.MethodType(customized_postprocess, imgm0) | |
imgm0.as_example = types.MethodType(customized_as_example, imgm0) | |
istrength0 = gr.Slider(label="Weight", minimum=0, maximum=1, value=1, step=0.02) | |
fcs0 = gr.Slider(label="Focus (Semantic -- Style)", minimum=0, maximum=1, value=0.5, step=0.02) | |
msk0 = gr.Checkbox(label='Use mask?') | |
swapf0 = image_mimage_swap(img0, imgm0) | |
msk0.change( | |
fn=swapf0, | |
inputs=[img0, imgm0, msk0], | |
outputs=[img0, imgm0],) | |
input_session.append([img0, imgm0, istrength0, fcs0, msk0]) | |
with gr.Column(): | |
img1 = gr.Image(label='Image Input', type='pil', elem_id='customized_imbox') | |
img1.as_example = types.MethodType(customized_as_example, img1) | |
imgm1 = gr.Image(label='Image Input with Mask', type='pil', elem_id='customized_imbox', tool='sketch', source="upload", visible=False) | |
imgm1.postprocess = types.MethodType(customized_postprocess, imgm1) | |
imgm1.as_example = types.MethodType(customized_as_example, imgm1) | |
istrength1 = gr.Slider(label="Weight", minimum=0, maximum=1, value=1, step=0.02) | |
fcs1 = gr.Slider(label="Focus (Semantic -- Style)", minimum=0, maximum=1, value=0.5, step=0.02) | |
msk1 = gr.Checkbox(label='Use mask?') | |
swapf1 = image_mimage_swap(img1, imgm1) | |
msk1.change( | |
fn=swapf1, | |
inputs=[img1, imgm1, msk1], | |
outputs=[img1, imgm1],) | |
input_session.append([img1, imgm1, istrength1, fcs1, msk1]) | |
gr.HTML('<p id=myinst>  Weight: The strength of the reference image. This weight is subject to <u>Text Domination</u>).</p>'+ | |
'<p id=myinst>  Focus: Focus on what aspect of the image? (0-semantic, 0.5-balanced (default), 1-style).</p>'+ | |
'<p id=myinst>  Mask: Remove regions on reference image so they will not influence the output.</p>',) | |
text = gr.Textbox(lines=2, placeholder="Input prompt...", label='Text Input') | |
tstrength = gr.Slider(label="Text Domination (NoEffect -- TextOnly)", minimum=0, maximum=1, value=0, step=0.02) | |
seed = gr.Number(20, label="Seed", precision=0) | |
button = gr.Button("Run") | |
with gr.Column(min_width=470): | |
input_gallary = gr.Gallery(label="Input Display", elem_id="customized_imbox").style(grid=2) | |
output_gallary = gr.Gallery(label="Image Result", elem_id="customized_imbox").style(grid=n_sample_image) | |
input_list = [] | |
for i in input_session: | |
input_list += i | |
input_list += [text, tstrength, seed] | |
button.click( | |
vd_inference.inference_tcg, | |
inputs=input_list, | |
outputs=[input_gallary, output_gallary]) | |
if with_example: | |
create_myexamples( | |
label='Examples', | |
examples=get_example('Triple-Context'), | |
fn=vd_inference.inference_tcg, | |
inputs=input_list, | |
outputs=[input_gallary, output_gallary, ], | |
cache_examples=cache_examples, ) | |
gr.HTML('<br><p id=myinst>  How to add mask: Please see the following instructions.</p><br>'+ | |
'<div id="maskinst">'+ | |
'<img src="file/assets/demo/misc/mask_inst1.gif">'+ | |
'<img src="file/assets/demo/misc/mask_inst2.gif">'+ | |
'<img src="file/assets/demo/misc/mask_inst3.gif">'+ | |
'</div>') | |
def mcg_interface(with_example=False): | |
num_img_input = 4 | |
gr.HTML('<p id=myinst>  Description: ' + get_instruction("Multi-Context") + '</p>') | |
with gr.Row(): | |
input_session = [] | |
with gr.Column(): | |
for idx in range(num_img_input): | |
with gr.Tab('Image{}'.format(idx+1)): | |
img = gr.Image(label='Image Input', type='pil', elem_id='customized_imbox') | |
img.as_example = types.MethodType(customized_as_example, img) | |
imgm = gr.Image(label='Image Input with Mask', type='pil', elem_id='customized_imbox', tool='sketch', source="upload", visible=False) | |
imgm.postprocess = types.MethodType(customized_postprocess, imgm) | |
imgm.as_example = types.MethodType(customized_as_example, imgm) | |
with gr.Row(): | |
istrength = gr.Slider(label="Weight", minimum=0, maximum=1, value=1, step=0.02) | |
fcs = gr.Slider(label="Focus (Semantic -- Style)", minimum=0, maximum=1, value=0.5, step=0.02) | |
msk = gr.Checkbox(label='Use mask?') | |
gr.HTML('<p id=myinst>  Weight: The strength of the reference image. This weight is subject to <u>Text Domination</u>).</p>'+ | |
'<p id=myinst>  Focus: Focus on what aspect of the image? (0-semantic, 0.5-balanced (default), 1-style).</p>'+ | |
'<p id=myinst>  Mask: Remove regions on reference image so they will not influence the output.</p>',) | |
msk.change( | |
fn=image_mimage_swap(img, imgm), | |
inputs=[img, imgm, msk], | |
outputs=[img, imgm],) | |
input_session.append([img, imgm, istrength, fcs, msk]) | |
text = gr.Textbox(lines=2, placeholder="Input prompt...", label='Text Input') | |
tstrength = gr.Slider(label="Text Domination (NoEffect -- TextOnly)", minimum=0, maximum=1, value=0, step=0.02) | |
seed = gr.Number(20, label="Seed", precision=0) | |
button = gr.Button("Run") | |
with gr.Column(): | |
input_gallary = gr.Gallery(label="Input Display", elem_id='customized_imbox').style(grid=4) | |
output_gallary = gr.Gallery(label="Image Result", elem_id='customized_imbox').style(grid=n_sample_image) | |
input_list = [] | |
for i in input_session: | |
input_list += i | |
input_list += [text, tstrength, seed] | |
button.click( | |
vd_inference.inference_mcg, | |
inputs=input_list, | |
outputs=[input_gallary, output_gallary], ) | |
if with_example: | |
create_myexamples( | |
label='Examples', | |
examples=get_example('Multi-Context'), | |
fn=vd_inference.inference_mcg, | |
inputs=input_list, | |
outputs=[input_gallary, output_gallary], | |
cache_examples=cache_examples, ) | |
gr.HTML('<br><p id=myinst>  How to add mask: Please see the following instructions.</p><br>'+ | |
'<div id="maskinst">'+ | |
'<img src="file/assets/demo/misc/mask_inst1.gif">'+ | |
'<img src="file/assets/demo/misc/mask_inst2.gif">'+ | |
'<img src="file/assets/demo/misc/mask_inst3.gif">'+ | |
'</div>') | |
########### | |
# Example # | |
########### | |
def get_example(mode): | |
if mode == 'Text-to-Image': | |
case = [ | |
['a dream of a village in china, by Caspar David Friedrich, matte painting trending on artstation HQ', 23], | |
['a beautiful landscape with mountains and rivers', 20], | |
] | |
elif mode == "Image-Variation": | |
case = [ | |
['assets/demo/reg_example/ghibli.jpg', 0, 0.5, 'None', 20], | |
['assets/demo/reg_example/ghibli.jpg', 0.5, 0.5, 'None', 20], | |
['assets/demo/reg_example/matisse.jpg', 0, 0, 'None', 20], | |
['assets/demo/reg_example/matisse.jpg', 0, 1, 'Simple', 20], | |
['assets/demo/reg_example/vermeer.jpg', 0.2, 0.3, 'None', 30], | |
] | |
elif mode == "Image-to-Text": | |
case = [ | |
['assets/demo/reg_example/house_by_lake.jpg', 20], | |
] | |
elif mode == "Text-Variation": | |
case = [ | |
['heavy arms gundam penguin mech', 20], | |
] | |
elif mode == "Dual-Context": | |
case = [ | |
['assets/demo/reg_example/benz.jpg', 0.5, 'cyberpunk 2077', 0.7, 22], | |
['assets/demo/reg_example/ghibli.jpg', 1, 'Red maple on a hill in golden Autumn.', 0.5, 21], | |
] | |
elif mode == "Triple-Context": | |
case = [ | |
[ | |
'assets/demo/reg_example/night_light.jpg', None, 1 , 0.5, False, | |
'assets/demo/reg_example/paris.jpg' , None, 0.94, 0.5, False, | |
"snow on the street", 0.4, 28], | |
[ | |
'assets/demo/tcg_example/e1i0.jpg', None, 1 , 0.5, False, | |
'assets/demo/tcg_example/e1i1.jpg', None, 0.94, 0.5, False, | |
"a painting of an elegant woman in front of the moon", 0.2, 217], | |
[ | |
'assets/demo/tcg_example/e2i0.jpg', None, 1, 0.5, False, | |
'assets/demo/reg_example/paris.jpg', None, 1, 0.5, False, | |
"", 0, 29], | |
[ | |
'assets/demo/tcg_example/e0i0.jpg', None, 1 , 0.5, False, | |
'assets/demo/tcg_example/e0i1.jpg', None, 0.9, 0.5, False, | |
"rose blooms on the tree", 0.2, 20], | |
[ | |
'assets/demo/reg_example/ghibli.jpg', None, 1 , 1 , False, | |
'assets/demo/reg_example/space.jpg' , None, 0.88, 0.5, False, | |
"", 0, 20], | |
[ | |
'assets/demo/reg_example/train.jpg' , None, 0.8, 0.5, False, | |
'assets/demo/reg_example/matisse.jpg', None, 1 , 1 , False, | |
"", 0, 20], | |
] | |
elif mode == "Multi-Context": | |
case = [ | |
[ | |
'assets/demo/mcg_example/e0i0.jpg', None, 1, 0.5, False, | |
'assets/demo/mcg_example/e0i1.jpg', None, 1, 0.5, False, | |
'assets/demo/mcg_example/e0i2.jpg', None, 0.86, 0.5, False, | |
None, None, 1, 0.5, False, | |
"", 0, 20], | |
] | |
else: | |
raise ValueError | |
return case | |
############# | |
# Interface # | |
############# | |
css = """ | |
#customized_imbox { | |
min-height: 450px; | |
} | |
#customized_imbox>div[data-testid="image"] { | |
min-height: 450px; | |
} | |
#customized_imbox>div[data-testid="image"]>div { | |
min-height: 450px; | |
} | |
#customized_imbox>div[data-testid="image"]>iframe { | |
min-height: 450px; | |
} | |
#customized_imbox>div.unpadded_box { | |
min-height: 450px; | |
} | |
#myinst { | |
font-size: 0.8rem; | |
margin: 0rem; | |
color: #6B7280; | |
} | |
#maskinst { | |
text-align: justify; | |
min-width: 1200px; | |
} | |
#maskinst>img { | |
min-width:399px; | |
max-width:450px; | |
vertical-align: top; | |
display: inline-block; | |
} | |
#maskinst:after { | |
content: ""; | |
width: 100%; | |
display: inline-block; | |
} | |
""" | |
if True: | |
with gr.Blocks(css=css) as demo: | |
gr.HTML( | |
""" | |
<div style="text-align: center; max-width: 1200px; margin: 20px auto;"> | |
<h1 style="font-weight: 900; font-size: 3rem; margin: 0rem"> | |
Versatile Diffusion | |
</h1> | |
<h2 style="font-weight: 450; font-size: 1rem; margin-top: 0.5rem; margin-bottom: 0.5rem"> | |
We built <b>Versatile Diffusion (VD), the first unified multi-flow multimodal diffusion framework</b>, as a step towards <b>Universal Generative AI</b>. | |
VD can natively support image-to-text, image-variation, text-to-image, and text-variation, | |
and can be further extended to other applications such as | |
semantic-style disentanglement, image-text dual-guided generation, latent image-to-text-to-image editing, and more. | |
Future versions will support more modalities such as speech, music, video and 3D. | |
</h2> | |
<h3 style="font-weight: 450; font-size: 1rem; margin: 0rem"> | |
Xingqian Xu, Atlas Wang, Eric Zhang, Kai Wang, | |
and <a href="https://www.humphreyshi.com/home">Humphrey Shi</a> | |
[<a href="https://arxiv.org/abs/2211.08332" style="color:blue;">arXiv</a>] | |
[<a href="https://github.com/SHI-Labs/Versatile-Diffusion" style="color:blue;">GitHub</a>] | |
</h3> | |
</div> | |
""") | |
with gr.Tab('Text-to-Image'): | |
t2i_interface(with_example=True) | |
with gr.Tab('Image-Variation'): | |
i2i_interface(with_example=True) | |
with gr.Tab('Image-to-Text'): | |
i2t_interface(with_example=True) | |
with gr.Tab('Text-Variation'): | |
t2t_interface(with_example=True) | |
with gr.Tab('Dual-Context Image-Generation'): | |
dcg_interface(with_example=True) | |
with gr.Tab('Triple-Context Image-Blender'): | |
tcg_interface(with_example=True) | |
with gr.Tab('Multi-Context Image-Blender'): | |
mcg_interface(with_example=True) | |
gr.HTML( | |
""" | |
<div style="text-align: justify; max-width: 1200px; margin: 20px auto;"> | |
<h3 style="font-weight: 450; font-size: 0.8rem; margin: 0rem"> | |
<b>Version</b>: {} | |
</h3> | |
<h3 style="font-weight: 450; font-size: 0.8rem; margin: 0rem"> | |
<b>Caution</b>: | |
We would like the raise the awareness of users of this demo of its potential issues and concerns. | |
Like previous large foundation models, Versatile Diffusion could be problematic in some cases, partially due to the imperfect training data and pretrained network (VAEs / context encoders) with limited scope. | |
In its future research phase, VD may do better on tasks such as text-to-image, image-to-text, etc., with the help of more powerful VAEs, more sophisticated network designs, and more cleaned data. | |
So far, we keep all features available for research testing both to show the great potential of the VD framework and to collect important feedback to improve the model in the future. | |
We welcome researchers and users to report issues with the HuggingFace community discussion feature or email the authors. | |
</h3> | |
<h3 style="font-weight: 450; font-size: 0.8rem; margin: 0rem"> | |
<b>Biases and content acknowledgement</b>: | |
Beware that VD may output content that reinforces or exacerbates societal biases, as well as realistic faces, pornography, and violence. | |
VD was trained on the LAION-2B dataset, which scraped non-curated online images and text, and may contained unintended exceptions as we removed illegal content. | |
VD in this demo is meant only for research purposes. | |
</h3> | |
</div> | |
""".format(' '+vd_inference.which)) | |
# demo.launch(share=True) | |
demo.launch(debug=True) | |