Spaces:

shi-labs
/

Smooth-Diffusion

Build error

App Files Files Community

JiayiGuo821 commited on Mar 22, 2024

Commit

b6e0092

1 Parent(s): e43437a

Add files

Browse files

Files changed (23) hide show

app.py +964 -0
app_utils.py +102 -0
assets/.DS_Store +0 -0
assets/images/.DS_Store +0 -0
assets/images/editing/banana.png +0 -0
assets/images/editing/cake.png +0 -0
assets/images/editing/rabbit.png +0 -0
assets/images/interpolation/church1.png +0 -0
assets/images/interpolation/church2.png +0 -0
assets/images/interpolation/dog1.png +0 -0
assets/images/interpolation/dog2.png +0 -0
assets/images/interpolation/horse1.png +0 -0
assets/images/interpolation/horse2.png +0 -0
assets/images/interpolation/land1.png +0 -0
assets/images/interpolation/land2.png +0 -0
assets/images/interpolation/rabbit1.png +0 -0
assets/images/interpolation/rabbit2.png +0 -0
assets/images/interpolation/woman1.png +0 -0
assets/images/interpolation/woman2.png +0 -0
assets/images/inversion/000000029596.jpg +0 -0
assets/images/inversion/000000560011.jpg +0 -0
nulltxtinv_wrapper.py +450 -0
requirements.txt +16 -0

app.py ADDED Viewed

	@@ -0,0 +1,964 @@

+################################################################################
+# Copyright (C) 2023 Jiayi Guo, Xingqian Xu, Manushree Vasu - All Rights Reserved                         #
+################################################################################
+import gradio as gr
+import os
+import os.path as osp
+import PIL
+from PIL import Image
+import numpy as np
+from collections import OrderedDict
+from easydict import EasyDict as edict
+from functools import partial
+import torch
+import torchvision.transforms as tvtrans
+import time
+import argparse
+import json
+import hashlib
+import copy
+from tqdm import tqdm
+from diffusers import StableDiffusionPipeline
+from diffusers import DDIMScheduler
+from app_utils import auto_dropdown
+from huggingface_hub import hf_hub_download
+version = "Smooth Diffusion Demo v1.0"
+refresh_symbol = "\U0001f504" # 🔄
+recycle_symbol = '\U0000267b' #
+##############
+# model_book #
+##############
+choices = edict()
+choices.diffuser = OrderedDict([
+    ['SD-v1-5' , "runwayml/stable-diffusion-v1-5"],
+    ['OJ-v4' , "prompthero/openjourney-v4"],
+    ['RR-v2', "SG161222/Realistic_Vision_V2.0"],
+])
+choices.lora = OrderedDict([
+    ['empty', ""],
+    ['Smooth-LoRA-v1', hf_hub_download('shi-labs/smooth-diffusion-lora', 'pytorch_model.bin')],
+])
+choices.scheduler = OrderedDict([
+    ['DDIM', DDIMScheduler],
+])
+choices.inversion = OrderedDict([
+    ['NTI', 'NTI'],
+    ['DDIM w/o text', 'DDIM w/o text'],
+    ['DDIM', 'DDIM'],
+])
+default = edict()
+default.diffuser = 'SD-v1-5'
+default.scheduler = 'DDIM'
+default.lora = 'Smooth-LoRA-v1'
+default.inversion = 'NTI'
+default.step = 50
+default.cfg_scale = 7.5
+default.framen = 24
+default.fps = 16
+default.nullinv_inner_step = 10
+default.threshold = 0.8
+default.variation = 0.8
+##########
+# helper #
+##########
+def lerp(t, v0, v1):
+    if isinstance(t, float):
+        return v0*(1-t) + v1*t
+    elif isinstance(t, (list, np.ndarray)):
+        return [v0*(1-ti) + v1*ti for ti in t]
+def slerp(t, v0, v1, DOT_THRESHOLD=0.9995):
+    # mostly copied from
+    # https://gist.github.com/dvschultz/3af50c40df002da3b751efab1daddf2c
+    v0_unit = v0 / np.linalg.norm(v0)
+    v1_unit = v1 / np.linalg.norm(v1)
+    dot = np.sum(v0_unit * v1_unit)
+    if np.abs(dot) > DOT_THRESHOLD:
+        return lerp(t, v0, v1)
+    # Calculate initial angle between v0 and v1
+    theta_0 = np.arccos(dot)
+    sin_theta_0 = np.sin(theta_0)
+    # Angle at timestep t
+    if isinstance(t, float):
+        tlist = [t]
+    elif isinstance(t, (list, np.ndarray)):
+        tlist = t
+    v2_list = []
+    for ti in tlist:
+        theta_t = theta_0 * ti
+        sin_theta_t = np.sin(theta_t)
+        # Finish the slerp algorithm
+        s0 = np.sin(theta_0 - theta_t) / sin_theta_0
+        s1 = sin_theta_t / sin_theta_0
+        v2 = s0 * v0 + s1 * v1
+        v2_list.append(v2)
+    if isinstance(t, float):
+        return v2_list[0]
+    else:
+        return v2_list
+def offset_resize(image, width=512, height=512, left=0, right=0, top=0, bottom=0):
+    image = np.array(image)[:, :, :3]
+    h, w, c = image.shape
+    left = min(left, w-1)
+    right = min(right, w - left - 1)
+    top = min(top, h - left - 1)
+    bottom = min(bottom, h - top - 1)
+    image = image[top:h-bottom, left:w-right]
+    h, w, c = image.shape
+    if h < w:
+        offset = (w - h) // 2
+        image = image[:, offset:offset + h]
+    elif w < h:
+        offset = (h - w) // 2
+        image = image[offset:offset + w]
+    image = Image.fromarray(image).resize((width, height))
+    return image
+def auto_dtype_device_shape(tlist, v0, v1, func,):
+    vshape = v0.shape
+    assert v0.shape == v1.shape
+    assert isinstance(tlist, (list, np.ndarray))
+    if isinstance(v0, torch.Tensor):
+        is_torch = True
+        dtype, device = v0.dtype, v0.device
+        v0 = v0.to('cpu').numpy().astype(float).flatten()
+        v1 = v1.to('cpu').numpy().astype(float).flatten()
+    else:
+        is_torch = False
+        dtype = v0.dtype
+        assert isinstance(v0, np.ndarray)
+        assert isinstance(v1, np.ndarray)
+        v0 = v0.astype(float).flatten()
+        v1 = v1.astype(float).flatten()
+    r = func(tlist, v0, v1)
+    if is_torch:
+        r = [torch.Tensor(ri).view(*vshape).to(dtype).to(device) for ri in r]
+    else:
+        r = [ri.astype(dtype) for ri in r]
+    return r
+auto_lerp = partial(auto_dtype_device_shape, func=lerp)
+auto_slerp = partial(auto_dtype_device_shape, func=slerp)
+def frames2mp4(vpath, frames, fps):
+    import moviepy.editor as mpy
+    frames = [np.array(framei) for framei in frames]
+    clip = mpy.ImageSequenceClip(frames, fps=fps)
+    clip.write_videofile(vpath, fps=fps)
+def negseed_to_rndseed(seed):
+    if seed < 0:
+        seed = np.random.randint(0, np.iinfo(np.uint32).max-100)
+    return seed
+def regulate_image(pilim):
+    w, h = pilim.size
+    w = int(round(w/64)) * 64
+    h = int(round(h/64)) * 64
+    return pilim.resize([w, h], resample=PIL.Image.BILINEAR)
+def txt_to_emb(model, prompt):
+    text_input = model.tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=model.tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",)
+    text_embeddings = model.text_encoder(text_input.input_ids.to(model.device))[0]
+    return text_embeddings
+def hash_pilim(pilim):
+    hasha = hashlib.md5(pilim.tobytes()).hexdigest()
+    return hasha
+def hash_cfgdict(cfgdict):
+    hashb = hashlib.md5(json.dumps(cfgdict, sort_keys=True).encode('utf-8')).hexdigest()
+    return hashb
+def remove_earliest_file(path, max_allowance=500, remove_ratio=0.1, ext=None):
+    if len(os.listdir(path)) <= max_allowance:
+        return
+    def get_mtime(fname):
+        return osp.getmtime(osp.join(path, fname))
+    if ext is None:
+        flist = sorted(os.listdir(path), key=get_mtime)
+    else:
+        flist = [fi for fi in os.listdir(path) if fi.endswith(ext)]
+        flist = sorted(flist, key=get_mtime)
+    exceedn = max(len(flist)-max_allowance, 0)
+    removen = int(max_allowance*remove_ratio)
+    removen = max(1, removen) + exceedn
+    for fi in flist[0:removen]:
+        os.remove(osp.join(path, fi))
+def remove_decoupled_file(path, exta='.mp4', extb='.json'):
+    tag_a = [osp.splitext(fi)[0] for fi in os.listdir(path) if fi.endswith(exta)]
+    tag_b = [osp.splitext(fi)[0] for fi in os.listdir(path) if fi.endswith(extb)]
+    tag_a_extra = set(tag_a) - set(tag_b)
+    tag_b_extra = set(tag_b) - set(tag_a)
+    [os.remove(osp.join(path, tagi+exta)) for tagi in tag_a_extra]
+    [os.remove(osp.join(path, tagi+extb)) for tagi in tag_b_extra]
+@torch.no_grad()
+def t2i_core(model, xt, emb, nemb, step=30, cfg_scale=7.5, return_list=False):
+    from nulltxtinv_wrapper import diffusion_step, latent2image
+    model.scheduler.set_timesteps(step)
+    xi = xt
+    emb = txt_to_emb(model, "") if emb is None else emb
+    nemb = txt_to_emb(model, "") if nemb is None else nemb
+    if return_list:
+        xi_list = [xi.clone()]
+    for i, t in enumerate(tqdm(model.scheduler.timesteps)):
+        embi = emb[i] if isinstance(emb, list) else emb
+        nembi = nemb[i] if isinstance(nemb, list) else nemb
+        context = torch.cat([nembi, embi])
+        xi = diffusion_step(model, xi, context, t, cfg_scale, low_resource=False)
+        if return_list:
+            xi_list.append(xi.clone())
+    x0 = xi
+    im = latent2image(model.vae, x0, return_type='pil')
+    if return_list:
+        return im, xi_list
+    else:
+        return im
+########
+# main #
+########
+class wrapper(object):
+    def __init__(self,
+                 fp16=False,
+                 tag_diffuser=None,
+                 tag_lora=None,
+                 tag_scheduler=None,):
+        self.device = "cuda"
+        if fp16:
+            self.torch_dtype = torch.float16
+        else:
+            self.torch_dtype = torch.float32
+        self.load_all(tag_diffuser, tag_lora, tag_scheduler)
+        self.image_latent_dim = 4
+        self.batchsize = 8
+        self.seed = {}
+        self.cache_video_folder = "temp/video"
+        self.cache_video_maxn = 500
+        self.cache_image_folder = "temp/image"
+        self.cache_image_maxn = 500
+        self.cache_inverse_folder = "temp/inverse"
+        self.cache_inverse_maxn = 500
+    def load_all(self, tag_diffuser, tag_lora, tag_scheduler):
+        self.load_diffuser_lora(tag_diffuser, tag_lora)
+        self.load_scheduler(tag_scheduler)
+        return tag_diffuser, tag_lora, tag_scheduler
+    def load_diffuser_lora(self, tag_diffuser, tag_lora):
+        self.net = StableDiffusionPipeline.from_pretrained(
+            choices.diffuser[tag_diffuser], torch_dtype=self.torch_dtype).to(self.device)
+        self.net.safety_checker = None
+        if tag_lora != 'empty':
+            self.net.unet.load_attn_procs(
+                choices.lora[tag_lora], use_safetensors=False,)
+        self.tag_diffuser = tag_diffuser
+        self.tag_lora = tag_lora
+        return tag_diffuser, tag_lora
+    def load_scheduler(self, tag_scheduler):
+        self.net.scheduler = choices.scheduler[tag_scheduler].from_config(self.net.scheduler.config)
+        self.tag_scheduler = tag_scheduler
+        return tag_scheduler
+    def reset_seed(self, which='ltintp'):
+        return -1
+    def recycle_seed(self, which='ltintp'):
+        if which not in self.seed:
+            return self.reset_seed(which=which)
+        else:
+            return self.seed[which]
+    ##########
+    # helper #
+    ##########
+    def precheck_model(self, tag_diffuser, tag_lora, tag_scheduler):
+        if (tag_diffuser != self.tag_diffuser) or (tag_lora != self.tag_lora):
+            self.load_all(tag_diffuser, tag_lora, tag_scheduler)
+        if tag_scheduler != self.tag_scheduler:
+            self.load_scheduler(tag_scheduler)
+    ########
+    # main #
+    ########
+    def ddiminv(self, img, cfgdict):
+        txt, step, cfg_scale = cfgdict['txt'], cfgdict['step'], cfgdict['cfg_scale']
+        from nulltxtinv_wrapper import NullInversion
+        null_inversion_model = NullInversion(self.net, step, cfg_scale)
+        with torch.no_grad():
+            emb = txt_to_emb(self.net, txt)
+            nemb = txt_to_emb(self.net, "")
+        xt = null_inversion_model.ddim_invert(img, txt)
+        data = {
+            'step' : step, 'cfg_scale' : cfg_scale, 'txt' : txt,
+            'diffuser' : self.tag_diffuser, 'lora' : self.tag_lora,
+            'xt': xt, 'emb': emb, 'nemb': nemb,}
+        return data
+    def nullinv_or_loadcache(self, img, cfgdict, force_reinvert=False):
+        hash = hash_pilim(img) + "--" + hash_cfgdict(cfgdict)
+        cdir = self.cache_inverse_folder
+        cfname = osp.join(cdir, hash+'.pth')
+        if osp.isfile(cfname) and (not force_reinvert):
+            cache_data = torch.load(cfname)
+            dtype = next(self.net.unet.parameters()).dtype
+            device = next(self.net.unet.parameters()).device
+            cache_data['xt'] = cache_data['xt'].to(device=device, dtype=dtype)
+            cache_data['emb'] = cache_data['emb'].to(device=device, dtype=dtype)
+            cache_data['nemb'] = [
+                nembi.to(device=device, dtype=dtype)
+                    for nembi in cache_data['nemb']]
+            return cache_data
+        else:
+            txt, step, cfg_scale = cfgdict['txt'], cfgdict['step'], cfgdict['cfg_scale']
+            inner_step = cfgdict['inner_step']
+            from nulltxtinv_wrapper import NullInversion
+            null_inversion_model = NullInversion(self.net, step, cfg_scale)
+            with torch.no_grad():
+                emb = txt_to_emb(self.net, txt)
+            xt, nemb = null_inversion_model.null_invert(img, txt, num_inner_steps=inner_step)
+            cache_data = {
+                'step' : step, 'cfg_scale' : cfg_scale, 'txt' : txt,
+                'inner_step' : inner_step,
+                'diffuser' : self.tag_diffuser, 'lora' : self.tag_lora,
+                'xt' : xt.to('cpu'),
+                'emb' : emb.to('cpu'),
+                'nemb' : [nembi.to('cpu') for nembi in nemb],}
+            os.makedirs(cdir, exist_ok=True)
+            remove_earliest_file(cdir, max_allowance=self.cache_inverse_maxn)
+            torch.save(cache_data, cfname)
+            data = {
+                'step' : step, 'cfg_scale' : cfg_scale, 'txt' : txt,
+                'inner_step' : inner_step,
+                'diffuser' : self.tag_diffuser, 'lora' : self.tag_lora,
+                'xt' : xt, 'emb' : emb, 'nemb' : nemb,}
+            return data
+    def nullinvdual_or_loadcachedual(self, img0, img1, cfgdict, force_reinvert=False):
+        hash = hash_pilim(img0) + "--" + hash_pilim(img1) + "--" + hash_cfgdict(cfgdict)
+        cdir = self.cache_inverse_folder
+        cfname = osp.join(cdir, hash+'.pth')
+        if osp.isfile(cfname) and (not force_reinvert):
+            cache_data = torch.load(cfname)
+            dtype = next(self.net.unet.parameters()).dtype
+            device = next(self.net.unet.parameters()).device
+            cache_data['xt0'] = cache_data['xt0'].to(device=device, dtype=dtype)
+            cache_data['xt1'] = cache_data['xt1'].to(device=device, dtype=dtype)
+            cache_data['emb0'] = cache_data['emb0'].to(device=device, dtype=dtype)
+            cache_data['emb1'] = cache_data['emb1'].to(device=device, dtype=dtype)
+            cache_data['nemb'] = [
+                nembi.to(device=device, dtype=dtype)
+                    for nembi in cache_data['nemb']]
+            cache_data_a = copy.deepcopy(cache_data)
+            cache_data_a['xt'] = cache_data_a.pop('xt0')
+            cache_data_a['emb'] = cache_data_a.pop('emb0')
+            cache_data_a.pop('xt1'); cache_data_a.pop('emb1')
+            cache_data_b = cache_data
+            cache_data_b['xt'] = cache_data_b.pop('xt1')
+            cache_data_b['emb'] = cache_data_b.pop('emb1')
+            cache_data_b.pop('xt0'); cache_data_b.pop('emb0')
+            return cache_data_a, cache_data_b
+        else:
+            txt0, txt1, step, cfg_scale, inner_step = \
+                cfgdict['txt0'], cfgdict['txt1'], cfgdict['step'], \
+                cfgdict['cfg_scale'], cfgdict['inner_step']
+            from nulltxtinv_wrapper import NullInversion
+            null_inversion_model = NullInversion(self.net, step, cfg_scale)
+            with torch.no_grad():
+                emb0 = txt_to_emb(self.net, txt0)
+                emb1 = txt_to_emb(self.net, txt1)
+            xt0, xt1, nemb = null_inversion_model.null_invert_dual(
+                img0, img1, txt0, txt1, num_inner_steps=inner_step)
+            cache_data = {
+                'step' : step, 'cfg_scale' : cfg_scale,
+                'txt0' : txt0, 'txt1' : txt1,
+                'inner_step' : inner_step,
+                'diffuser' : self.tag_diffuser, 'lora' : self.tag_lora,
+                'xt0' : xt0.to('cpu'), 'xt1' : xt1.to('cpu'),
+                'emb0' : emb0.to('cpu'), 'emb1' : emb1.to('cpu'),
+                'nemb' : [nembi.to('cpu') for nembi in nemb],}
+            os.makedirs(cdir, exist_ok=True)
+            remove_earliest_file(cdir, max_allowance=self.cache_inverse_maxn)
+            torch.save(cache_data, cfname)
+            data0 = {
+                'step' : step, 'cfg_scale' : cfg_scale, 'txt' : txt0,
+                'inner_step' : inner_step,
+                'diffuser' : self.tag_diffuser, 'lora' : self.tag_lora,
+                'xt' : xt0, 'emb' : emb0, 'nemb' : nemb,}
+            data1 = {
+                'step' : step, 'cfg_scale' : cfg_scale, 'txt' : txt1,
+                'inner_step' : inner_step,
+                'diffuser' : self.tag_diffuser, 'lora' : self.tag_lora,
+                'xt' : xt1, 'emb' : emb1, 'nemb' : nemb,}
+            return data0, data1
+    def image_inversion(
+            self, img, txt,
+            cfg_scale, step,
+            inversion, inner_step, force_reinvert):
+        from nulltxtinv_wrapper import text2image_ldm
+        if inversion == 'DDIM w/o text':
+            txt = ''
+        if not inversion == 'NTI':
+            data = self.ddiminv(img, {'txt':txt, 'step':step, 'cfg_scale':cfg_scale,})
+        else:
+            data = self.nullinv_or_loadcache(
+                img, {'txt':txt, 'step':step,
+                      'cfg_scale':cfg_scale, 'inner_step':inner_step,
+                      'diffuser' : self.tag_diffuser, 'lora' : self.tag_lora,}, force_reinvert)
+        if inversion == 'NTI':
+            img_inv, _ = text2image_ldm(
+                self.net, [txt], step, cfg_scale,
+                latent=data['xt'], uncond_embeddings=data['nemb'])
+        else:
+            img_inv, _ = text2image_ldm(
+            self.net, [txt], step, cfg_scale,
+            latent=data['xt'], uncond_embeddings=None)
+        return img_inv
+    def image_editing(
+        self, img, txt_0, txt_1,
+        cfg_scale, step, thresh,
+        inversion, inner_step, force_reinvert):
+        from nulltxtinv_wrapper import text2image_ldm_imedit
+        if inversion == 'DDIM w/o text':
+            txt_0 = ''
+        if not inversion == 'NTI':
+            data = self.ddiminv(img, {'txt':txt_0, 'step':step, 'cfg_scale':cfg_scale,})
+            img_edited, _ = text2image_ldm_imedit(
+                self.net, thresh, [txt_0], [txt_1], step, cfg_scale,
+                latent=data['xt'], uncond_embeddings=None)
+        else:
+            data = self.nullinv_or_loadcache(
+                img, {'txt':txt_0, 'step':step,
+                      'cfg_scale':cfg_scale, 'inner_step':inner_step,
+                      'diffuser' : self.tag_diffuser, 'lora' : self.tag_lora,}, force_reinvert)
+            img_edited, _ = text2image_ldm_imedit(
+                self.net, thresh, [txt_0], [txt_1], step, cfg_scale,
+                latent=data['xt'], uncond_embeddings=data['nemb'])
+        return img_edited
+    def general_interpolation(
+            self, xset0, xset1,
+            cfg_scale, step, tlist,):
+        xt0, emb0, nemb0 = xset0['xt'], xset0['emb'], xset0['nemb']
+        xt1, emb1, nemb1 = xset1['xt'], xset1['emb'], xset1['nemb']
+        framen = len(tlist)
+        xt_list = auto_slerp(tlist, xt0, xt1)
+        emb_list = auto_lerp(tlist, emb0, emb1)
+        if isinstance(nemb0, list) and isinstance(nemb1, list):
+            assert len(nemb0) == len(nemb1)
+            nemb_list = [auto_lerp(tlist, e0, e1) for e0, e1 in zip(nemb0, nemb1)]
+            nemb_islist = True
+        else:
+            nemb_list = auto_lerp(tlist, nemb0, nemb1)
+            nemb_islist = False
+        im_list = []
+        for frameidx in range(0, len(xt_list), self.batchsize):
+            xt_batch = [xt_list[idx] for idx in range(frameidx, min(frameidx+self.batchsize, framen))]
+            xt_batch = torch.cat(xt_batch, dim=0)
+            emb_batch = [emb_list[idx] for idx in range(frameidx, min(frameidx+self.batchsize, framen))]
+            emb_batch = torch.cat(emb_batch, dim=0)
+            if nemb_islist:
+                nemb_batch = []
+                for nembi in nemb_list:
+                    nembi_batch = [nembi[idx] for idx in range(frameidx, min(frameidx+self.batchsize, framen))]
+                    nembi_batch = torch.cat(nembi_batch, dim=0)
+                    nemb_batch.append(nembi_batch)
+            else:
+                nemb_batch = [nemb_list[idx] for idx in range(frameidx, min(frameidx+self.batchsize, framen))]
+                nemb_batch = torch.cat(nemb_batch, dim=0)
+            im = t2i_core(
+                self.net, xt_batch, emb_batch, nemb_batch, step, cfg_scale)
+            im_list += im if isinstance(im, list) else [im]
+        return im_list
+    def run_iminvs(
+            self, img, text,
+            cfg_scale, step,
+            force_resize, width, height,
+            inversion, inner_step, force_reinvert,
+            tag_diffuser, tag_lora, tag_scheduler, ):
+        self.precheck_model(tag_diffuser, tag_lora, tag_scheduler)
+        if force_resize:
+            img = offset_resize(img, width, height)
+        else:
+            img = regulate_image(img)
+        recon_output = self.image_inversion(
+            img, text, cfg_scale, step,
+            inversion, inner_step, force_reinvert)
+        idir = self.cache_image_folder
+        os.makedirs(idir, exist_ok=True)
+        remove_earliest_file(idir, max_allowance=self.cache_image_maxn)
+        sname = "time{}_iminvs_{}_{}".format(
+            int(time.time()), self.tag_diffuser, self.tag_lora,)
+        ipath = osp.join(idir, sname+'.png')
+        recon_output.save(ipath)
+        return [recon_output]
+    def run_imedit(
+            self, img, txt_0,txt_1,
+            threshold, cfg_scale, step,
+            force_resize, width, height,
+            inversion, inner_step, force_reinvert,
+            tag_diffuser, tag_lora, tag_scheduler, ):
+        self.precheck_model(tag_diffuser, tag_lora, tag_scheduler)
+        if force_resize:
+            img = offset_resize(img, width, height)
+        else:
+            img = regulate_image(img)
+        edited_img= self.image_editing(
+            img, txt_0,txt_1, cfg_scale, step, threshold,
+            inversion, inner_step, force_reinvert)
+        idir = self.cache_image_folder
+        os.makedirs(idir, exist_ok=True)
+        remove_earliest_file(idir, max_allowance=self.cache_image_maxn)
+        sname = "time{}_imedit_{}_{}".format(
+            int(time.time()), self.tag_diffuser, self.tag_lora,)
+        ipath = osp.join(idir, sname+'.png')
+        edited_img.save(ipath)
+        return [edited_img]
+    def run_imintp(
+            self,
+            img0, img1, txt0, txt1,
+            cfg_scale, step,
+            framen, fps,
+            force_resize, width, height,
+            inversion, inner_step, force_reinvert,
+            tag_diffuser, tag_lora, tag_scheduler,):
+        self.precheck_model(tag_diffuser, tag_lora, tag_scheduler)
+        if txt1 == '':
+            txt1 = txt0
+        if force_resize:
+            img0 = offset_resize(img0, width, height)
+            img1 = offset_resize(img1, width, height)
+        else:
+            img0 = regulate_image(img0)
+            img1 = regulate_image(img1)
+        if inversion == 'DDIM':
+            data0 = self.ddiminv(img0, {'txt':txt0, 'step':step, 'cfg_scale':cfg_scale,})
+            data1 = self.ddiminv(img1, {'txt':txt1, 'step':step, 'cfg_scale':cfg_scale,})
+        elif inversion == 'DDIM w/o text':
+            data0 = self.ddiminv(img0, {'txt':"", 'step':step, 'cfg_scale':cfg_scale,})
+            data1 = self.ddiminv(img1, {'txt':"", 'step':step, 'cfg_scale':cfg_scale,})
+        else:
+            data0, data1 = self.nullinvdual_or_loadcachedual(
+                img0, img1, {'txt0':txt0, 'txt1':txt1, 'step':step,
+                             'cfg_scale':cfg_scale, 'inner_step':inner_step,
+                             'diffuser' : self.tag_diffuser, 'lora' : self.tag_lora,}, force_reinvert)
+        tlist = np.linspace(0.0, 1.0, framen)
+        iminv0 = t2i_core(self.net, data0['xt'], data0['emb'], data0['nemb'], step, cfg_scale)
+        iminv1 = t2i_core(self.net, data1['xt'], data1['emb'], data1['nemb'], step, cfg_scale)
+        frames = self.general_interpolation(data0, data1, cfg_scale, step, tlist)
+        vdir = self.cache_video_folder
+        os.makedirs(vdir, exist_ok=True)
+        remove_earliest_file(vdir, max_allowance=self.cache_video_maxn)
+        sname = "time{}_imintp_{}_{}_framen{}_fps{}".format(
+            int(time.time()), self.tag_diffuser, self.tag_lora, framen, fps)
+        vpath = osp.join(vdir, sname+'.mp4')
+        frames2mp4(vpath, frames, fps)
+        jpath = osp.join(vdir, sname+'.json')
+        cfgdict = {
+            "method" : "image_interpolation",
+            "txt0" : txt0, "txt1" : txt1,
+            "cfg_scale" : cfg_scale, "step" : step,
+            "framen" : framen, "fps" : fps,
+            "force_resize" : force_resize, "width" : width, "height" : height,
+            "inversion" : inversion, "inner_step" : inner_step,
+            "force_reinvert" : force_reinvert,
+            "tag_diffuser" : tag_diffuser, "tag_lora" : tag_lora, "tag_scheduler" : tag_scheduler,}
+        with open(jpath, 'w') as f:
+            json.dump(cfgdict, f, indent=4)
+        return frames, vpath, [iminv0, iminv1]
+#################
+# get examples #
+#################
+cache_examples = False
+def get_imintp_example():
+    case = [
+        [
+            'assets/images/interpolation/cityview1.png',
+            'assets/images/interpolation/cityview2.png',
+            'A city view',],
+        [
+            'assets/images/interpolation/woman1.png',
+            'assets/images/interpolation/woman2.png',
+            'A woman face',],
+        [
+            'assets/images/interpolation/land1.png',
+            'assets/images/interpolation/land2.png',
+            'A beautiful landscape',],
+        [
+            'assets/images/interpolation/dog1.png',
+            'assets/images/interpolation/dog2.png',
+            'A realistic dog',],
+        [
+            'assets/images/interpolation/church1.png',
+            'assets/images/interpolation/church2.png',
+            'A church',],
+        [
+            'assets/images/interpolation/rabbit1.png',
+            'assets/images/interpolation/rabbit2.png',
+            'A cute rabbit',],
+        [
+            'assets/images/interpolation/horse1.png',
+            'assets/images/interpolation/horse2.png',
+            'A robot horse',],
+    ]
+    return case
+def get_iminvs_example():
+    case = [
+        [
+            'assets/images/inversion/000000560011.jpg',
+            'A mouse is next to a keyboard on a desk',],
+        [
+            'assets/images/inversion/000000029596.jpg',
+            'A room with a couch, table set with dinnerware and a television.',],
+    ]
+    return case
+def get_imedit_example():
+    case = [
+        [
+            'assets/images/editing/rabbit.png',
+            'A rabbit is eating a watermelon on the table',
+            'A cat is eating a watermelon on the table',
+            0.7,],
+        [
+            'assets/images/editing/cake.png',
+            'A chocolate cake with cream on it',
+            'A chocolate cake with strawberries on it',
+            0.9,],
+        [
+            'assets/images/editing/banana.png',
+            'A banana on the table',
+            'A banana and an apple on the table',
+            0.8,],
+    ]
+    return case
+#################
+# sub interface #
+#################
+def interface_imintp(wrapper_obj):
+    with gr.Row():
+        with gr.Column():
+            img0 = gr.Image(label="Image Input 0", type='pil',  elem_id='customized_imbox')
+        with gr.Column():
+            img1 = gr.Image(label="Image Input 1", type='pil', elem_id='customized_imbox')
+        with gr.Column():
+            video_output = gr.Video(label="Video Result", format='mp4', elem_id='customized_imbox')
+    with gr.Row():
+        with gr.Column():
+            txt0 = gr.Textbox(label='Text Input', lines=1, placeholder="Input prompt...", )
+        with gr.Column():
+            with gr.Row():
+                inversion = auto_dropdown('Inversion', choices.inversion, default.inversion)
+                inner_step = gr.Slider(label="Inner Step (NTI)", value=default.nullinv_inner_step, minimum=1, maximum=10, step=1)
+                force_reinvert = gr.Checkbox(label="Force ReInvert (NTI)", value=False)
+    with gr.Row():
+        with gr.Column():
+            with gr.Row():
+                framen = gr.Slider(label="Frame Number", minimum=8, maximum=default.framen, value=default.framen, step=1)
+                fps = gr.Slider(label="Video FPS", minimum=4, maximum=default.fps, value=default.fps, step=4)
+            with gr.Row():
+                button_run = gr.Button("Run")
+        with gr.Column():
+            with gr.Accordion('Frame Results', open=False):
+                frame_output = gr.Gallery(label="Frames", elem_id='customized_imbox')
+            with gr.Accordion("Inversion Results", open=False):
+                inv_output = gr.Gallery(label="Inversion Results", elem_id='customized_imbox')
+            with gr.Accordion('Advanced Settings', open=False):
+                with gr.Row():
+                    tag_diffuser = auto_dropdown('Diffuser', choices.diffuser, default.diffuser)
+                    tag_lora = auto_dropdown('Use LoRA', choices.lora, default.lora)
+                    tag_scheduler = auto_dropdown('Scheduler', choices.scheduler, default.scheduler)
+                with gr.Row():
+                    cfg_scale = gr.Number(label="Scale", minimum=1, maximum=10, value=default.cfg_scale, step=0.5)
+                    step = gr.Number(default.step, label="Step", precision=0)
+                with gr.Row():
+                    force_resize = gr.Checkbox(label="Force Resize", value=True)
+                    inp_width = gr.Slider(label="Width", minimum=256, maximum=1024, value=512, step=64)
+                    inp_height = gr.Slider(label="Height", minimum=256, maximum=1024, value=512, step=64)
+                with gr.Row():
+                    txt1 = gr.Textbox(label='Optional Different Text Input for Image Input 1', lines=1, placeholder="Input prompt...", )
+    tag_diffuser.change(
+        wrapper_obj.load_all,
+        inputs = [tag_diffuser, tag_lora, tag_scheduler],
+        outputs = [tag_diffuser, tag_lora, tag_scheduler],)
+    tag_lora.change(
+        wrapper_obj.load_all,
+        inputs = [tag_diffuser, tag_lora, tag_scheduler],
+        outputs = [tag_diffuser, tag_lora, tag_scheduler],)
+    tag_scheduler.change(
+        wrapper_obj.load_scheduler,
+        inputs = [tag_scheduler],
+        outputs = [tag_scheduler],)
+    button_run.click(
+        wrapper_obj.run_imintp,
+        inputs=[img0, img1, txt0, txt1,
+                cfg_scale, step,
+                framen, fps,
+                force_resize, inp_width, inp_height,
+                inversion, inner_step, force_reinvert,
+                tag_diffuser, tag_lora, tag_scheduler,],
+        outputs=[frame_output, video_output, inv_output])
+    gr.Examples(
+        label='Examples',
+        examples=get_imintp_example(),
+        fn=wrapper_obj.run_imintp,
+        inputs=[img0, img1, txt0,],
+        outputs=[frame_output, video_output, inv_output],
+        cache_examples=cache_examples,)
+def interface_iminvs(wrapper_obj):
+    with gr.Row():
+        image_input = gr.Image(label="Image input", type='pil', elem_id='customized_imbox')
+        recon_output = gr.Gallery(label="Reconstruction output", elem_id='customized_imbox')
+    with gr.Row():
+        with gr.Column():
+            prompt = gr.Textbox(label='Text Input', lines=1, placeholder="Input prompt...", )
+            with gr.Row():
+                button_run = gr.Button("Run")
+        with gr.Column():
+            with gr.Row():
+                inversion = auto_dropdown('Inversion', choices.inversion, default.inversion)
+                inner_step = gr.Slider(label="Inner Step (NTI)", value=default.nullinv_inner_step, minimum=1, maximum=10, step=1)
+                force_reinvert = gr.Checkbox(label="Force ReInvert (NTI)", value=False)
+            with gr.Accordion('Advanced Settings', open=False):
+                with gr.Row():
+                    tag_diffuser = auto_dropdown('Diffuser', choices.diffuser, default.diffuser)
+                    tag_lora = auto_dropdown('Use LoRA', choices.lora, default.lora)
+                    tag_scheduler = auto_dropdown('Scheduler', choices.scheduler, default.scheduler)
+                with gr.Row():
+                    cfg_scale = gr.Number(label="Scale", minimum=1, maximum=10, value=default.cfg_scale, step=0.5)
+                    step = gr.Number(default.step, label="Step", precision=0)
+                with gr.Row():
+                    force_resize = gr.Checkbox(label="Force Resize", value=True)
+                    inp_width = gr.Slider(label="Width", minimum=256, maximum=1024, value=512, step=64)
+                    inp_height = gr.Slider(label="Height", minimum=256, maximum=1024, value=512, step=64)
+    tag_diffuser.change(
+        wrapper_obj.load_all,
+        inputs = [tag_diffuser, tag_lora, tag_scheduler],
+        outputs = [tag_diffuser, tag_lora, tag_scheduler],)
+    tag_lora.change(
+        wrapper_obj.load_all,
+        inputs = [tag_diffuser, tag_lora, tag_scheduler],
+        outputs = [tag_diffuser, tag_lora, tag_scheduler],)
+    tag_scheduler.change(
+        wrapper_obj.load_scheduler,
+        inputs = [tag_scheduler],
+        outputs = [tag_scheduler],)
+    button_run.click(
+        wrapper_obj.run_iminvs,
+        inputs=[image_input, prompt,
+                cfg_scale, step,
+                force_resize, inp_width, inp_height,
+                inversion, inner_step, force_reinvert,
+                tag_diffuser, tag_lora, tag_scheduler,],
+        outputs=[recon_output])
+    gr.Examples(
+        label='Examples',
+        examples=get_iminvs_example(),
+        fn=wrapper_obj.run_iminvs,
+        inputs=[image_input, prompt,],
+        outputs=[recon_output],
+        cache_examples=cache_examples,)
+def interface_imedit(wrapper_obj):
+    with gr.Row():
+        image_input = gr.Image(label="Image input", type='pil', elem_id='customized_imbox')
+        edited_output = gr.Gallery(label="Edited output", elem_id='customized_imbox')
+    with gr.Row():
+        with gr.Column():
+            prompt_0 = gr.Textbox(label='Source Text', lines=1, placeholder="Source prompt...", )
+            prompt_1 = gr.Textbox(label='Target Text', lines=1, placeholder="Target prompt...", )
+            with gr.Row():
+                button_run = gr.Button("Run")
+        with gr.Column():
+            with gr.Row():
+                inversion = auto_dropdown('Inversion', choices.inversion, default.inversion)
+                inner_step = gr.Slider(label="Inner Step (NTI)", value=default.nullinv_inner_step, minimum=1, maximum=10, step=1)
+                force_reinvert = gr.Checkbox(label="Force ReInvert (NTI)", value=False)
+                threshold = gr.Slider(label="Threshold", minimum=0, maximum=1, value=default.threshold, step=0.1)
+            with gr.Accordion('Advanced Settings', open=False):
+                with gr.Row():
+                    tag_diffuser = auto_dropdown('Diffuser', choices.diffuser, default.diffuser)
+                    tag_lora = auto_dropdown('Use LoRA', choices.lora, default.lora)
+                    tag_scheduler = auto_dropdown('Scheduler', choices.scheduler, default.scheduler)
+                with gr.Row():
+                    cfg_scale = gr.Number(label="Scale", minimum=1, maximum=10, value=default.cfg_scale, step=0.5)
+                    step = gr.Number(default.step, label="Step", precision=0)
+                with gr.Row():
+                    force_resize = gr.Checkbox(label="Force Resize", value=True)
+                    inp_width = gr.Slider(label="Width", minimum=256, maximum=1024, value=512, step=64)
+                    inp_height = gr.Slider(label="Height", minimum=256, maximum=1024, value=512, step=64)
+    tag_diffuser.change(
+        wrapper_obj.load_all,
+        inputs = [tag_diffuser, tag_lora, tag_scheduler],
+        outputs = [tag_diffuser, tag_lora, tag_scheduler],)
+    tag_lora.change(
+        wrapper_obj.load_all,
+        inputs = [tag_diffuser, tag_lora, tag_scheduler],
+        outputs = [tag_diffuser, tag_lora, tag_scheduler],)
+    tag_scheduler.change(
+        wrapper_obj.load_scheduler,
+        inputs = [tag_scheduler],
+        outputs = [tag_scheduler],)
+    button_run.click(
+        wrapper_obj.run_imedit,
+        inputs=[image_input, prompt_0, prompt_1,
+                threshold, cfg_scale, step,
+                force_resize, inp_width, inp_height,
+                inversion, inner_step, force_reinvert,
+                tag_diffuser, tag_lora, tag_scheduler,],
+        outputs=[edited_output])
+    gr.Examples(
+        label='Examples',
+        examples=get_imedit_example(),
+        fn=wrapper_obj.run_imedit,
+        inputs=[image_input, prompt_0, prompt_1, threshold,],
+        outputs=[edited_output],
+        cache_examples=cache_examples,)
+#############
+# Interface #
+#############
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--port', type=int, default=None)
+    args = parser.parse_args()
+    from app_utils import css_empty, css_version_4_11_0
+    # css = css_empty
+    css = css_version_4_11_0
+    wrapper_obj = wrapper(
+        fp16=False,
+        tag_diffuser=default.diffuser,
+        tag_lora=default.lora,
+        tag_scheduler=default.scheduler)
+    if True:
+        with gr.Blocks(css=css) as demo:
+            gr.HTML(
+                """
+                <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
+                <h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
+                    {}
+                </h1>
+                </div>
+                """.format(version))
+            with gr.Tab('Image Interpolation'):
+                interface_imintp(wrapper_obj)
+            with gr.Tab('Image Inversion'):
+                interface_iminvs(wrapper_obj)
+            with gr.Tab('Image Editing'):
+                interface_imedit(wrapper_obj)
+        demo.launch()

app_utils.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import os
+import os.path as osp
+import cv2
+import numpy as np
+import numpy.random as npr
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as tvtrans
+import PIL.Image
+from tqdm import tqdm
+from PIL import Image
+import copy
+import json
+from collections import OrderedDict
+#######
+# css #
+#######
+css_empty = ""
+css_version_4_11_0 = """
+    #customized_imbox {
+        min-height: 450px;
+        max-height: 450px;
+    }
+    #customized_imbox>div[data-testid="image"] {
+        min-height: 450px;
+    }
+    #customized_imbox>div[data-testid="image"]>span[data-testid="source-select"] {
+        max-height: 0px;
+    }
+    #customized_imbox>div[data-testid="image"]>span[data-testid="source-select"]>button {
+        max-height: 0px;
+    }
+    #customized_imbox>div[data-testid="image"]>div.upload-container>div.image-frame>img {
+        position: absolute;
+        top: 50%;
+        left: 50%;
+        transform: translateX(-50%) translateY(-50%);
+        width: unset;
+        height: unset;
+        max-height: 450px;
+    }
+    #customized_imbox>div.unpadded_box {
+        min-height: 450px;
+    }
+    #myinst {
+        font-size: 0.8rem;
+        margin: 0rem;
+        color: #6B7280;
+    }
+    #maskinst {
+        text-align: justify;
+        min-width: 1200px;
+    }
+    #maskinst>img {
+        min-width:399px;
+        max-width:450px;
+        vertical-align: top;
+        display: inline-block;
+    }
+    #maskinst:after {
+        content: "";
+        width: 100%;
+        display: inline-block;
+    }
+"""
+##########
+# helper #
+##########
+def highlight_print(info):
+    print('')
+    print(''.join(['#']*(len(info)+4)))
+    print('# '+info+' #')
+    print(''.join(['#']*(len(info)+4)))
+    print('')
+def auto_dropdown(name, choices_od, value):
+    import gradio as gr
+    option_list = [pi for pi in choices_od.keys()]
+    return gr.Dropdown(label=name, choices=option_list, value=value)
+def load_sd_from_file(target):
+    if osp.splitext(target)[-1] == '.ckpt':
+        sd = torch.load(target, map_location='cpu')['state_dict']
+    elif osp.splitext(target)[-1] == '.pth':
+        sd = torch.load(target, map_location='cpu')
+    elif osp.splitext(target)[-1] == '.safetensors':
+        from safetensors.torch import load_file as stload
+        sd = OrderedDict(stload(target, device='cpu'))
+    else:
+        assert False, "File type must be .ckpt or .pth or .safetensors"
+    return sd
+def torch_to_numpy(x):
+    return x.detach().to('cpu').numpy()
+if __name__ == '__main__':
+    pass

assets/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

assets/images/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

assets/images/editing/banana.png ADDED Viewed

assets/images/editing/cake.png ADDED Viewed

assets/images/editing/rabbit.png ADDED Viewed

assets/images/interpolation/church1.png ADDED Viewed

assets/images/interpolation/church2.png ADDED Viewed

assets/images/interpolation/dog1.png ADDED Viewed

assets/images/interpolation/dog2.png ADDED Viewed

assets/images/interpolation/horse1.png ADDED Viewed

assets/images/interpolation/horse2.png ADDED Viewed

assets/images/interpolation/land1.png ADDED Viewed

assets/images/interpolation/land2.png ADDED Viewed

assets/images/interpolation/rabbit1.png ADDED Viewed

assets/images/interpolation/rabbit2.png ADDED Viewed

assets/images/interpolation/woman1.png ADDED Viewed

assets/images/interpolation/woman2.png ADDED Viewed

assets/images/inversion/000000029596.jpg ADDED Viewed

assets/images/inversion/000000560011.jpg ADDED Viewed

nulltxtinv_wrapper.py ADDED Viewed

	@@ -0,0 +1,450 @@

+import numpy as np
+import torch
+import PIL.Image
+from tqdm import tqdm
+from typing import Optional, Union, List
+import warnings
+warnings.filterwarnings('ignore')
+from torch.optim.adam import Adam
+import torch.nn.functional as nnf
+from diffusers import DDIMScheduler
+##########
+# helper #
+##########
+def diffusion_step(model, latents, context, t, guidance_scale, low_resource=False):
+    if low_resource:
+        noise_pred_uncond = model.unet(latents, t, encoder_hidden_states=context[0])["sample"]
+        noise_prediction_text = model.unet(latents, t, encoder_hidden_states=context[1])["sample"]
+    else:
+        latents_input = torch.cat([latents] * 2)
+        noise_pred = model.unet(latents_input, t, encoder_hidden_states=context)["sample"]
+        noise_pred_uncond, noise_prediction_text = noise_pred.chunk(2)
+    noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+    latents = model.scheduler.step(noise_pred, t, latents)["prev_sample"]
+    return latents
+def image2latent(vae, image):
+    with torch.no_grad():
+        if isinstance(image, PIL.Image.Image):
+            image = np.array(image)
+        if isinstance(image, np.ndarray):
+            dtype = next(vae.parameters()).dtype
+            device = next(vae.parameters()).device
+            image = torch.from_numpy(image).float() / 127.5 - 1
+            image = image.permute(2, 0, 1).unsqueeze(0).to(device=device, dtype=dtype)
+        latents = vae.encode(image)['latent_dist'].mean
+        latents = latents * 0.18215
+    return latents
+def latent2image(vae, latents, return_type='np'):
+    assert isinstance(latents, torch.Tensor)
+    latents = 1 / 0.18215 * latents.detach()
+    image = vae.decode(latents)['sample']
+    if return_type in ['np', 'pil']:
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        image = (image * 255).astype(np.uint8)
+        if return_type == 'pil':
+            pilim = [PIL.Image.fromarray(imi) for imi in image]
+            pilim = pilim[0] if len(pilim)==1 else pilim
+            return pilim
+        else:
+            return image
+def init_latent(latent, model, height, width, generator, batch_size):
+    if latent is None:
+        latent = torch.randn(
+            (1, model.unet.in_channels, height // 8, width // 8),
+            generator=generator,
+        )
+    latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)
+    return latent, latents
+def txt_to_emb(model, prompt):
+    text_input = model.tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=model.tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",)
+    text_embeddings = model.text_encoder(text_input.input_ids.to(model.device))[0]
+    return text_embeddings
+@torch.no_grad()
+def text2image_ldm(
+        model,
+        prompt:  List[str],
+        num_inference_steps: int = 50,
+        guidance_scale: Optional[float] = 7.5,
+        generator: Optional[torch.Generator] = None,
+        latent: Optional[torch.FloatTensor] = None,
+        uncond_embeddings=None,
+        start_time=50,
+        return_type='pil', ):
+    batch_size = len(prompt)
+    height = width = 512
+    if latent is not None:
+        height = latent.shape[-2] * 8
+        width = latent.shape[-1] * 8
+    text_input = model.tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=model.tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",)
+    text_embeddings = model.text_encoder(text_input.input_ids.to(model.device))[0]
+    max_length = text_input.input_ids.shape[-1]
+    if uncond_embeddings is None:
+        uncond_input = model.tokenizer(
+            [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt",)
+        uncond_embeddings_ = model.text_encoder(uncond_input.input_ids.to(model.device))[0]
+    else:
+        uncond_embeddings_ = None
+    latent, latents = init_latent(latent, model, height, width, generator, batch_size)
+    model.scheduler.set_timesteps(num_inference_steps)
+    for i, t in enumerate(tqdm(model.scheduler.timesteps[-start_time:])):
+        if uncond_embeddings_ is None:
+            context = torch.cat([uncond_embeddings[i].expand(*text_embeddings.shape), text_embeddings])
+        else:
+            context = torch.cat([uncond_embeddings_, text_embeddings])
+        latents = diffusion_step(model, latents, context, t, guidance_scale, low_resource=False)
+    if return_type in ['pil', 'np']:
+        image = latent2image(model.vae, latents, return_type=return_type)
+    else:
+        image = latents
+    return image, latent
+@torch.no_grad()
+def text2image_ldm_imedit(
+    model,
+    thresh,
+    prompt:  List[str],
+    target_prompt:  List[str],
+    num_inference_steps: int = 50,
+    guidance_scale: Optional[float] = 7.5,
+    generator: Optional[torch.Generator] = None,
+    latent: Optional[torch.FloatTensor] = None,
+    uncond_embeddings=None,
+    start_time=50,
+    return_type='pil'
+):
+    batch_size = len(prompt)
+    height = width = 512
+    text_input = model.tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=model.tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    target_text_input = model.tokenizer(
+        target_prompt,
+        padding="max_length",
+        max_length=model.tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_embeddings = model.text_encoder(text_input.input_ids.to(model.device))[0]
+    target_text_embeddings = model.text_encoder(target_text_input.input_ids.to(model.device))[0]
+    max_length = text_input.input_ids.shape[-1]
+    if uncond_embeddings is None:
+        uncond_input = model.tokenizer(
+            [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
+        )
+        uncond_embeddings_ = model.text_encoder(uncond_input.input_ids.to(model.device))[0]
+    else:
+        uncond_embeddings_ = None
+    latent, latents = init_latent(latent, model, height, width, generator, batch_size)
+    model.scheduler.set_timesteps(num_inference_steps)
+    for i, t in enumerate(tqdm(model.scheduler.timesteps[-start_time:])):
+        if i < (1 - thresh) * num_inference_steps:
+            if uncond_embeddings_ is None:
+                context = torch.cat([uncond_embeddings[i].expand(*text_embeddings.shape), text_embeddings])
+            else:
+                context = torch.cat([uncond_embeddings_, text_embeddings])
+            latents = diffusion_step(model, latents, context, t, guidance_scale, low_resource=False)
+        else:
+            if uncond_embeddings_ is None:
+                context = torch.cat([uncond_embeddings[i].expand(*target_text_embeddings.shape), target_text_embeddings])
+            else:
+                context = torch.cat([uncond_embeddings_, target_text_embeddings])
+            latents = diffusion_step(model, latents, context, t, guidance_scale, low_resource=False)
+    if return_type in ['pil', 'np']:
+        image = latent2image(model.vae, latents, return_type=return_type)
+    else:
+        image = latents
+    return image, latent
+###########
+# wrapper #
+###########
+class NullInversion(object):
+    def __init__(self, model, num_ddim_steps, guidance_scale, device='cuda'):
+        self.model = model
+        self.device = device
+        self.num_ddim_steps=num_ddim_steps
+        self.guidance_scale = guidance_scale
+        self.tokenizer = self.model.tokenizer
+        self.prompt = None
+        self.context = None
+    def prev_step(self, model_output: Union[torch.FloatTensor, np.ndarray], timestep: int, sample: Union[torch.FloatTensor, np.ndarray]):
+        prev_timestep = timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.scheduler.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        pred_original_sample = (sample - beta_prod_t ** 0.5 * model_output) / alpha_prod_t ** 0.5
+        pred_sample_direction = (1 - alpha_prod_t_prev) ** 0.5 * model_output
+        prev_sample = alpha_prod_t_prev ** 0.5 * pred_original_sample + pred_sample_direction
+        return prev_sample
+    def next_step(self, noise_pred, timestep, sample):
+        timestep, next_timestep = min(timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps, 999), timestep
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep] if timestep >= 0 else self.scheduler.final_alpha_cumprod
+        alpha_prod_t_next = self.scheduler.alphas_cumprod[next_timestep]
+        beta_prod_t = 1 - alpha_prod_t
+        next_original_sample = (sample - beta_prod_t ** 0.5 * noise_pred) / alpha_prod_t ** 0.5
+        next_sample_direction = (1 - alpha_prod_t_next) ** 0.5 * noise_pred
+        next_sample = alpha_prod_t_next ** 0.5 * next_original_sample + next_sample_direction
+        return next_sample
+    def get_noise_pred_single(self, latents, t, context):
+        noise_pred = self.model.unet(latents, t, encoder_hidden_states=context)["sample"]
+        return noise_pred
+    def get_noise_pred(self, latents, t, is_forward=True, context=None):
+        latents_input = torch.cat([latents] * 2)
+        if context is None:
+            context = self.context
+        guidance_scale = 1 if is_forward else self.guidance_scale
+        noise_pred = self.model.unet(latents_input, t, encoder_hidden_states=context)["sample"]
+        noise_pred_uncond, noise_prediction_text = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+        if is_forward:
+            latents = self.next_step(noise_pred, t, latents)
+        else:
+            latents = self.prev_step(noise_pred, t, latents)
+        return latents
+    @torch.no_grad()
+    def init_prompt(self, prompt: str):
+        uncond_input = self.model.tokenizer(
+            [""], padding="max_length", max_length=self.model.tokenizer.model_max_length,
+            return_tensors="pt"
+        )
+        uncond_embeddings = self.model.text_encoder(uncond_input.input_ids.to(self.model.device))[0]
+        text_input = self.model.tokenizer(
+            [prompt],
+            padding="max_length",
+            max_length=self.model.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.model.text_encoder(text_input.input_ids.to(self.model.device))[0]
+        self.context = torch.cat([uncond_embeddings, text_embeddings])
+        self.prompt = prompt
+    @torch.no_grad()
+    def ddim_loop(self, latent, emb):
+        # uncond_embeddings, cond_embeddings = self.context.chunk(2)
+        all_latent = [latent]
+        latent = latent.clone().detach()
+        for i in range(self.num_ddim_steps):
+            t = self.model.scheduler.timesteps[len(self.model.scheduler.timesteps) - i - 1]
+            noise_pred = self.get_noise_pred_single(latent, t, emb)
+            latent = self.next_step(noise_pred, t, latent)
+            all_latent.append(latent)
+        return all_latent
+    @property
+    def scheduler(self):
+        return self.model.scheduler
+    @torch.no_grad()
+    def ddim_invert(self, image, prompt):
+        assert isinstance(image, PIL.Image.Image)
+        scheduler_save = self.model.scheduler
+        self.model.scheduler = DDIMScheduler.from_config(self.model.scheduler.config)
+        self.model.scheduler.set_timesteps(self.num_ddim_steps)
+        with torch.no_grad():
+            emb = txt_to_emb(self.model, prompt)
+            latent = image2latent(self.model.vae, image)
+        ddim_latents = self.ddim_loop(latent, emb)
+        self.model.scheduler = scheduler_save
+        return ddim_latents[-1]
+    def null_optimization(self, latents, emb, nemb=None, num_inner_steps=10, epsilon=1e-5):
+        # force fp32
+        dtype = latents[0].dtype
+        uncond_embeddings = nemb.float() if nemb is not None else txt_to_emb(self.model, "").float()
+        cond_embeddings = emb.float()
+        latents = [li.float() for li in latents]
+        self.model.unet.to(torch.float32)
+        uncond_embeddings_list = []
+        latent_cur = latents[-1]
+        bar = tqdm(total=num_inner_steps * self.num_ddim_steps)
+        for i in range(self.num_ddim_steps):
+            uncond_embeddings = uncond_embeddings.clone().detach()
+            uncond_embeddings.requires_grad = True
+            optimizer = Adam([uncond_embeddings], lr=1e-2 * (1. - i / 100.))
+            latent_prev = latents[len(latents) - i - 2]
+            t = self.model.scheduler.timesteps[i]
+            with torch.no_grad():
+                noise_pred_cond = self.get_noise_pred_single(latent_cur, t, cond_embeddings)
+            for j in range(num_inner_steps):
+                noise_pred_uncond = self.get_noise_pred_single(latent_cur, t, uncond_embeddings)
+                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                latents_prev_rec = self.prev_step(noise_pred, t, latent_cur)
+                loss = nnf.mse_loss(latents_prev_rec, latent_prev)
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+                loss_item = loss.item()
+                bar.update()
+                if loss_item < epsilon + i * 2e-5:
+                    break
+            for j in range(j + 1, num_inner_steps):
+                bar.update()
+            uncond_embeddings_list.append(uncond_embeddings[:1].detach())
+            with torch.no_grad():
+                context = torch.cat([uncond_embeddings, cond_embeddings])
+                latent_cur = self.get_noise_pred(latent_cur, t, False, context)
+        bar.close()
+        uncond_embeddings_list = [ui.to(dtype) for ui in uncond_embeddings_list]
+        self.model.unet.to(dtype)
+        return uncond_embeddings_list
+    def null_invert(self, im, txt, ntxt=None, num_inner_steps=10, early_stop_epsilon=1e-5):
+        assert isinstance(im, PIL.Image.Image)
+        scheduler_save = self.model.scheduler
+        self.model.scheduler = DDIMScheduler.from_config(self.model.scheduler.config)
+        self.model.scheduler.set_timesteps(self.num_ddim_steps)
+        with torch.no_grad():
+            nemb = txt_to_emb(self.model, ntxt) \
+                if ntxt is not None else txt_to_emb(self.model, "")
+            emb  = txt_to_emb(self.model, txt)
+            latent = image2latent(self.model.vae, im)
+        # ddim inversion
+        ddim_latents = self.ddim_loop(latent, emb)
+        # nulltext inversion
+        uncond_embeddings = self.null_optimization(
+            ddim_latents, emb, nemb, num_inner_steps, early_stop_epsilon)
+        self.model.scheduler = scheduler_save
+        return ddim_latents[-1], uncond_embeddings
+    def null_optimization_dual(
+            self, latents0, latents1, emb0, emb1, nemb=None,
+            num_inner_steps=10, epsilon=1e-5):
+        # force fp32
+        dtype = latents0[0].dtype
+        uncond_embeddings = nemb.float() if nemb is not None else txt_to_emb(self.model, "").float()
+        cond_embeddings0, cond_embeddings1 = emb0.float(), emb1.float()
+        latents0 = [li.float() for li in latents0]
+        latents1 = [li.float() for li in latents1]
+        self.model.unet.to(torch.float32)
+        uncond_embeddings_list = []
+        latent_cur0 = latents0[-1]
+        latent_cur1 = latents1[-1]
+        bar = tqdm(total=num_inner_steps * self.num_ddim_steps)
+        for i in range(self.num_ddim_steps):
+            uncond_embeddings = uncond_embeddings.clone().detach()
+            uncond_embeddings.requires_grad = True
+            optimizer = Adam([uncond_embeddings], lr=1e-2 * (1. - i / 100.))
+            latent_prev0 = latents0[len(latents0) - i - 2]
+            latent_prev1 = latents1[len(latents1) - i - 2]
+            t = self.model.scheduler.timesteps[i]
+            with torch.no_grad():
+                noise_pred_cond0 = self.get_noise_pred_single(latent_cur0, t, cond_embeddings0)
+                noise_pred_cond1 = self.get_noise_pred_single(latent_cur1, t, cond_embeddings1)
+            for j in range(num_inner_steps):
+                noise_pred_uncond0 = self.get_noise_pred_single(latent_cur0, t, uncond_embeddings)
+                noise_pred_uncond1 = self.get_noise_pred_single(latent_cur1, t, uncond_embeddings)
+                noise_pred0 = noise_pred_uncond0 + self.guidance_scale*(noise_pred_cond0-noise_pred_uncond0)
+                noise_pred1 = noise_pred_uncond1 + self.guidance_scale*(noise_pred_cond1-noise_pred_uncond1)
+                latents_prev_rec0 = self.prev_step(noise_pred0, t, latent_cur0)
+                latents_prev_rec1 = self.prev_step(noise_pred1, t, latent_cur1)
+                loss = nnf.mse_loss(latents_prev_rec0, latent_prev0) + \
+                       nnf.mse_loss(latents_prev_rec1, latent_prev1)
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+                loss_item = loss.item()
+                bar.update()
+                if loss_item < epsilon + i * 2e-5:
+                    break
+            for j in range(j + 1, num_inner_steps):
+                bar.update()
+            uncond_embeddings_list.append(uncond_embeddings[:1].detach())
+            with torch.no_grad():
+                context0 = torch.cat([uncond_embeddings, cond_embeddings0])
+                context1 = torch.cat([uncond_embeddings, cond_embeddings1])
+                latent_cur0 = self.get_noise_pred(latent_cur0, t, False, context0)
+                latent_cur1 = self.get_noise_pred(latent_cur1, t, False, context1)
+        bar.close()
+        uncond_embeddings_list = [ui.to(dtype) for ui in uncond_embeddings_list]
+        self.model.unet.to(dtype)
+        return uncond_embeddings_list
+    def null_invert_dual(
+            self, im0, im1, txt0, txt1, ntxt=None,
+            num_inner_steps=10, early_stop_epsilon=1e-5, ):
+        assert isinstance(im0, PIL.Image.Image)
+        assert isinstance(im1, PIL.Image.Image)
+        scheduler_save = self.model.scheduler
+        self.model.scheduler = DDIMScheduler.from_config(self.model.scheduler.config)
+        self.model.scheduler.set_timesteps(self.num_ddim_steps)
+        with torch.no_grad():
+            nemb = txt_to_emb(self.model, ntxt) \
+                if ntxt is not None else txt_to_emb(self.model, "")
+            latent0 = image2latent(self.model.vae, im0)
+            latent1 = image2latent(self.model.vae, im1)
+            emb0 = txt_to_emb(self.model, txt0)
+            emb1 = txt_to_emb(self.model, txt1)
+        # ddim inversion
+        ddim_latents_0 = self.ddim_loop(latent0, emb0)
+        ddim_latents_1 = self.ddim_loop(latent1, emb1)
+        # nulltext inversion
+        nembs = self.null_optimization_dual(
+            ddim_latents_0, ddim_latents_1, emb0, emb1, nemb, num_inner_steps, early_stop_epsilon)
+        self.model.scheduler = scheduler_save
+        return ddim_latents_0[-1], ddim_latents_1[-1], nembs

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+accelerate==0.20.3
+bitsandbytes==0.42.0
+datasets==2.14.4
+diffusers==0.20.1
+easydict==1.11
+gradio==4.19.2
+huggingface_hub==0.19.3
+moviepy==1.0.3
+opencv_python==4.7.0.72
+packaging==23.2
+pypatchify==0.1.4
+safetensors==0.3.1
+tqdm==4.65.0
+transformers==4.30.1
+wandb==0.16.3
+xformers==0.0.17