Spaces:

bahjat-kawar
/

time-diffusion

Runtime error

App Files Files Community

Bahjat Kawar commited on Mar 15, 2023

Commit

3f7ead4

•

1 Parent(s): aae0ff3

first commit

Browse files

Files changed (6) hide show

README.md +1 -1
app.py +35 -0
requirements.txt +5 -0
time_main.py +138 -0
time_utils.py +105 -0
train_funcs.py +65 -0

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Time Diffusion
 emoji: 🐢
 colorFrom: red
 colorTo: red

 ---
+title: Editing Implicit Assumptions in Text-to-Image Diffusion Models
 emoji: 🐢
 colorFrom: red
 colorTo: red

app.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import gradio as gr
+from time_main import edit_model, generate_for_text
+with gr.Blocks() as demo:
+	gr.Markdown("<center><h2>TIME: Text-to-Image Model Editing</h2>Demo for the paper <a href=\"https://time-diffusion.github.io/\" style=\"color:black;\">\"Editing Implicit Assumptions in Text-to-Image Diffusion Models\"</a>. Implemented with Stable Diffusion v1.4.</center>")
+	with gr.Box():
+		gr.Markdown("1. Edit a concept in a text-to-image model by specifying an under-specified \"source\" prompt, and a similar \"destination\" prompt with an additional specification.")
+		with gr.Row():
+			src = gr.Textbox(label = "Source Prompt", placeholder="e.g., A pack of roses")
+			dst = gr.Textbox(label = "Destination Prompt", placeholder="e.g., A pack of blue roses")
+		with gr.Row():
+			lamb_val = gr.Slider(value = 0.1, minimum=0.01, maximum=10000, label = "Strength of regularization (lambda)", interactive = True)
+		with gr.Row():
+			edit_btn = gr.Button("Edit Model")
+		with gr.Row():
+			gr.HTML(value = "<br />")
+		with gr.Row():
+			edit_status = gr.HTML(value="<b>Current model status:</b> Unedited")
+	edit_btn.click(fn=edit_model, inputs=[src, dst, lamb_val], outputs=edit_status)
+	with gr.Box():
+		gr.Markdown("2. After editing, try any test prompt and see the effect on the generated images!")
+		with gr.Row():
+			tst = gr.Textbox(label = "Test Prompt", placeholder="e.g., A field of roses")
+		with gr.Row():
+			gen_btn = gr.Button("Generate Image")
+		with gr.Row():
+			gr.HTML(value = "<br />")
+		with gr.Row():
+			out_img = gr.Image(label="Generated Image")
+	gen_btn.click(fn=generate_for_text, inputs=tst, outputs=out_img)
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+--extra-index-url https://download.pytorch.org/whl/cu113
+torch==1.13.1
+diffusers
+numpy
+Pillow

time_main.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import torch
+from diffusers import StableDiffusionPipeline
+import numpy as np
+import abc
+import time_utils
+import copy
+import os
+from train_funcs import TRAIN_FUNC_DICT
+## get arguments for our script
+with_to_k = True
+with_augs = True
+train_func = "train_closed_form"
+### load model
+LOW_RESOURCE = True
+NUM_DIFFUSION_STEPS = 50
+GUIDANCE_SCALE = 7.5
+MAX_NUM_WORDS = 77
+device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
+ldm_stable = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(device)
+tokenizer = ldm_stable.tokenizer
+### get layers
+ca_layers = []
+def append_ca(net_):
+    if net_.__class__.__name__ == 'CrossAttention':
+        ca_layers.append(net_)
+    elif hasattr(net_, 'children'):
+        for net__ in net_.children():
+            append_ca(net__)
+sub_nets = ldm_stable.unet.named_children()
+for net in sub_nets:
+        if "down" in net[0]:
+            append_ca(net[1])
+        elif "up" in net[0]:
+            append_ca(net[1])
+        elif "mid" in net[0]:
+            append_ca(net[1])
+### get projection matrices
+ca_clip_layers = [l for l in ca_layers if l.to_v.in_features == 768]
+projection_matrices = [l.to_v for l in ca_clip_layers]
+og_matrices = [copy.deepcopy(l.to_v) for l in ca_clip_layers]
+if with_to_k:
+    projection_matrices = projection_matrices + [l.to_k for l in ca_clip_layers]
+    og_matrices = og_matrices + [copy.deepcopy(l.to_k) for l in ca_clip_layers]
+def edit_model(old_text_, new_text_, lamb=0.1):
+    #### restart LDM parameters
+    num_ca_clip_layers = len(ca_clip_layers)
+    for idx_, l in enumerate(ca_clip_layers):
+        l.to_v = copy.deepcopy(og_matrices[idx_])
+        projection_matrices[idx_] = l.to_v
+        if with_to_k:
+            l.to_k = copy.deepcopy(og_matrices[num_ca_clip_layers + idx_])
+            projection_matrices[num_ca_clip_layers + idx_] = l.to_k
+    try:
+        #### set up sentences
+        old_texts = [old_text_]
+        new_texts = [new_text_]
+        if with_augs:
+            base = old_texts[0] if old_texts[0][0:1] != "A" else "a" + old_texts[0][1:]
+            old_texts.append("A photo of " + base)
+            old_texts.append("An image of " + base)
+            old_texts.append("A picture of " + base)
+            base = new_texts[0] if new_texts[0][0:1] != "A" else "a" + new_texts[0][1:]
+            new_texts.append("A photo of " + base)
+            new_texts.append("An image of " + base)
+            new_texts.append("A picture of " + base)
+        #### prepare input k* and v*
+        old_embs, new_embs = [], []
+        for old_text, new_text in zip(old_texts, new_texts):
+            text_input = ldm_stable.tokenizer(
+                [old_text, new_text],
+                padding="max_length",
+                max_length=ldm_stable.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_embeddings = ldm_stable.text_encoder(text_input.input_ids.to(ldm_stable.device))[0]
+            old_emb, new_emb = text_embeddings
+            old_embs.append(old_emb)
+            new_embs.append(new_emb)
+        #### indetify corresponding destinations for each token in old_emb
+        idxs_replaces = []
+        for old_text, new_text in zip(old_texts, new_texts):
+            tokens_a = tokenizer(old_text).input_ids
+            tokens_b = tokenizer(new_text).input_ids
+            tokens_a = [tokenizer.encode("a ")[1] if tokenizer.decode(t) == 'an' else t for t in tokens_a]
+            tokens_b = [tokenizer.encode("a ")[1] if tokenizer.decode(t) == 'an' else t for t in tokens_b]
+            num_orig_tokens = len(tokens_a)
+            num_new_tokens = len(tokens_b)
+            idxs_replace = []
+            j = 0
+            for i in range(num_orig_tokens):
+                curr_token = tokens_a[i]
+                while tokens_b[j] != curr_token:
+                    j += 1
+                idxs_replace.append(j)
+                j += 1
+            while j < 77:
+                idxs_replace.append(j)
+                j += 1
+            while len(idxs_replace) < 77:
+                idxs_replace.append(76)
+            idxs_replaces.append(idxs_replace)
+        #### prepare batch: for each pair of setences, old context and new values
+        contexts, valuess = [], []
+        for old_emb, new_emb, idxs_replace in zip(old_embs, new_embs, idxs_replaces):
+            context = old_emb.detach()
+            values = []
+            with torch.no_grad():
+                for layer in projection_matrices:
+                    values.append(layer(new_emb[idxs_replace]).detach())
+            contexts.append(context)
+            valuess.append(values)
+        #### define training function
+        train = TRAIN_FUNC_DICT[train_func]
+        #### train the model
+        train(ldm_stable, projection_matrices, og_matrices, contexts, valuess, old_texts, new_texts, lamb=lamb)
+        return f"<b>Current model status:</b> Edited \"{old_text_}\" into \"{new_text_}\""
+    except:
+        return "<b>Current model status:</b> An error occured"
+def generate_for_text(test_text):
+    g = torch.Generator(device='cpu')
+    g.seed()
+    images = time_utils.text2image_ldm_stable(ldm_stable, [test_text], latent=None, num_inference_steps=NUM_DIFFUSION_STEPS, guidance_scale=GUIDANCE_SCALE, generator=g, low_resource=LOW_RESOURCE)
+    return time_utils.view_images(images)

time_utils.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import numpy as np
+import torch
+from PIL import Image
+def view_images(images, num_rows=1, offset_ratio=0.02):
+    if type(images) is list:
+        num_empty = len(images) % num_rows
+    elif images.ndim == 4:
+        num_empty = images.shape[0] % num_rows
+    else:
+        images = [images]
+        num_empty = 0
+    empty_images = np.ones(images[0].shape, dtype=np.uint8) * 255
+    images = [image.astype(np.uint8) for image in images] + [empty_images] * num_empty
+    num_items = len(images)
+    h, w, c = images[0].shape
+    offset = int(h * offset_ratio)
+    num_cols = num_items // num_rows
+    image_ = np.ones((h * num_rows + offset * (num_rows - 1),
+                      w * num_cols + offset * (num_cols - 1), 3), dtype=np.uint8) * 255
+    for i in range(num_rows):
+        for j in range(num_cols):
+            image_[i * (h + offset): i * (h + offset) + h:, j * (w + offset): j * (w + offset) + w] = images[
+                i * num_cols + j]
+    pil_img = Image.fromarray(image_)
+    return pil_img
+def diffusion_step(model, latents, context, t, guidance_scale, low_resource=False):
+    if low_resource:
+        noise_pred_uncond = model.unet(latents, t, encoder_hidden_states=context[0])["sample"]
+        noise_prediction_text = model.unet(latents, t, encoder_hidden_states=context[1])["sample"]
+    else:
+        latents_input = torch.cat([latents] * 2)
+        noise_pred = model.unet(latents_input, t, encoder_hidden_states=context)["sample"]
+        noise_pred_uncond, noise_prediction_text = noise_pred.chunk(2)
+    noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+    latents = model.scheduler.step(noise_pred, t, latents)["prev_sample"]
+    return latents
+def latent2image(vae, latents):
+    latents = 1 / 0.18215 * latents
+    image = vae.decode(latents)['sample']
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.cpu().permute(0, 2, 3, 1).numpy()
+    image = (image * 255).astype(np.uint8)
+    return image
+def init_latent(latent, model, height, width, generator, batch_size):
+    if latent is None:
+        latent = torch.randn(
+            (1, model.unet.in_channels, height // 8, width // 8),
+            generator=generator,
+        )
+    latents = latent.expand(batch_size,  model.unet.in_channels, height // 8, width // 8).to(model.device)
+    return latent, latents
+@torch.no_grad()
+def text2image_ldm_stable(
+    model,
+    prompt,
+    num_inference_steps = 50,
+    guidance_scale = 7.5,
+    generator = None,
+    latent = None,
+    low_resource = False,
+):
+    height = width = 512
+    batch_size = len(prompt)
+    text_input = model.tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=model.tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_embeddings = model.text_encoder(text_input.input_ids.to(model.device))[0]
+    max_length = text_input.input_ids.shape[-1]
+    uncond_input = model.tokenizer(
+        [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
+    )
+    uncond_embeddings = model.text_encoder(uncond_input.input_ids.to(model.device))[0]
+    context = [uncond_embeddings, text_embeddings]
+    if not low_resource:
+        context = torch.cat(context)
+    latent, latents = init_latent(latent, model, height, width, generator, batch_size)
+    model.scheduler.set_timesteps(num_inference_steps)
+    for t in model.scheduler.timesteps:
+        latents = diffusion_step(model, latents, context, t, guidance_scale, low_resource)
+    image = latent2image(model.vae, latents)
+    image, _ = model.run_safety_checker(image=image, device=model.device, dtype=text_embeddings.dtype)
+    return image

train_funcs.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import torch
+import numpy as np
+import ast
+"""
+TRAIN FUNCTION DEFINITION:
+    train(model: StableDiffusionPipeline,
+          projection_matrices: list[size=L](nn.Module),
+          og_matrices: list[size=L](nn.Module),
+          contexts: list[size=N](torch.tensor[size=MAX_LEN,...]),
+          valuess: list[size=N](list[size=L](torch.tensor[size=MAX_LEN,...])),
+          old_texts: list[size=N](str),
+          new_texts: list[size=N](str),
+          **kwargs)
+    where L is the number of matrices to edit, and N is the number of sentences to train on (batch size).
+PARAMS:
+    model: the model to use.
+    projection_matrices: list of projection matrices to edit from the model.
+    og_matrices: list of original values for the projection matrices. detached from the model.
+    contexts: list of context vectors (inputs to the matrices) to edit.
+    valuess: list of results from all matrices for each context vector.
+    old_texts: list of sentences to be edited.
+    new_texts: list of target sentences to be aimed at.
+    **kwargs: additional command line arguments.
+TRAIN_FUNC_DICT defined at the bottom of the file.
+"""
+def baseline_train(model, projection_matrices, og_matrices, contexts, valuess, old_texts, new_texts):
+    return None
+def train_closed_form(ldm_stable, projection_matrices, og_matrices, contexts, valuess, old_texts,
+          new_texts, layers_to_edit=None, lamb=0.1):
+    layers_to_edit = ast.literal_eval(layers_to_edit) if type(layers_to_edit) == str else layers_to_edit
+    lamb = ast.literal_eval(lamb) if type(lamb) == str else lamb
+    for layer_num in range(len(projection_matrices)):
+        if (layers_to_edit is not None) and (layer_num not in layers_to_edit):
+            continue
+        with torch.no_grad():
+            #mat1 = \lambda W + \sum{v k^T}
+            mat1 = lamb * projection_matrices[layer_num].weight
+            #mat2 = \lambda I + \sum{k k^T}
+            mat2 = lamb * torch.eye(projection_matrices[layer_num].weight.shape[1], device = projection_matrices[layer_num].weight.device)
+            #aggregate sums for mat1, mat2
+            for context, values in zip(contexts, valuess):
+                context_vector = context.reshape(context.shape[0], context.shape[1], 1)
+                context_vector_T = context.reshape(context.shape[0], 1, context.shape[1])
+                value_vector = values[layer_num].reshape(values[layer_num].shape[0], values[layer_num].shape[1], 1)
+                for_mat1 = (value_vector @ context_vector_T).sum(dim=0)
+                for_mat2 = (context_vector @ context_vector_T).sum(dim=0)
+                mat1 += for_mat1
+                mat2 += for_mat2
+            #update projection matrix
+            projection_matrices[layer_num].weight = torch.nn.Parameter(mat1 @ torch.inverse(mat2))
+TRAIN_FUNC_DICT = {
+ "baseline": baseline_train,
+ "train_closed_form": train_closed_form,
+}