Spaces:

tetrisd
/

Diffusion-Attentive-Attribution-Maps

Runtime error

App Files Files Community

tetrisd commited on Oct 15, 2022

Commit

5780ef2

•

1 Parent(s): 31ccd79

Improve efficiency

Browse files

Files changed (5) hide show

app.py +155 -11
diffusers/models/attention.py +3 -3
diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +4 -0
diffusers/pipelines/stable_diffusion/safety_checker.py +1 -0
requirements.txt +3 -0

app.py CHANGED Viewed

@@ -1,15 +1,159 @@
 import gradio as gr
-from transformers import pipeline
-pipeline = pipeline(task="image-classification", model="julien-c/hotdog-not-hotdog")
-def predict(image):
-  predictions = pipeline(image)
-  return {p["label"]: p["score"] for p in predictions}
-gr.Interface(
-    predict,
-    inputs=gr.inputs.Image(label="Upload hot dog candidate", type="filepath"),
-    outputs=gr.outputs.Label(num_top_classes=2),
-    title="Hot Dog? Or Not?",
-).launch()

+from threading import Lock
+import math
+import os
+import random
+from diffusers import StableDiffusionPipeline
+from diffusers.models.attention import get_global_heat_map, clear_heat_maps
+from matplotlib import pyplot as plt
 import gradio as gr
+import torch
+import torch.nn.functional as F
+import spacy
+if not os.environ.get('NO_DOWNLOAD_SPACY'):
+    spacy.cli.download('en_core_web_sm')
+model_id = "CompVis/stable-diffusion-v1-4"
+device = "cuda"
+gen = torch.Generator(device='cuda')
+gen.manual_seed(12758672)
+orig_state = gen.get_state()
+pipe = StableDiffusionPipeline.from_pretrained(model_id, use_auth_token=True).to(device)
+lock = Lock()
+nlp = spacy.load('en_core_web_sm')
+def expand_m(m, n: int = 1, o=512, mode='bicubic'):
+    m = m.unsqueeze(0).unsqueeze(0) / n
+    m = F.interpolate(m.float().detach(), size=(o, o), mode='bicubic', align_corners=False)
+    m = (m - m.min()) / (m.max() - m.min() + 1e-8)
+    m = m.cpu().detach()
+    return m
+@torch.no_grad()
+def predict(prompt, inf_steps, threshold):
+    global lock
+    with torch.cuda.amp.autocast(), lock:
+        try:
+            plt.close('all')
+        except:
+            pass
+        gen.set_state(orig_state.clone())
+        clear_heat_maps()
+        out = pipe(prompt, guidance_scale=7.5, height=512, width=512, do_intermediates=False, generator=gen, num_inference_steps=int(inf_steps))
+        heat_maps = get_global_heat_map()
+    with torch.cuda.amp.autocast(dtype=torch.float32):
+        m = 0
+        n = 0
+        w = ''
+        w_idx = 0
+        fig, ax = plt.subplots()
+        ax.imshow(out.images[0].cpu().float().detach().permute(1, 2, 0).numpy())
+        ax.set_xticks([])
+        ax.set_yticks([])
+        fig1, axs1 = plt.subplots(math.ceil(len(out.words) / 4), 4)#, figsize=(20, 20))
+        fig2, axs2 = plt.subplots(math.ceil(len(out.words) / 4), 4)  # , figsize=(20, 20))
+        for idx in range(len(out.words) + 1):
+            if idx == 0:
+                continue
+            word = out.words[idx - 1]
+            m += heat_maps[idx]
+            n += 1
+            w += word
+            if '</w>' not in word:
+                continue
+            else:
+                mplot = expand_m(m, n)
+                spotlit_im = out.images[0].cpu().float().detach()
+                w = w.replace('</w>', '')
+                spotlit_im2 = torch.cat((spotlit_im, (1 - mplot.squeeze(0)).pow(1)), dim=0)
+                if len(out.words) <= 4:
+                    a1 = axs1[w_idx % 4]
+                    a2 = axs2[w_idx % 4]
+                else:
+                    a1 = axs1[w_idx // 4, w_idx % 4]
+                    a2 = axs2[w_idx // 4, w_idx % 4]
+                a1.set_xticks([])
+                a1.set_yticks([])
+                a1.imshow(mplot.squeeze().numpy(), cmap='jet')
+                a1.imshow(spotlit_im2.permute(1, 2, 0).numpy())
+                a1.set_title(w)
+                mask = torch.ones_like(mplot)
+                mask[mplot < threshold * mplot.max()] = 0
+                im2 = spotlit_im * mask.squeeze(0)
+                a2.set_xticks([])
+                a2.set_yticks([])
+                a2.imshow(im2.permute(1, 2, 0).numpy())
+                a2.set_title(w)
+                m = 0
+                n = 0
+                w_idx += 1
+                w = ''
+        for idx in range(w_idx, len(axs1.flatten())):
+            fig1.delaxes(axs1.flatten()[idx])
+            fig2.delaxes(axs2.flatten()[idx])
+    return fig, fig1, fig2
+def set_prompt(prompt):
+    return prompt
+with gr.Blocks() as demo:
+    md = '''# DAAM: Attention Maps for Interpreting Stable Diffusion
+    Check out the paper: [What the DAAM: Interpreting Stable Diffusion Using Cross Attention](http://arxiv.org/abs/2210.04885).
+    '''
+    gr.Markdown(md)
+    with gr.Row():
+        with gr.Column():
+            dropdown = gr.Dropdown([
+                'An angry, bald man doing research',
+                'Doing research at Comcast Applied AI labs',
+                'Professor Jimmy Lin from the University of Waterloo',
+                'Yann Lecun teaching machine learning on a chalkboard',
+                'A cat eating cake for her birthday',
+                'Steak and dollars on a plate',
+                'A fox, a dog, and a wolf in a field'
+            ], label='Examples', value='An angry, bald man doing research')
+            text = gr.Textbox(label='Prompt', value='An angry, bald man doing research')
+            slider1 = gr.Slider(15, 35, value=25, interactive=True, step=1, label='Inference steps')
+            slider2 = gr.Slider(0, 1.0, value=0.4, interactive=True, step=0.05, label='Threshold (tau)')
+            submit_btn = gr.Button('Submit')
+        with gr.Tab('Original Image'):
+            p0 = gr.Plot()
+        with gr.Tab('Soft DAAM Maps'):
+            p1 = gr.Plot()
+        with gr.Tab('Hard DAAM Maps'):
+            p2 = gr.Plot()
+        submit_btn.click(fn=predict, inputs=[text, slider1, slider2], outputs=[p0, p1, p2])
+        dropdown.change(set_prompt, dropdown, text)
+        dropdown.update()
+demo.launch()#server_name='0.0.0.0', server_port=8080)

diffusers/models/attention.py CHANGED Viewed

@@ -324,12 +324,12 @@ class CrossAttention(nn.Module):
             for map_ in x:
                 map_ = map_.unsqueeze(1).view(map_.size(0), 1, h, w)
                 if method == 'bicubic':
-                    map_ = F.interpolate(map_, size=(55, 55), mode="bicubic", align_corners=False)
                     maps.append(map_.squeeze(1))
                 else:
                     maps.append(F.conv_transpose2d(map_, weight, stride=factor).squeeze(1).cpu())
-        maps = torch.stack(maps, 0).cpu()
         return maps
     def _attention(self, query, key, value, sequence_length, dim, use_context: bool = True):
@@ -347,7 +347,7 @@ class CrossAttention(nn.Module):
             factor = int(math.sqrt(4096 // attn_slice.shape[1]))
             attn_slice = attn_slice.softmax(-1)
-            if use_context:
                 if factor >= 1:
                     factor //= 1
                     maps = self._up_sample_attn(attn_slice, factor)

             for map_ in x:
                 map_ = map_.unsqueeze(1).view(map_.size(0), 1, h, w)
                 if method == 'bicubic':
+                    map_ = F.interpolate(map_, size=(64, 64), mode="bicubic", align_corners=False)
                     maps.append(map_.squeeze(1))
                 else:
                     maps.append(F.conv_transpose2d(map_, weight, stride=factor).squeeze(1).cpu())
+        maps = torch.stack(maps, 0).sum(1, keepdim=True).cpu()
         return maps
     def _attention(self, query, key, value, sequence_length, dim, use_context: bool = True):
             factor = int(math.sqrt(4096 // attn_slice.shape[1]))
             attn_slice = attn_slice.softmax(-1)
+            if use_context and attn_slice.shape[-1] == 77:
                 if factor >= 1:
                     factor //= 1
                     maps = self._up_sample_attn(attn_slice, factor)

diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py CHANGED Viewed

@@ -346,4 +346,8 @@ class StableDiffusionPipeline(DiffusionPipeline):
         if not return_dict:
             return (image, has_nsfw_concept)
         return StableDiffusionPipelineOutput(images=gpu_image, pil_images=image, nsfw_content_detected=has_nsfw_concept, text_embeddings=text_embeddings, words=words, intermediates=inters)

         if not return_dict:
             return (image, has_nsfw_concept)
+        if any(has_nsfw_concept):
+            gpu_image.zero_()
+            image[0] = None
         return StableDiffusionPipelineOutput(images=gpu_image, pil_images=image, nsfw_content_detected=has_nsfw_concept, text_embeddings=text_embeddings, words=words, intermediates=inters)

diffusers/pipelines/stable_diffusion/safety_checker.py CHANGED Viewed

@@ -72,6 +72,7 @@ class StableDiffusionSafetyChecker(PreTrainedModel):
                 images[idx] = np.zeros(images[idx].shape)  # black image
         if any(has_nsfw_concepts):
             logger.warning(
                 "Potential NSFW content was detected in one or more images. A black image will be returned instead."
                 " Try again with a different prompt and/or seed."

                 images[idx] = np.zeros(images[idx].shape)  # black image
         if any(has_nsfw_concepts):
+            images = []
             logger.warning(
                 "Potential NSFW content was detected in one or more images. A black image will be returned instead."
                 " Try again with a different prompt and/or seed."

requirements.txt CHANGED Viewed

@@ -24,3 +24,6 @@ tensorboard
 torch>=1.4
 torchvision
 transformers>=4.21.0

 torch>=1.4
 torchvision
 transformers>=4.21.0
+spacy
+gradio
+ftfy