Spaces:

omer11a
/

bounded-attention

Running on Zero

App Files Files Community

omer11a commited on Mar 29

Commit

056b358

•

1 Parent(s): 1127ecd

Improved memory requirements

Browse files

Files changed (2) hide show

bounded_attention.py +36 -30
injection_utils.py +15 -122

bounded_attention.py CHANGED Viewed

@@ -44,6 +44,7 @@ class BoundedAttention(injection_utils.AttentionBase):
         pca_rank=None,
         num_clusters=None,
         num_clusters_per_box=3,
         map_dir=None,
         debug=False,
         delta_debug_attention_steps=20,
@@ -81,6 +82,7 @@ class BoundedAttention(injection_utils.AttentionBase):
         self.clustering = KMeans(n_clusters=num_clusters, num_init=100)
         self.centers = None
         self.map_dir = map_dir
         self.debug = debug
         self.delta_debug_attention_steps = delta_debug_attention_steps
@@ -124,24 +126,34 @@ class BoundedAttention(injection_utils.AttentionBase):
         self.clear_values(include_maps=True)
         super().reset()
-    def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
-        self._display_attention_maps(attn, is_cross, num_heads)
-        _, n, d = sim.shape
-        sim_u, sim_c = sim.reshape(-1, num_heads, n, d).chunk(2)  # b h n d
         if is_cross:
-            sim_c = self._hide_other_subjects_from_tokens(sim_c)
         else:
-            sim_u = self._hide_other_subjects_from_subjects(sim_u)
-            sim_c = self._hide_other_subjects_from_subjects(sim_c)
-        sim = torch.cat((sim_u, sim_c)).reshape(-1, n, d)
-        attn = sim.softmax(-1)
-        self._save(attn, is_cross, num_heads)
-        self._display_attention_maps(attn, is_cross, num_heads, prefix='masked')
-        self._debug_hook(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
-        out = torch.bmm(attn, v)
         out = einops.rearrange(out, '(b h) n d -> b n (h d)', h=num_heads)
         return out
@@ -235,16 +247,13 @@ class BoundedAttention(injection_utils.AttentionBase):
         references = references.reshape(-1, *references_unconditional.shape[2:])
         return batch, references
-    def _hide_other_subjects_from_tokens(self, sim):  # b h i j
-        dtype = sim.dtype
-        device = sim.device
-        batch_size = sim.size(0)
-        resolution = int(sim.size(2) ** 0.5)
         subject_masks, background_masks = self._obtain_masks(resolution, batch_size=batch_size, device=device)  # b s n
         include_background = self.optimized or (not self.mask_cross_during_guidance and self.cur_step < self.max_guidance_iter_per_step)
         subject_masks = torch.logical_or(subject_masks, background_masks.unsqueeze(1)) if include_background else subject_masks
-        min_value = torch.finfo(sim.dtype).min
-        sim_masks = torch.zeros_like(sim[:, 0, :, :])  # b i j
         for token_indices in (*self.subject_token_indices, self.filter_token_indices):
             sim_masks[:, :, token_indices] = min_value
@@ -257,16 +266,13 @@ class BoundedAttention(injection_utils.AttentionBase):
             for batch_index, background_mask in zip(range(batch_size), background_masks):
                 sim_masks[batch_index, background_mask, self.eos_token_index] = min_value
-        return sim + sim_masks.unsqueeze(1)
-    def _hide_other_subjects_from_subjects(self, sim):  # b h i j
-        dtype = sim.dtype
-        device = sim.device
-        batch_size = sim.size(0)
-        resolution = int(sim.size(2) ** 0.5)
         subject_masks, background_masks = self._obtain_masks(resolution, batch_size=batch_size, device=device)  # b s n
         min_value = torch.finfo(dtype).min
-        sim_masks = torch.zeros_like(sim[:, 0, :, :])  # b i j
         for batch_index, background_mask in zip(range(batch_size), background_masks):
             sim_masks[batch_index, ~background_mask, ~background_mask] = min_value
@@ -276,7 +282,7 @@ class BoundedAttention(injection_utils.AttentionBase):
                 condition = torch.logical_or(subject_sim_mask == 0, subject_mask.unsqueeze(0))
                 sim_masks[batch_index, subject_mask] = torch.where(condition, 0, min_value).to(dtype=dtype)
-        return sim + sim_masks.unsqueeze(1)
     def _save(self, attn, is_cross, num_heads):
         _, attn = attn.chunk(2)

         pca_rank=None,
         num_clusters=None,
         num_clusters_per_box=3,
+        max_resolution=32,
         map_dir=None,
         debug=False,
         delta_debug_attention_steps=20,
         self.clustering = KMeans(n_clusters=num_clusters, num_init=100)
         self.centers = None
+        self.max_resolution = max_resolution
         self.map_dir = map_dir
         self.debug = debug
         self.delta_debug_attention_steps = delta_debug_attention_steps
         self.clear_values(include_maps=True)
         super().reset()
+    def forward(self, q, k, v, is_cross, place_in_unet, num_heads, **kwargs):
+        batch_size = q.size(0) // num_heads
+        n = q.size(1)
+        d = k.size(1)
+        dtype = q.dtype
+        device = q.device
         if is_cross:
+            masks = self._hide_other_subjects_from_tokens(batch_size // 2, n, d, dtype, device)
         else:
+            masks = self._hide_other_subjects_from_subjects(batch_size // 2, n, dtype, device)
+        if int(n ** 0.5) > self.max_resolution:
+            q = q.reshape(batch_size, num_heads, n, -1)
+            k = k.reshape(batch_size, num_heads, d, -1)
+            v = v.reshape(batch_size, num_heads, d, -1)
+            out = F.scaled_dot_product_attention(q, k, v, attn_mask=masks)
+            out = out.reshape(batch_size * num_heads, n, -1)
+        else:
+            sim = torch.einsum('b i d, b j d -> b i j', q, k) * kwargs['scale']
+            attn = sim.softmax(-1)
+            self._display_attention_maps(attn, is_cross, num_heads)
+            sim = sim.reshape(batch_size, num_heads, n, d) + masks
+            attn = sim.reshape(-1, n, d).softmax(-1)
+            self._save(attn, is_cross, num_heads)
+            self._display_attention_maps(attn, is_cross, num_heads, prefix='masked')
+            self._debug_hook(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
+            out = torch.bmm(attn, v)
         out = einops.rearrange(out, '(b h) n d -> b n (h d)', h=num_heads)
         return out
         references = references.reshape(-1, *references_unconditional.shape[2:])
         return batch, references
+    def _hide_other_subjects_from_tokens(self, batch_size, n, d, dtype, device):  # b h i j
+        resolution = int(n ** 0.5)
         subject_masks, background_masks = self._obtain_masks(resolution, batch_size=batch_size, device=device)  # b s n
         include_background = self.optimized or (not self.mask_cross_during_guidance and self.cur_step < self.max_guidance_iter_per_step)
         subject_masks = torch.logical_or(subject_masks, background_masks.unsqueeze(1)) if include_background else subject_masks
+        min_value = torch.finfo(dtype).min
+        sim_masks = torch.zeros((batch_size, n, d), dtype=dtype, device=device)  # b i j
         for token_indices in (*self.subject_token_indices, self.filter_token_indices):
             sim_masks[:, :, token_indices] = min_value
             for batch_index, background_mask in zip(range(batch_size), background_masks):
                 sim_masks[batch_index, background_mask, self.eos_token_index] = min_value
+        return torch.cat((torch.zeros_like(sim_masks), sim_masks)).unsqueeze(1)
+    def _hide_other_subjects_from_subjects(self, batch_size, n, dtype, device):  # b h i j
+        resolution = int(n ** 0.5)
         subject_masks, background_masks = self._obtain_masks(resolution, batch_size=batch_size, device=device)  # b s n
         min_value = torch.finfo(dtype).min
+        sim_masks = torch.zeros((batch_size, n, n), dtype=dtype, device=device)  # b i j
         for batch_index, background_mask in zip(range(batch_size), background_masks):
             sim_masks[batch_index, ~background_mask, ~background_mask] = min_value
                 condition = torch.logical_or(subject_sim_mask == 0, subject_mask.unsqueeze(0))
                 sim_masks[batch_index, subject_mask] = torch.where(condition, 0, min_value).to(dtype=dtype)
+        return torch.cat((sim_masks, sim_masks)).unsqueeze(1)
     def _save(self, attn, is_cross, num_heads):
         _, attn = attn.chunk(2)

injection_utils.py CHANGED Viewed

@@ -22,21 +22,29 @@ class AttentionBase:
     def after_step(self):
         pass
-    def __call__(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
         if self.cur_att_layer == 0:
             self.before_step()
-        out = self.forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
         self.cur_att_layer += 1
         if self.cur_att_layer == self.num_att_layers:
             self.cur_att_layer = 0
             self.cur_step += 1
-            # after step
             self.after_step()
         return out
     def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
-        out = torch.einsum('b i j, b j d -> b i d', attn, v)
         out = rearrange(out, '(b h) n d -> b n (h d)', h=num_heads)
         return out
@@ -45,42 +53,6 @@ class AttentionBase:
         self.cur_att_layer = 0
-class AttentionStore(AttentionBase):
-    def __init__(self, res=[32], min_step=0, max_step=1000):
-        super().__init__()
-        self.res = res
-        self.min_step = min_step
-        self.max_step = max_step
-        self.valid_steps = 0
-        self.self_attns = []  # store the all attns
-        self.cross_attns = []
-        self.self_attns_step = []  # store the attns in each step
-        self.cross_attns_step = []
-    def after_step(self):
-        if self.cur_step > self.min_step and self.cur_step < self.max_step:
-            self.valid_steps += 1
-            if len(self.self_attns) == 0:
-                self.self_attns = self.self_attns_step
-                self.cross_attns = self.cross_attns_step
-            else:
-                for i in range(len(self.self_attns)):
-                    self.self_attns[i] += self.self_attns_step[i]
-                    self.cross_attns[i] += self.cross_attns_step[i]
-        self.self_attns_step.clear()
-        self.cross_attns_step.clear()
-    def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
-        if attn.shape[1] <= 64 ** 2:  # avoid OOM
-            if is_cross:
-                self.cross_attns_step.append(attn)
-            else:
-                self.self_attns_step.append(attn)
-        return super().forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
 def regiter_attention_editor_diffusers(model, editor: AttentionBase):
     """
     Register a attention editor to Diffuser Pipeline, refer from [Prompt-to-Prompt]
@@ -109,21 +81,9 @@ def regiter_attention_editor_diffusers(model, editor: AttentionBase):
             k = self.to_k(context)
             v = self.to_v(context)
             q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
-            sim = torch.einsum('b i d, b j d -> b i j', q, k) * self.scale
-            if mask is not None:
-                mask = rearrange(mask, 'b ... -> b (...)')
-                max_neg_value = -torch.finfo(sim.dtype).max
-                mask = repeat(mask, 'b j -> (b h) () j', h=h)
-                mask = mask[:, None, :].repeat(h, 1, 1)
-                sim.masked_fill_(~mask, max_neg_value)
-            attn = sim.softmax(dim=-1)
-            # the only difference
             out = editor(
-                q, k, v, sim, attn, is_cross, place_in_unet,
-                self.heads, scale=self.scale)
             return to_out(out)
@@ -146,74 +106,7 @@ def regiter_attention_editor_diffusers(model, editor: AttentionBase):
             cross_att_count += register_editor(net, 0, "mid")
         elif "up" in net_name:
             cross_att_count += register_editor(net, 0, "up")
     editor.num_att_layers = cross_att_count
     editor.model = model
     model.editor = editor
-def regiter_attention_editor_ldm(model, editor: AttentionBase):
-    """
-    Register a attention editor to Stable Diffusion model, refer from [Prompt-to-Prompt]
-    """
-    def ca_forward(self, place_in_unet):
-        def forward(x, encoder_hidden_states=None, attention_mask=None, context=None, mask=None):
-            """
-            The attention is similar to the original implementation of LDM CrossAttention class
-            except adding some modifications on the attention
-            """
-            if encoder_hidden_states is not None:
-                context = encoder_hidden_states
-            if attention_mask is not None:
-                mask = attention_mask
-            to_out = self.to_out
-            if isinstance(to_out, nn.modules.container.ModuleList):
-                to_out = self.to_out[0]
-            else:
-                to_out = self.to_out
-            h = self.heads
-            q = self.to_q(x)
-            is_cross = context is not None
-            context = context if is_cross else x
-            k = self.to_k(context)
-            v = self.to_v(context)
-            q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
-            sim = torch.einsum('b i d, b j d -> b i j', q, k) * self.scale
-            if mask is not None:
-                mask = rearrange(mask, 'b ... -> b (...)')
-                max_neg_value = -torch.finfo(sim.dtype).max
-                mask = repeat(mask, 'b j -> (b h) () j', h=h)
-                mask = mask[:, None, :].repeat(h, 1, 1)
-                sim.masked_fill_(~mask, max_neg_value)
-            attn = sim.softmax(dim=-1)
-            # the only difference
-            out = editor(
-                q, k, v, sim, attn, is_cross, place_in_unet,
-                self.heads, scale=self.scale)
-            return to_out(out)
-        return forward
-    def register_editor(net, count, place_in_unet):
-        for name, subnet in net.named_children():
-            if net.__class__.__name__ == 'CrossAttention':  # spatial Transformer layer
-                net.forward = ca_forward(net, place_in_unet)
-                return count + 1
-            elif hasattr(net, 'children'):
-                count = register_editor(subnet, count, place_in_unet)
-        return count
-    cross_att_count = 0
-    for net_name, net in model.model.diffusion_model.named_children():
-        if "input" in net_name:
-            cross_att_count += register_editor(net, 0, "input")
-        elif "middle" in net_name:
-            cross_att_count += register_editor(net, 0, "middle")
-        elif "output" in net_name:
-            cross_att_count += register_editor(net, 0, "output")
-    editor.num_att_layers = cross_att_count

     def after_step(self):
         pass
+    def __call__(self, q, k, v, is_cross, place_in_unet, num_heads, **kwargs):
         if self.cur_att_layer == 0:
             self.before_step()
+        out = self.forward(q, k, v, is_cross, place_in_unet, num_heads, **kwargs)
         self.cur_att_layer += 1
         if self.cur_att_layer == self.num_att_layers:
             self.cur_att_layer = 0
             self.cur_step += 1
             self.after_step()
         return out
     def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        batch_size = q.size(0) // num_heads
+        n = q.size(1)
+        d = k.size(1)
+        q = q.reshape(batch_size, num_heads, n, -1)
+        k = k.reshape(batch_size, num_heads, d, -1)
+        v = v.reshape(batch_size, num_heads, d, -1)
+        out = F.scaled_dot_product_attention(q, k, v, attn_mask=kwargs['mask'])
+        out = out.reshape(batch_size * num_heads, n, -1)
         out = rearrange(out, '(b h) n d -> b n (h d)', h=num_heads)
         return out
         self.cur_att_layer = 0
 def regiter_attention_editor_diffusers(model, editor: AttentionBase):
     """
     Register a attention editor to Diffuser Pipeline, refer from [Prompt-to-Prompt]
             k = self.to_k(context)
             v = self.to_v(context)
             q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
             out = editor(
+                q, k, v, is_cross, place_in_unet,
+                self.heads, scale=self.scale, mask=mask)
             return to_out(out)
             cross_att_count += register_editor(net, 0, "mid")
         elif "up" in net_name:
             cross_att_count += register_editor(net, 0, "up")
     editor.num_att_layers = cross_att_count
     editor.model = model
     model.editor = editor