FLUX.1-dev-ControlNet-Union-Pro-2.0

Running on Zero

App Files Files Community

cbensimon HF Staff commited on Sep 11

Commit

d70d883

1 Parent(s): bd3cfcb

AOTI load

Browse files

Files changed (4) hide show

aoti.py +17 -0
app.py +9 -1
fa3.py +115 -0
requirements.txt +2 -1

aoti.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""
+"""
+import torch
+from huggingface_hub import hf_hub_download
+from spaces.zero.torch.aoti import ZeroGPUCompiledModel
+from spaces.zero.torch.aoti import ZeroGPUWeights
+def aoti_load(module: torch.nn.Module, repo_id: str):
+    repeated_blocks = module._repeated_blocks
+    aoti_files = {name: hf_hub_download(repo_id, f'{name}.pt2') for name in repeated_blocks}
+    for block_name, aoti_file in aoti_files.items():
+        for block in module.modules():
+            if block.__class__.__name__ == block_name:
+                weights = ZeroGPUWeights(block.state_dict())
+                block.forward = ZeroGPUCompiledModel(aoti_file, weights)

app.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import sys
 sys.path.append('./')
 import gradio as gr
 import spaces
-import os
 import sys
 import subprocess
 import numpy as np
@@ -57,6 +59,12 @@ canny = CannyDetector()
 anyline = AnylineDetector.from_pretrained("TheMistoAI/MistoLine", filename="MTEED.pth", subfolder="Anyline")
 open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
 def convert_from_image_to_cv2(img: Image) -> np.ndarray:
     return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

+import os
+os.system("pip install --upgrade spaces")
 import sys
 sys.path.append('./')
 import gradio as gr
 import spaces
 import sys
 import subprocess
 import numpy as np
 anyline = AnylineDetector.from_pretrained("TheMistoAI/MistoLine", filename="MTEED.pth", subfolder="Anyline")
 open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
+import fa3
+from aoti import aoti_load
+pipe.transformer.fuse_qkv_projections()
+aoti_load(pipe.transformer, 'zerogpu-aoti/FLUX.1')
 def convert_from_image_to_cv2(img: Image) -> np.ndarray:
     return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

fa3.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""
+"""
+import torch
+from kernels import get_kernel
+_flash_attn_func = get_kernel("kernels-community/vllm-flash-attn3").flash_attn_func
+@torch.library.custom_op("flash::flash_attn_func", mutates_args=())
+def flash_attn_func(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+    outputs, lse = _flash_attn_func(q, k, v)
+    return outputs
+@flash_attn_func.register_fake
+def _(q, k, v, **kwargs):
+    # two outputs:
+    # 1. output: (batch, seq_len, num_heads, head_dim)
+    # 2. softmax_lse: (batch, num_heads, seq_len) with dtype=torch.float32
+    meta_q = torch.empty_like(q).contiguous()
+    return meta_q #, q.new_empty((q.size(0), q.size(2), q.size(1)), dtype=torch.float32)
+# Copied FusedFluxAttnProcessor2_0 but using flash v3 instead of SDPA
+class FlashFusedFluxAttnProcessor3_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+    def __call__(
+        self,
+        attn,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor | None = None,
+        attention_mask: torch.FloatTensor | None = None,
+        image_rotary_emb: torch.Tensor | None = None,
+    ) -> torch.FloatTensor:
+        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        # `sample` projections.
+        qkv = attn.to_qkv(hidden_states)
+        split_size = qkv.shape[-1] // 3
+        query, key, value = torch.split(qkv, split_size, dim=-1)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
+        # `context` projections.
+        if encoder_hidden_states is not None:
+            encoder_qkv = attn.to_added_qkv(encoder_hidden_states)
+            split_size = encoder_qkv.shape[-1] // 3
+            (
+                encoder_hidden_states_query_proj,
+                encoder_hidden_states_key_proj,
+                encoder_hidden_states_value_proj,
+            ) = torch.split(encoder_qkv, split_size, dim=-1)
+            encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            if attn.norm_added_q is not None:
+                encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+            if attn.norm_added_k is not None:
+                encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+            # attention
+            query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        if image_rotary_emb is not None:
+            from diffusers.models.embeddings import apply_rotary_emb
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        # NB: transposes are necessary to match expected SDPA input shape
+        hidden_states = flash_attn_func(
+            query.transpose(1, 2),
+            key.transpose(1, 2),
+            value.transpose(1, 2))[0].transpose(1, 2)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        if encoder_hidden_states is not None:
+            encoder_hidden_states, hidden_states = (
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+                hidden_states[:, encoder_hidden_states.shape[1] :],
+            )
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states

requirements.txt CHANGED Viewed

@@ -14,4 +14,5 @@ xformers
 sentencepiece
 peft
 scipy
-scikit-image

 sentencepiece
 peft
 scipy
+scikit-image
+kernels