Spaces:

MoinulwithAI
/

TextToImageEdit

Runtime error

App Files Files Community

modules

by MoinulwithAI - opened May 3

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+13

-3933

This PR is in draft mode

Files changed (46) hide show

.gitattributes +0 -10
__init__.cpython-310.pyc +0 -0
__init__.py +0 -0
app.py +7 -5
attention.cpython-310.pyc +0 -0
attention.py +0 -133
autoencoder.cpython-310.pyc +0 -0
autoencoder.py +0 -326
conditioner.cpython-310.pyc +0 -0
conditioner.py +0 -216
connector_edit.cpython-310.pyc +0 -0
connector_edit.py +0 -486
cookie.png → examples 2.zip +2 -2
examples 2/celeb_meme.jpg +0 -3
examples 2/cookie.png +0 -3
examples 2/ghibli_meme.jpg +0 -0
examples 2/leather.jpg +0 -3
examples 2/meme.jpg +0 -0
examples 2/no_cookie.png +0 -3
examples 2/poster.jpg +0 -0
examples 2/poster_orig.jpg +0 -3
ghibli_meme.jpg +0 -0
layers.cpython-310.pyc +0 -0
layers.py +0 -640
leather.jpg +0 -3
meme.jpg +0 -0
model_edit.cpython-310.pyc +0 -0
model_edit.py +0 -143
celeb_meme.jpg → modules.zip +2 -2
modules/__init__.py +0 -0
modules/__pycache__/__init__.cpython-310.pyc +0 -0
modules/__pycache__/attention.cpython-310.pyc +0 -0
modules/__pycache__/autoencoder.cpython-310.pyc +0 -0
modules/__pycache__/conditioner.cpython-310.pyc +0 -0
modules/__pycache__/connector_edit.cpython-310.pyc +0 -0
modules/__pycache__/layers.cpython-310.pyc +0 -0
modules/__pycache__/model_edit.cpython-310.pyc +0 -0
modules/attention.py +0 -133
modules/autoencoder.py +0 -326
modules/conditioner.py +0 -216
modules/connector_edit.py +0 -486
modules/layers.py +0 -640
modules/model_edit.py +0 -143
no_cookie.png +0 -3
poster.jpg +0 -0
poster_orig.jpg +0 -3

.gitattributes CHANGED Viewed

@@ -33,13 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-celeb_meme.jpg filter=lfs diff=lfs merge=lfs -text
-cookie.png filter=lfs diff=lfs merge=lfs -text
-leather.jpg filter=lfs diff=lfs merge=lfs -text
-no_cookie.png filter=lfs diff=lfs merge=lfs -text
-poster_orig.jpg filter=lfs diff=lfs merge=lfs -text
-examples[[:space:]]2/celeb_meme.jpg filter=lfs diff=lfs merge=lfs -text
-examples[[:space:]]2/cookie.png filter=lfs diff=lfs merge=lfs -text
-examples[[:space:]]2/leather.jpg filter=lfs diff=lfs merge=lfs -text
-examples[[:space:]]2/no_cookie.png filter=lfs diff=lfs merge=lfs -text
-examples[[:space:]]2/poster_orig.jpg filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

__init__.cpython-310.pyc DELETED Viewed

Binary file (128 Bytes)

__init__.py DELETED Viewed

File without changes

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import spaces
 import time
 from pathlib import Path
 import gradio as gr
 import numpy as np
 import torch
@@ -60,7 +61,7 @@ def load_models(
     dit_path=None,
     ae_path=None,
     qwen2vl_model_path=None,
-    device="cpu",
     max_length=256,
     dtype=torch.bfloat16,
 ):
@@ -117,7 +118,7 @@ class ImageGenerator:
         dit_path=None,
         ae_path=None,
         qwen2vl_model_path=None,
-        device="cpu",
         max_length=640,
         dtype=torch.bfloat16,
     ) -> None:
@@ -134,9 +135,9 @@ class ImageGenerator:
         self.llm_encoder = self.llm_encoder.to(device=self.device, dtype=dtype)
     def to_cuda(self):
-        self.ae.to(device='cpu', dtype=torch.float32)
-        self.dit.to(device='cpu', dtype=torch.bfloat16)
-        self.llm_encoder.to(device='cpu', dtype=torch.bfloat16)
     def prepare(self, prompt, img, ref_image, ref_image_raw):
         bs, _, h, w = img.shape
@@ -487,4 +488,5 @@ with gr.Blocks() as demo:
         fn=generate_examples,
         cache_examples=True
         )
 demo.launch()

 import time
 from pathlib import Path
 import gradio as gr
 import numpy as np
 import torch
     dit_path=None,
     ae_path=None,
     qwen2vl_model_path=None,
+    device="cuda",
     max_length=256,
     dtype=torch.bfloat16,
 ):
         dit_path=None,
         ae_path=None,
         qwen2vl_model_path=None,
+        device="cuda",
         max_length=640,
         dtype=torch.bfloat16,
     ) -> None:
         self.llm_encoder = self.llm_encoder.to(device=self.device, dtype=dtype)
     def to_cuda(self):
+        self.ae.to(device='cuda', dtype=torch.float32)
+        self.dit.to(device='cuda', dtype=torch.bfloat16)
+        self.llm_encoder.to(device='cuda', dtype=torch.bfloat16)
     def prepare(self, prompt, img, ref_image, ref_image_raw):
         bs, _, h, w = img.shape
         fn=generate_examples,
         cache_examples=True
         )
 demo.launch()

attention.cpython-310.pyc DELETED Viewed

Binary file (3.13 kB)

attention.py DELETED Viewed

@@ -1,133 +0,0 @@
-import math
-import torch
-import torch.nn.functional as F
-try:
-    import flash_attn
-    from flash_attn.flash_attn_interface import (
-        _flash_attn_forward,
-        flash_attn_func,
-        flash_attn_varlen_func,
-    )
-except ImportError:
-    flash_attn = None
-    flash_attn_varlen_func = None
-    _flash_attn_forward = None
-    flash_attn_func = None
-MEMORY_LAYOUT = {
-    # flash模式:
-    # 预处理: 输入 [batch_size, seq_len, num_heads, head_dim]
-    # 后处理: 保持形状不变
-    "flash": (
-        lambda x: x,  # 保持形状
-        lambda x: x,  # 保持形状
-    ),
-    # torch/vanilla模式:
-    # 预处理: 交换序列和注意力头的维度 [B,S,A,D] -> [B,A,S,D]
-    # 后处理: 交换回原始维度 [B,A,S,D] -> [B,S,A,D]
-    "torch": (
-        lambda x: x.transpose(1, 2),  # (B,S,A,D) -> (B,A,S,D)
-        lambda x: x.transpose(1, 2),  # (B,A,S,D) -> (B,S,A,D)
-    ),
-    "vanilla": (
-        lambda x: x.transpose(1, 2),
-        lambda x: x.transpose(1, 2),
-    ),
-}
-def attention(
-    q,
-    k,
-    v,
-    mode="torch",
-    drop_rate=0,
-    attn_mask=None,
-    causal=False,
-):
-    """
-    执行QKV自注意力计算
-    Args:
-        q (torch.Tensor): 查询张量，形状 [batch_size, seq_len, num_heads, head_dim]
-        k (torch.Tensor): 键张量，形状 [batch_size, seq_len_kv, num_heads, head_dim]
-        v (torch.Tensor): 值张量，形状 [batch_size, seq_len_kv, num_heads, head_dim]
-        mode (str): 注意力模式，可选 'flash', 'torch', 'vanilla'
-        drop_rate (float): 注意力矩阵的dropout概率
-        attn_mask (torch.Tensor): 注意力掩码，形状根据模式不同而变化
-        causal (bool): 是否使用因果注意力（仅关注前面位置）
-    Returns:
-        torch.Tensor: 注意力输出，形状 [batch_size, seq_len, num_heads * head_dim]
-    """
-    # 获取预处理和后处理函数
-    pre_attn_layout, post_attn_layout = MEMORY_LAYOUT[mode]
-    # 应用预处理变换
-    q = pre_attn_layout(q)  # 形状根据模式变化
-    k = pre_attn_layout(k)
-    v = pre_attn_layout(v)
-    if mode == "torch":
-        # 使用PyTorch原生的scaled_dot_product_attention
-        if attn_mask is not None and attn_mask.dtype != torch.bool:
-            attn_mask = attn_mask.to(q.dtype)
-        x = F.scaled_dot_product_attention(
-            q, k, v, attn_mask=attn_mask, dropout_p=drop_rate, is_causal=causal
-        )
-    elif mode == "flash":
-        assert flash_attn_func is not None, "flash_attn_func未定义"
-        assert attn_mask is None, "不支持的注意力掩码"
-        x: torch.Tensor = flash_attn_func(
-            q, k, v, dropout_p=drop_rate, causal=causal, softmax_scale=None
-        )  # type: ignore
-    elif mode == "vanilla":
-        # 手动实现注意力机制
-        scale_factor = 1 / math.sqrt(q.size(-1))  # 缩放因子 1/sqrt(d_k)
-        b, a, s, _ = q.shape  # 获取形状参数
-        s1 = k.size(2)  # 键值序列长度
-        # 初始化注意力偏置
-        attn_bias = torch.zeros(b, a, s, s1, dtype=q.dtype, device=q.device)
-        # 处理因果掩码
-        if causal:
-            assert attn_mask is None, "因果掩码和注意力掩码不能同时使用"
-            # 生成下三角因果掩码
-            temp_mask = torch.ones(b, a, s, s, dtype=torch.bool, device=q.device).tril(
-                diagonal=0
-            )
-            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
-            attn_bias = attn_bias.to(q.dtype)
-        # 处理自定义注意力掩码
-        if attn_mask is not None:
-            if attn_mask.dtype == torch.bool:
-                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
-            else:
-                attn_bias += attn_mask  # 允许类似ALiBi的位置偏置
-        # 计算注意力矩阵
-        attn = (q @ k.transpose(-2, -1)) * scale_factor  # [B,A,S,S1]
-        attn += attn_bias
-        # softmax和dropout
-        attn = attn.softmax(dim=-1)
-        attn = torch.dropout(attn, p=drop_rate, train=True)
-        # 计算输出
-        x = attn @ v  # [B,A,S,D]
-    else:
-        raise NotImplementedError(f"不支持的注意力模式: {mode}")
-    # 应用后处理变换
-    x = post_attn_layout(x)  # 恢复原始维度顺序
-    # 合并注意力头维度
-    b, s, a, d = x.shape
-    out = x.reshape(b, s, -1)  # [B,S,A*D]
-    return out

autoencoder.cpython-310.pyc DELETED Viewed

Binary file (8.78 kB)

autoencoder.py DELETED Viewed

@@ -1,326 +0,0 @@
-# Modified from Flux
-#
-# Copyright 2024 Black Forest Labs
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#     http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import torch
-from einops import rearrange
-from torch import Tensor, nn
-def swish(x: Tensor) -> Tensor:
-    return x * torch.sigmoid(x)
-class AttnBlock(nn.Module):
-    def __init__(self, in_channels: int):
-        super().__init__()
-        self.in_channels = in_channels
-        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
-        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
-        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
-        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
-        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
-    def attention(self, h_: Tensor) -> Tensor:
-        h_ = self.norm(h_)
-        q = self.q(h_)
-        k = self.k(h_)
-        v = self.v(h_)
-        b, c, h, w = q.shape
-        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
-        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
-        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
-        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
-        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
-    def forward(self, x: Tensor) -> Tensor:
-        return x + self.proj_out(self.attention(x))
-class ResnetBlock(nn.Module):
-    def __init__(self, in_channels: int, out_channels: int):
-        super().__init__()
-        self.in_channels = in_channels
-        out_channels = in_channels if out_channels is None else out_channels
-        self.out_channels = out_channels
-        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
-        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
-        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
-        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
-        if self.in_channels != self.out_channels:
-            self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
-    def forward(self, x):
-        h = x
-        h = self.norm1(h)
-        h = swish(h)
-        h = self.conv1(h)
-        h = self.norm2(h)
-        h = swish(h)
-        h = self.conv2(h)
-        if self.in_channels != self.out_channels:
-            x = self.nin_shortcut(x)
-        return x + h
-class Downsample(nn.Module):
-    def __init__(self, in_channels: int):
-        super().__init__()
-        # no asymmetric padding in torch conv, must do it ourselves
-        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
-    def forward(self, x: Tensor):
-        pad = (0, 1, 0, 1)
-        x = nn.functional.pad(x, pad, mode="constant", value=0)
-        x = self.conv(x)
-        return x
-class Upsample(nn.Module):
-    def __init__(self, in_channels: int):
-        super().__init__()
-        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
-    def forward(self, x: Tensor):
-        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
-        x = self.conv(x)
-        return x
-class Encoder(nn.Module):
-    def __init__(
-        self,
-        resolution: int,
-        in_channels: int,
-        ch: int,
-        ch_mult: list[int],
-        num_res_blocks: int,
-        z_channels: int,
-    ):
-        super().__init__()
-        self.ch = ch
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-        # downsampling
-        self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
-        curr_res = resolution
-        in_ch_mult = (1, *tuple(ch_mult))
-        self.in_ch_mult = in_ch_mult
-        self.down = nn.ModuleList()
-        block_in = self.ch
-        for i_level in range(self.num_resolutions):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_in = ch * in_ch_mult[i_level]
-            block_out = ch * ch_mult[i_level]
-            for _ in range(self.num_res_blocks):
-                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
-                block_in = block_out
-            down = nn.Module()
-            down.block = block
-            down.attn = attn
-            if i_level != self.num_resolutions - 1:
-                down.downsample = Downsample(block_in)
-                curr_res = curr_res // 2
-            self.down.append(down)
-        # middle
-        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
-        self.mid.attn_1 = AttnBlock(block_in)
-        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
-        # end
-        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
-        self.conv_out = nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
-    def forward(self, x: Tensor) -> Tensor:
-        # downsampling
-        hs = [self.conv_in(x)]
-        for i_level in range(self.num_resolutions):
-            for i_block in range(self.num_res_blocks):
-                h = self.down[i_level].block[i_block](hs[-1])
-                if len(self.down[i_level].attn) > 0:
-                    h = self.down[i_level].attn[i_block](h)
-                hs.append(h)
-            if i_level != self.num_resolutions - 1:
-                hs.append(self.down[i_level].downsample(hs[-1]))
-        # middle
-        h = hs[-1]
-        h = self.mid.block_1(h)
-        h = self.mid.attn_1(h)
-        h = self.mid.block_2(h)
-        # end
-        h = self.norm_out(h)
-        h = swish(h)
-        h = self.conv_out(h)
-        return h
-class Decoder(nn.Module):
-    def __init__(
-        self,
-        ch: int,
-        out_ch: int,
-        ch_mult: list[int],
-        num_res_blocks: int,
-        in_channels: int,
-        resolution: int,
-        z_channels: int,
-    ):
-        super().__init__()
-        self.ch = ch
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-        self.ffactor = 2 ** (self.num_resolutions - 1)
-        # compute in_ch_mult, block_in and curr_res at lowest res
-        block_in = ch * ch_mult[self.num_resolutions - 1]
-        curr_res = resolution // 2 ** (self.num_resolutions - 1)
-        self.z_shape = (1, z_channels, curr_res, curr_res)
-        # z to block_in
-        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
-        # middle
-        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
-        self.mid.attn_1 = AttnBlock(block_in)
-        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
-        # upsampling
-        self.up = nn.ModuleList()
-        for i_level in reversed(range(self.num_resolutions)):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_out = ch * ch_mult[i_level]
-            for _ in range(self.num_res_blocks + 1):
-                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
-                block_in = block_out
-            up = nn.Module()
-            up.block = block
-            up.attn = attn
-            if i_level != 0:
-                up.upsample = Upsample(block_in)
-                curr_res = curr_res * 2
-            self.up.insert(0, up)  # prepend to get consistent order
-        # end
-        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
-        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
-    def forward(self, z: Tensor) -> Tensor:
-        # z to block_in
-        h = self.conv_in(z)
-        # middle
-        h = self.mid.block_1(h)
-        h = self.mid.attn_1(h)
-        h = self.mid.block_2(h)
-        # upsampling
-        for i_level in reversed(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks + 1):
-                h = self.up[i_level].block[i_block](h)
-                if len(self.up[i_level].attn) > 0:
-                    h = self.up[i_level].attn[i_block](h)
-            if i_level != 0:
-                h = self.up[i_level].upsample(h)
-        # end
-        h = self.norm_out(h)
-        h = swish(h)
-        h = self.conv_out(h)
-        return h
-class DiagonalGaussian(nn.Module):
-    def __init__(self, sample: bool = True, chunk_dim: int = 1):
-        super().__init__()
-        self.sample = sample
-        self.chunk_dim = chunk_dim
-    def forward(self, z: Tensor) -> Tensor:
-        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
-        if self.sample:
-            std = torch.exp(0.5 * logvar)
-            return mean + std * torch.randn_like(mean)
-        else:
-            return mean
-class AutoEncoder(nn.Module):
-    def __init__(
-        self,
-        resolution: int,
-        in_channels: int,
-        ch: int,
-        out_ch: int,
-        ch_mult: list[int],
-        num_res_blocks: int,
-        z_channels: int,
-        scale_factor: float,
-        shift_factor: float,
-    ):
-        super().__init__()
-        self.encoder = Encoder(
-            resolution=resolution,
-            in_channels=in_channels,
-            ch=ch,
-            ch_mult=ch_mult,
-            num_res_blocks=num_res_blocks,
-            z_channels=z_channels,
-        )
-        self.decoder = Decoder(
-            resolution=resolution,
-            in_channels=in_channels,
-            ch=ch,
-            out_ch=out_ch,
-            ch_mult=ch_mult,
-            num_res_blocks=num_res_blocks,
-            z_channels=z_channels,
-        )
-        self.reg = DiagonalGaussian()
-        self.scale_factor = scale_factor
-        self.shift_factor = shift_factor
-    def encode(self, x: Tensor) -> Tensor:
-        z = self.reg(self.encoder(x))
-        z = self.scale_factor * (z - self.shift_factor)
-        return z
-    def decode(self, z: Tensor) -> Tensor:
-        z = z / self.scale_factor + self.shift_factor
-        return self.decoder(z)
-    def forward(self, x: Tensor) -> Tensor:
-        return self.decode(self.encode(x))

conditioner.cpython-310.pyc DELETED Viewed

Binary file (4.94 kB)

conditioner.py DELETED Viewed

@@ -1,216 +0,0 @@
-import torch
-from qwen_vl_utils import process_vision_info
-from transformers import (
-    AutoProcessor,
-    Qwen2VLForConditionalGeneration,
-    Qwen2_5_VLForConditionalGeneration,
-)
-from torchvision.transforms import ToPILImage
-to_pil = ToPILImage()
-Qwen25VL_7b_PREFIX = '''Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:
-- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.
-- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.\n
-Here are examples of how to transform or refine prompts:
-- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.
-- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.\n
-Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:
-User Prompt:'''
-def split_string(s):
-    # 将中文引号替换为英文引号
-    s = s.replace("“", '"').replace("”", '"')  # use english quotes
-    result = []
-    # 标记是否在引号内
-    in_quotes = False
-    temp = ""
-    # 遍历字符串中的每个字符及其索引
-    for idx, char in enumerate(s):
-        # 如果字符是引号且索引大于 155
-        if char == '"' and idx > 155:
-            # 将引号添加到临时字符串
-            temp += char
-            # 如果不在引号内
-            if not in_quotes:
-                # 将临时字符串添加到结果列表
-                result.append(temp)
-                # 清空临时字符串
-                temp = ""
-            # 切换引号状态
-            in_quotes = not in_quotes
-            continue
-        # 如果在引号内
-        if in_quotes:
-            # 如果字符是空格
-            if char.isspace():
-                pass  # have space token
-            # 将字符用中文引号包裹后添加到结果列表
-            result.append("“" + char + "”")
-        else:
-            # 将字符添加到临时字符串
-            temp += char
-    # 如果临时字符串不为空
-    if temp:
-        # 将临时字符串添加到结果列表
-        result.append(temp)
-    return result
-class Qwen25VL_7b_Embedder(torch.nn.Module):
-    def __init__(self, model_path, max_length=640, dtype=torch.bfloat16, device="cuda"):
-        super(Qwen25VL_7b_Embedder, self).__init__()
-        self.max_length = max_length
-        self.dtype = dtype
-        self.device = device
-        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-            model_path,
-            torch_dtype=dtype,
-            attn_implementation="eager",
-        ).to(torch.cuda.current_device())
-        self.model.requires_grad_(False)
-        self.processor = AutoProcessor.from_pretrained(
-            model_path, min_pixels=256 * 28 * 28, max_pixels=324 * 28 * 28
-        )
-        self.prefix = Qwen25VL_7b_PREFIX
-    def forward(self, caption, ref_images):
-        text_list = caption
-        embs = torch.zeros(
-            len(text_list),
-            self.max_length,
-            self.model.config.hidden_size,
-            dtype=torch.bfloat16,
-            device=torch.cuda.current_device(),
-        )
-        hidden_states = torch.zeros(
-            len(text_list),
-            self.max_length,
-            self.model.config.hidden_size,
-            dtype=torch.bfloat16,
-            device=torch.cuda.current_device(),
-        )
-        masks = torch.zeros(
-            len(text_list),
-            self.max_length,
-            dtype=torch.long,
-            device=torch.cuda.current_device(),
-        )
-        input_ids_list = []
-        attention_mask_list = []
-        emb_list = []
-        def split_string(s):
-            s = s.replace("“", '"').replace("”", '"').replace("'", '''"''')  # use english quotes
-            result = []
-            in_quotes = False
-            temp = ""
-            for idx,char in enumerate(s):
-                if char == '"' and idx>155:
-                    temp += char
-                    if not in_quotes:
-                        result.append(temp)
-                        temp = ""
-                    in_quotes = not in_quotes
-                    continue
-                if in_quotes:
-                    if char.isspace():
-                        pass  # have space token
-                    result.append("“" + char + "”")
-                else:
-                    temp += char
-            if temp:
-                result.append(temp)
-            return result
-        for idx, (txt, imgs) in enumerate(zip(text_list, ref_images)):
-            messages = [{"role": "user", "content": []}]
-            messages[0]["content"].append({"type": "text", "text": f"{self.prefix}"})
-            messages[0]["content"].append({"type": "image", "image": to_pil(imgs)})
-            # 再添加 text
-            messages[0]["content"].append({"type": "text", "text": f"{txt}"})
-            # Preparation for inference
-            text = self.processor.apply_chat_template(
-                messages, tokenize=False, add_generation_prompt=True, add_vision_id=True
-            )
-            image_inputs, video_inputs = process_vision_info(messages)
-            inputs = self.processor(
-                text=[text],
-                images=image_inputs,
-                padding=True,
-                return_tensors="pt",
-            )
-            old_inputs_ids = inputs.input_ids
-            text_split_list = split_string(text)
-            token_list = []
-            for text_each in text_split_list:
-                txt_inputs = self.processor(
-                    text=text_each,
-                    images=None,
-                    videos=None,
-                    padding=True,
-                    return_tensors="pt",
-                )
-                token_each = txt_inputs.input_ids
-                if token_each[0][0] == 2073 and token_each[0][-1] == 854:
-                    token_each = token_each[:, 1:-1]
-                    token_list.append(token_each)
-                else:
-                    token_list.append(token_each)
-            new_txt_ids = torch.cat(token_list, dim=1).to("cuda")
-            new_txt_ids = new_txt_ids.to(old_inputs_ids.device)
-            idx1 = (old_inputs_ids == 151653).nonzero(as_tuple=True)[1][0]
-            idx2 = (new_txt_ids == 151653).nonzero(as_tuple=True)[1][0]
-            inputs.input_ids = (
-                torch.cat([old_inputs_ids[0, :idx1], new_txt_ids[0, idx2:]], dim=0)
-                .unsqueeze(0)
-                .to("cuda")
-            )
-            inputs.attention_mask = (inputs.input_ids > 0).long().to("cuda")
-            outputs = self.model(
-                input_ids=inputs.input_ids,
-                attention_mask=inputs.attention_mask,
-                pixel_values=inputs.pixel_values.to("cuda"),
-                image_grid_thw=inputs.image_grid_thw.to("cuda"),
-                output_hidden_states=True,
-            )
-            emb = outputs["hidden_states"][-1]
-            embs[idx, : min(self.max_length, emb.shape[1] - 217)] = emb[0, 217:][
-                : self.max_length
-            ]
-            masks[idx, : min(self.max_length, emb.shape[1] - 217)] = torch.ones(
-                (min(self.max_length, emb.shape[1] - 217)),
-                dtype=torch.long,
-                device=torch.cuda.current_device(),
-            )
-        return embs, masks

connector_edit.cpython-310.pyc DELETED Viewed

Binary file (11.8 kB)

connector_edit.py DELETED Viewed

@@ -1,486 +0,0 @@
-from typing import Optional
-import torch
-import torch.nn
-from einops import rearrange
-from torch import nn
-from .layers import MLP, TextProjection, TimestepEmbedder, apply_gate, attention
-class RMSNorm(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        elementwise_affine=True,
-        eps: float = 1e-6,
-        device=None,
-        dtype=None,
-    ):
-        """
-        Initialize the RMSNorm normalization layer.
-        Args:
-            dim (int): The dimension of the input tensor.
-            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
-        Attributes:
-            eps (float): A small value added to the denominator for numerical stability.
-            weight (nn.Parameter): Learnable scaling parameter.
-        """
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.eps = eps
-        if elementwise_affine:
-            self.weight = nn.Parameter(torch.ones(dim, **factory_kwargs))
-    def _norm(self, x):
-        """
-        Apply the RMSNorm normalization to the input tensor.
-        Args:
-            x (torch.Tensor): The input tensor.
-        Returns:
-            torch.Tensor: The normalized tensor.
-        """
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-    def forward(self, x):
-        """
-        Forward pass through the RMSNorm layer.
-        Args:
-            x (torch.Tensor): The input tensor.
-        Returns:
-            torch.Tensor: The output tensor after applying RMSNorm.
-        """
-        output = self._norm(x.float()).type_as(x)
-        if hasattr(self, "weight"):
-            output = output * self.weight
-        return output
-def get_norm_layer(norm_layer):
-    """
-    Get the normalization layer.
-    Args:
-        norm_layer (str): The type of normalization layer.
-    Returns:
-        norm_layer (nn.Module): The normalization layer.
-    """
-    if norm_layer == "layer":
-        return nn.LayerNorm
-    elif norm_layer == "rms":
-        return RMSNorm
-    else:
-        raise NotImplementedError(f"Norm layer {norm_layer} is not implemented")
-def get_activation_layer(act_type):
-    """get activation layer
-    Args:
-        act_type (str): the activation type
-    Returns:
-        torch.nn.functional: the activation layer
-    """
-    if act_type == "gelu":
-        return lambda: nn.GELU()
-    elif act_type == "gelu_tanh":
-        return lambda: nn.GELU(approximate="tanh")
-    elif act_type == "relu":
-        return nn.ReLU
-    elif act_type == "silu":
-        return nn.SiLU
-    else:
-        raise ValueError(f"Unknown activation type: {act_type}")
-class IndividualTokenRefinerBlock(torch.nn.Module):
-    def __init__(
-        self,
-        hidden_size,
-        heads_num,
-        mlp_width_ratio: str = 4.0,
-        mlp_drop_rate: float = 0.0,
-        act_type: str = "silu",
-        qk_norm: bool = False,
-        qk_norm_type: str = "layer",
-        qkv_bias: bool = True,
-        need_CA: bool = False,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
-    ):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.need_CA = need_CA
-        self.heads_num = heads_num
-        head_dim = hidden_size // heads_num
-        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
-        self.norm1 = nn.LayerNorm(
-            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
-        )
-        self.self_attn_qkv = nn.Linear(
-            hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs
-        )
-        qk_norm_layer = get_norm_layer(qk_norm_type)
-        self.self_attn_q_norm = (
-            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
-            if qk_norm
-            else nn.Identity()
-        )
-        self.self_attn_k_norm = (
-            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
-            if qk_norm
-            else nn.Identity()
-        )
-        self.self_attn_proj = nn.Linear(
-            hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
-        )
-        self.norm2 = nn.LayerNorm(
-            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
-        )
-        act_layer = get_activation_layer(act_type)
-        self.mlp = MLP(
-            in_channels=hidden_size,
-            hidden_channels=mlp_hidden_dim,
-            act_layer=act_layer,
-            drop=mlp_drop_rate,
-            **factory_kwargs,
-        )
-        self.adaLN_modulation = nn.Sequential(
-            act_layer(),
-            nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
-        )
-        if self.need_CA:
-            self.cross_attnblock=CrossAttnBlock(hidden_size=hidden_size,
-                        heads_num=heads_num,
-                        mlp_width_ratio=mlp_width_ratio,
-                        mlp_drop_rate=mlp_drop_rate,
-                        act_type=act_type,
-                        qk_norm=qk_norm,
-                        qk_norm_type=qk_norm_type,
-                        qkv_bias=qkv_bias,
-                        **factory_kwargs,)
-        # Zero-initialize the modulation
-        nn.init.zeros_(self.adaLN_modulation[1].weight)
-        nn.init.zeros_(self.adaLN_modulation[1].bias)
-    def forward(
-        self,
-        x: torch.Tensor,
-        c: torch.Tensor,  # timestep_aware_representations + context_aware_representations
-        attn_mask: torch.Tensor = None,
-        y: torch.Tensor = None,
-    ):
-        gate_msa, gate_mlp = self.adaLN_modulation(c).chunk(2, dim=1)
-        norm_x = self.norm1(x)
-        qkv = self.self_attn_qkv(norm_x)
-        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
-        # Apply QK-Norm if needed
-        q = self.self_attn_q_norm(q).to(v)
-        k = self.self_attn_k_norm(k).to(v)
-        # Self-Attention
-        attn = attention(q, k, v, mode="torch", attn_mask=attn_mask)
-        x = x + apply_gate(self.self_attn_proj(attn), gate_msa)
-        if self.need_CA:
-            x = self.cross_attnblock(x, c, attn_mask, y)
-        # FFN Layer
-        x = x + apply_gate(self.mlp(self.norm2(x)), gate_mlp)
-        return x
-class CrossAttnBlock(torch.nn.Module):
-    def __init__(
-        self,
-        hidden_size,
-        heads_num,
-        mlp_width_ratio: str = 4.0,
-        mlp_drop_rate: float = 0.0,
-        act_type: str = "silu",
-        qk_norm: bool = False,
-        qk_norm_type: str = "layer",
-        qkv_bias: bool = True,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
-    ):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.heads_num = heads_num
-        head_dim = hidden_size // heads_num
-        self.norm1 = nn.LayerNorm(
-            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
-        )
-        self.norm1_2 = nn.LayerNorm(
-            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
-        )
-        self.self_attn_q = nn.Linear(
-            hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
-        )
-        self.self_attn_kv = nn.Linear(
-            hidden_size, hidden_size*2, bias=qkv_bias, **factory_kwargs
-        )
-        qk_norm_layer = get_norm_layer(qk_norm_type)
-        self.self_attn_q_norm = (
-            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
-            if qk_norm
-            else nn.Identity()
-        )
-        self.self_attn_k_norm = (
-            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
-            if qk_norm
-            else nn.Identity()
-        )
-        self.self_attn_proj = nn.Linear(
-            hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
-        )
-        self.norm2 = nn.LayerNorm(
-            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
-        )
-        act_layer = get_activation_layer(act_type)
-        self.adaLN_modulation = nn.Sequential(
-            act_layer(),
-            nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
-        )
-        # Zero-initialize the modulation
-        nn.init.zeros_(self.adaLN_modulation[1].weight)
-        nn.init.zeros_(self.adaLN_modulation[1].bias)
-    def forward(
-        self,
-        x: torch.Tensor,
-        c: torch.Tensor,  # timestep_aware_representations + context_aware_representations
-        attn_mask: torch.Tensor = None,
-        y: torch.Tensor=None,
-    ):
-        gate_msa, gate_mlp = self.adaLN_modulation(c).chunk(2, dim=1)
-        norm_x = self.norm1(x)
-        norm_y = self.norm1_2(y)
-        q = self.self_attn_q(norm_x)
-        q = rearrange(q, "B L (H D) -> B L H D",  H=self.heads_num)
-        kv = self.self_attn_kv(norm_y)
-        k, v = rearrange(kv, "B L (K H D) -> K B L H D", K=2, H=self.heads_num)
-        # Apply QK-Norm if needed
-        q = self.self_attn_q_norm(q).to(v)
-        k = self.self_attn_k_norm(k).to(v)
-        # Self-Attention
-        attn = attention(q, k, v, mode="torch", attn_mask=attn_mask)
-        x = x + apply_gate(self.self_attn_proj(attn), gate_msa)
-        return x
-class IndividualTokenRefiner(torch.nn.Module):
-    def __init__(
-        self,
-        hidden_size,
-        heads_num,
-        depth,
-        mlp_width_ratio: float = 4.0,
-        mlp_drop_rate: float = 0.0,
-        act_type: str = "silu",
-        qk_norm: bool = False,
-        qk_norm_type: str = "layer",
-        qkv_bias: bool = True,
-        need_CA:bool=False,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
-    ):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.need_CA = need_CA
-        self.blocks = nn.ModuleList(
-            [
-                IndividualTokenRefinerBlock(
-                    hidden_size=hidden_size,
-                    heads_num=heads_num,
-                    mlp_width_ratio=mlp_width_ratio,
-                    mlp_drop_rate=mlp_drop_rate,
-                    act_type=act_type,
-                    qk_norm=qk_norm,
-                    qk_norm_type=qk_norm_type,
-                    qkv_bias=qkv_bias,
-                    need_CA=self.need_CA,
-                    **factory_kwargs,
-                )
-                for _ in range(depth)
-            ]
-        )
-    def forward(
-        self,
-        x: torch.Tensor,
-        c: torch.LongTensor,
-        mask: Optional[torch.Tensor] = None,
-        y:torch.Tensor=None,
-    ):
-        self_attn_mask = None
-        if mask is not None:
-            batch_size = mask.shape[0]
-            seq_len = mask.shape[1]
-            mask = mask.to(x.device)
-            # batch_size x 1 x seq_len x seq_len
-            self_attn_mask_1 = mask.view(batch_size, 1, 1, seq_len).repeat(
-                1, 1, seq_len, 1
-            )
-            # batch_size x 1 x seq_len x seq_len
-            self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
-            # batch_size x 1 x seq_len x seq_len, 1 for broadcasting of heads_num
-            self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
-            # avoids self-attention weight being NaN for padding tokens
-            self_attn_mask[:, :, :, 0] = True
-        for block in self.blocks:
-            x = block(x, c, self_attn_mask,y)
-        return x
-class SingleTokenRefiner(torch.nn.Module):
-    """
-    A single token refiner block for llm text embedding refine.
-    """
-    def __init__(
-        self,
-        in_channels,
-        hidden_size,
-        heads_num,
-        depth,
-        mlp_width_ratio: float = 4.0,
-        mlp_drop_rate: float = 0.0,
-        act_type: str = "silu",
-        qk_norm: bool = False,
-        qk_norm_type: str = "layer",
-        qkv_bias: bool = True,
-        need_CA:bool=False,
-        attn_mode: str = "torch",
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
-    ):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.attn_mode = attn_mode
-        self.need_CA = need_CA
-        assert self.attn_mode == "torch", "Only support 'torch' mode for token refiner."
-        self.input_embedder = nn.Linear(
-            in_channels, hidden_size, bias=True, **factory_kwargs
-        )
-        if self.need_CA:
-            self.input_embedder_CA = nn.Linear(
-            in_channels, hidden_size, bias=True, **factory_kwargs
-        )
-        act_layer = get_activation_layer(act_type)
-        # Build timestep embedding layer
-        self.t_embedder = TimestepEmbedder(hidden_size, act_layer, **factory_kwargs)
-        # Build context embedding layer
-        self.c_embedder = TextProjection(
-            in_channels, hidden_size, act_layer, **factory_kwargs
-        )
-        self.individual_token_refiner = IndividualTokenRefiner(
-            hidden_size=hidden_size,
-            heads_num=heads_num,
-            depth=depth,
-            mlp_width_ratio=mlp_width_ratio,
-            mlp_drop_rate=mlp_drop_rate,
-            act_type=act_type,
-            qk_norm=qk_norm,
-            qk_norm_type=qk_norm_type,
-            qkv_bias=qkv_bias,
-            need_CA=need_CA,
-            **factory_kwargs,
-        )
-    def forward(
-        self,
-        x: torch.Tensor,
-        t: torch.LongTensor,
-        mask: Optional[torch.LongTensor] = None,
-        y: torch.LongTensor=None,
-    ):
-        timestep_aware_representations = self.t_embedder(t)
-        if mask is None:
-            context_aware_representations = x.mean(dim=1)
-        else:
-            mask_float = mask.unsqueeze(-1)  # [b, s1, 1]
-            context_aware_representations = (x * mask_float).sum(
-                dim=1
-            ) / mask_float.sum(dim=1)
-        context_aware_representations = self.c_embedder(context_aware_representations)
-        c = timestep_aware_representations + context_aware_representations
-        x = self.input_embedder(x)
-        if self.need_CA:
-            y = self.input_embedder_CA(y)
-            x = self.individual_token_refiner(x, c, mask, y)
-        else:
-            x = self.individual_token_refiner(x, c, mask)
-        return x
-class Qwen2Connector(torch.nn.Module):
-    def __init__(
-        self,
-        # biclip_dim=1024,
-        in_channels=3584,
-        hidden_size=4096,
-        heads_num=32,
-        depth=2,
-        need_CA=False,
-        device=None,
-        dtype=torch.bfloat16,
-    ):
-        super().__init__()
-        factory_kwargs = {"device": device, "dtype":dtype}
-        self.S =SingleTokenRefiner(in_channels=in_channels,hidden_size=hidden_size,heads_num=heads_num,depth=depth,need_CA=need_CA,**factory_kwargs)
-        self.global_proj_out=nn.Linear(in_channels,768)
-        self.scale_factor = nn.Parameter(torch.zeros(1))
-        with torch.no_grad():
-            self.scale_factor.data += -(1 - 0.09)
-    def forward(self, x,t,mask):
-        mask_float = mask.unsqueeze(-1)  # [b, s1, 1]
-        x_mean = (x * mask_float).sum(
-                dim=1
-            ) / mask_float.sum(dim=1) * (1 + self.scale_factor)
-        global_out=self.global_proj_out(x_mean)
-        encoder_hidden_states = self.S(x,t,mask)
-        return encoder_hidden_states,global_out

cookie.png → examples 2.zip RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:328bf4f4779cd6235016a217eaf5dc1ef7a8f1cb95e8fbd7ee538ac6824e75b0
-size 542518

 version https://git-lfs.github.com/spec/v1
+oid sha256:5de0f67d94e0e46599bc9619a912a05898b8053ddc0f1f6563a3ee3b4dd1f7c7
+size 1878523

examples 2/celeb_meme.jpg DELETED Viewed

Git LFS Details

SHA256: 4fb2ccab4218dba753781d65e8f5933f8ab7613543b59a7b4512a6654fe55a4f
Pointer size: 131 Bytes
Size of remote file: 267 kB

examples 2/cookie.png DELETED Viewed

Git LFS Details

SHA256: 328bf4f4779cd6235016a217eaf5dc1ef7a8f1cb95e8fbd7ee538ac6824e75b0
Pointer size: 131 Bytes
Size of remote file: 543 kB

examples 2/ghibli_meme.jpg DELETED Viewed

Binary file (38.1 kB)

examples 2/leather.jpg DELETED Viewed

Git LFS Details

SHA256: efa1eab6d7fa83b2bb39631b194012cf01cca24356b624f32e0fd05346af3ec2
Pointer size: 131 Bytes
Size of remote file: 250 kB

examples 2/meme.jpg DELETED Viewed

Binary file (49.8 kB)

examples 2/no_cookie.png DELETED Viewed

Git LFS Details

SHA256: 4ee90a1e41774e2dae54ca436874341e750f2c7a6196b8360aee1952e98066f8
Pointer size: 131 Bytes
Size of remote file: 162 kB

examples 2/poster.jpg DELETED Viewed

Binary file (65.4 kB)

examples 2/poster_orig.jpg DELETED Viewed

Git LFS Details

SHA256: 92a4178a56e7fefd7dfd418c675c1ab6b6b2e00e17b45a778a1100ab62f9bfba
Pointer size: 131 Bytes
Size of remote file: 458 kB

ghibli_meme.jpg DELETED Viewed

Binary file (38.1 kB)

layers.cpython-310.pyc DELETED Viewed

Binary file (19.1 kB)

layers.py DELETED Viewed

@@ -1,640 +0,0 @@
-# Modified from Flux
-#
-# Copyright 2024 Black Forest Labs
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#     http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import math  # noqa: I001
-from dataclasses import dataclass
-from functools import partial
-import torch
-import torch.nn.functional as F
-from einops import rearrange
-# from liger_kernel.ops.rms_norm import LigerRMSNormFunction
-from torch import Tensor, nn
-try:
-    import flash_attn
-    from flash_attn.flash_attn_interface import (
-        _flash_attn_forward,
-        flash_attn_varlen_func,
-    )
-except ImportError:
-    flash_attn = None
-    flash_attn_varlen_func = None
-    _flash_attn_forward = None
-MEMORY_LAYOUT = {
-    "flash": (
-        lambda x: x.view(x.shape[0] * x.shape[1], *x.shape[2:]),
-        lambda x: x,
-    ),
-    "torch": (
-        lambda x: x.transpose(1, 2),
-        lambda x: x.transpose(1, 2),
-    ),
-    "vanilla": (
-        lambda x: x.transpose(1, 2),
-        lambda x: x.transpose(1, 2),
-    ),
-}
-def attention(
-    q,
-    k,
-    v,
-    mode="torch",
-    drop_rate=0,
-    attn_mask=None,
-    causal=False,
-    cu_seqlens_q=None,
-    cu_seqlens_kv=None,
-    max_seqlen_q=None,
-    max_seqlen_kv=None,
-    batch_size=1,
-):
-    """
-    Perform QKV self attention.
-    Args:
-        q (torch.Tensor): Query tensor with shape [b, s, a, d], where a is the number of heads.
-        k (torch.Tensor): Key tensor with shape [b, s1, a, d]
-        v (torch.Tensor): Value tensor with shape [b, s1, a, d]
-        mode (str): Attention mode. Choose from 'self_flash', 'cross_flash', 'torch', and 'vanilla'.
-        drop_rate (float): Dropout rate in attention map. (default: 0)
-        attn_mask (torch.Tensor): Attention mask with shape [b, s1] (cross_attn), or [b, a, s, s1] (torch or vanilla).
-            (default: None)
-        causal (bool): Whether to use causal attention. (default: False)
-        cu_seqlens_q (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
-            used to index into q.
-        cu_seqlens_kv (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
-            used to index into kv.
-        max_seqlen_q (int): The maximum sequence length in the batch of q.
-        max_seqlen_kv (int): The maximum sequence length in the batch of k and v.
-    Returns:
-        torch.Tensor: Output tensor after self attention with shape [b, s, ad]
-    """
-    pre_attn_layout, post_attn_layout = MEMORY_LAYOUT[mode]
-    q = pre_attn_layout(q)
-    k = pre_attn_layout(k)
-    v = pre_attn_layout(v)
-    if mode == "torch":
-        if attn_mask is not None and attn_mask.dtype != torch.bool:
-            attn_mask = attn_mask.to(q.dtype)
-        x = F.scaled_dot_product_attention(
-            q, k, v, attn_mask=attn_mask, dropout_p=drop_rate, is_causal=causal
-        )
-    elif mode == "flash":
-        assert flash_attn_varlen_func is not None
-        x: torch.Tensor = flash_attn_varlen_func(
-            q,
-            k,
-            v,
-            cu_seqlens_q,
-            cu_seqlens_kv,
-            max_seqlen_q,
-            max_seqlen_kv,
-        )  # type: ignore
-        # x with shape [(bxs), a, d]
-        x = x.view(batch_size, max_seqlen_q, x.shape[-2], x.shape[-1])  # type: ignore # reshape x to [b, s, a, d]
-    elif mode == "vanilla":
-        scale_factor = 1 / math.sqrt(q.size(-1))
-        b, a, s, _ = q.shape
-        s1 = k.size(2)
-        attn_bias = torch.zeros(b, a, s, s1, dtype=q.dtype, device=q.device)
-        if causal:
-            # Only applied to self attention
-            assert attn_mask is None, (
-                "Causal mask and attn_mask cannot be used together"
-            )
-            temp_mask = torch.ones(b, a, s, s, dtype=torch.bool, device=q.device).tril(
-                diagonal=0
-            )
-            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
-            attn_bias.to(q.dtype)
-        if attn_mask is not None:
-            if attn_mask.dtype == torch.bool:
-                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
-            else:
-                attn_bias += attn_mask
-        # TODO: Maybe force q and k to be float32 to avoid numerical overflow
-        attn = (q @ k.transpose(-2, -1)) * scale_factor
-        attn += attn_bias
-        attn = attn.softmax(dim=-1)
-        attn = torch.dropout(attn, p=drop_rate, train=True)
-        x = attn @ v
-    else:
-        raise NotImplementedError(f"Unsupported attention mode: {mode}")
-    x = post_attn_layout(x)
-    b, s, a, d = x.shape
-    out = x.reshape(b, s, -1)
-    return out
-def apply_gate(x, gate=None, tanh=False):
-    """AI is creating summary for apply_gate
-    Args:
-        x (torch.Tensor): input tensor.
-        gate (torch.Tensor, optional): gate tensor. Defaults to None.
-        tanh (bool, optional): whether to use tanh function. Defaults to False.
-    Returns:
-        torch.Tensor: the output tensor after apply gate.
-    """
-    if gate is None:
-        return x
-    if tanh:
-        return x * gate.unsqueeze(1).tanh()
-    else:
-        return x * gate.unsqueeze(1)
-class MLP(nn.Module):
-    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
-    def __init__(
-        self,
-        in_channels,
-        hidden_channels=None,
-        out_features=None,
-        act_layer=nn.GELU,
-        norm_layer=None,
-        bias=True,
-        drop=0.0,
-        use_conv=False,
-        device=None,
-        dtype=None,
-    ):
-        super().__init__()
-        out_features = out_features or in_channels
-        hidden_channels = hidden_channels or in_channels
-        bias = (bias, bias)
-        drop_probs = (drop, drop)
-        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
-        self.fc1 = linear_layer(
-            in_channels, hidden_channels, bias=bias[0], device=device, dtype=dtype
-        )
-        self.act = act_layer()
-        self.drop1 = nn.Dropout(drop_probs[0])
-        self.norm = (
-            norm_layer(hidden_channels, device=device, dtype=dtype)
-            if norm_layer is not None
-            else nn.Identity()
-        )
-        self.fc2 = linear_layer(
-            hidden_channels, out_features, bias=bias[1], device=device, dtype=dtype
-        )
-        self.drop2 = nn.Dropout(drop_probs[1])
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop1(x)
-        x = self.norm(x)
-        x = self.fc2(x)
-        x = self.drop2(x)
-        return x
-class TextProjection(nn.Module):
-    """
-    Projects text embeddings. Also handles dropout for classifier-free guidance.
-    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
-    """
-    def __init__(self, in_channels, hidden_size, act_layer, dtype=None, device=None):
-        factory_kwargs = {"dtype": dtype, "device": device}
-        super().__init__()
-        self.linear_1 = nn.Linear(
-            in_features=in_channels,
-            out_features=hidden_size,
-            bias=True,
-            **factory_kwargs,
-        )
-        self.act_1 = act_layer()
-        self.linear_2 = nn.Linear(
-            in_features=hidden_size,
-            out_features=hidden_size,
-            bias=True,
-            **factory_kwargs,
-        )
-    def forward(self, caption):
-        hidden_states = self.linear_1(caption)
-        hidden_states = self.act_1(hidden_states)
-        hidden_states = self.linear_2(hidden_states)
-        return hidden_states
-class TimestepEmbedder(nn.Module):
-    """
-    Embeds scalar timesteps into vector representations.
-    """
-    def __init__(
-        self,
-        hidden_size,
-        act_layer,
-        frequency_embedding_size=256,
-        max_period=10000,
-        out_size=None,
-        dtype=None,
-        device=None,
-    ):
-        factory_kwargs = {"dtype": dtype, "device": device}
-        super().__init__()
-        self.frequency_embedding_size = frequency_embedding_size
-        self.max_period = max_period
-        if out_size is None:
-            out_size = hidden_size
-        self.mlp = nn.Sequential(
-            nn.Linear(
-                frequency_embedding_size, hidden_size, bias=True, **factory_kwargs
-            ),
-            act_layer(),
-            nn.Linear(hidden_size, out_size, bias=True, **factory_kwargs),
-        )
-        nn.init.normal_(self.mlp[0].weight, std=0.02)  # type: ignore
-        nn.init.normal_(self.mlp[2].weight, std=0.02)  # type: ignore
-    @staticmethod
-    def timestep_embedding(t, dim, max_period=10000):
-        """
-        Create sinusoidal timestep embeddings.
-        Args:
-            t (torch.Tensor): a 1-D Tensor of N indices, one per batch element. These may be fractional.
-            dim (int): the dimension of the output.
-            max_period (int): controls the minimum frequency of the embeddings.
-        Returns:
-            embedding (torch.Tensor): An (N, D) Tensor of positional embeddings.
-        .. ref_link: https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
-        """
-        half = dim // 2
-        freqs = torch.exp(
-            -math.log(max_period)
-            * torch.arange(start=0, end=half, dtype=torch.float32)
-            / half
-        ).to(device=t.device)
-        args = t[:, None].float() * freqs[None]
-        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-        if dim % 2:
-            embedding = torch.cat(
-                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
-            )
-        return embedding
-    def forward(self, t):
-        t_freq = self.timestep_embedding(
-            t, self.frequency_embedding_size, self.max_period
-        ).type(self.mlp[0].weight.dtype)  # type: ignore
-        t_emb = self.mlp(t_freq)
-        return t_emb
-class EmbedND(nn.Module):
-    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
-        super().__init__()
-        self.dim = dim
-        self.theta = theta
-        self.axes_dim = axes_dim
-    def forward(self, ids: Tensor) -> Tensor:
-        n_axes = ids.shape[-1]
-        emb = torch.cat(
-            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
-            dim=-3,
-        )
-        return emb.unsqueeze(1)
-class MLPEmbedder(nn.Module):
-    def __init__(self, in_dim: int, hidden_dim: int):
-        super().__init__()
-        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
-        self.silu = nn.SiLU()
-        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
-    def forward(self, x: Tensor) -> Tensor:
-        return self.out_layer(self.silu(self.in_layer(x)))
-def rope(pos, dim: int, theta: int):
-    assert dim % 2 == 0
-    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
-    omega = 1.0 / (theta**scale)
-    out = torch.einsum("...n,d->...nd", pos, omega)
-    out = torch.stack(
-        [torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1
-    )
-    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
-    return out.float()
-def attention_after_rope(q, k, v, pe):
-    q, k = apply_rope(q, k, pe)
-    from .attention import attention
-    x = attention(q, k, v, mode="torch")
-    return x
-@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=True)
-def apply_rope(xq, xk, freqs_cis):
-    # 将 num_heads 和 seq_len 的维度交换回原函数的处理顺序
-    xq = xq.transpose(1, 2)  # [batch, num_heads, seq_len, head_dim]
-    xk = xk.transpose(1, 2)
-    # 将 head_dim 拆分为复数部分（实部和虚部）
-    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
-    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
-    # 应用旋转位置编码（复数乘法）
-    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
-    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
-    # 恢复张量形状并转置回目标维度顺序
-    xq_out = xq_out.reshape(*xq.shape).type_as(xq).transpose(1, 2)
-    xk_out = xk_out.reshape(*xk.shape).type_as(xk).transpose(1, 2)
-    return xq_out, xk_out
-@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=True)
-def scale_add_residual(
-    x: torch.Tensor, scale: torch.Tensor, residual: torch.Tensor
-) -> torch.Tensor:
-    return x * scale + residual
-@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=True)
-def layernorm_and_scale_shift(
-    x: torch.Tensor, scale: torch.Tensor, shift: torch.Tensor
-) -> torch.Tensor:
-    return torch.nn.functional.layer_norm(x, (x.size(-1),)) * (scale + 1) + shift
-class SelfAttention(nn.Module):
-    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.norm = QKNorm(head_dim)
-        self.proj = nn.Linear(dim, dim)
-    def forward(self, x: Tensor, pe: Tensor) -> Tensor:
-        qkv = self.qkv(x)
-        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads)
-        q, k = self.norm(q, k, v)
-        x = attention_after_rope(q, k, v, pe=pe)
-        x = self.proj(x)
-        return x
-@dataclass
-class ModulationOut:
-    shift: Tensor
-    scale: Tensor
-    gate: Tensor
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int):
-        super().__init__()
-        self.scale = nn.Parameter(torch.ones(dim))
-    # @staticmethod
-    # def rms_norm_fast(x, weight, eps):
-    #     return LigerRMSNormFunction.apply(
-    #         x,
-    #         weight,
-    #         eps,
-    #         0.0,
-    #         "gemma",
-    #         True,
-    #     )
-    @staticmethod
-    def rms_norm(x, weight, eps):
-        x_dtype = x.dtype
-        x = x.float()
-        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
-        return (x * rrms).to(dtype=x_dtype) * weight
-    def forward(self, x: Tensor):
-        # return self.rms_norm_fast(x, self.scale, 1e-6)
-        return self.rms_norm(x, self.scale, 1e-6)
-class QKNorm(torch.nn.Module):
-    def __init__(self, dim: int):
-        super().__init__()
-        self.query_norm = RMSNorm(dim)
-        self.key_norm = RMSNorm(dim)
-    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
-        q = self.query_norm(q)
-        k = self.key_norm(k)
-        return q.to(v), k.to(v)
-class Modulation(nn.Module):
-    def __init__(self, dim: int, double: bool):
-        super().__init__()
-        self.is_double = double
-        self.multiplier = 6 if double else 3
-        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
-    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
-        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(
-            self.multiplier, dim=-1
-        )
-        return (
-            ModulationOut(*out[:3]),
-            ModulationOut(*out[3:]) if self.is_double else None,
-        )
-class DoubleStreamBlock(nn.Module):
-    def __init__(
-        self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False
-    ):
-        super().__init__()
-        mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        self.num_heads = num_heads
-        self.hidden_size = hidden_size
-        self.img_mod = Modulation(hidden_size, double=True)
-        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.img_attn = SelfAttention(
-            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
-        )
-        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.img_mlp = nn.Sequential(
-            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
-            nn.GELU(approximate="tanh"),
-            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
-        )
-        self.txt_mod = Modulation(hidden_size, double=True)
-        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.txt_attn = SelfAttention(
-            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
-        )
-        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.txt_mlp = nn.Sequential(
-            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
-            nn.GELU(approximate="tanh"),
-            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
-        )
-    def forward(
-        self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor
-    ) -> tuple[Tensor, Tensor]:
-        img_mod1, img_mod2 = self.img_mod(vec)
-        txt_mod1, txt_mod2 = self.txt_mod(vec)
-        # prepare image for attention
-        img_modulated = self.img_norm1(img)
-        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
-        img_qkv = self.img_attn.qkv(img_modulated)
-        img_q, img_k, img_v = rearrange(
-            img_qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads
-        )
-        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
-        # prepare txt for attention
-        txt_modulated = self.txt_norm1(txt)
-        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
-        txt_qkv = self.txt_attn.qkv(txt_modulated)
-        txt_q, txt_k, txt_v = rearrange(
-            txt_qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads
-        )
-        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
-        # run actual attention
-        q = torch.cat((txt_q, img_q), dim=1)
-        k = torch.cat((txt_k, img_k), dim=1)
-        v = torch.cat((txt_v, img_v), dim=1)
-        attn = attention_after_rope(q, k, v, pe=pe)
-        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
-        # calculate the img bloks
-        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
-        img_mlp = self.img_mlp(
-            (1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift
-        )
-        img = scale_add_residual(img_mlp, img_mod2.gate, img)
-        # calculate the txt bloks
-        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
-        txt_mlp = self.txt_mlp(
-            (1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift
-        )
-        txt = scale_add_residual(txt_mlp, txt_mod2.gate, txt)
-        return img, txt
-class SingleStreamBlock(nn.Module):
-    """
-    A DiT block with parallel linear layers as described in
-    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
-    """
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        mlp_ratio: float = 4.0,
-        qk_scale: float | None = None,
-    ):
-        super().__init__()
-        self.hidden_dim = hidden_size
-        self.num_heads = num_heads
-        head_dim = hidden_size // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        # qkv and mlp_in
-        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
-        # proj and mlp_out
-        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
-        self.norm = QKNorm(head_dim)
-        self.hidden_size = hidden_size
-        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.mlp_act = nn.GELU(approximate="tanh")
-        self.modulation = Modulation(hidden_size, double=False)
-    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
-        mod, _ = self.modulation(vec)
-        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
-        qkv, mlp = torch.split(
-            self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1
-        )
-        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads)
-        q, k = self.norm(q, k, v)
-        # compute attention
-        attn = attention_after_rope(q, k, v, pe=pe)
-        # compute activation in mlp stream, cat again and run second linear layer
-        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
-        return scale_add_residual(output, mod.gate, x)
-class LastLayer(nn.Module):
-    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
-        super().__init__()
-        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.linear = nn.Linear(
-            hidden_size, patch_size * patch_size * out_channels, bias=True
-        )
-        self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)
-        )
-    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
-        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
-        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
-        x = self.linear(x)
-        return x

leather.jpg DELETED Viewed

Git LFS Details

SHA256: efa1eab6d7fa83b2bb39631b194012cf01cca24356b624f32e0fd05346af3ec2
Pointer size: 131 Bytes
Size of remote file: 250 kB

meme.jpg DELETED Viewed

Binary file (49.8 kB)

model_edit.cpython-310.pyc DELETED Viewed

Binary file (4.21 kB)

model_edit.py DELETED Viewed

@@ -1,143 +0,0 @@
-import math
-from dataclasses import dataclass
-import numpy as np
-import torch
-from torch import Tensor, nn
-from .connector_edit import Qwen2Connector
-from .layers import DoubleStreamBlock, EmbedND, LastLayer, MLPEmbedder, SingleStreamBlock
-@dataclass
-class Step1XParams:
-    in_channels: int
-    out_channels: int
-    vec_in_dim: int
-    context_in_dim: int
-    hidden_size: int
-    mlp_ratio: float
-    num_heads: int
-    depth: int
-    depth_single_blocks: int
-    axes_dim: list[int]
-    theta: int
-    qkv_bias: bool
-class Step1XEdit(nn.Module):
-    """
-    Transformer model for flow matching on sequences.
-    """
-    def __init__(self, params: Step1XParams):
-        super().__init__()
-        self.params = params
-        self.in_channels = params.in_channels
-        self.out_channels = params.out_channels
-        if params.hidden_size % params.num_heads != 0:
-            raise ValueError(
-                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
-            )
-        pe_dim = params.hidden_size // params.num_heads
-        if sum(params.axes_dim) != pe_dim:
-            raise ValueError(
-                f"Got {params.axes_dim} but expected positional dim {pe_dim}"
-            )
-        self.hidden_size = params.hidden_size
-        self.num_heads = params.num_heads
-        self.pe_embedder = EmbedND(
-            dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim
-        )
-        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
-        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
-        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
-        self.double_blocks = nn.ModuleList(
-            [
-                DoubleStreamBlock(
-                    self.hidden_size,
-                    self.num_heads,
-                    mlp_ratio=params.mlp_ratio,
-                    qkv_bias=params.qkv_bias,
-                )
-                for _ in range(params.depth)
-            ]
-        )
-        self.single_blocks = nn.ModuleList(
-            [
-                SingleStreamBlock(
-                    self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio
-                )
-                for _ in range(params.depth_single_blocks)
-            ]
-        )
-        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
-        self.connector = Qwen2Connector()
-    @staticmethod
-    def timestep_embedding(
-        t: Tensor, dim, max_period=10000, time_factor: float = 1000.0
-    ):
-        """
-        Create sinusoidal timestep embeddings.
-        :param t: a 1-D Tensor of N indices, one per batch element.
-                        These may be fractional.
-        :param dim: the dimension of the output.
-        :param max_period: controls the minimum frequency of the embeddings.
-        :return: an (N, D) Tensor of positional embeddings.
-        """
-        t = time_factor * t
-        half = dim // 2
-        freqs = torch.exp(
-            -math.log(max_period)
-            * torch.arange(start=0, end=half, dtype=torch.float32)
-            / half
-        ).to(t.device)
-        args = t[:, None].float() * freqs[None]
-        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-        if dim % 2:
-            embedding = torch.cat(
-                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
-            )
-        if torch.is_floating_point(t):
-            embedding = embedding.to(t)
-        return embedding
-    def forward(
-        self,
-        img: Tensor,
-        img_ids: Tensor,
-        txt: Tensor,
-        txt_ids: Tensor,
-        timesteps: Tensor,
-        y: Tensor,
-    ) -> Tensor:
-        if img.ndim != 3 or txt.ndim != 3:
-            raise ValueError("Input img and txt tensors must have 3 dimensions.")
-        img = self.img_in(img)
-        vec = self.time_in(self.timestep_embedding(timesteps, 256))
-        vec = vec + self.vector_in(y)
-        txt = self.txt_in(txt)
-        ids = torch.cat((txt_ids, img_ids), dim=1)
-        pe = self.pe_embedder(ids)
-        for block in self.double_blocks:
-            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
-        img = torch.cat((txt, img), 1)
-        for block in self.single_blocks:
-            img = block(img, vec=vec, pe=pe)
-        img = img[:, txt.shape[1] :, ...]
-        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
-        return img

celeb_meme.jpg → modules.zip RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4fb2ccab4218dba753781d65e8f5933f8ab7613543b59a7b4512a6654fe55a4f
-size 266588

 version https://git-lfs.github.com/spec/v1
+oid sha256:c432d89999f0ae531c09c6ccf1d4a69bf5c2bb878f23411fafdf64b7370c8afe
+size 45293

modules/__init__.py DELETED Viewed

File without changes

modules/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (128 Bytes)

modules/__pycache__/attention.cpython-310.pyc DELETED Viewed

Binary file (3.13 kB)

modules/__pycache__/autoencoder.cpython-310.pyc DELETED Viewed

Binary file (8.78 kB)

modules/__pycache__/conditioner.cpython-310.pyc DELETED Viewed

Binary file (4.94 kB)

modules/__pycache__/connector_edit.cpython-310.pyc DELETED Viewed

Binary file (11.8 kB)

modules/__pycache__/layers.cpython-310.pyc DELETED Viewed

Binary file (19.1 kB)

modules/__pycache__/model_edit.cpython-310.pyc DELETED Viewed

Binary file (4.21 kB)

modules/attention.py DELETED Viewed

@@ -1,133 +0,0 @@
-import math
-import torch
-import torch.nn.functional as F
-try:
-    import flash_attn
-    from flash_attn.flash_attn_interface import (
-        _flash_attn_forward,
-        flash_attn_func,
-        flash_attn_varlen_func,
-    )
-except ImportError:
-    flash_attn = None
-    flash_attn_varlen_func = None
-    _flash_attn_forward = None
-    flash_attn_func = None
-MEMORY_LAYOUT = {
-    # flash模式:
-    # 预处理: 输入 [batch_size, seq_len, num_heads, head_dim]
-    # 后处理: 保持形状不变
-    "flash": (
-        lambda x: x,  # 保持形状
-        lambda x: x,  # 保持形状
-    ),
-    # torch/vanilla模式:
-    # 预处理: 交换序列和注意力头的维度 [B,S,A,D] -> [B,A,S,D]
-    # 后处理: 交换回原始维度 [B,A,S,D] -> [B,S,A,D]
-    "torch": (
-        lambda x: x.transpose(1, 2),  # (B,S,A,D) -> (B,A,S,D)
-        lambda x: x.transpose(1, 2),  # (B,A,S,D) -> (B,S,A,D)
-    ),
-    "vanilla": (
-        lambda x: x.transpose(1, 2),
-        lambda x: x.transpose(1, 2),
-    ),
-}
-def attention(
-    q,
-    k,
-    v,
-    mode="torch",
-    drop_rate=0,
-    attn_mask=None,
-    causal=False,
-):
-    """
-    执行QKV自注意力计算
-    Args:
-        q (torch.Tensor): 查询张量，形状 [batch_size, seq_len, num_heads, head_dim]
-        k (torch.Tensor): 键张量，形状 [batch_size, seq_len_kv, num_heads, head_dim]
-        v (torch.Tensor): 值张量，形状 [batch_size, seq_len_kv, num_heads, head_dim]
-        mode (str): 注意力模式，可选 'flash', 'torch', 'vanilla'
-        drop_rate (float): 注意力矩阵的dropout概率
-        attn_mask (torch.Tensor): 注意力掩码，形状根据模式不同而变化
-        causal (bool): 是否使用因果注意力（仅关注前面位置）
-    Returns:
-        torch.Tensor: 注意力输出，形状 [batch_size, seq_len, num_heads * head_dim]
-    """
-    # 获取预处理和后处理函数
-    pre_attn_layout, post_attn_layout = MEMORY_LAYOUT[mode]
-    # 应用预处理变换
-    q = pre_attn_layout(q)  # 形状根据模式变化
-    k = pre_attn_layout(k)
-    v = pre_attn_layout(v)
-    if mode == "torch":
-        # 使用PyTorch原生的scaled_dot_product_attention
-        if attn_mask is not None and attn_mask.dtype != torch.bool:
-            attn_mask = attn_mask.to(q.dtype)
-        x = F.scaled_dot_product_attention(
-            q, k, v, attn_mask=attn_mask, dropout_p=drop_rate, is_causal=causal
-        )
-    elif mode == "flash":
-        assert flash_attn_func is not None, "flash_attn_func未定义"
-        assert attn_mask is None, "不支持的注意力掩码"
-        x: torch.Tensor = flash_attn_func(
-            q, k, v, dropout_p=drop_rate, causal=causal, softmax_scale=None
-        )  # type: ignore
-    elif mode == "vanilla":
-        # 手动实现注意力机制
-        scale_factor = 1 / math.sqrt(q.size(-1))  # 缩放因子 1/sqrt(d_k)
-        b, a, s, _ = q.shape  # 获取形状参数
-        s1 = k.size(2)  # 键值序列长度
-        # 初始化注意力偏置
-        attn_bias = torch.zeros(b, a, s, s1, dtype=q.dtype, device=q.device)
-        # 处理因果掩码
-        if causal:
-            assert attn_mask is None, "因果掩码和注意力掩码不能同时使用"
-            # 生成下三角因果掩码
-            temp_mask = torch.ones(b, a, s, s, dtype=torch.bool, device=q.device).tril(
-                diagonal=0
-            )
-            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
-            attn_bias = attn_bias.to(q.dtype)
-        # 处理自定义注意力掩码
-        if attn_mask is not None:
-            if attn_mask.dtype == torch.bool:
-                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
-            else:
-                attn_bias += attn_mask  # 允许类似ALiBi的位置偏置
-        # 计算注意力矩阵
-        attn = (q @ k.transpose(-2, -1)) * scale_factor  # [B,A,S,S1]
-        attn += attn_bias
-        # softmax和dropout
-        attn = attn.softmax(dim=-1)
-        attn = torch.dropout(attn, p=drop_rate, train=True)
-        # 计算输出
-        x = attn @ v  # [B,A,S,D]
-    else:
-        raise NotImplementedError(f"不支持的注意力模式: {mode}")
-    # 应用后处理变换
-    x = post_attn_layout(x)  # 恢复原始维度顺序
-    # 合并注意力头维度
-    b, s, a, d = x.shape
-    out = x.reshape(b, s, -1)  # [B,S,A*D]
-    return out

modules/autoencoder.py DELETED Viewed

@@ -1,326 +0,0 @@
-# Modified from Flux
-#
-# Copyright 2024 Black Forest Labs
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#     http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import torch
-from einops import rearrange
-from torch import Tensor, nn
-def swish(x: Tensor) -> Tensor:
-    return x * torch.sigmoid(x)
-class AttnBlock(nn.Module):
-    def __init__(self, in_channels: int):
-        super().__init__()
-        self.in_channels = in_channels
-        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
-        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
-        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
-        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
-        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
-    def attention(self, h_: Tensor) -> Tensor:
-        h_ = self.norm(h_)
-        q = self.q(h_)
-        k = self.k(h_)
-        v = self.v(h_)
-        b, c, h, w = q.shape
-        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
-        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
-        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
-        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
-        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
-    def forward(self, x: Tensor) -> Tensor:
-        return x + self.proj_out(self.attention(x))
-class ResnetBlock(nn.Module):
-    def __init__(self, in_channels: int, out_channels: int):
-        super().__init__()
-        self.in_channels = in_channels
-        out_channels = in_channels if out_channels is None else out_channels
-        self.out_channels = out_channels
-        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
-        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
-        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
-        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
-        if self.in_channels != self.out_channels:
-            self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
-    def forward(self, x):
-        h = x
-        h = self.norm1(h)
-        h = swish(h)
-        h = self.conv1(h)
-        h = self.norm2(h)
-        h = swish(h)
-        h = self.conv2(h)
-        if self.in_channels != self.out_channels:
-            x = self.nin_shortcut(x)
-        return x + h
-class Downsample(nn.Module):
-    def __init__(self, in_channels: int):
-        super().__init__()
-        # no asymmetric padding in torch conv, must do it ourselves
-        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
-    def forward(self, x: Tensor):
-        pad = (0, 1, 0, 1)
-        x = nn.functional.pad(x, pad, mode="constant", value=0)
-        x = self.conv(x)
-        return x
-class Upsample(nn.Module):
-    def __init__(self, in_channels: int):
-        super().__init__()
-        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
-    def forward(self, x: Tensor):
-        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
-        x = self.conv(x)
-        return x
-class Encoder(nn.Module):
-    def __init__(
-        self,
-        resolution: int,
-        in_channels: int,
-        ch: int,
-        ch_mult: list[int],
-        num_res_blocks: int,
-        z_channels: int,
-    ):
-        super().__init__()
-        self.ch = ch
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-        # downsampling
-        self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
-        curr_res = resolution
-        in_ch_mult = (1, *tuple(ch_mult))
-        self.in_ch_mult = in_ch_mult
-        self.down = nn.ModuleList()
-        block_in = self.ch
-        for i_level in range(self.num_resolutions):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_in = ch * in_ch_mult[i_level]
-            block_out = ch * ch_mult[i_level]
-            for _ in range(self.num_res_blocks):
-                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
-                block_in = block_out
-            down = nn.Module()
-            down.block = block
-            down.attn = attn
-            if i_level != self.num_resolutions - 1:
-                down.downsample = Downsample(block_in)
-                curr_res = curr_res // 2
-            self.down.append(down)
-        # middle
-        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
-        self.mid.attn_1 = AttnBlock(block_in)
-        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
-        # end
-        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
-        self.conv_out = nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
-    def forward(self, x: Tensor) -> Tensor:
-        # downsampling
-        hs = [self.conv_in(x)]
-        for i_level in range(self.num_resolutions):
-            for i_block in range(self.num_res_blocks):
-                h = self.down[i_level].block[i_block](hs[-1])
-                if len(self.down[i_level].attn) > 0:
-                    h = self.down[i_level].attn[i_block](h)
-                hs.append(h)
-            if i_level != self.num_resolutions - 1:
-                hs.append(self.down[i_level].downsample(hs[-1]))
-        # middle
-        h = hs[-1]
-        h = self.mid.block_1(h)
-        h = self.mid.attn_1(h)
-        h = self.mid.block_2(h)
-        # end
-        h = self.norm_out(h)
-        h = swish(h)
-        h = self.conv_out(h)
-        return h
-class Decoder(nn.Module):
-    def __init__(
-        self,
-        ch: int,
-        out_ch: int,
-        ch_mult: list[int],
-        num_res_blocks: int,
-        in_channels: int,
-        resolution: int,
-        z_channels: int,
-    ):
-        super().__init__()
-        self.ch = ch
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-        self.ffactor = 2 ** (self.num_resolutions - 1)
-        # compute in_ch_mult, block_in and curr_res at lowest res
-        block_in = ch * ch_mult[self.num_resolutions - 1]
-        curr_res = resolution // 2 ** (self.num_resolutions - 1)
-        self.z_shape = (1, z_channels, curr_res, curr_res)
-        # z to block_in
-        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
-        # middle
-        self.mid = nn.Module()
-        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
-        self.mid.attn_1 = AttnBlock(block_in)
-        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
-        # upsampling
-        self.up = nn.ModuleList()
-        for i_level in reversed(range(self.num_resolutions)):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_out = ch * ch_mult[i_level]
-            for _ in range(self.num_res_blocks + 1):
-                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
-                block_in = block_out
-            up = nn.Module()
-            up.block = block
-            up.attn = attn
-            if i_level != 0:
-                up.upsample = Upsample(block_in)
-                curr_res = curr_res * 2
-            self.up.insert(0, up)  # prepend to get consistent order
-        # end
-        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
-        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
-    def forward(self, z: Tensor) -> Tensor:
-        # z to block_in
-        h = self.conv_in(z)
-        # middle
-        h = self.mid.block_1(h)
-        h = self.mid.attn_1(h)
-        h = self.mid.block_2(h)
-        # upsampling
-        for i_level in reversed(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks + 1):
-                h = self.up[i_level].block[i_block](h)
-                if len(self.up[i_level].attn) > 0:
-                    h = self.up[i_level].attn[i_block](h)
-            if i_level != 0:
-                h = self.up[i_level].upsample(h)
-        # end
-        h = self.norm_out(h)
-        h = swish(h)
-        h = self.conv_out(h)
-        return h
-class DiagonalGaussian(nn.Module):
-    def __init__(self, sample: bool = True, chunk_dim: int = 1):
-        super().__init__()
-        self.sample = sample
-        self.chunk_dim = chunk_dim
-    def forward(self, z: Tensor) -> Tensor:
-        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
-        if self.sample:
-            std = torch.exp(0.5 * logvar)
-            return mean + std * torch.randn_like(mean)
-        else:
-            return mean
-class AutoEncoder(nn.Module):
-    def __init__(
-        self,
-        resolution: int,
-        in_channels: int,
-        ch: int,
-        out_ch: int,
-        ch_mult: list[int],
-        num_res_blocks: int,
-        z_channels: int,
-        scale_factor: float,
-        shift_factor: float,
-    ):
-        super().__init__()
-        self.encoder = Encoder(
-            resolution=resolution,
-            in_channels=in_channels,
-            ch=ch,
-            ch_mult=ch_mult,
-            num_res_blocks=num_res_blocks,
-            z_channels=z_channels,
-        )
-        self.decoder = Decoder(
-            resolution=resolution,
-            in_channels=in_channels,
-            ch=ch,
-            out_ch=out_ch,
-            ch_mult=ch_mult,
-            num_res_blocks=num_res_blocks,
-            z_channels=z_channels,
-        )
-        self.reg = DiagonalGaussian()
-        self.scale_factor = scale_factor
-        self.shift_factor = shift_factor
-    def encode(self, x: Tensor) -> Tensor:
-        z = self.reg(self.encoder(x))
-        z = self.scale_factor * (z - self.shift_factor)
-        return z
-    def decode(self, z: Tensor) -> Tensor:
-        z = z / self.scale_factor + self.shift_factor
-        return self.decoder(z)
-    def forward(self, x: Tensor) -> Tensor:
-        return self.decode(self.encode(x))

modules/conditioner.py DELETED Viewed

@@ -1,216 +0,0 @@
-import torch
-from qwen_vl_utils import process_vision_info
-from transformers import (
-    AutoProcessor,
-    Qwen2VLForConditionalGeneration,
-    Qwen2_5_VLForConditionalGeneration,
-)
-from torchvision.transforms import ToPILImage
-to_pil = ToPILImage()
-Qwen25VL_7b_PREFIX = '''Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:
-- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.
-- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.\n
-Here are examples of how to transform or refine prompts:
-- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.
-- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.\n
-Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:
-User Prompt:'''
-def split_string(s):
-    # 将中文引号替换为英文引号
-    s = s.replace("“", '"').replace("”", '"')  # use english quotes
-    result = []
-    # 标记是否在引号内
-    in_quotes = False
-    temp = ""
-    # 遍历字符串中的每个字符及其索引
-    for idx, char in enumerate(s):
-        # 如果字符是引号且索引大于 155
-        if char == '"' and idx > 155:
-            # 将引号添加到临时字符串
-            temp += char
-            # 如果不在引号内
-            if not in_quotes:
-                # 将临时字符串添加到结果列表
-                result.append(temp)
-                # 清空临时字符串
-                temp = ""
-            # 切换引号状态
-            in_quotes = not in_quotes
-            continue
-        # 如果在引号内
-        if in_quotes:
-            # 如果字符是空格
-            if char.isspace():
-                pass  # have space token
-            # 将字符用中文引号包裹后添加到结果列表
-            result.append("“" + char + "”")
-        else:
-            # 将字符添加到临时字符串
-            temp += char
-    # 如果临时字符串不为空
-    if temp:
-        # 将临时字符串添加到结果列表
-        result.append(temp)
-    return result
-class Qwen25VL_7b_Embedder(torch.nn.Module):
-    def __init__(self, model_path, max_length=640, dtype=torch.bfloat16, device="cuda"):
-        super(Qwen25VL_7b_Embedder, self).__init__()
-        self.max_length = max_length
-        self.dtype = dtype
-        self.device = device
-        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-            model_path,
-            torch_dtype=dtype,
-            attn_implementation="eager",
-        ).to(torch.cuda.current_device())
-        self.model.requires_grad_(False)
-        self.processor = AutoProcessor.from_pretrained(
-            model_path, min_pixels=256 * 28 * 28, max_pixels=324 * 28 * 28
-        )
-        self.prefix = Qwen25VL_7b_PREFIX
-    def forward(self, caption, ref_images):
-        text_list = caption
-        embs = torch.zeros(
-            len(text_list),
-            self.max_length,
-            self.model.config.hidden_size,
-            dtype=torch.bfloat16,
-            device=torch.cuda.current_device(),
-        )
-        hidden_states = torch.zeros(
-            len(text_list),
-            self.max_length,
-            self.model.config.hidden_size,
-            dtype=torch.bfloat16,
-            device=torch.cuda.current_device(),
-        )
-        masks = torch.zeros(
-            len(text_list),
-            self.max_length,
-            dtype=torch.long,
-            device=torch.cuda.current_device(),
-        )
-        input_ids_list = []
-        attention_mask_list = []
-        emb_list = []
-        def split_string(s):
-            s = s.replace("“", '"').replace("”", '"').replace("'", '''"''')  # use english quotes
-            result = []
-            in_quotes = False
-            temp = ""
-            for idx,char in enumerate(s):
-                if char == '"' and idx>155:
-                    temp += char
-                    if not in_quotes:
-                        result.append(temp)
-                        temp = ""
-                    in_quotes = not in_quotes
-                    continue
-                if in_quotes:
-                    if char.isspace():
-                        pass  # have space token
-                    result.append("“" + char + "”")
-                else:
-                    temp += char
-            if temp:
-                result.append(temp)
-            return result
-        for idx, (txt, imgs) in enumerate(zip(text_list, ref_images)):
-            messages = [{"role": "user", "content": []}]
-            messages[0]["content"].append({"type": "text", "text": f"{self.prefix}"})
-            messages[0]["content"].append({"type": "image", "image": to_pil(imgs)})
-            # 再添加 text
-            messages[0]["content"].append({"type": "text", "text": f"{txt}"})
-            # Preparation for inference
-            text = self.processor.apply_chat_template(
-                messages, tokenize=False, add_generation_prompt=True, add_vision_id=True
-            )
-            image_inputs, video_inputs = process_vision_info(messages)
-            inputs = self.processor(
-                text=[text],
-                images=image_inputs,
-                padding=True,
-                return_tensors="pt",
-            )
-            old_inputs_ids = inputs.input_ids
-            text_split_list = split_string(text)
-            token_list = []
-            for text_each in text_split_list:
-                txt_inputs = self.processor(
-                    text=text_each,
-                    images=None,
-                    videos=None,
-                    padding=True,
-                    return_tensors="pt",
-                )
-                token_each = txt_inputs.input_ids
-                if token_each[0][0] == 2073 and token_each[0][-1] == 854:
-                    token_each = token_each[:, 1:-1]
-                    token_list.append(token_each)
-                else:
-                    token_list.append(token_each)
-            new_txt_ids = torch.cat(token_list, dim=1).to("cuda")
-            new_txt_ids = new_txt_ids.to(old_inputs_ids.device)
-            idx1 = (old_inputs_ids == 151653).nonzero(as_tuple=True)[1][0]
-            idx2 = (new_txt_ids == 151653).nonzero(as_tuple=True)[1][0]
-            inputs.input_ids = (
-                torch.cat([old_inputs_ids[0, :idx1], new_txt_ids[0, idx2:]], dim=0)
-                .unsqueeze(0)
-                .to("cuda")
-            )
-            inputs.attention_mask = (inputs.input_ids > 0).long().to("cuda")
-            outputs = self.model(
-                input_ids=inputs.input_ids,
-                attention_mask=inputs.attention_mask,
-                pixel_values=inputs.pixel_values.to("cuda"),
-                image_grid_thw=inputs.image_grid_thw.to("cuda"),
-                output_hidden_states=True,
-            )
-            emb = outputs["hidden_states"][-1]
-            embs[idx, : min(self.max_length, emb.shape[1] - 217)] = emb[0, 217:][
-                : self.max_length
-            ]
-            masks[idx, : min(self.max_length, emb.shape[1] - 217)] = torch.ones(
-                (min(self.max_length, emb.shape[1] - 217)),
-                dtype=torch.long,
-                device=torch.cuda.current_device(),
-            )
-        return embs, masks

modules/connector_edit.py DELETED Viewed

@@ -1,486 +0,0 @@
-from typing import Optional
-import torch
-import torch.nn
-from einops import rearrange
-from torch import nn
-from .layers import MLP, TextProjection, TimestepEmbedder, apply_gate, attention
-class RMSNorm(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        elementwise_affine=True,
-        eps: float = 1e-6,
-        device=None,
-        dtype=None,
-    ):
-        """
-        Initialize the RMSNorm normalization layer.
-        Args:
-            dim (int): The dimension of the input tensor.
-            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
-        Attributes:
-            eps (float): A small value added to the denominator for numerical stability.
-            weight (nn.Parameter): Learnable scaling parameter.
-        """
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.eps = eps
-        if elementwise_affine:
-            self.weight = nn.Parameter(torch.ones(dim, **factory_kwargs))
-    def _norm(self, x):
-        """
-        Apply the RMSNorm normalization to the input tensor.
-        Args:
-            x (torch.Tensor): The input tensor.
-        Returns:
-            torch.Tensor: The normalized tensor.
-        """
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-    def forward(self, x):
-        """
-        Forward pass through the RMSNorm layer.
-        Args:
-            x (torch.Tensor): The input tensor.
-        Returns:
-            torch.Tensor: The output tensor after applying RMSNorm.
-        """
-        output = self._norm(x.float()).type_as(x)
-        if hasattr(self, "weight"):
-            output = output * self.weight
-        return output
-def get_norm_layer(norm_layer):
-    """
-    Get the normalization layer.
-    Args:
-        norm_layer (str): The type of normalization layer.
-    Returns:
-        norm_layer (nn.Module): The normalization layer.
-    """
-    if norm_layer == "layer":
-        return nn.LayerNorm
-    elif norm_layer == "rms":
-        return RMSNorm
-    else:
-        raise NotImplementedError(f"Norm layer {norm_layer} is not implemented")
-def get_activation_layer(act_type):
-    """get activation layer
-    Args:
-        act_type (str): the activation type
-    Returns:
-        torch.nn.functional: the activation layer
-    """
-    if act_type == "gelu":
-        return lambda: nn.GELU()
-    elif act_type == "gelu_tanh":
-        return lambda: nn.GELU(approximate="tanh")
-    elif act_type == "relu":
-        return nn.ReLU
-    elif act_type == "silu":
-        return nn.SiLU
-    else:
-        raise ValueError(f"Unknown activation type: {act_type}")
-class IndividualTokenRefinerBlock(torch.nn.Module):
-    def __init__(
-        self,
-        hidden_size,
-        heads_num,
-        mlp_width_ratio: str = 4.0,
-        mlp_drop_rate: float = 0.0,
-        act_type: str = "silu",
-        qk_norm: bool = False,
-        qk_norm_type: str = "layer",
-        qkv_bias: bool = True,
-        need_CA: bool = False,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
-    ):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.need_CA = need_CA
-        self.heads_num = heads_num
-        head_dim = hidden_size // heads_num
-        mlp_hidden_dim = int(hidden_size * mlp_width_ratio)
-        self.norm1 = nn.LayerNorm(
-            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
-        )
-        self.self_attn_qkv = nn.Linear(
-            hidden_size, hidden_size * 3, bias=qkv_bias, **factory_kwargs
-        )
-        qk_norm_layer = get_norm_layer(qk_norm_type)
-        self.self_attn_q_norm = (
-            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
-            if qk_norm
-            else nn.Identity()
-        )
-        self.self_attn_k_norm = (
-            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
-            if qk_norm
-            else nn.Identity()
-        )
-        self.self_attn_proj = nn.Linear(
-            hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
-        )
-        self.norm2 = nn.LayerNorm(
-            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
-        )
-        act_layer = get_activation_layer(act_type)
-        self.mlp = MLP(
-            in_channels=hidden_size,
-            hidden_channels=mlp_hidden_dim,
-            act_layer=act_layer,
-            drop=mlp_drop_rate,
-            **factory_kwargs,
-        )
-        self.adaLN_modulation = nn.Sequential(
-            act_layer(),
-            nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
-        )
-        if self.need_CA:
-            self.cross_attnblock=CrossAttnBlock(hidden_size=hidden_size,
-                        heads_num=heads_num,
-                        mlp_width_ratio=mlp_width_ratio,
-                        mlp_drop_rate=mlp_drop_rate,
-                        act_type=act_type,
-                        qk_norm=qk_norm,
-                        qk_norm_type=qk_norm_type,
-                        qkv_bias=qkv_bias,
-                        **factory_kwargs,)
-        # Zero-initialize the modulation
-        nn.init.zeros_(self.adaLN_modulation[1].weight)
-        nn.init.zeros_(self.adaLN_modulation[1].bias)
-    def forward(
-        self,
-        x: torch.Tensor,
-        c: torch.Tensor,  # timestep_aware_representations + context_aware_representations
-        attn_mask: torch.Tensor = None,
-        y: torch.Tensor = None,
-    ):
-        gate_msa, gate_mlp = self.adaLN_modulation(c).chunk(2, dim=1)
-        norm_x = self.norm1(x)
-        qkv = self.self_attn_qkv(norm_x)
-        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.heads_num)
-        # Apply QK-Norm if needed
-        q = self.self_attn_q_norm(q).to(v)
-        k = self.self_attn_k_norm(k).to(v)
-        # Self-Attention
-        attn = attention(q, k, v, mode="torch", attn_mask=attn_mask)
-        x = x + apply_gate(self.self_attn_proj(attn), gate_msa)
-        if self.need_CA:
-            x = self.cross_attnblock(x, c, attn_mask, y)
-        # FFN Layer
-        x = x + apply_gate(self.mlp(self.norm2(x)), gate_mlp)
-        return x
-class CrossAttnBlock(torch.nn.Module):
-    def __init__(
-        self,
-        hidden_size,
-        heads_num,
-        mlp_width_ratio: str = 4.0,
-        mlp_drop_rate: float = 0.0,
-        act_type: str = "silu",
-        qk_norm: bool = False,
-        qk_norm_type: str = "layer",
-        qkv_bias: bool = True,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
-    ):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.heads_num = heads_num
-        head_dim = hidden_size // heads_num
-        self.norm1 = nn.LayerNorm(
-            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
-        )
-        self.norm1_2 = nn.LayerNorm(
-            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
-        )
-        self.self_attn_q = nn.Linear(
-            hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
-        )
-        self.self_attn_kv = nn.Linear(
-            hidden_size, hidden_size*2, bias=qkv_bias, **factory_kwargs
-        )
-        qk_norm_layer = get_norm_layer(qk_norm_type)
-        self.self_attn_q_norm = (
-            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
-            if qk_norm
-            else nn.Identity()
-        )
-        self.self_attn_k_norm = (
-            qk_norm_layer(head_dim, elementwise_affine=True, eps=1e-6, **factory_kwargs)
-            if qk_norm
-            else nn.Identity()
-        )
-        self.self_attn_proj = nn.Linear(
-            hidden_size, hidden_size, bias=qkv_bias, **factory_kwargs
-        )
-        self.norm2 = nn.LayerNorm(
-            hidden_size, elementwise_affine=True, eps=1e-6, **factory_kwargs
-        )
-        act_layer = get_activation_layer(act_type)
-        self.adaLN_modulation = nn.Sequential(
-            act_layer(),
-            nn.Linear(hidden_size, 2 * hidden_size, bias=True, **factory_kwargs),
-        )
-        # Zero-initialize the modulation
-        nn.init.zeros_(self.adaLN_modulation[1].weight)
-        nn.init.zeros_(self.adaLN_modulation[1].bias)
-    def forward(
-        self,
-        x: torch.Tensor,
-        c: torch.Tensor,  # timestep_aware_representations + context_aware_representations
-        attn_mask: torch.Tensor = None,
-        y: torch.Tensor=None,
-    ):
-        gate_msa, gate_mlp = self.adaLN_modulation(c).chunk(2, dim=1)
-        norm_x = self.norm1(x)
-        norm_y = self.norm1_2(y)
-        q = self.self_attn_q(norm_x)
-        q = rearrange(q, "B L (H D) -> B L H D",  H=self.heads_num)
-        kv = self.self_attn_kv(norm_y)
-        k, v = rearrange(kv, "B L (K H D) -> K B L H D", K=2, H=self.heads_num)
-        # Apply QK-Norm if needed
-        q = self.self_attn_q_norm(q).to(v)
-        k = self.self_attn_k_norm(k).to(v)
-        # Self-Attention
-        attn = attention(q, k, v, mode="torch", attn_mask=attn_mask)
-        x = x + apply_gate(self.self_attn_proj(attn), gate_msa)
-        return x
-class IndividualTokenRefiner(torch.nn.Module):
-    def __init__(
-        self,
-        hidden_size,
-        heads_num,
-        depth,
-        mlp_width_ratio: float = 4.0,
-        mlp_drop_rate: float = 0.0,
-        act_type: str = "silu",
-        qk_norm: bool = False,
-        qk_norm_type: str = "layer",
-        qkv_bias: bool = True,
-        need_CA:bool=False,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
-    ):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.need_CA = need_CA
-        self.blocks = nn.ModuleList(
-            [
-                IndividualTokenRefinerBlock(
-                    hidden_size=hidden_size,
-                    heads_num=heads_num,
-                    mlp_width_ratio=mlp_width_ratio,
-                    mlp_drop_rate=mlp_drop_rate,
-                    act_type=act_type,
-                    qk_norm=qk_norm,
-                    qk_norm_type=qk_norm_type,
-                    qkv_bias=qkv_bias,
-                    need_CA=self.need_CA,
-                    **factory_kwargs,
-                )
-                for _ in range(depth)
-            ]
-        )
-    def forward(
-        self,
-        x: torch.Tensor,
-        c: torch.LongTensor,
-        mask: Optional[torch.Tensor] = None,
-        y:torch.Tensor=None,
-    ):
-        self_attn_mask = None
-        if mask is not None:
-            batch_size = mask.shape[0]
-            seq_len = mask.shape[1]
-            mask = mask.to(x.device)
-            # batch_size x 1 x seq_len x seq_len
-            self_attn_mask_1 = mask.view(batch_size, 1, 1, seq_len).repeat(
-                1, 1, seq_len, 1
-            )
-            # batch_size x 1 x seq_len x seq_len
-            self_attn_mask_2 = self_attn_mask_1.transpose(2, 3)
-            # batch_size x 1 x seq_len x seq_len, 1 for broadcasting of heads_num
-            self_attn_mask = (self_attn_mask_1 & self_attn_mask_2).bool()
-            # avoids self-attention weight being NaN for padding tokens
-            self_attn_mask[:, :, :, 0] = True
-        for block in self.blocks:
-            x = block(x, c, self_attn_mask,y)
-        return x
-class SingleTokenRefiner(torch.nn.Module):
-    """
-    A single token refiner block for llm text embedding refine.
-    """
-    def __init__(
-        self,
-        in_channels,
-        hidden_size,
-        heads_num,
-        depth,
-        mlp_width_ratio: float = 4.0,
-        mlp_drop_rate: float = 0.0,
-        act_type: str = "silu",
-        qk_norm: bool = False,
-        qk_norm_type: str = "layer",
-        qkv_bias: bool = True,
-        need_CA:bool=False,
-        attn_mode: str = "torch",
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
-    ):
-        factory_kwargs = {"device": device, "dtype": dtype}
-        super().__init__()
-        self.attn_mode = attn_mode
-        self.need_CA = need_CA
-        assert self.attn_mode == "torch", "Only support 'torch' mode for token refiner."
-        self.input_embedder = nn.Linear(
-            in_channels, hidden_size, bias=True, **factory_kwargs
-        )
-        if self.need_CA:
-            self.input_embedder_CA = nn.Linear(
-            in_channels, hidden_size, bias=True, **factory_kwargs
-        )
-        act_layer = get_activation_layer(act_type)
-        # Build timestep embedding layer
-        self.t_embedder = TimestepEmbedder(hidden_size, act_layer, **factory_kwargs)
-        # Build context embedding layer
-        self.c_embedder = TextProjection(
-            in_channels, hidden_size, act_layer, **factory_kwargs
-        )
-        self.individual_token_refiner = IndividualTokenRefiner(
-            hidden_size=hidden_size,
-            heads_num=heads_num,
-            depth=depth,
-            mlp_width_ratio=mlp_width_ratio,
-            mlp_drop_rate=mlp_drop_rate,
-            act_type=act_type,
-            qk_norm=qk_norm,
-            qk_norm_type=qk_norm_type,
-            qkv_bias=qkv_bias,
-            need_CA=need_CA,
-            **factory_kwargs,
-        )
-    def forward(
-        self,
-        x: torch.Tensor,
-        t: torch.LongTensor,
-        mask: Optional[torch.LongTensor] = None,
-        y: torch.LongTensor=None,
-    ):
-        timestep_aware_representations = self.t_embedder(t)
-        if mask is None:
-            context_aware_representations = x.mean(dim=1)
-        else:
-            mask_float = mask.unsqueeze(-1)  # [b, s1, 1]
-            context_aware_representations = (x * mask_float).sum(
-                dim=1
-            ) / mask_float.sum(dim=1)
-        context_aware_representations = self.c_embedder(context_aware_representations)
-        c = timestep_aware_representations + context_aware_representations
-        x = self.input_embedder(x)
-        if self.need_CA:
-            y = self.input_embedder_CA(y)
-            x = self.individual_token_refiner(x, c, mask, y)
-        else:
-            x = self.individual_token_refiner(x, c, mask)
-        return x
-class Qwen2Connector(torch.nn.Module):
-    def __init__(
-        self,
-        # biclip_dim=1024,
-        in_channels=3584,
-        hidden_size=4096,
-        heads_num=32,
-        depth=2,
-        need_CA=False,
-        device=None,
-        dtype=torch.bfloat16,
-    ):
-        super().__init__()
-        factory_kwargs = {"device": device, "dtype":dtype}
-        self.S =SingleTokenRefiner(in_channels=in_channels,hidden_size=hidden_size,heads_num=heads_num,depth=depth,need_CA=need_CA,**factory_kwargs)
-        self.global_proj_out=nn.Linear(in_channels,768)
-        self.scale_factor = nn.Parameter(torch.zeros(1))
-        with torch.no_grad():
-            self.scale_factor.data += -(1 - 0.09)
-    def forward(self, x,t,mask):
-        mask_float = mask.unsqueeze(-1)  # [b, s1, 1]
-        x_mean = (x * mask_float).sum(
-                dim=1
-            ) / mask_float.sum(dim=1) * (1 + self.scale_factor)
-        global_out=self.global_proj_out(x_mean)
-        encoder_hidden_states = self.S(x,t,mask)
-        return encoder_hidden_states,global_out

modules/layers.py DELETED Viewed

@@ -1,640 +0,0 @@
-# Modified from Flux
-#
-# Copyright 2024 Black Forest Labs
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#     http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import math  # noqa: I001
-from dataclasses import dataclass
-from functools import partial
-import torch
-import torch.nn.functional as F
-from einops import rearrange
-# from liger_kernel.ops.rms_norm import LigerRMSNormFunction
-from torch import Tensor, nn
-try:
-    import flash_attn
-    from flash_attn.flash_attn_interface import (
-        _flash_attn_forward,
-        flash_attn_varlen_func,
-    )
-except ImportError:
-    flash_attn = None
-    flash_attn_varlen_func = None
-    _flash_attn_forward = None
-MEMORY_LAYOUT = {
-    "flash": (
-        lambda x: x.view(x.shape[0] * x.shape[1], *x.shape[2:]),
-        lambda x: x,
-    ),
-    "torch": (
-        lambda x: x.transpose(1, 2),
-        lambda x: x.transpose(1, 2),
-    ),
-    "vanilla": (
-        lambda x: x.transpose(1, 2),
-        lambda x: x.transpose(1, 2),
-    ),
-}
-def attention(
-    q,
-    k,
-    v,
-    mode="torch",
-    drop_rate=0,
-    attn_mask=None,
-    causal=False,
-    cu_seqlens_q=None,
-    cu_seqlens_kv=None,
-    max_seqlen_q=None,
-    max_seqlen_kv=None,
-    batch_size=1,
-):
-    """
-    Perform QKV self attention.
-    Args:
-        q (torch.Tensor): Query tensor with shape [b, s, a, d], where a is the number of heads.
-        k (torch.Tensor): Key tensor with shape [b, s1, a, d]
-        v (torch.Tensor): Value tensor with shape [b, s1, a, d]
-        mode (str): Attention mode. Choose from 'self_flash', 'cross_flash', 'torch', and 'vanilla'.
-        drop_rate (float): Dropout rate in attention map. (default: 0)
-        attn_mask (torch.Tensor): Attention mask with shape [b, s1] (cross_attn), or [b, a, s, s1] (torch or vanilla).
-            (default: None)
-        causal (bool): Whether to use causal attention. (default: False)
-        cu_seqlens_q (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
-            used to index into q.
-        cu_seqlens_kv (torch.Tensor): dtype torch.int32. The cumulative sequence lengths of the sequences in the batch,
-            used to index into kv.
-        max_seqlen_q (int): The maximum sequence length in the batch of q.
-        max_seqlen_kv (int): The maximum sequence length in the batch of k and v.
-    Returns:
-        torch.Tensor: Output tensor after self attention with shape [b, s, ad]
-    """
-    pre_attn_layout, post_attn_layout = MEMORY_LAYOUT[mode]
-    q = pre_attn_layout(q)
-    k = pre_attn_layout(k)
-    v = pre_attn_layout(v)
-    if mode == "torch":
-        if attn_mask is not None and attn_mask.dtype != torch.bool:
-            attn_mask = attn_mask.to(q.dtype)
-        x = F.scaled_dot_product_attention(
-            q, k, v, attn_mask=attn_mask, dropout_p=drop_rate, is_causal=causal
-        )
-    elif mode == "flash":
-        assert flash_attn_varlen_func is not None
-        x: torch.Tensor = flash_attn_varlen_func(
-            q,
-            k,
-            v,
-            cu_seqlens_q,
-            cu_seqlens_kv,
-            max_seqlen_q,
-            max_seqlen_kv,
-        )  # type: ignore
-        # x with shape [(bxs), a, d]
-        x = x.view(batch_size, max_seqlen_q, x.shape[-2], x.shape[-1])  # type: ignore # reshape x to [b, s, a, d]
-    elif mode == "vanilla":
-        scale_factor = 1 / math.sqrt(q.size(-1))
-        b, a, s, _ = q.shape
-        s1 = k.size(2)
-        attn_bias = torch.zeros(b, a, s, s1, dtype=q.dtype, device=q.device)
-        if causal:
-            # Only applied to self attention
-            assert attn_mask is None, (
-                "Causal mask and attn_mask cannot be used together"
-            )
-            temp_mask = torch.ones(b, a, s, s, dtype=torch.bool, device=q.device).tril(
-                diagonal=0
-            )
-            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
-            attn_bias.to(q.dtype)
-        if attn_mask is not None:
-            if attn_mask.dtype == torch.bool:
-                attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
-            else:
-                attn_bias += attn_mask
-        # TODO: Maybe force q and k to be float32 to avoid numerical overflow
-        attn = (q @ k.transpose(-2, -1)) * scale_factor
-        attn += attn_bias
-        attn = attn.softmax(dim=-1)
-        attn = torch.dropout(attn, p=drop_rate, train=True)
-        x = attn @ v
-    else:
-        raise NotImplementedError(f"Unsupported attention mode: {mode}")
-    x = post_attn_layout(x)
-    b, s, a, d = x.shape
-    out = x.reshape(b, s, -1)
-    return out
-def apply_gate(x, gate=None, tanh=False):
-    """AI is creating summary for apply_gate
-    Args:
-        x (torch.Tensor): input tensor.
-        gate (torch.Tensor, optional): gate tensor. Defaults to None.
-        tanh (bool, optional): whether to use tanh function. Defaults to False.
-    Returns:
-        torch.Tensor: the output tensor after apply gate.
-    """
-    if gate is None:
-        return x
-    if tanh:
-        return x * gate.unsqueeze(1).tanh()
-    else:
-        return x * gate.unsqueeze(1)
-class MLP(nn.Module):
-    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
-    def __init__(
-        self,
-        in_channels,
-        hidden_channels=None,
-        out_features=None,
-        act_layer=nn.GELU,
-        norm_layer=None,
-        bias=True,
-        drop=0.0,
-        use_conv=False,
-        device=None,
-        dtype=None,
-    ):
-        super().__init__()
-        out_features = out_features or in_channels
-        hidden_channels = hidden_channels or in_channels
-        bias = (bias, bias)
-        drop_probs = (drop, drop)
-        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
-        self.fc1 = linear_layer(
-            in_channels, hidden_channels, bias=bias[0], device=device, dtype=dtype
-        )
-        self.act = act_layer()
-        self.drop1 = nn.Dropout(drop_probs[0])
-        self.norm = (
-            norm_layer(hidden_channels, device=device, dtype=dtype)
-            if norm_layer is not None
-            else nn.Identity()
-        )
-        self.fc2 = linear_layer(
-            hidden_channels, out_features, bias=bias[1], device=device, dtype=dtype
-        )
-        self.drop2 = nn.Dropout(drop_probs[1])
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop1(x)
-        x = self.norm(x)
-        x = self.fc2(x)
-        x = self.drop2(x)
-        return x
-class TextProjection(nn.Module):
-    """
-    Projects text embeddings. Also handles dropout for classifier-free guidance.
-    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
-    """
-    def __init__(self, in_channels, hidden_size, act_layer, dtype=None, device=None):
-        factory_kwargs = {"dtype": dtype, "device": device}
-        super().__init__()
-        self.linear_1 = nn.Linear(
-            in_features=in_channels,
-            out_features=hidden_size,
-            bias=True,
-            **factory_kwargs,
-        )
-        self.act_1 = act_layer()
-        self.linear_2 = nn.Linear(
-            in_features=hidden_size,
-            out_features=hidden_size,
-            bias=True,
-            **factory_kwargs,
-        )
-    def forward(self, caption):
-        hidden_states = self.linear_1(caption)
-        hidden_states = self.act_1(hidden_states)
-        hidden_states = self.linear_2(hidden_states)
-        return hidden_states
-class TimestepEmbedder(nn.Module):
-    """
-    Embeds scalar timesteps into vector representations.
-    """
-    def __init__(
-        self,
-        hidden_size,
-        act_layer,
-        frequency_embedding_size=256,
-        max_period=10000,
-        out_size=None,
-        dtype=None,
-        device=None,
-    ):
-        factory_kwargs = {"dtype": dtype, "device": device}
-        super().__init__()
-        self.frequency_embedding_size = frequency_embedding_size
-        self.max_period = max_period
-        if out_size is None:
-            out_size = hidden_size
-        self.mlp = nn.Sequential(
-            nn.Linear(
-                frequency_embedding_size, hidden_size, bias=True, **factory_kwargs
-            ),
-            act_layer(),
-            nn.Linear(hidden_size, out_size, bias=True, **factory_kwargs),
-        )
-        nn.init.normal_(self.mlp[0].weight, std=0.02)  # type: ignore
-        nn.init.normal_(self.mlp[2].weight, std=0.02)  # type: ignore
-    @staticmethod
-    def timestep_embedding(t, dim, max_period=10000):
-        """
-        Create sinusoidal timestep embeddings.
-        Args:
-            t (torch.Tensor): a 1-D Tensor of N indices, one per batch element. These may be fractional.
-            dim (int): the dimension of the output.
-            max_period (int): controls the minimum frequency of the embeddings.
-        Returns:
-            embedding (torch.Tensor): An (N, D) Tensor of positional embeddings.
-        .. ref_link: https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
-        """
-        half = dim // 2
-        freqs = torch.exp(
-            -math.log(max_period)
-            * torch.arange(start=0, end=half, dtype=torch.float32)
-            / half
-        ).to(device=t.device)
-        args = t[:, None].float() * freqs[None]
-        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-        if dim % 2:
-            embedding = torch.cat(
-                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
-            )
-        return embedding
-    def forward(self, t):
-        t_freq = self.timestep_embedding(
-            t, self.frequency_embedding_size, self.max_period
-        ).type(self.mlp[0].weight.dtype)  # type: ignore
-        t_emb = self.mlp(t_freq)
-        return t_emb
-class EmbedND(nn.Module):
-    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
-        super().__init__()
-        self.dim = dim
-        self.theta = theta
-        self.axes_dim = axes_dim
-    def forward(self, ids: Tensor) -> Tensor:
-        n_axes = ids.shape[-1]
-        emb = torch.cat(
-            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
-            dim=-3,
-        )
-        return emb.unsqueeze(1)
-class MLPEmbedder(nn.Module):
-    def __init__(self, in_dim: int, hidden_dim: int):
-        super().__init__()
-        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
-        self.silu = nn.SiLU()
-        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
-    def forward(self, x: Tensor) -> Tensor:
-        return self.out_layer(self.silu(self.in_layer(x)))
-def rope(pos, dim: int, theta: int):
-    assert dim % 2 == 0
-    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
-    omega = 1.0 / (theta**scale)
-    out = torch.einsum("...n,d->...nd", pos, omega)
-    out = torch.stack(
-        [torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1
-    )
-    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
-    return out.float()
-def attention_after_rope(q, k, v, pe):
-    q, k = apply_rope(q, k, pe)
-    from .attention import attention
-    x = attention(q, k, v, mode="torch")
-    return x
-@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=True)
-def apply_rope(xq, xk, freqs_cis):
-    # 将 num_heads 和 seq_len 的维度交换回原函数的处理顺序
-    xq = xq.transpose(1, 2)  # [batch, num_heads, seq_len, head_dim]
-    xk = xk.transpose(1, 2)
-    # 将 head_dim 拆分为复数部分（实部和虚部）
-    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
-    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
-    # 应用旋转位置编码（复数乘法）
-    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
-    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
-    # 恢复张量形状并转置回目标维度顺序
-    xq_out = xq_out.reshape(*xq.shape).type_as(xq).transpose(1, 2)
-    xk_out = xk_out.reshape(*xk.shape).type_as(xk).transpose(1, 2)
-    return xq_out, xk_out
-@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=True)
-def scale_add_residual(
-    x: torch.Tensor, scale: torch.Tensor, residual: torch.Tensor
-) -> torch.Tensor:
-    return x * scale + residual
-@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=True)
-def layernorm_and_scale_shift(
-    x: torch.Tensor, scale: torch.Tensor, shift: torch.Tensor
-) -> torch.Tensor:
-    return torch.nn.functional.layer_norm(x, (x.size(-1),)) * (scale + 1) + shift
-class SelfAttention(nn.Module):
-    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.norm = QKNorm(head_dim)
-        self.proj = nn.Linear(dim, dim)
-    def forward(self, x: Tensor, pe: Tensor) -> Tensor:
-        qkv = self.qkv(x)
-        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads)
-        q, k = self.norm(q, k, v)
-        x = attention_after_rope(q, k, v, pe=pe)
-        x = self.proj(x)
-        return x
-@dataclass
-class ModulationOut:
-    shift: Tensor
-    scale: Tensor
-    gate: Tensor
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int):
-        super().__init__()
-        self.scale = nn.Parameter(torch.ones(dim))
-    # @staticmethod
-    # def rms_norm_fast(x, weight, eps):
-    #     return LigerRMSNormFunction.apply(
-    #         x,
-    #         weight,
-    #         eps,
-    #         0.0,
-    #         "gemma",
-    #         True,
-    #     )
-    @staticmethod
-    def rms_norm(x, weight, eps):
-        x_dtype = x.dtype
-        x = x.float()
-        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
-        return (x * rrms).to(dtype=x_dtype) * weight
-    def forward(self, x: Tensor):
-        # return self.rms_norm_fast(x, self.scale, 1e-6)
-        return self.rms_norm(x, self.scale, 1e-6)
-class QKNorm(torch.nn.Module):
-    def __init__(self, dim: int):
-        super().__init__()
-        self.query_norm = RMSNorm(dim)
-        self.key_norm = RMSNorm(dim)
-    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
-        q = self.query_norm(q)
-        k = self.key_norm(k)
-        return q.to(v), k.to(v)
-class Modulation(nn.Module):
-    def __init__(self, dim: int, double: bool):
-        super().__init__()
-        self.is_double = double
-        self.multiplier = 6 if double else 3
-        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
-    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
-        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(
-            self.multiplier, dim=-1
-        )
-        return (
-            ModulationOut(*out[:3]),
-            ModulationOut(*out[3:]) if self.is_double else None,
-        )
-class DoubleStreamBlock(nn.Module):
-    def __init__(
-        self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False
-    ):
-        super().__init__()
-        mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        self.num_heads = num_heads
-        self.hidden_size = hidden_size
-        self.img_mod = Modulation(hidden_size, double=True)
-        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.img_attn = SelfAttention(
-            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
-        )
-        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.img_mlp = nn.Sequential(
-            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
-            nn.GELU(approximate="tanh"),
-            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
-        )
-        self.txt_mod = Modulation(hidden_size, double=True)
-        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.txt_attn = SelfAttention(
-            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
-        )
-        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.txt_mlp = nn.Sequential(
-            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
-            nn.GELU(approximate="tanh"),
-            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
-        )
-    def forward(
-        self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor
-    ) -> tuple[Tensor, Tensor]:
-        img_mod1, img_mod2 = self.img_mod(vec)
-        txt_mod1, txt_mod2 = self.txt_mod(vec)
-        # prepare image for attention
-        img_modulated = self.img_norm1(img)
-        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
-        img_qkv = self.img_attn.qkv(img_modulated)
-        img_q, img_k, img_v = rearrange(
-            img_qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads
-        )
-        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
-        # prepare txt for attention
-        txt_modulated = self.txt_norm1(txt)
-        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
-        txt_qkv = self.txt_attn.qkv(txt_modulated)
-        txt_q, txt_k, txt_v = rearrange(
-            txt_qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads
-        )
-        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
-        # run actual attention
-        q = torch.cat((txt_q, img_q), dim=1)
-        k = torch.cat((txt_k, img_k), dim=1)
-        v = torch.cat((txt_v, img_v), dim=1)
-        attn = attention_after_rope(q, k, v, pe=pe)
-        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
-        # calculate the img bloks
-        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
-        img_mlp = self.img_mlp(
-            (1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift
-        )
-        img = scale_add_residual(img_mlp, img_mod2.gate, img)
-        # calculate the txt bloks
-        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
-        txt_mlp = self.txt_mlp(
-            (1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift
-        )
-        txt = scale_add_residual(txt_mlp, txt_mod2.gate, txt)
-        return img, txt
-class SingleStreamBlock(nn.Module):
-    """
-    A DiT block with parallel linear layers as described in
-    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
-    """
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        mlp_ratio: float = 4.0,
-        qk_scale: float | None = None,
-    ):
-        super().__init__()
-        self.hidden_dim = hidden_size
-        self.num_heads = num_heads
-        head_dim = hidden_size // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        # qkv and mlp_in
-        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
-        # proj and mlp_out
-        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
-        self.norm = QKNorm(head_dim)
-        self.hidden_size = hidden_size
-        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.mlp_act = nn.GELU(approximate="tanh")
-        self.modulation = Modulation(hidden_size, double=False)
-    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
-        mod, _ = self.modulation(vec)
-        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
-        qkv, mlp = torch.split(
-            self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1
-        )
-        q, k, v = rearrange(qkv, "B L (K H D) -> K B L H D", K=3, H=self.num_heads)
-        q, k = self.norm(q, k, v)
-        # compute attention
-        attn = attention_after_rope(q, k, v, pe=pe)
-        # compute activation in mlp stream, cat again and run second linear layer
-        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
-        return scale_add_residual(output, mod.gate, x)
-class LastLayer(nn.Module):
-    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
-        super().__init__()
-        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.linear = nn.Linear(
-            hidden_size, patch_size * patch_size * out_channels, bias=True
-        )
-        self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)
-        )
-    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
-        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
-        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
-        x = self.linear(x)
-        return x

modules/model_edit.py DELETED Viewed

@@ -1,143 +0,0 @@
-import math
-from dataclasses import dataclass
-import numpy as np
-import torch
-from torch import Tensor, nn
-from .connector_edit import Qwen2Connector
-from .layers import DoubleStreamBlock, EmbedND, LastLayer, MLPEmbedder, SingleStreamBlock
-@dataclass
-class Step1XParams:
-    in_channels: int
-    out_channels: int
-    vec_in_dim: int
-    context_in_dim: int
-    hidden_size: int
-    mlp_ratio: float
-    num_heads: int
-    depth: int
-    depth_single_blocks: int
-    axes_dim: list[int]
-    theta: int
-    qkv_bias: bool
-class Step1XEdit(nn.Module):
-    """
-    Transformer model for flow matching on sequences.
-    """
-    def __init__(self, params: Step1XParams):
-        super().__init__()
-        self.params = params
-        self.in_channels = params.in_channels
-        self.out_channels = params.out_channels
-        if params.hidden_size % params.num_heads != 0:
-            raise ValueError(
-                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
-            )
-        pe_dim = params.hidden_size // params.num_heads
-        if sum(params.axes_dim) != pe_dim:
-            raise ValueError(
-                f"Got {params.axes_dim} but expected positional dim {pe_dim}"
-            )
-        self.hidden_size = params.hidden_size
-        self.num_heads = params.num_heads
-        self.pe_embedder = EmbedND(
-            dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim
-        )
-        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
-        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
-        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
-        self.double_blocks = nn.ModuleList(
-            [
-                DoubleStreamBlock(
-                    self.hidden_size,
-                    self.num_heads,
-                    mlp_ratio=params.mlp_ratio,
-                    qkv_bias=params.qkv_bias,
-                )
-                for _ in range(params.depth)
-            ]
-        )
-        self.single_blocks = nn.ModuleList(
-            [
-                SingleStreamBlock(
-                    self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio
-                )
-                for _ in range(params.depth_single_blocks)
-            ]
-        )
-        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
-        self.connector = Qwen2Connector()
-    @staticmethod
-    def timestep_embedding(
-        t: Tensor, dim, max_period=10000, time_factor: float = 1000.0
-    ):
-        """
-        Create sinusoidal timestep embeddings.
-        :param t: a 1-D Tensor of N indices, one per batch element.
-                        These may be fractional.
-        :param dim: the dimension of the output.
-        :param max_period: controls the minimum frequency of the embeddings.
-        :return: an (N, D) Tensor of positional embeddings.
-        """
-        t = time_factor * t
-        half = dim // 2
-        freqs = torch.exp(
-            -math.log(max_period)
-            * torch.arange(start=0, end=half, dtype=torch.float32)
-            / half
-        ).to(t.device)
-        args = t[:, None].float() * freqs[None]
-        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-        if dim % 2:
-            embedding = torch.cat(
-                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
-            )
-        if torch.is_floating_point(t):
-            embedding = embedding.to(t)
-        return embedding
-    def forward(
-        self,
-        img: Tensor,
-        img_ids: Tensor,
-        txt: Tensor,
-        txt_ids: Tensor,
-        timesteps: Tensor,
-        y: Tensor,
-    ) -> Tensor:
-        if img.ndim != 3 or txt.ndim != 3:
-            raise ValueError("Input img and txt tensors must have 3 dimensions.")
-        img = self.img_in(img)
-        vec = self.time_in(self.timestep_embedding(timesteps, 256))
-        vec = vec + self.vector_in(y)
-        txt = self.txt_in(txt)
-        ids = torch.cat((txt_ids, img_ids), dim=1)
-        pe = self.pe_embedder(ids)
-        for block in self.double_blocks:
-            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
-        img = torch.cat((txt, img), 1)
-        for block in self.single_blocks:
-            img = block(img, vec=vec, pe=pe)
-        img = img[:, txt.shape[1] :, ...]
-        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
-        return img

no_cookie.png DELETED Viewed

Git LFS Details

SHA256: 4ee90a1e41774e2dae54ca436874341e750f2c7a6196b8360aee1952e98066f8
Pointer size: 131 Bytes
Size of remote file: 162 kB

poster.jpg DELETED Viewed

Binary file (65.4 kB)

poster_orig.jpg DELETED Viewed

Git LFS Details

SHA256: 92a4178a56e7fefd7dfd418c675c1ab6b6b2e00e17b45a778a1100ab62f9bfba
Pointer size: 131 Bytes
Size of remote file: 458 kB