Sketch of text encoder merger

Files changed (3) hide show

.gitignore +2 -1
pyproject.toml +8 -0
text_encoder_merger.py +90 -0

.gitignore CHANGED Viewed

	@@ -1 +1,2 @@
1	- *.lock


1	+ *.lock
2	+ debug.ipynb

pyproject.toml CHANGED Viewed

@@ -10,9 +10,17 @@ package-mode = false
 python = "^3.12"
 transformers = "^4.43.4"
 accelerate = "^0.33.0"
 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.0.0"
 [build-system]
 requires = ["poetry-core"]

 python = "^3.12"
 transformers = "^4.43.4"
 accelerate = "^0.33.0"
+protobuf = "^5.27.3"
+torchvision = "^0.19.0"
+datasets = "^2.21.0"
+safetensors = "^0.4.4"
+evaluate = "^0.4.2"
+diffusers = "^0.30.0"
+torch = "^2.4.0"
 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.0.0"
+autopep8 = "^2.3.1"
 [build-system]
 requires = ["poetry-core"]

text_encoder_merger.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from transformers import PretrainedConfig, PreTrainedModel
+from torch import nn, tensor, concat
+from diffusers.models.embeddings import get_timestep_embedding
+import torch
+class T5DiffusionXLTextEncoderMergerConfig(PretrainedConfig):
+    def __init__(self,
+                 num_layers: int = 4,
+                 dim_timestep_embeds: int = 16,
+                 seq_len: int = 77,
+                 channels_sdxl: int = 2048,
+                 channels_t5: int = 4096,
+                 channels_pooled: int = 1280,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.num_layers = num_layers
+        self.dim_timestep_embeds = dim_timestep_embeds
+        self.seq_len = seq_len
+        self.channels_sdxl = channels_sdxl
+        self.channels_t5 = channels_t5
+        self.channels_pooled = channels_pooled
+class T5DiffusionXLTextEncoderMerger(PreTrainedModel, nn.Module):
+    def __init__(self, config: T5DiffusionXLTextEncoderMergerConfig):
+        super().__init__(config)
+        self._last_timestep = 0
+        channels_concat = config.channels_sdxl + config.channels_t5
+        self.block_forward1 = nn.Sequential(
+            nn.Linear(channels_concat, channels_concat),
+            nn.LayerNorm([config.seq_len, channels_concat],
+                         elementwise_affine=False))
+        layers = []
+        for _ in range(config.num_layers - 1):
+            layers.append(nn.Linear(channels_concat, channels_concat))
+            layers.append(nn.SiLU())
+        layers.append(nn.Linear(channels_concat, config.channels_sdxl))
+        layers.append(nn.Tanh())
+        self.block_forward2 = nn.Sequential(*layers)
+        self.block_modulate_by_pooled = nn.Sequential(
+            nn.Linear(config.channels_pooled, 512, bias=False), nn.SiLU(),
+            nn.Linear(512,
+                      config.seq_len *
+                      (channels_concat * 2 + config.channels_sdxl),
+                      bias=False))
+        self.block_modulate_by_timestep = nn.Sequential(
+            nn.Linear(config.dim_timestep_embeds, 512, bias=False), nn.SiLU(),
+            nn.Linear(512,
+                      config.seq_len *
+                      (channels_concat * 2 + config.channels_sdxl),
+                      bias=False))
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            module.weight.normal_(0, 0.1)
+            if module.bias is not None:
+                module.bias.zero_()
+    def forward(self, embeds_t5, embeds_sdxl, pooled_embeds_sdxl):
+        batch_size = embeds_sdxl.size(0)
+        assert batch_size == embeds_sdxl.size(0) == pooled_embeds_sdxl.size(0)
+        channels_sdxl = self.config.channels_sdxl
+        channels_concat = self.config.channels_t5 + channels_sdxl
+        seq_len = self.config.seq_len
+        timestep_embeds = get_timestep_embedding(
+            tensor([self._last_timestep]),
+            embedding_dim=self.config.dim_timestep_embeds).repeat(
+                batch_size, 1)
+        modulation = self.block_modulate_by_timestep(
+            timestep_embeds) + self.block_modulate_by_pooled(pooled_embeds_sdxl)
+        gamma, beta, zeta = [
+            m.view(batch_size, seq_len, -1) for m in modulation.split([
+                seq_len * channels_concat, seq_len * channels_concat, seq_len *
+                channels_sdxl
+            ],
+                dim=1)
+        ]
+        output = (gamma + 1) * self.block_forward1(
+            concat((embeds_t5, embeds_sdxl), dim=2)) + beta
+        output = (zeta + 1) * self.block_forward2(output)
+        output += embeds_sdxl
+        return {"output": output}
+    def set_timestep(self, timestep: int):
+        self._last_timestep = timestep