Blackroot
/

SimplePixelDiffusion-AlphaDemo

Model card Files Files and versions Community

Blackroot commited on 17 days ago

Commit

4a9ad28

verified ·

1 Parent(s): d374f52

Upload 9 files

Browse files

Files changed (10) hide show

.gitattributes +4 -0
1.png +3 -0
2.png +3 -0
3.png +3 -0
4.png +3 -0
models/__init__.py +3 -0
models/uvit.py +219 -0
step_1799.safetensors +3 -0
test_sample.py +81 -0
train.py +236 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+1.png filter=lfs diff=lfs merge=lfs -text
+2.png filter=lfs diff=lfs merge=lfs -text
+3.png filter=lfs diff=lfs merge=lfs -text
+4.png filter=lfs diff=lfs merge=lfs -text

1.png ADDED Viewed

Git LFS Details

SHA256: e61e271503cd84735a944d563d82bbe321b4f0c8fc2490ed3f0e3f23310fb903
Pointer size: 132 Bytes
Size of remote file: 2.1 MB

2.png ADDED Viewed

Git LFS Details

SHA256: ff9e6a8322d050cb697bfe7b63bf3d56b9607267c4e6831610e16de35b4261a5
Pointer size: 132 Bytes
Size of remote file: 2.11 MB

3.png ADDED Viewed

Git LFS Details

SHA256: 02dc1fde3f9601de0ec09bb75a41a068e435b8478705c28c22dee6532ac4b2b6
Pointer size: 132 Bytes
Size of remote file: 2.17 MB

4.png ADDED Viewed

Git LFS Details

SHA256: 4b6d4dc329903a1f0ac56d5669ecb4488b9f05778761844ce1fcbc53a9ad1092
Pointer size: 132 Bytes
Size of remote file: 2.15 MB

models/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .uvit import AsymmetricResidualUDiT
2	+
3	+ __all__ = ['AsymmetricResidualUDiT']

models/uvit.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# This architecture was my attempt at the following Simple Diffusion paper with some modifications:
+# https://arxiv.org/pdf/2410.19324v1
+# Very similar to GeGLU or SwiGLU, there's a learned gate FN, uses arctan as the activation fn.
+class xATGLU(nn.Module):
+    def __init__(self, input_dim, output_dim, bias=True):
+        super().__init__()
+        # GATE path | VALUE path
+        self.proj = nn.Linear(input_dim, output_dim * 2, bias=bias)
+        nn.init.kaiming_normal_(self.proj.weight, nonlinearity='linear')
+        self.alpha = nn.Parameter(torch.zeros(1))
+        self.half_pi = torch.pi / 2
+        self.inv_pi = 1 / torch.pi
+    def forward(self, x):
+        projected = self.proj(x)
+        gate_path, value_path = projected.chunk(2, dim=-1)
+        # Apply arctan gating with expanded range via learned alpha -- https://arxiv.org/pdf/2405.20768
+        gate = (torch.arctan(gate_path) + self.half_pi) * self.inv_pi
+        expanded_gate = gate * (1 + 2 * self.alpha) - self.alpha
+        return expanded_gate * value_path  # g(x) × y
+class ResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.conv1 = nn.Conv2d(channels, channels, 3, padding=1)
+        self.norm1 = nn.GroupNorm(32, channels)
+        self.conv2 = nn.Conv2d(channels, channels, 3, padding=1)
+        self.norm2 = nn.GroupNorm(32, channels)
+    def forward(self, x):
+        h = self.conv1(F.silu(self.norm1(x)))
+        h = self.conv2(F.silu(self.norm2(h)))
+        return x + h
+class TransformerBlock(nn.Module):
+    def __init__(self, channels, num_heads=8):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(channels)
+        self.attn = nn.MultiheadAttention(channels, num_heads)
+        self.norm2 = nn.LayerNorm(channels)
+        self.mlp = nn.Sequential(
+            xATGLU(channels, 4 * channels),
+            nn.Linear(4 * channels, channels)
+        )
+    def forward(self, x):
+        # Reshape for attention [B, C, H, W] -> [H*W, B, C]
+        b, c, h, w = x.shape
+        spatial_size = h * w
+        x = x.flatten(2).permute(2, 0, 1)
+        # Self attention
+        h_attn = self.norm1(x)
+        h_attn, _ = self.attn(h_attn, h_attn, h_attn)
+        x = x + h_attn
+        # MLP
+        h_mlp = self.norm2(x)
+        h_mlp = self.mlp(h_mlp)
+        x = x + h_mlp
+        # Reshape back [H*W, B, C] -> [B, C, H, W]
+        return x.permute(1, 2, 0).reshape(b, c, h, w)
+class LevelBlock(nn.Module):
+    def __init__(self, channels, num_blocks, block_type='res'):
+        super().__init__()
+        self.blocks = nn.ModuleList()
+        for _ in range(num_blocks):
+            if block_type == 'transformer':
+                self.blocks.append(TransformerBlock(channels))
+            else:
+                self.blocks.append(ResBlock(channels))
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+        return x
+class AsymmetricResidualUDiT(nn.Module):
+    def __init__(self,
+                 in_channels=3, # Input color channels
+                 base_channels=128, # Initial feature size, dramatically increases parameter size of network.
+                 patch_size=2, # Smaller patches dramatically increases flops and compute expenses. Recommend >=4 unless you have real compute.
+                 num_levels=3, # Feature downsample, essentially the unet depth -- so we down/upsample three times. Dramatically increases parameters as you increase.
+                 encoder_blocks=3,  # Can be different number of blocks VS decoder_blocks
+                 decoder_blocks=7,  # Can be different number of blocks VS encoder_blocks
+                 encoder_transformer_thresh=2, #When to start using transformer blocks instead of res blocks in the encoder. (>=)
+                 decoder_transformer_thresh=4, #When to stop using transformer blocks instead of res blocks in the decoder. (<=)
+                 mid_blocks=16 # Number of middle transformer blocks. Relatively cheap as this is at the bottom of the unet feature bottleneck.
+                 ):
+        super().__init__()
+        # Initial projection from image space
+        self.patch_embed = nn.Conv2d(in_channels, base_channels,
+                                   kernel_size=patch_size, stride=patch_size)
+        # Create encoder levels
+        self.encoders = nn.ModuleList()
+        curr_channels = base_channels
+        for level in range(num_levels):
+            # Create the main processing blocks for this level
+            use_transformer = level >= encoder_transformer_thresh  # Use transformers for latter levels
+            # Encoder blocks -- encoder_blocks
+            self.encoders.append(
+                LevelBlock(curr_channels, encoder_blocks, use_transformer)
+            )
+            # Add channel scaling for next level
+            # Doubles the size of the feature space for each step, except for the last level.
+            if level < num_levels - 1:
+                self.encoders.append(
+                    nn.Conv2d(curr_channels, curr_channels * 2, 1)
+                )
+                curr_channels *= 2
+        # Middle transformer blocks -- mid_blocks
+        self.middle = nn.ModuleList([
+            TransformerBlock(curr_channels) for _ in range(mid_blocks)
+        ])
+        # Create decoder levels
+        self.decoders = nn.ModuleList()
+        for level in range(num_levels):
+            # Create the main processing blocks for this level
+            use_transformer = level <= decoder_transformer_thresh  # Use transformers for early levels (inverse of encoder)
+            # Decoder blocks -- decoder_blocks
+            self.decoders.append(
+                LevelBlock(curr_channels, decoder_blocks, use_transformer)
+            )
+            # Add channel scaling for next level
+            # Halves the size of the feature space for each step, except for the last level.
+            if level < num_levels - 1:
+                self.decoders.append(
+                    nn.Conv2d(curr_channels, curr_channels // 2, 1)
+                )
+                curr_channels //= 2
+        # Final projection back to image space
+        self.final_proj = nn.ConvTranspose2d(base_channels, in_channels,
+                                           kernel_size=patch_size, stride=patch_size)
+    def downsample(self, x):
+        return F.avg_pool2d(x, kernel_size=2)
+    def upsample(self, x):
+        return F.interpolate(x, scale_factor=2, mode='nearest')
+    def forward(self, x, t=None):
+        # Start by patch embedding the inputs.
+        x = self.patch_embed(x)
+        # Track residual path and features at each spatial level
+        # The paper was very specific about the residual flow path, I tried my best to copy how they described it.
+        # *Per resolution e.g. per num_level resolution block more or less
+        # f(x) = fu( U(fm(D(h)) - D(h)) + h )  where h = fd(x)
+        #
+        # Where
+        # 1. h = fd(x)    : Encoder path processes input
+        # 2. D(h)         : Downsample the encoded features
+        # 3. fm(D(h))     : Middle transformer blocks process downsampled features
+        # 4. fm(D(h))-D(h): Subtract original downsampled features (residual connection)
+        # 5. U(...)       : Upsample the processed features
+        # 6. ... + h      : Add back original encoder features (skip connection)
+        # 7. fu(...)      : Decoder path processes the combined features
+        residuals = []
+        curr_res = x
+        # Encoder path (computing h = fd(x))
+        h = x
+        for i, blocks in enumerate(self.encoders):
+            if isinstance(blocks, LevelBlock):
+                h = blocks(h)
+            else:
+                # Save residual before downsampling
+                residuals.append(curr_res)
+                # Downsample and update current residual
+                h = self.downsample(blocks(h))
+                curr_res = h
+        # Middle blocks (fm)
+        x = h
+        for block in self.middle:
+            x = block(x)
+        # Subtract the residual at this level (D(h))
+        x = x - curr_res
+        # Decoder path (fu)
+        for i, blocks in enumerate(self.decoders):
+            if isinstance(blocks, LevelBlock):
+                x = blocks(x)
+            else:
+                # Channel reduction
+                x = blocks(x)
+                # Upsample
+                x = self.upsample(x)
+                # Add residual from encoder at this level, LIFO, last residual added is the first we want, since it's this u-shape.
+                curr_res = residuals.pop()
+                x = x + curr_res
+        # Final projection
+        x = self.final_proj(x)
+        return x

step_1799.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74718eb5a40f7e9576182888828dbc717050987f6be58dcc6a28b58e6591f013
+size 383841508

test_sample.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import torch
+import torch.nn as nn
+import torchvision.utils as vutils
+from models import AsymmetricResidualUDiT
+from safetensors.torch import load_file
+import os
+import argparse
+from typing import Optional
+def load_checkpoint(model: nn.Module, checkpoint_path: str) -> None:
+    state_dict = load_file(checkpoint_path)
+    # The training was done via torch compile which prefixes the model with this for whatever reason.
+    # Handle compiled model state dict by removing '_orig_mod.' prefix
+    if all(k.startswith('_orig_mod.') for k in state_dict.keys()):
+        state_dict = {k[10:]: v for k, v in state_dict.items()}
+    model.load_state_dict(state_dict)
+def sample(model, n_samples=16, n_steps=50, image_size=256, device="cuda", sigma_min=0.001, dtype=torch.float32):
+    with torch.amp.autocast('cuda', dtype=dtype):
+        x = torch.randn(n_samples, 3, image_size, image_size, device=device)
+        ts = torch.linspace(0, 1, n_steps, device=device)
+        dt = 1/n_steps
+        # Forward Euler Integration step 0..1
+        with torch.no_grad():
+            for i in range(len(ts)):
+                t = ts[i]
+                t_input = t.repeat(n_samples, 1, 1, 1)
+                v_t = model(x, t_input)
+                x = x + v_t * dt
+    return x.float()
+def main():
+    parser = argparse.ArgumentParser(description="Generate samples from a trained UDiT model")
+    parser.add_argument("checkpoint", type=str, help="Path to the model checkpoint (.safetensors)")
+    parser.add_argument("--samples", type=int, default=16, help="Number of samples to generate")
+    parser.add_argument("--steps", type=int, default=50, help="Number of sampling steps")
+    parser.add_argument("--output", type=str, default="output.png", help="Output filename")
+    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu",
+                      help="Device to run inference on (cuda/cpu)")
+    args = parser.parse_args()
+    device = args.device
+    model = AsymmetricResidualUDiT(
+        in_channels=3,
+        base_channels=128,
+        num_levels=3,
+        patch_size=4,
+        encoder_blocks=3,
+        decoder_blocks=7,
+        encoder_transformer_thresh=2,
+        decoder_transformer_thresh=4,
+        mid_blocks=8
+    ).to(device)
+    # Load state dict into model
+    load_checkpoint(model, args.checkpoint)
+    model.eval()
+    # Generate samples
+    print(f"Generating {args.samples} samples with {args.steps} steps...")
+    with torch.no_grad():
+        samples = sample(
+            model,
+            n_samples=args.samples,
+            n_steps=args.steps,
+            device=args.device,
+            dtype=torch.float32
+        )
+    # Save samples
+    os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
+    vutils.save_image(samples, args.output, nrow=4, padding=2)
+    print(f"Samples saved to {args.output}")
+if __name__ == "__main__":
+    main()

train.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torchvision.transforms as transforms
+import torchvision.utils as vutils
+from datasets import load_dataset
+from torch.utils.data import DataLoader, TensorDataset
+from schedulefree import AdamWScheduleFree
+from torch.utils.tensorboard import SummaryWriter
+from safetensors.torch import save_file, load_file
+import os, time
+from models import AsymmetricResidualUDiT
+from torch.cuda.amp import autocast
+def preload_dataset(image_size=256, device="cuda"):
+    """Preload and cache the entire dataset in GPU memory"""
+    print("Loading and preprocessing dataset...")
+    #dataset = load_dataset("jiovine/pixel-art-nouns-2k", split="train")
+    dataset = load_dataset("reach-vb/pokemon-blip-captions", split="train")
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Resize((image_size, image_size), antialias=True),
+        transforms.Lambda(lambda x: (x * 2) - 1)  # Scale to [-1, 1]
+    ])
+    all_images = []
+    for example in dataset:
+        img_tensor = transform(example['image'])
+        all_images.append(img_tensor)
+    # Stack entire dataset onto gpu
+    images_tensor = torch.stack(all_images).to(device)
+    print(f"Dataset loaded: {images_tensor.shape} ({images_tensor.element_size() * images_tensor.nelement() / 1024/1024:.2f} MB)")
+    return TensorDataset(images_tensor)
+def count_parameters(model):
+    total_params = sum(p.numel() for p in model.parameters())
+    print(f'Total parameters: {total_params:,} ({total_params/1e6:.2f}M)')
+def save_checkpoint(model, optimizer, filename="checkpoint.safetensors"):
+    model_state = model.state_dict()
+    save_file(model_state, filename)
+def load_checkpoint(model, optimizer, filename="checkpoint.safetensors"):
+    model_state = load_file(filename)
+    model.load_state_dict(model_state)
+# https://arxiv.org/abs/2210.02747
+class OptimalTransportLinearFlowGenerator():
+    def __init__(self, sigma_min=0.001):
+        self.sigma_min = sigma_min
+    def loss(self, model, x1, device):
+        batch_size = x1.shape[0]
+        # Sample t uniform in [0,1]
+        t = torch.rand(batch_size, 1, 1, 1, device=device)
+        # Sample noise
+        x0 = torch.randn_like(x1)
+        x1 = x1
+        # Compute OT path interpolation (equation 22)
+        sigma_t = 1 - (1 - self.sigma_min) * t
+        mu_t = t * x1
+        x_t = sigma_t * x0 + mu_t
+        # Compute target (equation 23)
+        target = x1 - (1 - self.sigma_min) * x0
+        v_t = model(x_t, t)
+        loss = F.mse_loss(v_t, target)
+        return loss
+def write_logs(writer, model, loss, batch_idx, epoch, epoch_time, batch_size, lr, log_gradients=True):
+    """
+    TensorBoard logging
+    Args:
+        writer: torch.utils.tensorboard.SummaryWriter instance
+        model: torch.nn.Module - the model being trained
+        loss: float or torch.Tensor - the loss value to log
+        batch_idx: int - current batch index
+        epoch: int - current epoch
+        epoch_time: float - time taken for epoch
+        batch_size: int - current batch size
+        lr: float - current learning rate
+        samples: Optional[torch.Tensor] - generated samples to log (only passed every 50 epochs)
+        log_gradients: bool - whether to log gradient norms
+    """
+    total_steps = epoch * batch_idx
+    writer.add_scalar('Loss/batch', loss, total_steps)
+    writer.add_scalar('Time/epoch', epoch_time, epoch)
+    writer.add_scalar('Training/batch_size', batch_size, epoch)
+    writer.add_scalar('Training/learning_rate', lr, epoch)
+    if log_gradients:
+        total_norm = 0.0
+        for p in model.parameters():
+            if p.grad is not None:
+                param_norm = p.grad.detach().data.norm(2)
+                total_norm += param_norm.item() ** 2
+        total_norm = total_norm ** 0.5
+        writer.add_scalar('Gradients/total_norm', total_norm, total_steps)
+def train_udit_flow(num_epochs=5000, initial_batch_sizes=[8, 16, 32, 64, 128], epoch_batch_drop_at=40, device="cuda", dtype=torch.float32):
+    dataset = preload_dataset(device=device)
+    temp_loader = DataLoader(dataset, batch_size=initial_batch_sizes[0], shuffle=True)
+    first_batch = next(iter(temp_loader))
+    image_shape = first_batch[0].shape[1:]
+    writer = SummaryWriter('logs/current_run')
+    model = AsymmetricResidualUDiT(
+        in_channels=3,
+        base_channels=128,
+        num_levels=3,
+        patch_size=4,
+        encoder_blocks=3,
+        decoder_blocks=7,
+        encoder_transformer_thresh=2,
+        decoder_transformer_thresh=4,
+        mid_blocks=8
+    ).to(device).to(dtype)
+    model.train()
+    count_parameters(model)
+    optimizer = AdamWScheduleFree(
+        model.parameters(),
+        lr=1e-4,
+        warmup_steps=100
+    )
+    optimizer.train()
+    current_batch_sizes = initial_batch_sizes.copy()
+    next_drop_epoch = epoch_batch_drop_at
+    interval_multiplier = 2
+    torch.set_float32_matmul_precision('high')
+    model = torch.compile(
+        model,
+        backend='inductor',
+        mode='max-autotune',
+        fullgraph=True,
+    )
+    flow_transport = OptimalTransportLinearFlowGenerator(sigma_min=0.001)
+    for epoch in range(num_epochs):
+        epoch_start_time = time.time()
+        total_loss = 0
+        # Batch size decay logic
+        # Geomtric growth, every X*N+(X-1*N+...) use the number batch size in the list.
+        if epoch > 0 and epoch == next_drop_epoch and len(current_batch_sizes) > 1:
+            current_batch_sizes.pop()
+            next_interval = epoch_batch_drop_at * interval_multiplier
+            next_drop_epoch += next_interval
+            interval_multiplier += 1
+            print(f"\nEpoch {epoch}: Reducing batch size to {current_batch_sizes[-1]}")
+            print(f"Next drop will occur at epoch {next_drop_epoch} (interval: {next_interval})")
+        current_batch_size = current_batch_sizes[-1]
+        dataloader = DataLoader(dataset, batch_size=current_batch_size, shuffle=True)
+        curr_lr = optimizer.param_groups[0]['lr']
+        with torch.amp.autocast('cuda', dtype=dtype):
+            for batch_idx, batch in enumerate(dataloader):
+                x1 = batch[0]
+                batch_size = x1.shape[0]
+                loss = flow_transport.loss(model, x1, device)
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+                total_loss += loss.item()
+        avg_loss = total_loss / len(dataloader)
+        epoch_time = time.time() - epoch_start_time
+        print(f"Epoch {epoch}, Took: {epoch_time:.2f}s, Batch Size: {current_batch_size}, "
+              f"Average Loss: {avg_loss:.4f}, Learning Rate: {curr_lr:.6f}")
+        write_logs(writer, model, avg_loss, batch_idx, epoch, epoch_time, current_batch_size, curr_lr)
+        if (epoch + 1) % 50 == 0:
+            with torch.amp.autocast('cuda', dtype=dtype):
+                sampling_start_time = time.time()
+                samples = sample(model, device=device, dtype=dtype)
+                os.makedirs("samples", exist_ok=True)
+                vutils.save_image(samples, f"samples/epoch_{epoch}.png", nrow=4, padding=2)
+                sample_time = time.time() - sampling_start_time
+                print(f"Sampling took: {sample_time:.2f}s")
+        if (epoch + 1) % 200 == 0:
+            save_checkpoint(model, optimizer, f"step_{epoch}.safetensors")
+    return model
+def sample(model, n_samples=16, n_steps=50, image_size=256, device="cuda", sigma_min=0.001, dtype=torch.float32):
+    with torch.amp.autocast('cuda', dtype=dtype):
+        x = torch.randn(n_samples, 3, image_size, image_size, device=device)
+        ts = torch.linspace(0, 1, n_steps, device=device)
+        dt = 1/n_steps
+        # Forward Euler Integration step 0..1
+        with torch.no_grad():
+            for i in range(len(ts)):
+                t = ts[i]
+                t_input = t.repeat(n_samples, 1, 1, 1)
+                v_t = model(x, t_input)
+                x = x + v_t * dt
+    return x.float()
+if __name__ == "__main__":
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    model = train_udit_flow(
+        device=device,
+        initial_batch_sizes=[8, 16],
+        epoch_batch_drop_at=600,
+        dtype=torch.float32
+    )
+    print("Training complete! Samples saved in 'samples' directory")