Spaces:

lazerkat
/

RandomDiffusion

Sleeping

App Files Files Community

lazerkat commited on 9 days ago

Commit

3192df2

verified ·

1 Parent(s): 1748b4f

Update app.py

Browse files

Files changed (1) hide show

app.py +190 -78

app.py CHANGED Viewed

@@ -6,11 +6,137 @@ import torch.nn as nn
 import torch.nn.functional as F
 from PIL import Image
 import numpy as np
 # ============================================================================
-# DIFFUSION Model Architecture
 # ============================================================================
 class Diffusion:
     def __init__(self, timesteps=1000, beta_start=1e-4, beta_end=0.02, device='cuda'):
         self.timesteps = timesteps
@@ -20,14 +146,16 @@ class Diffusion:
         self.alpha_bars = torch.cumprod(self.alphas, dim=0)
     @torch.no_grad()
-    def sample(self, model, x, steps=None):
         model.eval()
         if steps is None:
             steps = self.timesteps
         for t in reversed(range(steps)):
             t_batch = torch.full((x.shape[0],), t, device=self.device, dtype=torch.long)
-            predicted_noise = model(x, t_batch)
             alpha = self.alphas[t]
             alpha_bar = self.alpha_bars[t]
@@ -45,103 +173,77 @@ class Diffusion:
         return x
-class UNet(nn.Module):
-    def __init__(self, in_channels=3, out_channels=3):
-        super().__init__()
-        # Encoder
-        self.enc1 = self.conv_block(in_channels, 64)
-        self.enc2 = self.conv_block(64, 128)
-        self.enc3 = self.conv_block(128, 256)
-        # Bottleneck
-        self.bottleneck = self.conv_block(256, 512)
-        # Decoder
-        self.dec3 = self.conv_block(512 + 256, 256)
-        self.dec2 = self.conv_block(256 + 128, 128)
-        self.dec1 = self.conv_block(128 + 64, 64)
-        # Time embedding
-        self.time_embed = nn.Sequential(
-            nn.Linear(1, 128),
-            nn.ReLU(),
-            nn.Linear(128, 128)
-        )
-        self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
-        self.final = nn.Conv2d(64, out_channels, 1)
-        self.pool = nn.MaxPool2d(2)
-    def conv_block(self, in_ch, out_ch):
-        return nn.Sequential(
-            nn.Conv2d(in_ch, out_ch, 3, padding=1),
-            nn.BatchNorm2d(out_ch),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(out_ch, out_ch, 3, padding=1),
-            nn.BatchNorm2d(out_ch),
-            nn.ReLU(inplace=True)
-        )
-    def forward(self, x, t):
-        # Time embedding
-        t_embed = self.time_embed(t.float().unsqueeze(-1))
-        t_embed = t_embed.unsqueeze(-1).unsqueeze(-1)
-        # Encoder
-        e1 = self.enc1(x)
-        e2 = self.enc2(self.pool(e1))
-        e3 = self.enc3(self.pool(e2))
-        # Bottleneck
-        b = self.bottleneck(self.pool(e3))
-        b = b + t_embed.repeat(1, 1, b.shape[2], b.shape[3]) if b.shape[1] == t_embed.shape[1] else b
-        # Decoder
-        d3 = self.dec3(torch.cat([self.up(b), e3], dim=1))
-        d2 = self.dec2(torch.cat([self.up(d3), e2], dim=1))
-        d1 = self.dec1(torch.cat([self.up(d2), e1], dim=1))
-        return self.final(d1)
 # Global variables
 model = None
 device = None
 # Download and load model
 def initialize_model():
-    global model, device
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     model_url = "https://huggingface.co/lazerkat/randomdiffusion/resolve/main/newest.pth"
     model_path = "newest.pth"
-    if not os.path.exists(model_path):
-        urllib.request.urlretrieve(model_url, model_path)
     checkpoint = torch.load(model_path, map_location=device)
-    model = UNet().to(device)
     model.load_state_dict(checkpoint['model_state_dict'])
     model.eval()
-    return "✅ Model loaded successfully!"
 # Generate image
-def generate_image():
-    global model, device
-    if model is None:
         return None
-    diffusion = Diffusion(timesteps=1000, device=device)
     with torch.no_grad():
-        noise = torch.randn(1, 3, 64, 64).to(device)
-        generated = diffusion.sample(model, noise, steps=100)
     # Convert to image
     image = generated.cpu().squeeze(0)
@@ -153,24 +255,34 @@ def generate_image():
     return Image.fromarray(image)
 # Create interface
-with gr.Blocks(title="RandomDiffusion") as demo:
     gr.Markdown("# 🎨 RandomDiffusion")
-    gr.Markdown("Random image generation using diffusion")
     status = gr.Textbox(label="Status", value="Loading model...", interactive=False)
     with gr.Row():
-        generate_btn = gr.Button("Generate Random Image", variant="primary")
     output_image = gr.Image(label="Generated Image", type="pil")
     demo.load(
         lambda: initialize_model(),
         outputs=[status]
     )
     generate_btn.click(
         generate_image,
         outputs=[output_image]
     )

 import torch.nn.functional as F
 from PIL import Image
 import numpy as np
+import json
 # ============================================================================
+# DIFFUSION Model Architecture (from your training code)
 # ============================================================================
+class TextEncoder(nn.Module):
+    def __init__(self, vocab_size, embed_dim=256, hidden_dim=512):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
+        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
+        self.fc = nn.Linear(hidden_dim * 2, hidden_dim)
+    def forward(self, x):
+        embedded = self.embedding(x)
+        lstm_out, (hidden, _) = self.lstm(embedded)
+        hidden_forward = hidden[-2, :, :]
+        hidden_backward = hidden[-1, :, :]
+        combined = torch.cat([hidden_forward, hidden_backward], dim=1)
+        return self.fc(combined)
+class DownBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, time_emb_dim=256, text_emb_dim=512):
+        super().__init__()
+        self.conv1 = nn.Conv2d(in_channels, out_channels, 3, padding=1)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, padding=1)
+        self.norm1 = nn.BatchNorm2d(out_channels)
+        self.norm2 = nn.BatchNorm2d(out_channels)
+        self.time_mlp = nn.Sequential(
+            nn.Linear(time_emb_dim, out_channels), nn.SiLU(),
+            nn.Linear(out_channels, out_channels)
+        )
+        self.text_mlp = nn.Sequential(
+            nn.Linear(text_emb_dim, out_channels), nn.SiLU(),
+            nn.Linear(out_channels, out_channels)
+        )
+        self.pool = nn.MaxPool2d(2)
+    def forward(self, x, t_emb, text_emb):
+        h = self.conv1(x)
+        h = self.norm1(h)
+        t = self.time_mlp(t_emb).unsqueeze(-1).unsqueeze(-1)
+        txt = self.text_mlp(text_emb).unsqueeze(-1).unsqueeze(-1)
+        h = h + t + txt
+        h = F.relu(h)
+        h = self.conv2(h)
+        h = self.norm2(h)
+        h = F.relu(h)
+        return h, self.pool(h)
+class UpBlock(nn.Module):
+    def __init__(self, in_channels, skip_channels, out_channels, time_emb_dim=256, text_emb_dim=512):
+        super().__init__()
+        self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
+        self.conv1 = nn.Conv2d(in_channels + skip_channels, out_channels, 3, padding=1)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, padding=1)
+        self.norm1 = nn.BatchNorm2d(out_channels)
+        self.norm2 = nn.BatchNorm2d(out_channels)
+        self.time_mlp = nn.Sequential(
+            nn.Linear(time_emb_dim, out_channels), nn.SiLU(),
+            nn.Linear(out_channels, out_channels)
+        )
+        self.text_mlp = nn.Sequential(
+            nn.Linear(text_emb_dim, out_channels), nn.SiLU(),
+            nn.Linear(out_channels, out_channels)
+        )
+    def forward(self, x, skip, t_emb, text_emb):
+        x = self.up(x)
+        x = torch.cat([x, skip], dim=1)
+        h = self.conv1(x)
+        h = self.norm1(h)
+        t = self.time_mlp(t_emb).unsqueeze(-1).unsqueeze(-1)
+        txt = self.text_mlp(text_emb).unsqueeze(-1).unsqueeze(-1)
+        h = h + t + txt
+        h = F.relu(h)
+        h = self.conv2(h)
+        h = self.norm2(h)
+        return F.relu(h)
+class DiffusionUNet(nn.Module):
+    def __init__(self, vocab_size, image_channels=3, base_channels=64, time_emb_dim=256, text_emb_dim=512):
+        super().__init__()
+        self.text_encoder = TextEncoder(vocab_size, embed_dim=256, hidden_dim=text_emb_dim)
+        self.time_mlp = nn.Sequential(
+            nn.Linear(1, time_emb_dim), nn.SiLU(),
+            nn.Linear(time_emb_dim, time_emb_dim), nn.SiLU(),
+            nn.Linear(time_emb_dim, time_emb_dim)
+        )
+        self.init_conv = nn.Conv2d(image_channels, base_channels, 3, padding=1)
+        self.down1 = DownBlock(base_channels, base_channels, time_emb_dim, text_emb_dim)
+        self.down2 = DownBlock(base_channels, base_channels * 2, time_emb_dim, text_emb_dim)
+        self.bottleneck_conv1 = nn.Conv2d(base_channels * 2, base_channels * 2, 3, padding=1)
+        self.bottleneck_conv2 = nn.Conv2d(base_channels * 2, base_channels * 2, 3, padding=1)
+        self.bottleneck_norm1 = nn.BatchNorm2d(base_channels * 2)
+        self.bottleneck_norm2 = nn.BatchNorm2d(base_channels * 2)
+        self.bottleneck_time_mlp = nn.Sequential(
+            nn.Linear(time_emb_dim, base_channels * 2), nn.SiLU(),
+            nn.Linear(base_channels * 2, base_channels * 2)
+        )
+        self.bottleneck_text_mlp = nn.Sequential(
+            nn.Linear(text_emb_dim, base_channels * 2), nn.SiLU(),
+            nn.Linear(base_channels * 2, base_channels * 2)
+        )
+        self.up1 = UpBlock(base_channels * 2, base_channels * 2, base_channels, time_emb_dim, text_emb_dim)
+        self.up2 = UpBlock(base_channels, base_channels, base_channels, time_emb_dim, text_emb_dim)
+        self.out_conv = nn.Conv2d(base_channels, image_channels, 1)
+    def forward(self, x, timesteps, text_tokens):
+        text_emb = self.text_encoder(text_tokens)
+        t_emb = self.time_mlp(timesteps.unsqueeze(-1).float())
+        x1 = self.init_conv(x)
+        x2, x2_pooled = self.down1(x1, t_emb, text_emb)
+        x3, x3_pooled = self.down2(x2_pooled, t_emb, text_emb)
+        h = self.bottleneck_conv1(x3_pooled)
+        h = self.bottleneck_norm1(h)
+        t = self.bottleneck_time_mlp(t_emb).unsqueeze(-1).unsqueeze(-1)
+        txt = self.bottleneck_text_mlp(text_emb).unsqueeze(-1).unsqueeze(-1)
+        h = h + t + txt
+        h = F.relu(h)
+        h = self.bottleneck_conv2(h)
+        h = self.bottleneck_norm2(h)
+        bottleneck = F.relu(h)
+        d1 = self.up1(bottleneck, x3, t_emb, text_emb)
+        d2 = self.up2(d1, x2, t_emb, text_emb)
+        return self.out_conv(d2)
 class Diffusion:
     def __init__(self, timesteps=1000, beta_start=1e-4, beta_end=0.02, device='cuda'):
         self.timesteps = timesteps
         self.alpha_bars = torch.cumprod(self.alphas, dim=0)
     @torch.no_grad()
+    def sample(self, model, text_tokens, image_size=64, steps=None):
         model.eval()
         if steps is None:
             steps = self.timesteps
+        x = torch.randn(1, 3, image_size, image_size).to(self.device)
         for t in reversed(range(steps)):
             t_batch = torch.full((x.shape[0],), t, device=self.device, dtype=torch.long)
+            predicted_noise = model(x, t_batch, text_tokens)
             alpha = self.alphas[t]
             alpha_bar = self.alpha_bars[t]
         return x
 # Global variables
 model = None
 device = None
+vocab_data = None
+def download_file(url, filename):
+    """Download with progress tracking"""
+    if not os.path.exists(filename):
+        print(f"Downloading {filename}...")
+        urllib.request.urlretrieve(url, filename)
+        print(f"Downloaded {filename}")
+    else:
+        print(f"{filename} already exists")
 # Download and load model
 def initialize_model():
+    global model, device, vocab_data
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    # Download model and vocab
     model_url = "https://huggingface.co/lazerkat/randomdiffusion/resolve/main/newest.pth"
     model_path = "newest.pth"
+    download_file(model_url, model_path)
+    # Load checkpoint
     checkpoint = torch.load(model_path, map_location=device)
+    # Get vocab info from checkpoint
+    vocab_data = {
+        'vocab': checkpoint['vocab'],
+        'word_to_idx': checkpoint['word_to_idx'],
+        'vocab_size': checkpoint['vocab_size']
+    }
+    # Create model with correct vocab size
+    model = DiffusionUNet(
+        vocab_size=vocab_data['vocab_size'],
+        image_channels=3,
+        base_channels=64
+    ).to(device)
+    # Load state dict
     model.load_state_dict(checkpoint['model_state_dict'])
     model.eval()
+    print(f"Model loaded successfully! Vocab size: {vocab_data['vocab_size']}")
+    return "✅ Model loaded successfully! You can now generate images."
+def tokenize_text(text, max_len=20):
+    """Tokenize text input for the model"""
+    words = [w.strip('.,!?"\'') for w in text.lower().split()]
+    tokens = words[:max_len]
+    indices = [vocab_data['word_to_idx'].get(token, vocab_data['word_to_idx'].get('<UNK>', 1)) for token in tokens]
+    while len(indices) < max_len:
+        indices.append(0)  # PAD token
+    return torch.tensor(indices).unsqueeze(0).to(device)
 # Generate image
+def generate_image(prompt):
+    global model, device, vocab_data
+    if model is None or vocab_data is None:
         return None
+    diffusion = Diffusion(timesteps=500, device=device)  # Use 500 timesteps like training
     with torch.no_grad():
+        text_tokens = tokenize_text(prompt)
+        generated = diffusion.sample(model, text_tokens, image_size=64, steps=500)
     # Convert to image
     image = generated.cpu().squeeze(0)
     return Image.fromarray(image)
 # Create interface
+with gr.Blocks(title="RandomDiffusion Text-to-Image") as demo:
     gr.Markdown("# 🎨 RandomDiffusion")
+    gr.Markdown("Text-to-Image generation using diffusion model")
     status = gr.Textbox(label="Status", value="Loading model...", interactive=False)
     with gr.Row():
+        prompt_input = gr.Textbox(
+            label="Prompt",
+            value="a beautiful landscape",
+            placeholder="Enter your text prompt here..."
+        )
+    with gr.Row():
+        generate_btn = gr.Button("Generate Image", variant="primary")
     output_image = gr.Image(label="Generated Image", type="pil")
+    # Load model on startup
     demo.load(
         lambda: initialize_model(),
         outputs=[status]
     )
+    # Generate on button click
     generate_btn.click(
         generate_image,
+        inputs=[prompt_input],
         outputs=[output_image]
     )