Spaces:

Felixstro-dev
/

SKINAI

Sleeping

App Files Files Community

Felixstro-dev commited on 22 days ago

Commit

3722d89

verified ·

1 Parent(s): 6176678

Update app.py

Browse files

Files changed (1) hide show

app.py +215 -182

app.py CHANGED Viewed

@@ -3,13 +3,18 @@ Minecraft Skin Generator – HuggingFace Spaces Demo
 ====================================================
 Lädt model.pt (EMA-Gewichte) aus dem Repo und generiert Skins per Prompt.
 Benötigte Dateien im Space-Repo:
-  app.py          ← diese Datei
-  model.pt        ← dein exportiertes EMA-Modell
   requirements.txt
 """
 import math
-import copy
 import random
 import numpy as np
 import gradio as gr
@@ -18,22 +23,28 @@ import torch.nn as nn
 import torch.nn.functional as F
 from PIL import Image
-# ─── Konstanten (müssen exakt mit train_diffusion.py übereinstimmen) ──────────
-IMG_SIZE  = 64
-CHANNELS  = 4
-EMBED_DIM = 256
-T_STEPS   = 500
 BETA_START = 1e-4
 BETA_END   = 0.02
 # ─── Tags (identisch mit Training) ────────────────────────────────────────────
 BEREICHE    = ["head","body","arm_l","arm_r","leg_l","leg_r"]
-FARBEN      = ["orange","red","blue","green","cyan","yellow","pink","purple","black","white","gray","brown","beige"]
 HELL        = ["bright","medium","dark"]
-KLEIDUNG    = ["hoodie","shirt","tshirt","jacket","coat","armor","robe","suit","dress","cape","vest","sweater","uniform","casual","formal","jeans","pants","shorts","skirt"]
-STIL        = ["player_skin","mob_skin","zombie","enderman","skeleton_like","custom","unknown","fantasy","modern","medieval","sci_fi","ninja","pirate","wizard","knight","archer","mage"]
 HAUTTONE    = ["skin_light","skin_medium","skin_dark","skin_pale","skin_tan"]
-ACCESSOIRES = ["hat","helmet","crown","glasses","beard","hair_long","hair_short","wings","tail","horns","mask"]
 ALL_TAGS = []
 for b in BEREICHE:
@@ -55,29 +66,32 @@ PROMPT_KEYWORDS = {
     "red":"red","blue":"blue","green":"green","yellow":"yellow","cyan":"cyan",
     "pink":"pink","purple":"purple","black":"black","white":"white",
     "gray":"gray","grey":"gray","brown":"brown",
-    "hell":"bright","bright":"bright","dunkel":"dark","dark":"dark","mittel":"medium","medium":"medium",
-    "zombie":"zombie","enderman":"enderman","skelett":"skeleton_like","skeleton":"skeleton_like",
-    "armor":"armor","player":"player_skin","custom":"custom",
-    "hoodie":"hoodie","hemd":"shirt","shirt":"shirt",
-    "tshirt":"tshirt","jacke":"jacket","jacket":"jacket",
-    "mantel":"coat","coat":"coat","robe":"robe","anzug":"suit","suit":"suit",
-    "kleid":"dress","dress":"dress","umhang":"cape","cape":"cape",
-    "weste":"vest","vest":"vest","pullover":"sweater","sweater":"sweater",
-    "uniform":"uniform","casual":"casual","formal":"formal",
-    "jeans":"jeans","hose":"pants","pants":"pants","shorts":"shorts","skirt":"skirt",
     "fantasy":"fantasy","modern":"modern","medieval":"medieval",
     "scifi":"sci_fi","ninja":"ninja","pirate":"pirate",
     "wizard":"wizard","knight":"knight","archer":"archer","mage":"mage",
     "pale":"skin_pale","tan":"skin_tan",
-    "hat":"hat","helmet":"helmet","crown":"crown",
-    "glasses":"glasses","beard":"beard",
-    "wings":"wings","horns":"horns","mask":"mask",
 }
 _COLOR_BODY_PARTS = {
-    "hoodie":["body","arm_l","arm_r"],"shirt":["body"],"tshirt":["body"],
-    "jacket":["body","arm_l","arm_r"],"coat":["body","arm_l","arm_r"],
-    "jeans":["leg_l","leg_r"],"pants":["leg_l","leg_r"],"shorts":["leg_l","leg_r"],"skirt":["leg_l","leg_r"],
-    "default":["head","body","arm_l","arm_r","leg_l","leg_r"],
 }
 def parse_prompt(prompt: str) -> list:
@@ -88,7 +102,8 @@ def parse_prompt(prompt: str) -> list:
         if resolved in FARBEN:
             pending_color = resolved
             if pending_garment is None:
-                for b in _COLOR_BODY_PARTS["default"]: tags.add(f"{b}_{resolved}")
         elif resolved in KLEIDUNG:
             pending_garment = resolved
             tags.add(resolved)
@@ -111,15 +126,22 @@ def tags_to_vector(tags: list) -> torch.Tensor:
         if t in TAG2IDX: vec[TAG2IDX[t]] = 1.0
     return vec
-# ─── UV-Masken ────────────────────────────────────────────────────────────────
 SKIN_REGIONS = {
-    "head":(0,0,32,16),"body":(16,16,40,32),"arm_r":(40,16,56,32),
-    "leg_r":(0,16,16,32),"arm_l":(32,48,48,64),"leg_l":(16,48,32,64),
 }
 OVERLAY_REGIONS = {
-    "head_overlay":(32,0,64,16),"body_overlay":(16,32,40,48),
-    "arm_r_overlay":(40,32,56,48),"leg_r_overlay":(0,32,16,48),
-    "arm_l_overlay":(48,48,64,64),"leg_l_overlay":(0,48,16,64),
 }
 def _build_base_mask(device):
@@ -138,20 +160,22 @@ def force_alpha_mask(img: torch.Tensor) -> torch.Tensor:
     base    = _build_base_mask(img.device)
     overlay = _build_overlay_mask(img.device)
     outside = (1.0 - base - overlay).clamp(0,1)
-    alpha   = (base * torch.ones_like(img[:,3:4])
-               + overlay * img[:,3:4]
-               + outside * torch.full_like(img[:,3:4], -1.0))
     return torch.cat([img[:,:3], alpha], dim=1)
-# ─── UNet (identisch mit Training) ────────────────────────────────────────────
 class SinusoidalPE(nn.Module):
     def __init__(self, dim):
         super().__init__()
         self.dim = dim
     def forward(self, t):
-        half  = self.dim // 2
-        freqs = torch.exp(-math.log(10000) * torch.arange(half, device=t.device) / half)
-        args  = t[:,None].float() * freqs[None]
         return torch.cat([args.sin(), args.cos()], dim=-1)
 class CondEmbed(nn.Module):
@@ -167,33 +191,45 @@ class RMSNorm(nn.Module):
     def __init__(self, num_channels, eps=1e-8):
         super().__init__()
         self.eps   = eps
-        self.scale = nn.Parameter(torch.ones(num_channels,1,1))
     def forward(self, x):
-        return x / (x.pow(2).mean(dim=1,keepdim=True).sqrt() + self.eps) * self.scale
 class ResBlock(nn.Module):
-    def __init__(self, in_ch, out_ch, emb_dim, dropout=0.0):
         super().__init__()
-        self.norm1  = RMSNorm(in_ch)
-        self.conv1  = nn.Conv2d(in_ch,  out_ch, 3, padding=1)
-        self.norm2  = RMSNorm(out_ch)
-        self.conv2  = nn.Conv2d(out_ch, out_ch, 3, padding=1)
-        self.emb_proj = nn.Linear(emb_dim, out_ch*2)
-        self.drop   = nn.Dropout(dropout)
-        self.skip   = nn.Conv2d(in_ch, out_ch, 1) if in_ch != out_ch else nn.Identity()
-    def forward(self, x, emb):
-        h   = self.conv1(F.silu(self.norm1(x)))
-        s,b = self.emb_proj(F.silu(emb)).chunk(2, dim=-1)
-        h   = h * (1 + s[:,:,None,None]) + b[:,:,None,None]
-        h   = self.conv2(self.drop(F.silu(self.norm2(h))))
         return h + self.skip(x)
 class AttentionBlock(nn.Module):
-    def __init__(self, ch, num_heads=4):
         super().__init__()
         self.norm = RMSNorm(ch)
-        self.attn = nn.MultiheadAttention(ch, num_heads, batch_first=True)
         self.proj = nn.Conv2d(ch, ch, 1)
     def forward(self, x):
         B,C,H,W = x.shape
         h = self.norm(x).view(B,C,H*W).permute(0,2,1)
@@ -207,90 +243,107 @@ class UNet(nn.Module):
         time_dim = embed_dim * 2
         cond_dim = embed_dim
         self.time_pe  = SinusoidalPE(embed_dim)
-        self.time_mlp = nn.Sequential(nn.Linear(embed_dim,time_dim), nn.SiLU(), nn.Linear(time_dim,time_dim))
         self.cond_emb = CondEmbed(NUM_TAGS, cond_dim)
-        self.cond_mlp = nn.Linear(cond_dim, time_dim)
         ch = base_ch
-        self.enc_in = nn.Conv2d(channels, ch, 3, padding=1)
-        self.enc1   = ResBlock(ch,   ch,    time_dim, dropout=0.05)
-        self.enc1b  = ResBlock(ch,   ch,    time_dim, dropout=0.05)
-        self.down1  = nn.Conv2d(ch,  ch,   4, stride=2, padding=1)
-        self.enc2   = ResBlock(ch,   ch*2,  time_dim, dropout=0.05)
-        self.enc2b  = ResBlock(ch*2, ch*2,  time_dim, dropout=0.05)
-        self.down2  = nn.Conv2d(ch*2, ch*2, 4, stride=2, padding=1)
-        self.enc3   = ResBlock(ch*2, ch*4,  time_dim, dropout=0.05)
-        self.enc3b  = ResBlock(ch*4, ch*4,  time_dim, dropout=0.05)
-        self.attn3  = AttentionBlock(ch*4)
-        self.down3  = nn.Conv2d(ch*4, ch*4, 4, stride=2, padding=1)
-        self.mid1   = ResBlock(ch*4, ch*4,  time_dim)
-        self.mid_att= AttentionBlock(ch*4)
-        self.mid2   = ResBlock(ch*4, ch*4,  time_dim)
-        self.up3    = nn.ConvTranspose2d(ch*4, ch*4, 4, stride=2, padding=1)
-        self.dec3   = ResBlock(ch*8, ch*4,  time_dim, dropout=0.15)
-        self.dec3b  = ResBlock(ch*4, ch*4,  time_dim, dropout=0.15)
-        self.attn_d3= AttentionBlock(ch*4)
-        self.up2    = nn.ConvTranspose2d(ch*4, ch*2, 4, stride=2, padding=1)
-        self.dec2   = ResBlock(ch*4, ch*2,  time_dim, dropout=0.15)
-        self.dec2b  = ResBlock(ch*2, ch*2,  time_dim, dropout=0.15)
-        self.up1    = nn.ConvTranspose2d(ch*2, ch, 4, stride=2, padding=1)
-        self.dec1   = ResBlock(ch*2, ch,    time_dim)
-        self.dec1b  = ResBlock(ch,   ch,    time_dim)
-        self.out    = nn.Sequential(nn.GroupNorm(min(8,ch),ch), nn.SiLU(), nn.Conv2d(ch,channels,3,padding=1))
     def forward(self, x, t, cond):
         t_emb = self.time_mlp(self.time_pe(t))
-        c_emb = self.cond_mlp(self.cond_emb(cond))
-        emb   = t_emb + c_emb
         h0 = self.enc_in(x)
-        h1 = self.enc1b(self.enc1(h0, emb), emb)
-        h2 = self.enc2b(self.enc2(self.down1(h1), emb), emb)
-        h3 = self.attn3(self.enc3b(self.enc3(self.down2(h2), emb), emb))
-        h  = self.mid2(self.mid_att(self.mid1(self.down3(h3), emb)), emb)
-        h  = self.attn_d3(self.dec3b(self.dec3(torch.cat([self.up3(h), h3], 1), emb), emb))
-        h  = self.dec2b(self.dec2(torch.cat([self.up2(h), h2], 1), emb), emb)
-        h  = self.dec1b(self.dec1(torch.cat([self.up1(h), h1], 1), emb), emb)
         return self.out(h)
-# ─── Diffusion Schedule ───────────────────────────────────────────────────────
 class DiffusionSchedule:
     def __init__(self, T=T_STEPS, device="cpu"):
         self.T      = T
         self.device = device
         steps  = T + 1
         x      = torch.linspace(0, T, steps)
-        alphas = torch.cos(((x/T)+0.008)/1.008*math.pi/2)**2
         alphas = alphas / alphas[0]
-        betas  = (1 - alphas[1:]/alphas[:-1]).clamp(0,0.999)
         self.betas               = betas.to(device)
         self.alphas              = 1.0 - self.betas
         self.alphas_cumprod      = torch.cumprod(self.alphas, dim=0)
-        self.alphas_cumprod_prev = F.pad(self.alphas_cumprod[:-1],(1,0),value=1.0)
-        self.posterior_variance  = (self.betas*(1-self.alphas_cumprod_prev)/(1-self.alphas_cumprod))
     @torch.no_grad()
-    def p_sample(self, model, x, t_idx, cond, guidance_scale):
         t_tensor  = torch.full((x.shape[0],), t_idx, device=self.device, dtype=torch.long)
         null_cond = torch.zeros_like(cond)
-        x2        = torch.cat([x,x])
-        t2        = torch.cat([t_tensor,t_tensor])
-        c2        = torch.cat([cond,null_cond])
-        out       = model(x2,t2,c2)
-        n_cond, n_uncond = out.chunk(2)
-        noise_pred = n_uncond + guidance_scale*(n_cond - n_uncond)
-        alpha     = self.alphas[t_idx]
-        alpha_bar = self.alphas_cumprod[t_idx]
-        beta      = self.betas[t_idx]
-        mean = (1/alpha.sqrt())*(x - beta/(1-alpha_bar).sqrt()*noise_pred)
-        if t_idx > 0:
-            return mean + self.posterior_variance[t_idx].sqrt()*torch.randn_like(x)
-        return mean
     @torch.no_grad()
-    def sample(self, model, cond, n=1, steps=50, guidance_scale=6.0):
         model.eval()
         x = torch.randn(n, CHANNELS, IMG_SIZE, IMG_SIZE, device=self.device)
-        for t_idx in torch.linspace(self.T-1, 0, steps, dtype=torch.long, device=self.device):
-            x = self.p_sample(model, x, t_idx.item(), cond, guidance_scale)
-        return force_alpha_mask(x).clamp(-1,1)
 # ─── Modell laden ─────────────────────────────────────────────────────────────
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -298,104 +351,84 @@ print(f"Device: {device}")
 ckpt    = torch.load("model.pt", map_location=device, weights_only=False)
 base_ch = ckpt.get("base_ch", 96)
-# base_ch aus Gewichten lesen falls nicht im Checkpoint
-if base_ch is None:
-    for key in ("enc_in.weight", "_orig_mod.enc_in.weight"):
-        if key in ckpt.get("model", ckpt):
-            base_ch = ckpt.get("model", ckpt)[key].shape[0]
-            break
-    base_ch = base_ch or 96
-model = UNet(base_ch=base_ch).to(device)
-sd    = ckpt.get("model", ckpt)
-model.load_state_dict(sd, strict=False)
-model.eval()
-try: torch._dynamo.disable(model)
-except Exception: pass
 schedule = DiffusionSchedule(device=device)
-print(f"Modell geladen: base_ch={base_ch}, {sum(p.numel() for p in model.parameters())/1e6:.1f}M Parameter")
 # ─── Generierungs-Funktion ────────────────────────────────────────────────────
 def generate(prompt, num_skins, steps, guidance_scale, seed, randomize_seed):
     if randomize_seed:
         seed = random.randint(0, 2**31)
     torch.manual_seed(seed)
-    tags = parse_prompt(prompt)
-    tag_str = ", ".join(tags) if tags else "–"
-    cond = tags_to_vector(tags).to(device).unsqueeze(0).expand(num_skins, -1)
     with torch.inference_mode():
-        imgs = schedule.sample(model, cond, n=num_skins, steps=steps, guidance_scale=guidance_scale)
     results = []
     for img_t in imgs:
         arr = ((img_t.cpu().permute(1,2,0).numpy() + 1) * 127.5).clip(0,255).astype(np.uint8)
-        # 8x upscale für bessere Sichtbarkeit (nearest-neighbor – kein Blur)
         pil = Image.fromarray(arr, "RGBA").resize((512, 512), Image.NEAREST)
         results.append(pil)
-    return results, f"Tags erkannt: {tag_str}", seed
 # ─── Gradio UI ────────────────────────────────────────────────────────────────
 EXAMPLES = [
-    ["roter hoodie blaue jeans", 4, 50, 6.0],
-    ["zombie", 4, 50, 7.0],
-    ["wizard fantasy purple", 4, 50, 6.5],
-    ["knight medieval armor", 4, 50, 6.0],
-    ["ninja black dark", 4, 50, 7.0],
-    ["enderman", 2, 50, 6.0],
 ]
-css = """
-#gallery { min-height: 300px; }
-"""
-with gr.Blocks(css=css, title="Minecraft Skin Generator") as demo:
     gr.Markdown("""
 # 🎮 Minecraft Skin Generator
-Generiert 64×64 Minecraft Skins aus einem Text-Prompt. Trainiert mit DDPM auf ~41k Skins.
-**Beispiel-Prompts:** `roter hoodie blaue jeans` · `zombie` · `knight medieval armor` · `wizard fantasy purple`
 """)
     with gr.Row():
         with gr.Column(scale=2):
-            prompt = gr.Text(
-                label="Prompt",
-                placeholder="z.B. roter hoodie blaue jeans",
-                lines=1,
-            )
-            run_btn = gr.Button("Generieren", variant="primary", size="lg")
-            with gr.Accordion("Einstellungen", open=False):
-                num_skins = gr.Slider(label="Anzahl Skins", minimum=1, maximum=8, step=1, value=4)
-                steps     = gr.Slider(label="Diffusion-Schritte", minimum=10, maximum=100, step=5, value=50)
-                guidance  = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=15.0, step=0.5, value=6.0)
-                seed      = gr.Slider(label="Seed", minimum=0, maximum=2**31, step=1, value=42)
                 rand_seed = gr.Checkbox(label="Seed zufällig", value=True)
             tag_info  = gr.Text(label="Erkannte Tags", interactive=False)
             seed_out  = gr.Number(label="Verwendeter Seed", interactive=False)
         with gr.Column(scale=3):
             gallery = gr.Gallery(
-                label="Generierte Skins (512×512 hochskaliert)",
-                elem_id="gallery",
-                columns=4,
-                rows=2,
-                object_fit="contain",
-                height=400,
             )
-    gr.Examples(
-        examples=EXAMPLES,
-        inputs=[prompt, num_skins, steps, guidance],
-        label="Beispiele",
-    )
     gr.on(
         triggers=[run_btn.click, prompt.submit],
         fn=generate,

 ====================================================
 Lädt model.pt (EMA-Gewichte) aus dem Repo und generiert Skins per Prompt.
 Benötigte Dateien im Space-Repo:
+  app.py            ← diese Datei
+  model.pt          ← exportiertes Modell (via Option 6 → "Modell exportieren")
   requirements.txt
+requirements.txt Inhalt:
+  torch
+  gradio
+  Pillow
+  numpy
 """
 import math
 import random
 import numpy as np
 import gradio as gr
 import torch.nn.functional as F
 from PIL import Image
+# ─── Konstanten (MÜSSEN exakt mit train_diffusion.py übereinstimmen) ──────────
+IMG_SIZE   = 64
+CHANNELS   = 4
+EMBED_DIM  = 256
+T_STEPS    = 500
 BETA_START = 1e-4
 BETA_END   = 0.02
 # ─── Tags (identisch mit Training) ────────────────────────────────────────────
 BEREICHE    = ["head","body","arm_l","arm_r","leg_l","leg_r"]
+FARBEN      = ["orange","red","blue","green","cyan","yellow","pink","purple",
+               "black","white","gray","brown","beige"]
 HELL        = ["bright","medium","dark"]
+KLEIDUNG    = ["hoodie","shirt","tshirt","jacket","coat","armor","robe","suit",
+               "dress","cape","vest","sweater","uniform","casual","formal",
+               "jeans","pants","shorts","skirt"]
+STIL        = ["player_skin","mob_skin","zombie","enderman","skeleton_like",
+               "custom","unknown","fantasy","modern","medieval","sci_fi",
+               "ninja","pirate","wizard","knight","archer","mage"]
 HAUTTONE    = ["skin_light","skin_medium","skin_dark","skin_pale","skin_tan"]
+ACCESSOIRES = ["hat","helmet","crown","glasses","beard","hair_long","hair_short",
+               "wings","tail","horns","mask"]
 ALL_TAGS = []
 for b in BEREICHE:
     "red":"red","blue":"blue","green":"green","yellow":"yellow","cyan":"cyan",
     "pink":"pink","purple":"purple","black":"black","white":"white",
     "gray":"gray","grey":"gray","brown":"brown",
+    "hell":"bright","bright":"bright","dunkel":"dark","dark":"dark",
+    "mittel":"medium","medium":"medium",
+    "zombie":"zombie","enderman":"enderman","skelett":"skeleton_like",
+    "skeleton":"skeleton_like","armor":"armor","player":"player_skin","custom":"custom",
+    "hoodie":"hoodie","hemd":"shirt","shirt":"shirt","tshirt":"tshirt",
+    "jacke":"jacket","jacket":"jacket","mantel":"coat","coat":"coat",
+    "robe":"robe","anzug":"suit","suit":"suit","kleid":"dress","dress":"dress",
+    "umhang":"cape","cape":"cape","weste":"vest","vest":"vest",
+    "pullover":"sweater","sweater":"sweater","uniform":"uniform",
+    "casual":"casual","formal":"formal","jeans":"jeans","hose":"pants",
+    "pants":"pants","shorts":"shorts","skirt":"skirt",
     "fantasy":"fantasy","modern":"modern","medieval":"medieval",
     "scifi":"sci_fi","ninja":"ninja","pirate":"pirate",
     "wizard":"wizard","knight":"knight","archer":"archer","mage":"mage",
     "pale":"skin_pale","tan":"skin_tan",
+    "hat":"hat","helmet":"helmet","crown":"crown","glasses":"glasses",
+    "beard":"beard","wings":"wings","horns":"horns","mask":"mask",
 }
 _COLOR_BODY_PARTS = {
+    "hoodie":  ["body","arm_l","arm_r"],
+    "shirt":   ["body"], "tshirt": ["body"],
+    "jacket":  ["body","arm_l","arm_r"],
+    "coat":    ["body","arm_l","arm_r"],
+    "jeans":   ["leg_l","leg_r"], "pants": ["leg_l","leg_r"],
+    "shorts":  ["leg_l","leg_r"], "skirt": ["leg_l","leg_r"],
+    "default": ["head","body","arm_l","arm_r","leg_l","leg_r"],
 }
 def parse_prompt(prompt: str) -> list:
         if resolved in FARBEN:
             pending_color = resolved
             if pending_garment is None:
+                for b in _COLOR_BODY_PARTS["default"]:
+                    tags.add(f"{b}_{resolved}")
         elif resolved in KLEIDUNG:
             pending_garment = resolved
             tags.add(resolved)
         if t in TAG2IDX: vec[TAG2IDX[t]] = 1.0
     return vec
+# ─── UV-Masken (identisch mit Training) ───────────────────────────────────────
 SKIN_REGIONS = {
+    "head":  (0,  0,  32, 16),
+    "body":  (16, 16, 40, 32),
+    "arm_r": (40, 16, 56, 32),
+    "leg_r": (0,  16, 16, 32),
+    "arm_l": (32, 48, 48, 64),
+    "leg_l": (16, 48, 32, 64),
 }
 OVERLAY_REGIONS = {
+    "head_overlay":  (32, 0,  64, 16),
+    "body_overlay":  (16, 32, 40, 48),
+    "arm_r_overlay": (40, 32, 56, 48),
+    "leg_r_overlay": (0,  32, 16, 48),
+    "arm_l_overlay": (48, 48, 64, 64),
+    "leg_l_overlay": (0,  48, 16, 64),
 }
 def _build_base_mask(device):
     base    = _build_base_mask(img.device)
     overlay = _build_overlay_mask(img.device)
     outside = (1.0 - base - overlay).clamp(0,1)
+    alpha   = (base    * torch.ones_like(img[:,3:4])
+             + overlay * img[:,3:4]
+             + outside * torch.full_like(img[:,3:4], -1.0))
     return torch.cat([img[:,:3], alpha], dim=1)
+# ─── Architektur (EXAKT identisch mit train_diffusion.py) ─────────────────────
 class SinusoidalPE(nn.Module):
     def __init__(self, dim):
         super().__init__()
         self.dim = dim
     def forward(self, t):
+        device = t.device
+        half   = self.dim // 2
+        freqs  = torch.exp(-math.log(10000) * torch.arange(half, device=device) / half)
+        args   = t[:,None].float() * freqs[None]
         return torch.cat([args.sin(), args.cos()], dim=-1)
 class CondEmbed(nn.Module):
     def __init__(self, num_channels, eps=1e-8):
         super().__init__()
         self.eps   = eps
+        # WICHTIG: Shape (1, num_channels, 1, 1) – identisch mit Training
+        self.scale = nn.Parameter(torch.ones(1, num_channels, 1, 1))
     def forward(self, x):
+        rms = x.pow(2).mean(dim=1, keepdim=True).add(self.eps).sqrt()
+        return x / rms * self.scale
 class ResBlock(nn.Module):
+    # WICHTIG: time_mlp + cond_mlp getrennt – identisch mit Training
+    def __init__(self, in_ch, out_ch, time_dim, dropout=0.1, cond_dim=None):
         super().__init__()
+        self.norm1    = RMSNorm(in_ch)
+        self.conv1    = nn.Conv2d(in_ch, out_ch, 3, padding=1)
+        self.norm2    = RMSNorm(out_ch)
+        self.conv2    = nn.Conv2d(out_ch, out_ch, 3, padding=1)
+        self.time_mlp = nn.Sequential(nn.SiLU(), nn.Linear(time_dim, out_ch*2))
+        self.cond_mlp = nn.Sequential(nn.SiLU(), nn.Linear(cond_dim if cond_dim else time_dim, out_ch*2))
+        self.skip     = nn.Conv2d(in_ch, out_ch, 1) if in_ch != out_ch else nn.Identity()
+        self.dropout  = nn.Dropout(dropout)
+        self.act      = nn.SiLU()
+    def forward(self, x, t_emb, c_emb=None):
+        h = self.conv1(self.act(self.norm1(x)))
+        t = self.time_mlp(t_emb)[:,:,None,None]
+        t_scale, t_shift = t.chunk(2, dim=1)
+        h = self.norm2(h) * (1 + t_scale) + t_shift
+        if c_emb is not None:
+            c = self.cond_mlp(c_emb)[:,:,None,None]
+            c_scale, c_shift = c.chunk(2, dim=1)
+            h = h * (1 + c_scale) + c_shift
+        h = self.conv2(self.dropout(self.act(h)))
         return h + self.skip(x)
 class AttentionBlock(nn.Module):
+    def __init__(self, ch, heads=4):
         super().__init__()
         self.norm = RMSNorm(ch)
+        self.attn = nn.MultiheadAttention(ch, heads, batch_first=True)
         self.proj = nn.Conv2d(ch, ch, 1)
+        nn.init.zeros_(self.proj.weight)
+        nn.init.zeros_(self.proj.bias)
     def forward(self, x):
         B,C,H,W = x.shape
         h = self.norm(x).view(B,C,H*W).permute(0,2,1)
         time_dim = embed_dim * 2
         cond_dim = embed_dim
         self.time_pe  = SinusoidalPE(embed_dim)
+        self.time_mlp = nn.Sequential(
+            nn.Linear(embed_dim, time_dim), nn.SiLU(),
+            nn.Linear(time_dim, time_dim),
+        )
         self.cond_emb = CondEmbed(NUM_TAGS, cond_dim)
+        self.cond_mlp = nn.Linear(cond_dim, time_dim)  # ungenutzt in forward, aber im state_dict
         ch = base_ch
+        self.enc_in  = nn.Conv2d(channels, ch, 3, padding=1)
+        self.enc1    = ResBlock(ch,   ch,    time_dim, dropout=0.05, cond_dim=cond_dim)
+        self.enc1b   = ResBlock(ch,   ch,    time_dim, dropout=0.05, cond_dim=cond_dim)
+        self.down1   = nn.Conv2d(ch,  ch,   4, stride=2, padding=1)
+        self.enc2    = ResBlock(ch,   ch*2,  time_dim, dropout=0.05, cond_dim=cond_dim)
+        self.enc2b   = ResBlock(ch*2, ch*2,  time_dim, dropout=0.05, cond_dim=cond_dim)
+        self.down2   = nn.Conv2d(ch*2, ch*2, 4, stride=2, padding=1)
+        self.enc3    = ResBlock(ch*2, ch*4,  time_dim, dropout=0.05, cond_dim=cond_dim)
+        self.enc3b   = ResBlock(ch*4, ch*4,  time_dim, dropout=0.05, cond_dim=cond_dim)
+        self.attn3   = AttentionBlock(ch*4)
+        self.down3   = nn.Conv2d(ch*4, ch*4, 4, stride=2, padding=1)
+        self.mid1    = ResBlock(ch*4, ch*4,  time_dim, cond_dim=cond_dim)
+        self.mid_att = AttentionBlock(ch*4)
+        self.mid2    = ResBlock(ch*4, ch*4,  time_dim, cond_dim=cond_dim)
+        self.up3     = nn.ConvTranspose2d(ch*4, ch*4, 4, stride=2, padding=1)
+        self.dec3    = ResBlock(ch*8, ch*4,  time_dim, dropout=0.15, cond_dim=cond_dim)
+        self.dec3b   = ResBlock(ch*4, ch*4,  time_dim, dropout=0.15, cond_dim=cond_dim)
+        self.attn_d3 = AttentionBlock(ch*4)
+        self.up2     = nn.ConvTranspose2d(ch*4, ch*2, 4, stride=2, padding=1)
+        self.dec2    = ResBlock(ch*4, ch*2,  time_dim, dropout=0.15, cond_dim=cond_dim)
+        self.dec2b   = ResBlock(ch*2, ch*2,  time_dim, dropout=0.15, cond_dim=cond_dim)
+        self.up1     = nn.ConvTranspose2d(ch*2, ch,   4, stride=2, padding=1)
+        self.dec1    = ResBlock(ch*2, ch,    time_dim, cond_dim=cond_dim)
+        self.dec1b   = ResBlock(ch,   ch,    time_dim, cond_dim=cond_dim)
+        self.out     = nn.Sequential(
+            nn.GroupNorm(min(8,ch), ch), nn.SiLU(),
+            nn.Conv2d(ch, channels, 3, padding=1),
+        )
+        nn.init.zeros_(self.out[-1].bias)
     def forward(self, x, t, cond):
         t_emb = self.time_mlp(self.time_pe(t))
+        c_emb = self.cond_emb(cond)  # cond_dim=embed_dim, direkt an ResBlocks
         h0 = self.enc_in(x)
+        h1 = self.enc1b(self.enc1(h0, t_emb, c_emb), t_emb, c_emb)
+        h2 = self.enc2b(self.enc2(self.down1(h1), t_emb, c_emb), t_emb, c_emb)
+        h3 = self.attn3(self.enc3b(self.enc3(self.down2(h2), t_emb, c_emb), t_emb, c_emb))
+        h  = self.mid2(self.mid_att(self.mid1(self.down3(h3), t_emb, c_emb)), t_emb, c_emb)
+        h  = self.attn_d3(self.dec3b(self.dec3(torch.cat([self.up3(h), h3], 1), t_emb, c_emb), t_emb, c_emb))
+        h  = self.dec2b(self.dec2(torch.cat([self.up2(h), h2], 1), t_emb, c_emb), t_emb, c_emb)
+        h  = self.dec1b(self.dec1(torch.cat([self.up1(h), h1], 1), t_emb, c_emb), t_emb, c_emb)
         return self.out(h)
+# ─── Diffusion Schedule (EXAKT identisch mit train_diffusion.py) ───────────────
 class DiffusionSchedule:
     def __init__(self, T=T_STEPS, device="cpu"):
         self.T      = T
         self.device = device
         steps  = T + 1
         x      = torch.linspace(0, T, steps)
+        alphas = torch.cos(((x / T) + 0.008) / 1.008 * math.pi / 2) ** 2
         alphas = alphas / alphas[0]
+        betas  = (1 - alphas[1:] / alphas[:-1]).clamp(0, 0.999)
         self.betas               = betas.to(device)
         self.alphas              = 1.0 - self.betas
         self.alphas_cumprod      = torch.cumprod(self.alphas, dim=0)
+        self.alphas_cumprod_prev = F.pad(self.alphas_cumprod[:-1], (1,0), value=1.0)
     @torch.no_grad()
+    def _predict_noise(self, model, x, t_idx, cond, guidance_scale):
         t_tensor  = torch.full((x.shape[0],), t_idx, device=self.device, dtype=torch.long)
         null_cond = torch.zeros_like(cond)
+        x2   = torch.cat([x, x])
+        t2   = torch.cat([t_tensor, t_tensor])
+        c2   = torch.cat([cond, null_cond])
+        out  = model(x2, t2, c2)
+        noise_cond, noise_uncond = out.chunk(2)
+        return noise_uncond + guidance_scale * (noise_cond - noise_uncond)
     @torch.no_grad()
+    def ddim_step(self, model, x, t_idx, t_prev_idx, cond, guidance_scale=6.0, eta=0.0):
+        noise_pred     = self._predict_noise(model, x, t_idx, cond, guidance_scale)
+        alpha_bar      = self.alphas_cumprod[t_idx]
+        alpha_bar_prev = self.alphas_cumprod[t_prev_idx] if t_prev_idx >= 0 else torch.ones(1, device=self.device)
+        x0_pred = (x - (1 - alpha_bar).sqrt() * noise_pred) / alpha_bar.sqrt()
+        x0_pred = x0_pred.clamp(-1.0, 1.0)
+        sigma   = eta * ((1 - alpha_bar_prev)/(1 - alpha_bar)).sqrt() * (1 - alpha_bar/alpha_bar_prev).sqrt()
+        dir_xt  = (1 - alpha_bar_prev - sigma**2).clamp(min=0).sqrt() * noise_pred
+        x_prev  = alpha_bar_prev.sqrt() * x0_pred + dir_xt
+        if eta > 0 and t_prev_idx > 0:
+            x_prev = x_prev + sigma * torch.randn_like(x)
+        return x_prev
+    @torch.no_grad()
+    def sample(self, model, cond, n=1, steps=80, guidance_scale=6.0):
         model.eval()
         x = torch.randn(n, CHANNELS, IMG_SIZE, IMG_SIZE, device=self.device)
+        timesteps = torch.linspace(self.T - 1, 0, steps, device=self.device).round().long()
+        timesteps = torch.unique_consecutive(timesteps)
+        for i in range(len(timesteps)):
+            t_idx      = int(timesteps[i].item())
+            t_prev_idx = int(timesteps[i+1].item()) if i+1 < len(timesteps) else -1
+            x = self.ddim_step(model, x, t_idx, t_prev_idx, cond, guidance_scale)
+        return force_alpha_mask(x).clamp(-1, 1)
 # ─── Modell laden ─────────────────────────────────────────────────────────────
 device = "cuda" if torch.cuda.is_available() else "cpu"
 ckpt    = torch.load("model.pt", map_location=device, weights_only=False)
 base_ch = ckpt.get("base_ch", 96)
+# EMA bevorzugen (stabiler), Fallback auf "model"
+sd = ckpt.get("ema") or ckpt.get("model") or ckpt
+# _orig_mod. Präfix entfernen (torch.compile Artefakt)
+cleaned_sd = {k.replace("_orig_mod.", ""): v for k, v in sd.items()}
+model_obj = UNet(base_ch=base_ch).to(device)
+missing, unexpected = model_obj.load_state_dict(cleaned_sd, strict=False)
+if missing:
+    print(f"⚠️  {len(missing)} Keys nicht geladen: {missing[:3]}")
+model_obj.eval()
+try:
+    torch._dynamo.disable(model_obj)
+except Exception:
+    pass
 schedule = DiffusionSchedule(device=device)
+n_params = sum(p.numel() for p in model_obj.parameters()) / 1e6
+print(f"✅ Modell geladen: base_ch={base_ch}, {n_params:.1f}M Parameter, {len(cleaned_sd)-len(missing)}/{len(cleaned_sd)} Keys")
 # ─── Generierungs-Funktion ────────────────────────────────────────────────────
 def generate(prompt, num_skins, steps, guidance_scale, seed, randomize_seed):
     if randomize_seed:
         seed = random.randint(0, 2**31)
     torch.manual_seed(seed)
+    tags    = parse_prompt(prompt)
+    tag_str = ", ".join(sorted(tags)) if tags else "–"
+    cond    = tags_to_vector(tags).to(device).unsqueeze(0).repeat(num_skins, 1)
     with torch.inference_mode():
+        imgs = schedule.sample(model_obj, cond, n=num_skins,
+                               steps=steps, guidance_scale=guidance_scale)
     results = []
     for img_t in imgs:
         arr = ((img_t.cpu().permute(1,2,0).numpy() + 1) * 127.5).clip(0,255).astype(np.uint8)
+        # 8× Upscale (nearest-neighbor, kein Blur) für Sichtbarkeit
         pil = Image.fromarray(arr, "RGBA").resize((512, 512), Image.NEAREST)
         results.append(pil)
+    return results, f"Tags: {tag_str}", int(seed)
 # ─── Gradio UI ────────────────────────────────────────────────────────────────
 EXAMPLES = [
+    ["red hoodie blue jeans", 4, 80, 6.0],
+    ["zombie",                4, 80, 7.0],
+    ["wizard fantasy purple", 4, 80, 6.5],
+    ["knight medieval armor", 4, 80, 6.0],
+    ["ninja black dark",      4, 80, 7.0],
+    ["enderman",              2, 80, 6.0],
 ]
+with gr.Blocks(title="Minecraft Skin Generator") as demo:
     gr.Markdown("""
 # 🎮 Minecraft Skin Generator
+Generiert **64×64 Minecraft Skins** aus einem Text-Prompt via DDPM Diffusion Model (~35M Parameter, trainiert auf ~44k Skins).
+**Prompts:** `red hoodie blue jeans` · `zombie` · `knight medieval armor` · `wizard fantasy purple` · `ninja black dark`
 """)
     with gr.Row():
         with gr.Column(scale=2):
+            prompt  = gr.Text(label="Prompt", placeholder="z.B. red hoodie blue jeans", lines=1)
+            run_btn = gr.Button("🎨 Generieren", variant="primary", size="lg")
+            with gr.Accordion("⚙️ Einstellungen", open=False):
+                num_skins = gr.Slider(label="Anzahl Skins",        minimum=1,   maximum=8,    step=1,   value=4)
+                steps     = gr.Slider(label="Diffusion-Schritte",  minimum=20,  maximum=200,  step=10,  value=80)
+                guidance  = gr.Slider(label="Guidance Scale",      minimum=1.0, maximum=15.0, step=0.5, value=6.0)
+                seed      = gr.Slider(label="Seed",                minimum=0,   maximum=2**31,step=1,   value=42)
                 rand_seed = gr.Checkbox(label="Seed zufällig", value=True)
             tag_info  = gr.Text(label="Erkannte Tags", interactive=False)
             seed_out  = gr.Number(label="Verwendeter Seed", interactive=False)
         with gr.Column(scale=3):
             gallery = gr.Gallery(
+                label="Generierte Skins (512×512 hochskaliert, nearest-neighbor)",
+                columns=4, rows=2, object_fit="contain", height=420,
             )
+    gr.Examples(examples=EXAMPLES, inputs=[prompt, num_skins, steps, guidance], label="Beispiele")
     gr.on(
         triggers=[run_btn.click, prompt.submit],
         fn=generate,