Upload BitPixelLM model artifacts

Browse files

Files changed (10) hide show

README.md +76 -0
app.py +310 -0
best.pt +3 -0
config.json +7 -0
generate.py +196 -0
model/__init__.py +17 -0
model/bit_pixel_decoder.py +577 -0
model/bitlinear.py +239 -0
model/text_encoder.py +122 -0
model/tokenizer.py +106 -0

README.md ADDED Viewed

	@@ -0,0 +1,76 @@

+# BitPixelLM
+BitPixelLM is a text-to-pixel-art language model that generates 32x32 images from prompts like `a red pixel art sword`.
+It uses a BitNet b1.58-style ternary decoder (`-1, 0, +1`) with a lightweight text encoder.
+## Current Model Snapshot
+- Model name: **BitPixelLM**
+- Architecture: 3-layer text encoder + 6-layer BitPixelLM decoder
+- Parameters: ~7.3M
+- Dataset (v3): 23,648 synthetic pixel-art samples
+- Vocab: 222 words
+- Best validation loss (v3): ~0.4015
+## Project Layout
+- `model/bit_pixel_decoder.py` — BitPixelLM model
+- `train_bitnet.py` — training pipeline
+- `generate.py` — CLI generation
+- `app.py` — Gradio app
+- `data/generate_v3.py` — v3 dataset generator
+- `PixelArtGen_Colab.ipynb` — Colab training notebook
+## Run Locally
+1. Ensure Python 3.9 + CUDA-enabled PyTorch.
+2. Place data in `D:\PixelArtGen_Data\processed`:
+   - `tokens.npy`, `labels.json`, `vocab.json`, `palette_256.npy`
+3. Train:
+```bash
+python train_bitnet.py --epochs 60 --batch-size 32 --lr 5e-4
+```
+4. Launch app:
+```bash
+python app.py
+```
+## Publish to Hugging Face
+This repo includes `publish_hf.py` for one-step upload.
+### Required
+- Hugging Face token with write access (`HF_TOKEN`)
+- `huggingface_hub` installed
+### Command
+```bash
+pip install huggingface_hub
+python publish_hf.py --repo-id YOUR_USERNAME/BitPixelLM --token $HF_TOKEN
+```
+On Windows PowerShell:
+```powershell
+$env:HF_TOKEN = "hf_xxx"
+python publish_hf.py --repo-id YOUR_USERNAME/BitPixelLM --token $env:HF_TOKEN
+```
+This uploads:
+- `checkpoints_bit/best.pt`
+- `model/` Python files
+- `generate.py`
+- `app.py`
+- `README.md` (model card / usage overview)
+## Notes
+- The active production model is **BitPixelLM**.
+- Legacy FP32 `PixelLM` artifacts remain in the repo only for historical reference.

app.py ADDED Viewed

	@@ -0,0 +1,310 @@

+"""
+PixelArtGen — Gradio Web UI
+Interactive UI to generate pixel art from text prompts using
+BitPixelLM — a 1.58-bit ternary transformer (BitNet b1.58).
+Launch:
+    python app.py
+    Then open http://localhost:7860 in your browser.
+"""
+import sys
+import json
+import torch
+import numpy as np
+import gradio as gr
+from pathlib import Path
+from PIL import Image
+sys.path.insert(0, str(Path(__file__).parent))
+from model.tokenizer import PaletteTokenizer
+from model.text_encoder import TextTokenizer, TextEncoder
+from model.bit_pixel_decoder import BitPixelLMDecoder, BitPixelLM
+# ─── Config ──────────────────────────────────────────────────────
+DATA_DIR = Path(r"D:\PixelArtGen_Data\processed")
+CHECKPOINT_PATH = Path("checkpoints_bit/best.pt")
+# ─── Global state (loaded once) ─────────────────────────────────
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = None
+palette_tok = None
+text_tok = None
+def load_tokenizers():
+    """Load shared tokenizers."""
+    global palette_tok, text_tok
+    palette_tok = PaletteTokenizer(palette_path=str(DATA_DIR / "palette_256.npy"))
+    with open(DATA_DIR / "vocab.json") as f:
+        vocab = json.load(f)
+    text_tok = TextTokenizer(vocab)
+def load_model():
+    """Load the BitPixelLM model from checkpoint."""
+    global model
+    if model is not None:
+        return model
+    if not CHECKPOINT_PATH.exists():
+        raise FileNotFoundError(
+            f"Checkpoint not found: {CHECKPOINT_PATH}\n"
+            "BitPixelLM is still training — check back once training completes."
+        )
+    checkpoint = torch.load(str(CHECKPOINT_PATH), map_location=device, weights_only=False)
+    model_args = checkpoint.get("args", {})
+    d_model = model_args.get("d_model", 256)
+    nhead = model_args.get("nhead", 8)
+    text_layers = model_args.get("text_layers", 3)
+    pixel_layers = model_args.get("pixel_layers", 6)
+    dim_ff = model_args.get("dim_ff", 512)
+    dropout = model_args.get("dropout", 0.1)
+    max_text_len = model_args.get("max_text_len", 32)
+    text_encoder = TextEncoder(
+        vocab_size=text_tok.vocab_size,
+        d_model=d_model,
+        nhead=nhead,
+        num_layers=text_layers,
+        dim_feedforward=dim_ff,
+        max_seq_len=max_text_len,
+        dropout=dropout,
+    )
+    pixel_decoder = BitPixelLMDecoder(
+        vocab_size=palette_tok.vocab_size,
+        d_model=d_model,
+        nhead=nhead,
+        num_layers=pixel_layers,
+        dim_feedforward=dim_ff,
+        img_size=32,
+        dropout=dropout,
+    )
+    m = BitPixelLM(text_encoder, pixel_decoder).to(device)
+    m.load_state_dict(checkpoint["model_state_dict"])
+    m.eval()
+    model = m
+    return model
+def generate(
+    prompt: str,
+    temperature: float,
+    top_k: int,
+    top_p: float,
+    num_samples: int,
+    scale: int,
+):
+    """Generate pixel art from a text prompt."""
+    if not prompt.strip():
+        raise gr.Error("Please enter a prompt.")
+    if model is None:
+        raise gr.Error(
+            "BitPixelLM is not loaded yet. "
+            "It may still be training — check back once training completes."
+        )
+    text_tokens = text_tok.encode(prompt).unsqueeze(0).to(device)
+    # Warn about unknown words (still generates, but quality may suffer)
+    words = prompt.lower().strip().split()
+    unknown = [w for w in words if w not in text_tok.word2idx and w not in ("<pad>", "<sos>", "<eos>", "<unk>")]
+    images = []
+    try:
+        for _ in range(int(num_samples)):
+            with torch.no_grad():
+                generated_tokens = model.generate(
+                    text_tokens,
+                    sos_token=palette_tok.sos_token,
+                    eos_token=palette_tok.eos_token,
+                    temperature=temperature,
+                    top_k=top_k,
+                    top_p=top_p,
+                )
+            token_list = generated_tokens[0].cpu().tolist()
+            img_array = palette_tok.decode_tokens(token_list)
+            img = Image.fromarray(img_array, "RGB")
+            # Upscale with nearest-neighbor for crisp pixels
+            s = int(scale)
+            if s > 1:
+                img = img.resize((32 * s, 32 * s), Image.NEAREST)
+            images.append(img)
+    except Exception as e:
+        raise gr.Error(f"Generation failed: {e}")
+    if unknown:
+        gr.Warning(
+            f"Unknown words treated as <unk>: {', '.join(unknown)}. "
+            f"Try using words from the vocabulary list below."
+        )
+    return images
+# ─── Build UI ─────────────────────���──────────────────────────────
+# Load vocabulary dynamically from processed data
+def _load_vocab_words():
+    try:
+        with open(DATA_DIR / "vocab.json") as f:
+            vocab = json.load(f)
+        return sorted([w for w in vocab if not w.startswith("<")])
+    except Exception:
+        return ["pixel", "art", "sword", "red", "blue", "green"]
+VOCAB_WORDS = _load_vocab_words()
+EXAMPLE_PROMPTS = [
+    "a red pixel art sword",
+    "a green pixel art dragon",
+    "a purple pixel art crystal",
+    "a blue pixel art knight",
+    "a gold pixel art castle",
+    "a red pixel art phoenix",
+    "a dark pixel art skeleton",
+    "a teal pixel art wizard",
+    "a silver pixel art robot",
+    "a orange pixel art fox",
+]
+def build_ui():
+    with gr.Blocks(
+        title="PixelArtGen",
+        theme=gr.themes.Soft(primary_hue="purple"),
+        css="""
+        .gallery-item img { image-rendering: pixelated !important; }
+        .output-gallery img { image-rendering: pixelated !important; }
+        #gallery img { image-rendering: pixelated !important; }
+        """,
+    ) as app:
+        gr.Markdown(
+            """
+            # PixelArtGen
+            ### Generate 32x32 pixel art from text prompts
+            Powered by **BitPixelLM** — a custom 1.58-bit ternary transformer built from scratch
+            using BitNet b1.58 with RMSNorm, SwiGLU, and 2D positional encoding.
+            7.3M parameters (75% ternary weights at 1.58 bits per weight).
+            """
+        )
+        with gr.Row():
+            with gr.Column(scale=1):
+                prompt = gr.Textbox(
+                    label="Prompt",
+                    placeholder="a red pixel art sword",
+                    lines=2,
+                )
+                with gr.Row():
+                    generate_btn = gr.Button("Generate", variant="primary", scale=2)
+                    num_samples = gr.Slider(1, 8, value=4, step=1, label="Samples")
+                with gr.Accordion("Advanced Settings", open=False):
+                    temperature = gr.Slider(
+                        0.1, 2.0, value=0.8, step=0.05,
+                        label="Temperature",
+                        info="Lower = more deterministic, higher = more creative"
+                    )
+                    top_k = gr.Slider(
+                        0, 256, value=40, step=1,
+                        label="Top-K",
+                        info="0 = disabled. Limits sampling to top K tokens."
+                    )
+                    top_p = gr.Slider(
+                        0.1, 1.0, value=0.9, step=0.05,
+                        label="Top-P (Nucleus)",
+                        info="Cumulative probability threshold for sampling."
+                    )
+                    scale = gr.Slider(
+                        1, 16, value=8, step=1,
+                        label="Upscale Factor",
+                        info="8x = 256x256, 16x = 512x512"
+                    )
+                gr.Markdown(
+                    f"**Known vocabulary:** {', '.join(VOCAB_WORDS)}"
+                )
+            with gr.Column(scale=2):
+                gallery = gr.Gallery(
+                    label="Generated Pixel Art",
+                    columns=4,
+                    rows=2,
+                    height=520,
+                    object_fit="contain",
+                    elem_id="gallery",
+                )
+        gr.Markdown("### Examples")
+        gr.Examples(
+            examples=EXAMPLE_PROMPTS,
+            inputs=[prompt],
+            label="Click to try",
+        )
+        gr.Markdown(
+            """
+            ---
+            **Architecture:**
+            BitPixelLM treats pixel art generation as language modeling — each pixel is a token from a 256-color palette,
+            generated left-to-right, top-to-bottom via a causal transformer with 2D positional encoding and cross-attention to text.
+            Uses 1.58-bit ternary weights (BitNet b1.58) with RMSNorm and SwiGLU for extreme parameter efficiency.
+            """
+        )
+        # Wire up the generate button
+        generate_btn.click(
+            fn=generate,
+            inputs=[prompt, temperature, top_k, top_p, num_samples, scale],
+            outputs=gallery,
+        )
+        # Also generate on Enter
+        prompt.submit(
+            fn=generate,
+            inputs=[prompt, temperature, top_k, top_p, num_samples, scale],
+            outputs=gallery,
+        )
+    return app
+# ─── Main ────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    print("Loading tokenizers...")
+    load_tokenizers()
+    print(f"  Palette: {palette_tok.vocab_size} tokens")
+    print(f"  Text: {text_tok.vocab_size} words")
+    print(f"  Device: {device}")
+    # Load BitPixelLM
+    print(f"Loading BitPixelLM from {CHECKPOINT_PATH}...")
+    try:
+        load_model()
+        print(f"  BitPixelLM loaded successfully.")
+    except FileNotFoundError as e:
+        print(f"  {e}")
+        print(f"  UI will launch but generation will be unavailable until training completes.")
+    except Exception as e:
+        print(f"  Failed to load BitPixelLM: {e}")
+    print("\nLaunching UI...")
+    app = build_ui()
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        inbrowser=True,
+    )

best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37ceef8a7d844445be4bc5730bcd683d1512aff084a6e872634b3184c58f2464
+size 88732053

config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "model_type": "BitPixelLM",
+  "architecture": "BitNet-b1.58-style autoregressive decoder",
+  "image_size": 32,
+  "task": "text-to-image (pixel art)",
+  "checkpoint_file": "best.pt"
+}

generate.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""
+PixelArtGen — Generate pixel art from text prompts.
+Usage:
+    python generate.py --prompt "a red pixel art sword" --output output.png
+    python generate.py --prompt "a blue pixel art heart" --output heart.png --temperature 0.7
+    python generate.py --batch-prompts prompts.txt --output-dir outputs/
+"""
+import os
+import sys
+import json
+import argparse
+import numpy as np
+import torch
+from pathlib import Path
+from PIL import Image
+sys.path.insert(0, str(Path(__file__).parent))
+from model.tokenizer import PaletteTokenizer
+from model.text_encoder import TextTokenizer, TextEncoder
+from model.pixel_decoder import PixelLMDecoder, PixelLM
+def load_model(checkpoint_path: str, data_dir: str, device: torch.device):
+    """Load a trained PixelLM model from checkpoint."""
+    data_dir = Path(data_dir)
+    # Load checkpoint
+    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
+    model_args = checkpoint.get("args", {})
+    # Load tokenizers
+    palette_tok = PaletteTokenizer(palette_path=str(data_dir / "palette_256.npy"))
+    with open(data_dir / "vocab.json") as f:
+        vocab = json.load(f)
+    text_tok = TextTokenizer(vocab)
+    # Rebuild model
+    d_model = model_args.get("d_model", 256)
+    nhead = model_args.get("nhead", 8)
+    text_layers = model_args.get("text_layers", 3)
+    pixel_layers = model_args.get("pixel_layers", 6)
+    dim_ff = model_args.get("dim_ff", 512)
+    dropout = model_args.get("dropout", 0.1)
+    max_text_len = model_args.get("max_text_len", 32)
+    text_encoder = TextEncoder(
+        vocab_size=text_tok.vocab_size,
+        d_model=d_model,
+        nhead=nhead,
+        num_layers=text_layers,
+        dim_feedforward=dim_ff,
+        max_seq_len=max_text_len,
+        dropout=dropout,
+    )
+    pixel_decoder = PixelLMDecoder(
+        vocab_size=palette_tok.vocab_size,
+        d_model=d_model,
+        nhead=nhead,
+        num_layers=pixel_layers,
+        dim_feedforward=dim_ff,
+        img_size=32,
+        dropout=dropout,
+    )
+    model = PixelLM(text_encoder, pixel_decoder).to(device)
+    model.load_state_dict(checkpoint["model_state_dict"])
+    model.eval()
+    return model, palette_tok, text_tok
+def generate_pixel_art(
+    model: PixelLM,
+    palette_tok: PaletteTokenizer,
+    text_tok: TextTokenizer,
+    prompt: str,
+    device: torch.device,
+    temperature: float = 0.8,
+    top_k: int = 40,
+    top_p: float = 0.9,
+    scale: int = 8,
+) -> Image.Image:
+    """
+    Generate a 32×32 pixel art image from a text prompt.
+    Args:
+        model: Trained PixelLM model
+        palette_tok: Color palette tokenizer
+        text_tok: Text tokenizer
+        prompt: Text description
+        device: torch device
+        temperature: Sampling temperature (lower = more deterministic)
+        top_k: Top-k filtering
+        top_p: Nucleus sampling threshold
+        scale: Upscale factor for display (8 = 256×256 output)
+    Returns:
+        PIL Image (32*scale × 32*scale)
+    """
+    # Tokenize prompt
+    text_tokens = text_tok.encode(prompt).unsqueeze(0).to(device)
+    # Generate
+    with torch.no_grad():
+        generated_tokens = model.generate(
+            text_tokens,
+            sos_token=palette_tok.sos_token,
+            eos_token=palette_tok.eos_token,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+        )
+    # Decode to image
+    token_list = generated_tokens[0].cpu().tolist()
+    img_array = palette_tok.decode_tokens(token_list)
+    img = Image.fromarray(img_array, "RGB")
+    # Upscale with nearest-neighbor (pixel art style)
+    if scale > 1:
+        img = img.resize((32 * scale, 32 * scale), Image.NEAREST)
+    return img
+def main():
+    parser = argparse.ArgumentParser(description="Generate pixel art from text")
+    parser.add_argument("--prompt", type=str, help="Text prompt")
+    parser.add_argument("--output", type=str, default="output.png", help="Output file")
+    parser.add_argument("--checkpoint", type=str, default="checkpoints/best.pt")
+    parser.add_argument("--data-dir", type=str, default=r"D:\PixelArtGen_Data\processed")
+    parser.add_argument("--temperature", type=float, default=0.8)
+    parser.add_argument("--top-k", type=int, default=40)
+    parser.add_argument("--top-p", type=float, default=0.9)
+    parser.add_argument("--scale", type=int, default=8, help="Upscale factor")
+    parser.add_argument("--num-samples", type=int, default=1, help="Number of images to generate")
+    parser.add_argument("--batch-prompts", type=str, help="File with prompts (one per line)")
+    parser.add_argument("--output-dir", type=str, default="outputs")
+    args = parser.parse_args()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Device: {device}")
+    # Load model
+    print(f"Loading model from {args.checkpoint}...")
+    model, palette_tok, text_tok = load_model(args.checkpoint, args.data_dir, device)
+    print(f"  Model: {model.count_parameters():,} parameters")
+    # Collect prompts
+    if args.batch_prompts:
+        with open(args.batch_prompts) as f:
+            prompts = [line.strip() for line in f if line.strip()]
+    elif args.prompt:
+        prompts = [args.prompt]
+    else:
+        prompts = [
+            "a red pixel art sword",
+            "a blue pixel art heart",
+            "a green pixel art tree",
+            "a purple pixel art gem",
+        ]
+    # Generate
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    for i, prompt in enumerate(prompts):
+        print(f"\nGenerating: \"{prompt}\"")
+        for j in range(args.num_samples):
+            img = generate_pixel_art(
+                model, palette_tok, text_tok, prompt, device,
+                temperature=args.temperature,
+                top_k=args.top_k,
+                top_p=args.top_p,
+                scale=args.scale,
+            )
+            if len(prompts) == 1 and args.num_samples == 1:
+                out_path = args.output
+            else:
+                safe_name = prompt.replace(" ", "_")[:30]
+                out_path = output_dir / f"{safe_name}_{j}.png"
+            img.save(str(out_path))
+            print(f"  Saved: {out_path}")
+    print("\nDone!")
+if __name__ == "__main__":
+    main()

model/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""PixelArtGen model package."""
+from .tokenizer import PaletteTokenizer
+from .text_encoder import TextTokenizer, TextEncoder
+from .bitlinear import BitLinear158, RMSNorm, SwiGLU
+from .bit_pixel_decoder import BitPixelLMDecoder, BitPixelLM
+__all__ = [
+    "PaletteTokenizer",
+    "TextTokenizer",
+    "TextEncoder",
+    "BitLinear158",
+    "RMSNorm",
+    "SwiGLU",
+    "BitPixelLMDecoder",
+    "BitPixelLM",
+]

model/bit_pixel_decoder.py ADDED Viewed

	@@ -0,0 +1,577 @@

+"""
+PixelArtGen — BitPixelLM Decoder (1.58-bit)
+A ternary-weight variant of our PixelLM decoder, implementing BitNet b1.58.
+Replaces nn.Linear layers with BitLinear158 (ternary weights {-1, 0, +1})
+and uses modern LLaMA-alike components (RMSNorm, SwiGLU, no biases).
+Key differences from the standard PixelLM decoder:
+- BitLinear158 layers with built-in RMSNorm (replaces nn.Linear + LayerNorm)
+- SwiGLU FFN activation (replaces GELU)
+- No biases anywhere
+- Token embeddings and output head remain in full precision
+- 2D positional encoding preserved (our unique contribution)
+References:
+- "The Era of 1-bit LLMs" (Ma et al., 2024) — arXiv:2402.17764
+- "BitNet" (Wang et al., 2023) — arXiv:2310.11453
+- "GLU Variants Improve Transformer" (Shazeer, 2020) — arXiv:2002.05202
+- "RMSNorm" (Zhang & Sennrich, 2019) — arXiv:1910.07467
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional
+from model.bitlinear import BitLinear158, RMSNorm, SwiGLU
+# ── Shared components (self-contained, no dependency on pixel_decoder.py) ──
+class PixelPositionalEncoding2D(nn.Module):
+    """
+    2D positional encoding for pixel sequences.
+    Instead of treating pixel positions as flat indices 0..1023,
+    we encode them as (row, col) pairs with separate learned embeddings.
+    This gives the model explicit 2D spatial structure.
+    Also includes a special position embedding for <sos> and <eos> tokens.
+    """
+    def __init__(self, d_model: int, img_size: int = 32):
+        super().__init__()
+        self.img_size = img_size
+        self.d_model = d_model
+        # Separate row and column embeddings
+        self.row_embed = nn.Embedding(img_size, d_model // 2)
+        self.col_embed = nn.Embedding(img_size, d_model // 2)
+        # Special position for sos/eos tokens
+        self.special_pos = nn.Embedding(2, d_model)  # 0=sos, 1=eos
+        # Learnable scale
+        self.scale = nn.Parameter(torch.ones(1))
+    def forward(self, seq_len: int, device: torch.device) -> torch.Tensor:
+        """
+        Generate positional encodings for a sequence of length seq_len.
+        Sequence layout: [sos, pixel_0, pixel_1, ..., pixel_1023, eos]
+        Returns: (1, seq_len, d_model)
+        """
+        positions = torch.zeros(1, seq_len, self.d_model, device=device)
+        # SOS position
+        positions[:, 0, :] = self.special_pos(torch.tensor([0], device=device))
+        # Pixel positions (indices 1..1024)
+        num_pixels = min(seq_len - 1, self.img_size * self.img_size)
+        if num_pixels > 0:
+            pixel_indices = torch.arange(num_pixels, device=device)
+            rows = pixel_indices // self.img_size
+            cols = pixel_indices % self.img_size
+            row_emb = self.row_embed(rows)   # (num_pixels, d_model//2)
+            col_emb = self.col_embed(cols)   # (num_pixels, d_model//2)
+            pixel_pos = torch.cat([row_emb, col_emb], dim=-1)  # (num_pixels, d_model)
+            positions[:, 1:1 + num_pixels, :] = pixel_pos.unsqueeze(0)
+        # EOS position (if present)
+        if seq_len > self.img_size * self.img_size + 1:
+            positions[:, -1, :] = self.special_pos(torch.tensor([1], device=device))
+        return positions * self.scale
+class PaletteOutputHead(nn.Module):
+    """
+    Palette-aware output prediction.
+    Instead of a flat linear(d_model -> vocab_size) layer, we compute
+    output logits via scaled dot-product attention between the decoder
+    hidden states and a set of learned palette key vectors.
+    Each palette color has a key embedding initialized from its RGB values.
+    This gives the model an inductive bias toward understanding color relationships.
+    """
+    def __init__(self, d_model: int, palette_size: int, num_special_tokens: int = 3):
+        super().__init__()
+        self.total_vocab = palette_size + num_special_tokens
+        self.d_model = d_model
+        # Learned palette keys (will be initialized from RGB values)
+        self.palette_keys = nn.Parameter(torch.randn(self.total_vocab, d_model))
+        # Query projection for hidden states
+        self.query_proj = nn.Linear(d_model, d_model)
+        # Temperature parameter for controlling sharpness
+        self.temperature = nn.Parameter(torch.tensor(math.sqrt(d_model), dtype=torch.float32))
+    def init_from_palette(self, palette_rgb: torch.Tensor):
+        """
+        Initialize palette key embeddings from RGB values.
+        palette_rgb: (palette_size, 3) tensor of RGB values [0, 255]
+        """
+        with torch.no_grad():
+            palette_size = palette_rgb.shape[0]
+            # Normalize RGB to [-1, 1] and project to d_model
+            rgb_norm = palette_rgb.float() / 127.5 - 1.0  # (palette_size, 3)
+            # Repeat/tile to fill d_model dimensions
+            repeats = self.d_model // 3 + 1
+            expanded = rgb_norm.repeat(1, repeats)[:, :self.d_model]
+            # Mix with some noise for diversity
+            self.palette_keys.data[:palette_size] = expanded + 0.1 * torch.randn_like(expanded)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: (batch, seq_len, d_model)
+        Returns:
+            logits: (batch, seq_len, total_vocab)
+        """
+        queries = self.query_proj(hidden_states)  # (batch, seq_len, d_model)
+        # Scaled dot-product attention with palette keys
+        logits = torch.matmul(queries, self.palette_keys.T) / self.temperature
+        return logits
+class BitMultiheadAttention(nn.Module):
+    """
+    Multi-head attention with BitLinear158 projections.
+    Q, K, V projections and the output projection all use 1.58-bit weights.
+    Attention computation itself remains in full precision.
+    Following BitNet b1.58: the RMSNorm that normally precedes attention
+    is absorbed into the BitLinear158 layers (they have built-in RMSNorm).
+    """
+    def __init__(self, d_model: int, nhead: int, dropout: float = 0.0):
+        super().__init__()
+        assert d_model % nhead == 0, f"d_model ({d_model}) must be divisible by nhead ({nhead})"
+        self.d_model = d_model
+        self.nhead = nhead
+        self.head_dim = d_model // nhead
+        # QKV projections — all 1.58-bit
+        self.q_proj = BitLinear158(d_model, d_model)
+        self.k_proj = BitLinear158(d_model, d_model)
+        self.v_proj = BitLinear158(d_model, d_model)
+        # Output projection — 1.58-bit
+        self.out_proj = BitLinear158(d_model, d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.scale = math.sqrt(self.head_dim)
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        key_padding_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            query:  (batch, q_len, d_model)
+            key:    (batch, kv_len, d_model)
+            value:  (batch, kv_len, d_model)
+            attn_mask: (q_len, kv_len) or (batch*nhead, q_len, kv_len)
+            key_padding_mask: (batch, kv_len)
+        Returns:
+            (batch, q_len, d_model)
+        """
+        batch_size = query.size(0)
+        # Project Q, K, V through 1.58-bit linear layers
+        q = self.q_proj(query)
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+        # Reshape for multi-head: (batch, seq, d_model) -> (batch, nhead, seq, head_dim)
+        q = q.view(batch_size, -1, self.nhead, self.head_dim).transpose(1, 2)
+        k = k.view(batch_size, -1, self.nhead, self.head_dim).transpose(1, 2)
+        v = v.view(batch_size, -1, self.nhead, self.head_dim).transpose(1, 2)
+        # Scaled dot-product attention
+        attn_weights = torch.matmul(q, k.transpose(-2, -1)) / self.scale
+        # Apply causal mask
+        if attn_mask is not None:
+            if attn_mask.dim() == 2:
+                attn_weights = attn_weights + attn_mask.unsqueeze(0).unsqueeze(0)
+            else:
+                attn_weights = attn_weights + attn_mask
+        # Apply padding mask
+        if key_padding_mask is not None:
+            attn_weights = attn_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2),
+                float('-inf')
+            )
+        attn_weights = F.softmax(attn_weights, dim=-1)
+        attn_weights = self.dropout(attn_weights)
+        # Apply attention to values
+        attn_output = torch.matmul(attn_weights, v)
+        # Reshape back: (batch, nhead, seq, head_dim) -> (batch, seq, d_model)
+        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
+        # Output projection (1.58-bit)
+        return self.out_proj(attn_output)
+class BitPixelLMDecoderLayer(nn.Module):
+    """
+    Single decoder layer with 1.58-bit weights.
+    Structure (per BitNet b1.58 / LLaMA convention):
+    1. Self-attention with BitLinear158 projections (RMSNorm built into BitLinear)
+    2. Cross-attention to text encoder output (BitLinear158 projections)
+    3. SwiGLU feed-forward network (BitLinear158 projections)
+    Pre-norm architecture, but the norm is absorbed into BitLinear158.
+    Residual connections use a separate RMSNorm for gradient stability.
+    """
+    def __init__(self, d_model: int, nhead: int, dim_ff: int, dropout: float = 0.0):
+        super().__init__()
+        # Self-attention (masked, causal)
+        self.self_attn = BitMultiheadAttention(d_model, nhead, dropout=dropout)
+        self.norm1 = RMSNorm(d_model)
+        # Cross-attention to text
+        self.cross_attn = BitMultiheadAttention(d_model, nhead, dropout=dropout)
+        self.norm2 = RMSNorm(d_model)
+        # SwiGLU feed-forward (replaces GELU FFN)
+        self.ff = SwiGLU(d_model, hidden_features=dim_ff, use_bitlinear=True)
+        self.norm3 = RMSNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+    def forward(
+        self,
+        x: torch.Tensor,
+        text_enc: torch.Tensor,
+        causal_mask: torch.Tensor,
+        text_pad_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            x:            (batch, seq_len, d_model)
+            text_enc:     (batch, text_len, d_model)
+            causal_mask:  (seq_len, seq_len) causal attention mask
+            text_pad_mask: (batch, text_len) padding mask for text
+        Returns:
+            (batch, seq_len, d_model)
+        """
+        # Pre-norm self-attention with residual
+        residual = x
+        x = self.norm1(x)
+        x = self.self_attn(x, x, x, attn_mask=causal_mask)
+        x = self.dropout(x) + residual
+        # Pre-norm cross-attention with residual
+        residual = x
+        x = self.norm2(x)
+        x = self.cross_attn(x, text_enc, text_enc, key_padding_mask=text_pad_mask)
+        x = self.dropout(x) + residual
+        # Pre-norm SwiGLU FFN with residual
+        residual = x
+        x = self.norm3(x)
+        x = self.ff(x)
+        x = self.dropout(x) + residual
+        return x
+class BitPixelLMDecoder(nn.Module):
+    """
+    1.58-bit PixelLM Decoder.
+    Same architecture as PixelLMDecoder but with:
+    - BitLinear158 replacing all nn.Linear in attention and FFN
+    - RMSNorm replacing LayerNorm (absorbed into BitLinear + residual norms)
+    - SwiGLU replacing GELU FFN
+    - No biases
+    Full precision components (NOT quantized):
+    - Token embeddings (need full precision for gradient flow to embeddings)
+    - 2D positional encoding (our unique spatial encoding)
+    - Palette output head (needs high-precision logits for sampling)
+    """
+    def __init__(
+        self,
+        vocab_size: int,
+        d_model: int = 256,
+        nhead: int = 8,
+        num_layers: int = 6,
+        dim_feedforward: int = 512,
+        img_size: int = 32,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.vocab_size = vocab_size
+        self.img_size = img_size
+        self.max_seq_len = img_size * img_size + 2
+        # ── Full precision components ─────────────────────────────
+        # Token embedding (kept in FP32)
+        self.token_embed = nn.Embedding(vocab_size, d_model)
+        # 2D positional encoding (our unique contribution — kept FP32)
+        self.pos_encoding = PixelPositionalEncoding2D(d_model, img_size)
+        # Palette-aware output head (kept FP32 for sampling precision)
+        self.output_head = PaletteOutputHead(d_model, vocab_size - 3, num_special_tokens=3)
+        # ── 1.58-bit components ───────────────────────────────────
+        # Decoder layers with BitLinear158
+        self.layers = nn.ModuleList([
+            BitPixelLMDecoderLayer(d_model, nhead, dim_feedforward, dropout)
+            for _ in range(num_layers)
+        ])
+        # Final norm (full precision RMSNorm)
+        self.final_norm = RMSNorm(d_model)
+        # Dropout
+        self.dropout = nn.Dropout(dropout)
+        # Cache for causal mask
+        self._causal_mask_cache = {}
+    def _get_causal_mask(self, seq_len: int, device: torch.device) -> torch.Tensor:
+        """Generate or retrieve cached causal attention mask."""
+        if seq_len not in self._causal_mask_cache:
+            mask = torch.triu(torch.ones(seq_len, seq_len, device=device), diagonal=1).bool()
+            float_mask = torch.zeros(seq_len, seq_len, device=device)
+            float_mask.masked_fill_(mask, float('-inf'))
+            self._causal_mask_cache[seq_len] = float_mask
+        return self._causal_mask_cache[seq_len]
+    def forward(
+        self,
+        pixel_tokens: torch.Tensor,
+        text_enc: torch.Tensor,
+        text_pad_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Forward pass for training (teacher-forced).
+        Args:
+            pixel_tokens: (batch, seq_len) long tensor of pixel token indices
+            text_enc:     (batch, text_len, d_model) text encoder output
+            text_pad_mask: (batch, text_len) True where text is padded
+        Returns:
+            logits: (batch, seq_len, vocab_size)
+        """
+        batch_size, seq_len = pixel_tokens.shape
+        device = pixel_tokens.device
+        # Token embeddings (full precision)
+        x = self.token_embed(pixel_tokens) * math.sqrt(self.d_model)
+        # 2D positional encoding (full precision)
+        pos = self.pos_encoding(seq_len, device)
+        x = x + pos
+        x = self.dropout(x)
+        # Causal mask
+        causal_mask = self._get_causal_mask(seq_len, device)
+        # 1.58-bit decoder layers
+        for layer in self.layers:
+            x = layer(x, text_enc, causal_mask, text_pad_mask)
+        # Final norm
+        x = self.final_norm(x)
+        # Output logits via palette-aware head (full precision)
+        logits = self.output_head(x)
+        return logits
+    @torch.no_grad()
+    def generate(
+        self,
+        text_enc: torch.Tensor,
+        sos_token: int,
+        eos_token: int,
+        max_len: int = 1026,
+        temperature: float = 0.8,
+        top_k: int = 40,
+        top_p: float = 0.9,
+        text_pad_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Autoregressive generation (same interface as PixelLMDecoder).
+        """
+        device = text_enc.device
+        tokens = torch.tensor([[sos_token]], dtype=torch.long, device=device)
+        for step in range(max_len - 1):
+            logits = self.forward(tokens, text_enc, text_pad_mask)
+            next_logits = logits[:, -1, :] / temperature
+            # Top-k filtering
+            if top_k > 0:
+                topk_vals, _ = torch.topk(next_logits, top_k)
+                next_logits[next_logits < topk_vals[:, -1:]] = float('-inf')
+            # Top-p (nucleus) filtering
+            if top_p < 1.0:
+                sorted_logits, sorted_indices = torch.sort(next_logits, descending=True)
+                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                sorted_mask = cumulative_probs - F.softmax(sorted_logits, dim=-1) >= top_p
+                sorted_logits[sorted_mask] = float('-inf')
+                next_logits = sorted_logits.scatter(1, sorted_indices, sorted_logits)
+            probs = F.softmax(next_logits, dim=-1)
+            next_token = torch.multinomial(probs, 1)
+            tokens = torch.cat([tokens, next_token], dim=1)
+            if next_token.item() == eos_token:
+                break
+        return tokens
+class BitPixelLM(nn.Module):
+    """
+    Complete 1.58-bit PixelLM: Text Encoder (FP32) + Pixel Decoder (1.58-bit).
+    The text encoder remains in full precision because:
+    1. It's small (3 layers) — quantization overhead would negate benefits
+    2. Text understanding needs full precision for a small vocabulary
+    The pixel decoder uses 1.58-bit weights for:
+    1. All self-attention projections (Q, K, V, O)
+    2. All cross-attention projections
+    3. All FFN projections (SwiGLU)
+    """
+    def __init__(self, text_encoder: nn.Module, pixel_decoder: BitPixelLMDecoder):
+        super().__init__()
+        self.text_encoder = text_encoder
+        self.pixel_decoder = pixel_decoder
+    def forward(
+        self,
+        text_tokens: torch.Tensor,
+        pixel_tokens: torch.Tensor,
+    ) -> torch.Tensor:
+        text_pad_mask = (text_tokens == 0)
+        text_enc = self.text_encoder(text_tokens)
+        logits = self.pixel_decoder(pixel_tokens, text_enc, text_pad_mask)
+        return logits
+    @torch.no_grad()
+    def generate(
+        self,
+        text_tokens: torch.Tensor,
+        sos_token: int,
+        eos_token: int,
+        **kwargs,
+    ) -> torch.Tensor:
+        text_pad_mask = (text_tokens == 0)
+        text_enc = self.text_encoder(text_tokens)
+        return self.pixel_decoder.generate(
+            text_enc, sos_token, eos_token,
+            text_pad_mask=text_pad_mask, **kwargs
+        )
+    def count_parameters(self) -> int:
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+    def count_bit_parameters(self) -> dict:
+        """Count parameters by precision level."""
+        bit_params = 0
+        fp_params = 0
+        for name, p in self.named_parameters():
+            if not p.requires_grad:
+                continue
+            if 'pixel_decoder.layers' in name and '.weight' in name and 'norm' not in name and 'rms_norm' not in name:
+                bit_params += p.numel()
+            else:
+                fp_params += p.numel()
+        return {
+            'ternary_params': bit_params,
+            'fp32_params': fp_params,
+            'total': bit_params + fp_params,
+            'ternary_pct': bit_params / (bit_params + fp_params) * 100,
+            'effective_bits': (bit_params * 1.58 + fp_params * 32) / (bit_params + fp_params),
+        }
+# ──── Testing ────────────────────────────────────────────────────
+if __name__ == "__main__":
+    import sys
+    sys.path.insert(0, str(__import__('pathlib').Path(__file__).parent.parent))
+    from model.text_encoder import TextEncoder
+    print("Building BitPixelLM...")
+    # Build text encoder (full precision)
+    text_encoder = TextEncoder(
+        vocab_size=66,  # 62 words + 4 special
+        d_model=256,
+        nhead=4,
+        num_layers=3,
+        dim_feedforward=512,
+        max_seq_len=32,
+    )
+    # Build 1.58-bit pixel decoder
+    pixel_decoder = BitPixelLMDecoder(
+        vocab_size=259,
+        d_model=256,
+        nhead=8,
+        num_layers=6,
+        dim_feedforward=512,
+        img_size=32,
+    )
+    model = BitPixelLM(text_encoder, pixel_decoder)
+    # Parameter count
+    total = model.count_parameters()
+    breakdown = model.count_bit_parameters()
+    print(f"\nBitPixelLM: {total:,} total parameters")
+    print(f"  Ternary (1.58-bit): {breakdown['ternary_params']:,} ({breakdown['ternary_pct']:.1f}%)")
+    print(f"  Full precision:     {breakdown['fp32_params']:,} ({100-breakdown['ternary_pct']:.1f}%)")
+    print(f"  Effective bits/param: {breakdown['effective_bits']:.2f}")
+    # Forward pass test
+    text = torch.randint(0, 66, (2, 32))
+    pixels = torch.randint(0, 259, (2, 1025))
+    print(f"\nForward pass test...")
+    logits = model(text, pixels)
+    print(f"  Input:  text={text.shape}, pixels={pixels.shape}")
+    print(f"  Output: logits={logits.shape}")
+    # Gradient test
+    loss = logits[:, :, :259].sum()
+    loss.backward()
+    grad_ok = all(p.grad is not None for p in model.parameters() if p.requires_grad)
+    print(f"  Gradient flow: {'OK' if grad_ok else 'FAILED'}")
+    print("\nAll tests passed! ✓")

model/bitlinear.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""
+PixelArtGen — BitLinear 1.58-bit Layer & RMSNorm
+Implementation of the core BitNet b1.58 components:
+- RMSNorm: Root Mean Square Layer Normalization (Zhang & Sennrich, 2019)
+- BitLinear158: 1.58-bit linear layer with ternary weights {-1, 0, +1}
+References:
+- "The Era of 1-bit LLMs" (Ma et al., 2024) — arXiv:2402.17764
+- "BitNet: Scaling 1-bit Transformers" (Wang et al., 2023) — arXiv:2310.11453
+- "RMSNorm" (Zhang & Sennrich, 2019) — arXiv:1910.07467
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+class RMSNorm(nn.Module):
+    """
+    Root Mean Square Layer Normalization.
+    Simpler and faster than LayerNorm — removes mean centering,
+    keeps only the re-scaling by root mean square.
+    RMSNorm(x) = x / RMS(x) * g
+    where RMS(x) = sqrt(mean(x^2))
+    Reference: arXiv:1910.07467
+    """
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+def activation_quant(x: torch.Tensor) -> torch.Tensor:
+    """
+    Per-token 8-bit activation quantization from BitNet b1.58.
+    Quantizes activations to [-127, 127] per-token using absmax scaling.
+    Symmetric quantization (no zero-point) as specified in the paper.
+    Args:
+        x: (..., d_model) float tensor
+    Returns:
+        Quantized tensor (still float for autograd compatibility), scale factor
+    """
+    Qb = 127  # 8-bit signed: 2^(8-1) - 1
+    scale = x.abs().max(dim=-1, keepdim=True).values.clamp(min=1e-5)
+    x_quant = (x * Qb / scale).clamp(-Qb, Qb).round()
+    # STE: detach the rounding, keep gradients flowing
+    x_quant = x + (x_quant * scale / Qb - x).detach()
+    return x_quant
+def weight_quant(w: torch.Tensor) -> tuple:
+    """
+    Absmean ternary weight quantization from BitNet b1.58.
+    Quantizes weights to {-1, 0, +1} using absmean scaling:
+    1. Compute gamma = mean(|W|)
+    2. Scale: W_scaled = W / gamma
+    3. Round to nearest in {-1, 0, +1}
+    Args:
+        w: (out_features, in_features) weight matrix
+    Returns:
+        (quantized_weights, scale_factor)
+    """
+    gamma = w.abs().mean().clamp(min=1e-5)
+    w_scaled = w / gamma
+    w_quant = w_scaled.clamp(-1, 1).round()
+    # STE: detach the rounding, keep gradients on the latent weights
+    w_quant = w + (w_quant * gamma - w).detach()
+    return w_quant, gamma
+class BitLinear158(nn.Module):
+    """
+    1.58-bit Linear Layer from BitNet b1.58.
+    Drop-in replacement for nn.Linear with:
+    - Ternary weights {-1, 0, +1} via absmean quantization
+    - 8-bit per-token activation quantization
+    - Built-in RMSNorm (absorbs the preceding LayerNorm)
+    - No bias (following BitNet b1.58 / LLaMA convention)
+    - Full-precision latent weights maintained for training (STE)
+    Forward pass:
+        1. RMSNorm the input
+        2. Quantize activations to 8-bit
+        3. Quantize weights to ternary
+        4. Matrix multiply (effectively integer addition)
+        5. Rescale output
+    During training, gradients flow through quantization via the
+    Straight-Through Estimator (STE) — the gradient of round()
+    is treated as the identity function.
+    Reference: arXiv:2402.17764
+    """
+    def __init__(self, in_features: int, out_features: int):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        # Full-precision latent weight (master copy for training)
+        self.weight = nn.Parameter(torch.empty(out_features, in_features))
+        # Built-in RMSNorm (replaces the preceding LayerNorm)
+        self.rms_norm = RMSNorm(in_features)
+        # Initialize weights
+        self._init_weights()
+    def _init_weights(self):
+        """Kaiming uniform initialization, same as nn.Linear."""
+        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: (batch, seq_len, in_features)
+        Returns:
+            (batch, seq_len, out_features)
+        """
+        # 1. Normalize input (built-in RMSNorm)
+        x = self.rms_norm(x)
+        # 2. Quantize activations to 8-bit per-token
+        x_q = activation_quant(x)
+        # 3. Quantize weights to ternary {-1, 0, +1}
+        w_q, w_scale = weight_quant(self.weight)
+        # 4. Matrix multiply with quantized weights and activations
+        # In theory this is integer addition; in practice we use float
+        # for autograd compatibility during training
+        output = F.linear(x_q, w_q)
+        return output
+    def extra_repr(self) -> str:
+        return f"in={self.in_features}, out={self.out_features}, bits=1.58"
+class SwiGLU(nn.Module):
+    """
+    SwiGLU activation for Feed-Forward Networks.
+    SwiGLU(x) = (Swish(xW1) ⊙ xV) W2
+    Uses 3 linear projections instead of 2, but the hidden dim
+    is typically reduced by 2/3 to keep parameter count similar.
+    When used with BitLinear158, all three projections are ternary.
+    Reference: arXiv:2002.05202 (Shazeer, 2020)
+    """
+    def __init__(self, in_features: int, hidden_features: int = None, use_bitlinear: bool = True):
+        super().__init__()
+        hidden_features = hidden_features or int(in_features * 8 / 3)  # 2/3 of 4x expansion
+        # Round to nearest multiple of 8 for efficiency
+        hidden_features = ((hidden_features + 7) // 8) * 8
+        Linear = BitLinear158 if use_bitlinear else nn.Linear
+        self.w1 = Linear(in_features, hidden_features)   # gate projection
+        self.v = Linear(in_features, hidden_features)     # value projection
+        self.w2 = Linear(hidden_features, in_features)    # output projection
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.v(x))
+# ──── Testing ────────────────────────────────────────────────────
+if __name__ == "__main__":
+    print("Testing BitLinear158 components...")
+    # Test RMSNorm
+    norm = RMSNorm(256)
+    x = torch.randn(2, 10, 256)
+    y = norm(x)
+    print(f"RMSNorm: {x.shape} -> {y.shape}, mean={y.mean():.4f}, std={y.std():.4f}")
+    # Test weight quantization
+    w = torch.randn(512, 256)
+    w_q, scale = weight_quant(w)
+    unique = torch.unique(w_q.detach())
+    print(f"Weight quant: {w.shape}, unique values: {len(unique)}, scale: {scale:.4f}")
+    print(f"  Ternary distribution: -1={((w_q.detach().round() == -1).sum().item())}, "
+          f"0={((w_q.detach().round() == 0).sum().item())}, "
+          f"+1={((w_q.detach().round() == 1).sum().item())}")
+    # Test activation quantization
+    a = torch.randn(2, 10, 256)
+    a_q = activation_quant(a)
+    print(f"Activation quant: range [{a_q.min():.2f}, {a_q.max():.2f}]")
+    # Test BitLinear158
+    layer = BitLinear158(256, 512)
+    x = torch.randn(2, 10, 256)
+    y = layer(x)
+    print(f"BitLinear158: {x.shape} -> {y.shape}")
+    # Test gradient flow (STE)
+    loss = y.sum()
+    loss.backward()
+    assert layer.weight.grad is not None, "Gradient did not flow through STE!"
+    print(f"STE gradient flow: OK (grad norm: {layer.weight.grad.norm():.4f})")
+    # Test SwiGLU
+    swiglu = SwiGLU(256, use_bitlinear=True)
+    x = torch.randn(2, 10, 256)
+    y = swiglu(x)
+    print(f"SwiGLU (BitLinear): {x.shape} -> {y.shape}")
+    total = sum(p.numel() for p in swiglu.parameters())
+    print(f"  SwiGLU params: {total:,}")
+    # Parameter comparison
+    ff_standard = nn.Sequential(nn.Linear(256, 512), nn.GELU(), nn.Linear(512, 256))
+    ff_params = sum(p.numel() for p in ff_standard.parameters())
+    print(f"  Standard FFN params: {ff_params:,}")
+    print(f"  Ratio: {total / ff_params:.2f}x")
+    print("\nAll tests passed! ✓")

model/text_encoder.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""
+PixelArtGen — Text Encoder
+A small transformer encoder that converts text prompts into
+contextual embeddings for conditioning the pixel art decoder.
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import List
+class TextTokenizer:
+    """Simple word-level tokenizer for text prompts."""
+    def __init__(self, vocab: List[str]):
+        self.word2idx = {w: i for i, w in enumerate(vocab)}
+        self.idx2word = {i: w for i, w in enumerate(vocab)}
+        self.pad_idx = self.word2idx.get("<pad>", 0)
+        self.sos_idx = self.word2idx.get("<sos>", 1)
+        self.eos_idx = self.word2idx.get("<eos>", 2)
+        self.unk_idx = self.word2idx.get("<unk>", 3)
+        self.vocab_size = len(vocab)
+    def encode(self, text: str, max_len: int = 32) -> torch.Tensor:
+        """Tokenize and pad a text prompt."""
+        words = text.lower().strip().split()
+        tokens = [self.sos_idx]
+        for w in words:
+            tokens.append(self.word2idx.get(w, self.unk_idx))
+        tokens.append(self.eos_idx)
+        # Pad or truncate
+        if len(tokens) > max_len:
+            tokens = tokens[:max_len]
+        else:
+            tokens += [self.pad_idx] * (max_len - len(tokens))
+        return torch.tensor(tokens, dtype=torch.long)
+    def encode_batch(self, texts: List[str], max_len: int = 32) -> torch.Tensor:
+        """Encode a batch of text prompts."""
+        return torch.stack([self.encode(t, max_len) for t in texts])
+class TextEncoder(nn.Module):
+    """
+    Small transformer encoder for text prompts.
+    Architecture:
+    - Word embeddings + sinusoidal positional encoding
+    - N transformer encoder layers with multi-head attention
+    - Output: sequence of contextual embeddings (batch, seq_len, d_model)
+    """
+    def __init__(
+        self,
+        vocab_size: int,
+        d_model: int = 256,
+        nhead: int = 4,
+        num_layers: int = 3,
+        dim_feedforward: int = 512,
+        max_seq_len: int = 32,
+        dropout: float = 0.1,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
+        self.pos_encoding = SinusoidalPositionalEncoding(d_model, max_seq_len)
+        self.dropout = nn.Dropout(dropout)
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=d_model,
+            nhead=nhead,
+            dim_feedforward=dim_feedforward,
+            dropout=dropout,
+            batch_first=True,
+            norm_first=True,
+        )
+        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        self.norm = nn.LayerNorm(d_model)
+    def forward(self, text_tokens: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            text_tokens: (batch, seq_len) long tensor of word indices
+        Returns:
+            (batch, seq_len, d_model) contextual embeddings
+        """
+        # Create padding mask (True = ignore)
+        pad_mask = (text_tokens == 0)  # pad_idx = 0
+        # Embed + positional encode
+        x = self.embedding(text_tokens) * math.sqrt(self.d_model)
+        x = self.pos_encoding(x)
+        x = self.dropout(x)
+        # Transformer encode
+        x = self.transformer(x, src_key_padding_mask=pad_mask)
+        x = self.norm(x)
+        return x
+class SinusoidalPositionalEncoding(nn.Module):
+    """Standard sinusoidal positional encoding."""
+    def __init__(self, d_model: int, max_len: int = 512):
+        super().__init__()
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
+        self.register_buffer("pe", pe)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + self.pe[:, :x.size(1)]

model/tokenizer.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""
+PixelArtGen — Color Palette Tokenizer
+Converts 32×32 RGB pixel art images into sequences of palette indices
+and back. This is the "vocabulary" for the pixel language model.
+"""
+import numpy as np
+import torch
+from pathlib import Path
+class PaletteTokenizer:
+    """
+    Maps RGB pixels to/from a fixed palette of N colors.
+    Each pixel becomes a token index ∈ [0, palette_size).
+    Special tokens:
+        palette_size     = <sos> (start of sequence)
+        palette_size + 1 = <eos> (end of sequence)
+        palette_size + 2 = <pad> (padding)
+    """
+    def __init__(self, palette_path: str = None, palette: np.ndarray = None, palette_size: int = 256):
+        if palette is not None:
+            self.palette = palette.astype(np.float32)
+        elif palette_path is not None:
+            self.palette = np.load(palette_path).astype(np.float32)
+        else:
+            raise ValueError("Must provide palette_path or palette array")
+        self.palette_size = len(self.palette)
+        self.sos_token = self.palette_size
+        self.eos_token = self.palette_size + 1
+        self.pad_token = self.palette_size + 2
+        self.vocab_size = self.palette_size + 3  # colors + sos + eos + pad
+    def rgb_to_index(self, rgb: np.ndarray) -> int:
+        """Find the closest palette color for an RGB value."""
+        distances = np.sum((self.palette - rgb.astype(np.float32)) ** 2, axis=1)
+        return int(np.argmin(distances))
+    def encode_image(self, img_array: np.ndarray) -> list:
+        """
+        Encode a 32×32×3 RGB image into a flat sequence of palette indices.
+        Returns: [sos, p0, p1, ..., p1023, eos]  (1026 tokens)
+        """
+        h, w, c = img_array.shape
+        assert h == 32 and w == 32 and c == 3, f"Expected 32×32×3, got {img_array.shape}"
+        tokens = [self.sos_token]
+        for y in range(h):
+            for x in range(w):
+                pixel = img_array[y, x]
+                idx = self.rgb_to_index(pixel)
+                tokens.append(idx)
+        tokens.append(self.eos_token)
+        return tokens
+    def encode_image_fast(self, img_array: np.ndarray) -> list:
+        """
+        Vectorized encoding — much faster than pixel-by-pixel.
+        """
+        h, w, c = img_array.shape
+        pixels = img_array.reshape(-1, 3).astype(np.float32)  # (1024, 3)
+        # Compute distances to all palette colors at once
+        # pixels: (1024, 3), palette: (N, 3)
+        diff = pixels[:, None, :] - self.palette[None, :, :]  # (1024, N, 3)
+        distances = np.sum(diff ** 2, axis=2)  # (1024, N)
+        indices = np.argmin(distances, axis=1)  # (1024,)
+        tokens = [self.sos_token] + indices.tolist() + [self.eos_token]
+        return tokens
+    def decode_tokens(self, tokens: list) -> np.ndarray:
+        """
+        Decode a sequence of palette indices back to a 32×32×3 RGB image.
+        Strips sos/eos/pad tokens.
+        """
+        # Filter special tokens
+        pixel_tokens = [t for t in tokens if t < self.palette_size]
+        # Pad or truncate to exactly 1024 pixels
+        if len(pixel_tokens) < 1024:
+            pixel_tokens += [0] * (1024 - len(pixel_tokens))
+        pixel_tokens = pixel_tokens[:1024]
+        img = np.zeros((1024, 3), dtype=np.uint8)
+        for i, idx in enumerate(pixel_tokens):
+            idx = min(idx, self.palette_size - 1)
+            img[i] = self.palette[idx].astype(np.uint8)
+        return img.reshape(32, 32, 3)
+    def tokens_to_tensor(self, tokens: list, max_len: int = 1026) -> torch.Tensor:
+        """Convert token list to padded tensor."""
+        if len(tokens) > max_len:
+            tokens = tokens[:max_len]
+        else:
+            tokens = tokens + [self.pad_token] * (max_len - len(tokens))
+        return torch.tensor(tokens, dtype=torch.long)
+    def get_palette_tensor(self) -> torch.Tensor:
+        """Return the palette as a (palette_size, 3) float32 tensor."""
+        return torch.tensor(self.palette, dtype=torch.float32)