Spaces:

FINAL-Bench
/

test-old

Paused

App Files Files Community

SeaWolf-AI commited on 11 days ago

Commit

ca19627

verified ·

1 Parent(s): 1fe984b

Upload 6 files

Browse files

Files changed (6) hide show

app.py +320 -0
config.py +149 -0
layers.py +449 -0
model.py +228 -0
oheng_moe.py +292 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,320 @@

+#!/usr/bin/env python3
+"""
+AETHER-Net 0.8B — Inference Test Space
+Private 모델을 로드하여 텍스트 생성을 테스트합니다.
+HF Space: T4 GPU, HF_TOKEN secret 필요
+Deploy: FINAL-Bench/aether-net-test
+"""
+import os
+import sys
+import time
+import json
+import torch
+import torch.nn.functional as F
+import gradio as gr
+from pathlib import Path
+from huggingface_hub import hf_hub_download, snapshot_download
+# ── Config ──
+MODEL_REPO = "FINAL-Bench/AETHER-Net-0.8B"
+DONOR_REPO = "Qwen/Qwen3.5-0.8B"  # For tokenizer
+HF_TOKEN = os.getenv("HF_TOKEN")
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Device: {DEVICE}")
+print(f"HF_TOKEN: {'set' if HF_TOKEN else 'NOT SET'}")
+# ── Download model weights from private repo ──
+print(f"Downloading AETHER-Net weights from {MODEL_REPO}...")
+model_dir = None
+try:
+    model_dir = snapshot_download(
+        MODEL_REPO, token=HF_TOKEN,
+        allow_patterns=["model.safetensors", "config.json"],
+    )
+    print(f"  Model downloaded to: {model_dir}")
+except Exception as e:
+    print(f"  Download failed: {e}")
+# Source files are co-located in the same directory
+APP_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, APP_DIR)
+# ── Load model ──
+MODEL = None
+TOKENIZER = None
+def load_model():
+    global MODEL, TOKENIZER
+    if MODEL is not None:
+        return True
+    # Load tokenizer from donor
+    print("Loading tokenizer...")
+    from transformers import AutoTokenizer
+    try:
+        TOKENIZER = AutoTokenizer.from_pretrained(
+            DONOR_REPO, trust_remote_code=True, token=HF_TOKEN
+        )
+        print(f"  Tokenizer loaded: vocab_size={TOKENIZER.vocab_size}")
+    except Exception as e:
+        print(f"  Tokenizer failed: {e}")
+        return False
+    # Load AETHER-Net
+    print("Loading AETHER-Net model...")
+    try:
+        from config import AetherNetConfig
+        from model import AetherNetModel
+        # Load config
+        config_path = Path(model_dir) / "config.json" if model_dir else None
+        if config_path and config_path.exists():
+            with open(config_path) as f:
+                cfg_dict = json.load(f)
+            # Filter valid fields
+            valid_fields = {k for k in AetherNetConfig.__dataclass_fields__}
+            filtered = {k: v for k, v in cfg_dict.items() if k in valid_fields}
+            config = AetherNetConfig(**filtered)
+            print(f"  Config loaded: hidden={config.hidden_size}, layers={config.num_layers}")
+        else:
+            print("  No config.json, using defaults")
+            config = AetherNetConfig(
+                hidden_size=1024, intermediate_size=3584,
+                num_layers=25, num_attention_heads=16, num_kv_heads=2,
+                head_dim=64, vocab_size=248320,
+                max_position_embeddings=4096,
+                expert_intermediate_size=716,
+                overcome_gate_hidden=64,
+                sliding_window_size=1024,
+                gdn_state_size=64, mamba2_state_size=64,
+                tie_word_embeddings=True,
+            )
+        model = AetherNetModel(config)
+        # Load weights
+        weights_path = Path(model_dir) / "model.safetensors" if model_dir else None
+        if weights_path and weights_path.exists():
+            from safetensors.torch import load_file
+            state = load_file(str(weights_path), device="cpu")
+            model.load_state_dict(state, strict=False)
+            print(f"  Weights loaded: {len(state)} tensors")
+        else:
+            print("  ⚠️ No weights found, using random init")
+        model = model.to(DEVICE).eval()
+        MODEL = model
+        params = sum(p.numel() for p in model.parameters())
+        mem = params * 2 / 1e9  # BF16 estimate
+        print(f"  Model ready: {params:,} params (~{mem:.1f}GB)")
+        return True
+    except Exception as e:
+        import traceback
+        print(f"  Model load failed: {e}")
+        traceback.print_exc()
+        return False
+# ── Generation ──
+@torch.no_grad()
+def generate(prompt, max_tokens=128, temperature=0.8, top_k=50, top_p=0.9):
+    """Generate text from prompt."""
+    if MODEL is None:
+        success = load_model()
+        if not success:
+            return "❌ Model failed to load. Check logs."
+    # Tokenize
+    input_ids = TOKENIZER.encode(prompt, return_tensors="pt").to(DEVICE)
+    generated = input_ids.clone()
+    t0 = time.time()
+    for i in range(max_tokens):
+        # Truncate to max position
+        if generated.shape[1] > 4096:
+            generated = generated[:, -4096:]
+        outputs = MODEL(input_ids=generated)
+        logits = outputs["logits"][:, -1, :]
+        # Temperature
+        if temperature > 0:
+            logits = logits / temperature
+            # Top-k
+            if top_k > 0:
+                values, _ = torch.topk(logits, top_k)
+                min_val = values[:, -1].unsqueeze(-1)
+                logits = torch.where(logits < min_val, torch.full_like(logits, -float('inf')), logits)
+            # Top-p (nucleus)
+            if top_p < 1.0:
+                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+                cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                mask = cum_probs - F.softmax(sorted_logits, dim=-1) > top_p
+                sorted_logits[mask] = -float('inf')
+                logits = sorted_logits.scatter(1, sorted_indices, sorted_logits)
+            probs = F.softmax(logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+        else:
+            next_token = logits.argmax(dim=-1, keepdim=True)
+        generated = torch.cat([generated, next_token], dim=-1)
+        # EOS check
+        if next_token.item() == TOKENIZER.eos_token_id:
+            break
+    elapsed = time.time() - t0
+    tokens_generated = generated.shape[1] - input_ids.shape[1]
+    tps = tokens_generated / elapsed if elapsed > 0 else 0
+    output_text = TOKENIZER.decode(generated[0], skip_special_tokens=True)
+    stats = f"\n\n---\n📊 {tokens_generated} tokens | {tps:.1f} tok/s | {elapsed:.2f}s"
+    return output_text + stats
+def get_model_info():
+    """Return model architecture info."""
+    if MODEL is None:
+        load_model()
+    if MODEL is None:
+        return "Model not loaded"
+    info = "## AETHER-Net 0.8B — Architecture Info\n\n"
+    info += f"| Item | Value |\n|---|---|\n"
+    info += f"| Device | {DEVICE} |\n"
+    info += f"| Parameters | {sum(p.numel() for p in MODEL.parameters()):,} |\n"
+    info += f"| Layers | {len(MODEL.layers)} |\n"
+    info += f"| Vocab | {MODEL.config.vocab_size:,} |\n"
+    info += f"| Hidden | {MODEL.config.hidden_size} |\n"
+    # Layer types
+    from config import LAYER_TYPES, LAYER_TO_ELEMENT, ELEMENTS
+    info += f"\n### Layer Map\n\n"
+    info += "| Layer | Type | Element |\n|---|---|---|\n"
+    for i in range(len(MODEL.layers)):
+        lt = LAYER_TYPES[i]
+        elem = LAYER_TO_ELEMENT[i]
+        info += f"| {i} | {lt.upper()} | {elem} |\n"
+    # Oheng status
+    info += f"\n### Oheng Status\n\n"
+    for elem in ELEMENTS:
+        layers = [i for i in range(25) if LAYER_TO_ELEMENT[i] == elem]
+        alphas = []
+        for li in layers:
+            gb = MODEL.layers[li].moe.generate_boost
+            if gb is not None:
+                a = torch.sigmoid(gb.alpha).detach()
+                eidx = ELEMENTS.index(elem)
+                if eidx < a.shape[0]:
+                    alphas.append(a[eidx].item())
+        avg = sum(alphas) / len(alphas) if alphas else 0
+        info += f"- {elem}: α={avg:.4f}\n"
+    return info
+# ── Gradio UI ──
+TITLE = """
+<div style="text-align:center; padding:15px 0;">
+    <h1>🌌 AETHER-Net 0.8B — Inference Test</h1>
+    <p style="color:#666;">Cross-Architecture Knowledge Distillation from Qwen3.5-0.8B</p>
+    <p style="color:#999; font-size:0.9em;">5×5 Magic Square | Oheng MoE | 5 Attention Types</p>
+</div>
+"""
+with gr.Blocks(title="AETHER-Net Test") as app:
+    gr.HTML(TITLE)
+    with gr.Tabs():
+        with gr.Tab("💬 Generate"):
+            gr.Markdown("프롬프트를 입력하면 AETHER-Net이 텍스트를 생성합니다.")
+            with gr.Row():
+                with gr.Column(scale=3):
+                    prompt = gr.Textbox(
+                        label="Prompt",
+                        placeholder="Enter your prompt here...",
+                        lines=3,
+                        value="The theory of relativity explains that"
+                    )
+                with gr.Column(scale=1):
+                    max_tokens = gr.Slider(16, 512, value=128, step=16, label="Max Tokens")
+                    temperature = gr.Slider(0.0, 2.0, value=0.8, step=0.1, label="Temperature")
+                    top_k = gr.Slider(0, 100, value=50, step=5, label="Top-K")
+                    top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P")
+            gen_btn = gr.Button("🚀 Generate", variant="primary", size="lg")
+            output = gr.Textbox(label="Output", lines=12, interactive=False)
+            gen_btn.click(
+                fn=generate,
+                inputs=[prompt, max_tokens, temperature, top_k, top_p],
+                outputs=output,
+            )
+            gr.Markdown("### Quick Prompts")
+            examples = gr.Examples(
+                examples=[
+                    ["The theory of relativity explains that"],
+                    ["In Python, the most efficient way to sort a list is"],
+                    ["The five elements of nature are"],
+                    ["Artificial general intelligence requires"],
+                    ["한국의 수도는"],
+                    ["def fibonacci(n):"],
+                ],
+                inputs=prompt,
+            )
+        with gr.Tab("🔍 Model Info"):
+            info_btn = gr.Button("Load Model Info", variant="primary")
+            info_output = gr.Markdown()
+            info_btn.click(fn=get_model_info, outputs=info_output)
+        with gr.Tab("ℹ️ About"):
+            gr.Markdown("""
+## AETHER-Net 0.8B
+**Cross-Architecture Knowledge Distillation from Qwen3.5-0.8B**
+### Method
+- **Weight Transplant**: Qwen3.5-0.8B → AETHER-Net (5×5 Magic Square layout)
+- **3-Stage MOHAWK Distillation**: KLD → Hidden Alignment → Oheng Regularization
+- **Cost**: ~$0 (CPU-only, 100 steps demo)
+### Architecture
+- 25 Layers: 5 attention types × 5 elements
+- GDN, Full, Mamba2, Sliding Window, Cross Attention
+- Oheng MoE: 25 experts, 상생(Generate) + 상극(Overcome)
+### Source
+- Model: [FINAL-Bench/AETHER-Net-0.8B](https://huggingface.co/FINAL-Bench/AETHER-Net-0.8B) (private)
+- Space: [FINAL-Bench/agi-model-gen](https://huggingface.co/spaces/FINAL-Bench/agi-model-gen)
+---
+© 2026 VIDRAFT / Ginigen AI
+""")
+# ── Preload model on startup ──
+print("\n=== Pre-loading model ===")
+load_model()
+print("=== Ready ===\n")
+if __name__ == "__main__":
+    app.launch()

config.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""
+AETHER-Net Configuration
+Adaptive Elemental Transformer-Hybrid Efficient Recurrent Network
+5×5 Latin Orthogonal Magic Square Layout + Oheng(五行) MoE Routing
+"""
+from dataclasses import dataclass, field
+from typing import List, Tuple
+# ── 5×5 Latin Orthogonal Magic Square ──
+# Each row (element group) and each column (phase) contains
+# exactly one of each attention type → zero carry-over bias.
+MAGIC_SQUARE = [
+    # Phase1    Phase2    Phase3    Phase4    Phase5
+    ["gdn",    "full",   "mamba2", "slide",  "cross"],   # 木 Wood
+    ["slide",  "gdn",    "full",   "cross",  "mamba2"],  # 火 Fire
+    ["full",   "cross",  "slide",  "mamba2", "gdn"],     # 土 Earth
+    ["mamba2", "slide",  "cross",  "gdn",    "full"],    # 金 Metal
+    ["cross",  "mamba2", "gdn",    "full",   "slide"],   # 水 Water
+]
+# Flatten to 25-layer sequence (row-major)
+LAYER_TYPES = [t for row in MAGIC_SQUARE for t in row]
+# ── Oheng (五行) Element System ──
+ELEMENTS = ["wood", "fire", "earth", "metal", "water"]
+# 상생 (Generate): 木→火→土→金→水→木
+GENERATE = {"wood": "fire", "fire": "earth", "earth": "metal", "metal": "water", "water": "wood"}
+GENERATE_REVERSE = {v: k for k, v in GENERATE.items()}
+# 상극 (Overcome): 木⊣土, 土⊣水, 水⊣火, 火⊣金, 金⊣木
+OVERCOME = {"wood": "earth", "earth": "water", "water": "fire", "fire": "metal", "metal": "wood"}
+OVERCOME_REVERSE = {v: k for k, v in OVERCOME.items()}
+# Element → Layer indices (0-based)
+ELEMENT_LAYERS = {
+    "wood":  [0, 1, 2, 3, 4],
+    "fire":  [5, 6, 7, 8, 9],
+    "earth": [10, 11, 12, 13, 14],
+    "metal": [15, 16, 17, 18, 19],
+    "water": [20, 21, 22, 23, 24],
+}
+# Element → Expert indices (0-based, 5 experts per element)
+ELEMENT_EXPERTS = {
+    "wood":  [0, 1, 2, 3, 4],
+    "fire":  [5, 6, 7, 8, 9],
+    "earth": [10, 11, 12, 13, 14],
+    "metal": [15, 16, 17, 18, 19],
+    "water": [20, 21, 22, 23, 24],
+}
+# Layer index → element name
+LAYER_TO_ELEMENT = {}
+for elem, indices in ELEMENT_LAYERS.items():
+    for idx in indices:
+        LAYER_TO_ELEMENT[idx] = elem
+@dataclass
+class AetherNetConfig:
+    """Configuration for AETHER-Net model."""
+    # ── Model dimensions ──
+    hidden_size: int = 4096
+    intermediate_size: int = 11008  # FFN intermediate (SwiGLU)
+    num_layers: int = 25
+    num_attention_heads: int = 32
+    num_kv_heads: int = 8  # GQA for Full Attention layers
+    head_dim: int = 128  # hidden_size // num_attention_heads
+    vocab_size: int = 151936  # Qwen tokenizer
+    max_position_embeddings: int = 262144
+    rope_theta: float = 10000000.0
+    # ── Layer schedule (from magic square) ──
+    layer_types: List[str] = field(default_factory=lambda: LAYER_TYPES)
+    # ── MoE Configuration ──
+    num_experts: int = 25
+    num_experts_per_group: int = 5
+    num_element_groups: int = 5
+    top_k: int = 2
+    num_shared_experts: int = 1
+    expert_intermediate_size: int = 2752  # intermediate_size // 4 (per expert)
+    moe_jitter_eps: float = 0.01
+    # ── Oheng (五行) routing ──
+    use_generate_boost: bool = True
+    use_overcome_gate: bool = True
+    generate_alpha_init: float = 0.1  # learnable soft scalar
+    overcome_gate_hidden: int = 256  # critic head hidden dim
+    # ── Attention-specific ──
+    sliding_window_size: int = 4096
+    gdn_state_size: int = 128  # Gated DeltaNet state dimension
+    mamba2_state_size: int = 128
+    mamba2_conv_size: int = 4
+    mamba2_expand: int = 2
+    # ── Training / Inference ──
+    rms_norm_eps: float = 1e-6
+    initializer_range: float = 0.02
+    tie_word_embeddings: bool = False
+    use_cache: bool = True
+    torch_dtype: str = "bfloat16"
+    # ── Donor transplant info (metadata) ──
+    primary_donor: str = "Qwen/Qwen3.5-27B"
+    secondary_donor: str = "meta-llama/Llama-3.1-8B"
+    def get_layer_type(self, layer_idx: int) -> str:
+        return self.layer_types[layer_idx]
+    def get_layer_element(self, layer_idx: int) -> str:
+        return LAYER_TO_ELEMENT[layer_idx]
+    def get_element_expert_range(self, element: str) -> Tuple[int, int]:
+        indices = ELEMENT_EXPERTS[element]
+        return (indices[0], indices[-1] + 1)
+    def summary(self) -> str:
+        type_counts = {}
+        for t in self.layer_types:
+            type_counts[t] = type_counts.get(t, 0) + 1
+        total_params_b = (
+            self.num_experts * self.expert_intermediate_size * self.hidden_size * 3 * 2  # experts
+            + self.num_layers * self.hidden_size * self.hidden_size * 4  # attention projections
+            + self.vocab_size * self.hidden_size * 2  # embeddings
+        ) / 1e9
+        active_params_b = total_params_b * (self.top_k + self.num_shared_experts) / self.num_experts_per_group
+        lines = [
+            "═" * 60,
+            "  AETHER-Net Architecture Summary",
+            "═" * 60,
+            f"  Layers:         {self.num_layers} (5×5 magic square)",
+            f"  Hidden dim:     {self.hidden_size}",
+            f"  Attention mix:  {type_counts}",
+            f"  MoE:            {self.num_experts} experts / {self.num_element_groups} groups / top-{self.top_k}",
+            f"  Est. total:     ~{total_params_b:.1f}B params",
+            f"  Est. active:    ~{active_params_b:.1f}B params",
+            f"  Context:        {self.max_position_embeddings:,} tokens",
+            f"  Oheng generate: {self.use_generate_boost} (α={self.generate_alpha_init})",
+            f"  Oheng overcome: {self.use_overcome_gate}",
+            f"  Primary donor:  {self.primary_donor}",
+            f"  Secondary donor:{self.secondary_donor}",
+            "═" * 60,
+        ]
+        return "\n".join(lines)

layers.py ADDED Viewed

	@@ -0,0 +1,449 @@

+"""
+AETHER-Net Attention Layers
+5 types: GDN, Full, Mamba2, Sliding Window, Cross Attention
+Each layer follows the same interface:
+    forward(hidden_states, attention_mask=None, position_ids=None, **kwargs) -> hidden_states
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.eps = eps
+    def forward(self, x):
+        variance = x.float().pow(2).mean(-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.eps)
+        return (self.weight * x).to(x.dtype)
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin):
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, max_seq_len: int = 262144, theta: float = 10000000.0):
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.max_seq_len = max_seq_len
+    def forward(self, x, position_ids):
+        # position_ids: [B, L] → take first batch (all same for standard positions)
+        pos = position_ids[0] if position_ids.dim() == 2 else position_ids
+        freqs = torch.outer(pos.float(), self.inv_freq.to(pos.device))
+        emb = torch.cat((freqs, freqs), dim=-1)
+        return emb.cos().unsqueeze(0), emb.sin().unsqueeze(0)
+# ═══════════════════════════════════════════════════════════
+# 1. FULL ATTENTION (Softmax, GQA, RoPE) — O(n²)
+# ═══════════════════════════════════════════════════════════
+class FullAttention(nn.Module):
+    """Standard grouped-query attention with RoPE.
+    Kept for 5 layers — provides precise token-to-token reasoning.
+    These layers maintain KV cache."""
+    def __init__(self, config):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.num_kv_heads = config.num_kv_heads
+        self.head_dim = config.head_dim
+        self.num_kv_groups = self.num_heads // self.num_kv_heads
+        self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, config.hidden_size, bias=False)
+        # Output gate (Qwen3.5 style gated attention)
+        self.gate = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.rotary_emb = RotaryEmbedding(self.head_dim, config.max_position_embeddings, config.rope_theta)
+    def forward(self, hidden_states, attention_mask=None, position_ids=None, **kwargs):
+        B, L, _ = hidden_states.shape
+        q = self.q_proj(hidden_states).view(B, L, self.num_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(hidden_states).view(B, L, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(hidden_states).view(B, L, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        # RoPE
+        cos, sin = self.rotary_emb(hidden_states, position_ids)
+        cos = cos.unsqueeze(1)
+        sin = sin.unsqueeze(1)
+        q, k = apply_rotary_pos_emb(q, k, cos, sin)
+        # GQA: expand KV heads
+        if self.num_kv_groups > 1:
+            k = k.repeat_interleave(self.num_kv_groups, dim=1)
+            v = v.repeat_interleave(self.num_kv_groups, dim=1)
+        # Scaled dot-product attention
+        attn = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        # Causal mask
+        causal = torch.triu(torch.full((L, L), float('-inf'), device=attn.device), diagonal=1)
+        attn = attn + causal.unsqueeze(0).unsqueeze(0)
+        if attention_mask is not None:
+            attn = attn + attention_mask
+        attn = F.softmax(attn, dim=-1, dtype=torch.float32).to(q.dtype)
+        out = torch.matmul(attn, v)
+        out = out.transpose(1, 2).contiguous().view(B, L, -1)
+        # Output gating
+        gate = torch.sigmoid(self.gate(hidden_states))
+        out = out * gate
+        return self.o_proj(out)
+# ═══════════════════════════════════════════════════════════
+# 2. GATED DELTANET (GDN) — O(n) linear time
+# ═══════════════════════════════════════════════════════════
+class GatedDeltaNet(nn.Module):
+    """Gated DeltaNet: Mamba-style gating + DeltaNet fast-weight update.
+    Core linear attention mechanism — 10 layers (40% of model).
+    Implements: M_t = α_t * M_{t-1} * (I - k_t * q_t^T) + k_t * v_t^T
+    with SiLU output gating for gradient flow stability.
+    Weight transplant: Q,K,V projections map directly from Qwen3.5 GDN layers.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.head_dim
+        self.state_size = config.gdn_state_size
+        # Input projections (transplantable from Qwen3.5 GDN)
+        self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, config.hidden_size, bias=False)
+        # Decay gate (α): controls memory decay speed
+        self.decay_proj = nn.Linear(config.hidden_size, self.num_heads, bias=True)
+        # Update gate (β): controls state update strength
+        self.beta_proj = nn.Linear(config.hidden_size, self.num_heads, bias=True)
+        # Output gate (SiLU activation for gradient stability)
+        self.gate = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
+        # Short convolution for local context (replaces positional encoding)
+        self.conv1d = nn.Conv1d(
+            in_channels=config.hidden_size,
+            out_channels=config.hidden_size,
+            kernel_size=4, padding=3, groups=config.hidden_size, bias=True
+        )
+    def forward(self, hidden_states, attention_mask=None, position_ids=None, **kwargs):
+        B, L, D = hidden_states.shape
+        # Local context mixing via causal conv1d
+        conv_out = self.conv1d(hidden_states.transpose(1, 2))[..., :L].transpose(1, 2)
+        q = self.q_proj(conv_out).view(B, L, self.num_heads, self.head_dim)
+        k = self.k_proj(conv_out).view(B, L, self.num_heads, self.head_dim)
+        v = self.v_proj(hidden_states).view(B, L, self.num_heads, self.head_dim)
+        # L2 normalize Q, K (replaces softmax normalization)
+        q = F.normalize(q, p=2, dim=-1)
+        k = F.normalize(k, p=2, dim=-1)
+        # Decay and update gates
+        alpha = torch.sigmoid(self.decay_proj(hidden_states)).unsqueeze(-1)  # [B, L, H, 1]
+        beta = torch.sigmoid(self.beta_proj(hidden_states)).unsqueeze(-1)
+        # Recurrent scan with delta rule
+        # M_t = α * M_{t-1} * (I - β * k * q^T) + β * k * v^T
+        # For efficiency, compute as: o_t = q^T @ M_t
+        outputs = []
+        state = torch.zeros(B, self.num_heads, self.head_dim, self.head_dim,
+                           device=hidden_states.device, dtype=hidden_states.dtype)
+        for t in range(L):
+            q_t = q[:, t]  # [B, H, D]
+            k_t = k[:, t]
+            v_t = v[:, t]
+            a_t = alpha[:, t]  # [B, H, 1]
+            b_t = beta[:, t]
+            # Delta rule update
+            # Erase: state = α * state * (I - β * k * q^T)
+            # Write: state += β * k * v^T
+            erase = torch.einsum('bhd,bhe->bhde', k_t * b_t, q_t)
+            write = torch.einsum('bhd,bhe->bhde', k_t * b_t, v_t)
+            state = a_t.unsqueeze(-1) * (state - state * erase) + write
+            # Read: o_t = q^T @ state
+            o_t = torch.einsum('bhd,bhde->bhe', q_t, state)
+            outputs.append(o_t)
+        out = torch.stack(outputs, dim=1)  # [B, L, H, D]
+        out = out.reshape(B, L, -1)
+        # Output gating with SiLU
+        gate = F.silu(self.gate(hidden_states))
+        out = out * gate
+        return self.o_proj(out)
+# ═══════════════════════════════════════════════════════════
+# 3. MAMBA2 — O(n) with SSM state-space duality
+# ═══════════════════════════════════════════════════════════
+class Mamba2Block(nn.Module):
+    """Mamba-2 block with Structured State Space Duality.
+    5 layers — provides state compression for memory efficiency.
+    Weight transplant: Via MOHAWK SSD duality from Llama-3.1 Q,K,V → C,B,X.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        expand = config.mamba2_expand
+        self.inner_size = config.hidden_size * expand
+        self.state_size = config.mamba2_state_size
+        self.conv_size = config.mamba2_conv_size
+        self.num_heads = config.num_attention_heads
+        # Input projection: x → (z, x_ssm) split
+        self.in_proj = nn.Linear(config.hidden_size, self.inner_size * 2, bias=False)
+        # Causal conv1d
+        self.conv1d = nn.Conv1d(
+            self.inner_size, self.inner_size,
+            kernel_size=self.conv_size, padding=self.conv_size - 1,
+            groups=self.inner_size, bias=True
+        )
+        # SSM parameters
+        self.dt_proj = nn.Linear(self.inner_size, self.num_heads, bias=True)
+        self.A_log = nn.Parameter(torch.log(torch.arange(1, self.num_heads + 1, dtype=torch.float32)))
+        self.D = nn.Parameter(torch.ones(self.num_heads))
+        # B, C projections (state-space)
+        head_dim_ssm = self.inner_size // self.num_heads
+        self.B_proj = nn.Linear(self.inner_size, self.state_size * self.num_heads, bias=False)
+        self.C_proj = nn.Linear(self.inner_size, self.state_size * self.num_heads, bias=False)
+        # Output
+        self.out_proj = nn.Linear(self.inner_size, config.hidden_size, bias=False)
+        self.norm = RMSNorm(self.inner_size)
+    def forward(self, hidden_states, attention_mask=None, position_ids=None, **kwargs):
+        B, L, _ = hidden_states.shape
+        # Input split
+        zx = self.in_proj(hidden_states)
+        z, x = zx.chunk(2, dim=-1)
+        # Causal conv
+        x = self.conv1d(x.transpose(1, 2))[..., :L].transpose(1, 2)
+        x = F.silu(x)
+        # SSM parameters
+        A = -torch.exp(self.A_log)  # [H]
+        dt = F.softplus(self.dt_proj(x))  # [B, L, H]
+        B_state = self.B_proj(x).view(B, L, self.num_heads, self.state_size)
+        C_state = self.C_proj(x).view(B, L, self.num_heads, self.state_size)
+        # Discretize: A_bar = exp(dt * A), B_bar = dt * B
+        dt_A = dt.unsqueeze(-1) * A.view(1, 1, -1, 1)  # [B, L, H, 1]
+        A_bar = torch.exp(dt_A)
+        B_bar = dt.unsqueeze(-1) * B_state  # [B, L, H, N]
+        # Selective scan (sequential for correctness; replace with FLA parallel kernel)
+        head_dim = self.inner_size // self.num_heads
+        x_heads = x.view(B, L, self.num_heads, head_dim)
+        outputs = []
+        state = torch.zeros(B, self.num_heads, self.state_size, device=x.device, dtype=x.dtype)
+        for t in range(L):
+            state = A_bar[:, t] * state + B_bar[:, t] * x_heads[:, t, :, :1].expand_as(B_bar[:, t])
+            y_t = torch.sum(state * C_state[:, t], dim=-1)  # [B, H]
+            outputs.append(y_t)
+        y = torch.stack(outputs, dim=1)  # [B, L, H]
+        # Skip connection with D
+        y = y + self.D.view(1, 1, -1) * x.view(B, L, self.num_heads, head_dim).mean(-1)
+        # Expand back and gate with z
+        y = y.unsqueeze(-1).expand(-1, -1, -1, head_dim).reshape(B, L, self.inner_size)
+        y = self.norm(y)
+        y = y * F.silu(z)
+        return self.out_proj(y)
+# ═══════════════════════════════════════════════════════════
+# 4. SLIDING WINDOW ATTENTION — O(n * w)
+# ═══════════════════════════════════════════════════════════
+class SlidingWindowAttention(nn.Module):
+    """Sliding window attention for local pattern capture.
+    5 layers — complements GDN's global view with fine-grained local context."""
+    def __init__(self, config):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.num_kv_heads = config.num_kv_heads
+        self.head_dim = config.head_dim
+        self.window_size = config.sliding_window_size
+        self.num_kv_groups = self.num_heads // self.num_kv_heads
+        self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, config.hidden_size, bias=False)
+        self.gate = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.rotary_emb = RotaryEmbedding(self.head_dim, config.max_position_embeddings, config.rope_theta)
+    def forward(self, hidden_states, attention_mask=None, position_ids=None, **kwargs):
+        B, L, _ = hidden_states.shape
+        q = self.q_proj(hidden_states).view(B, L, self.num_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(hidden_states).view(B, L, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(hidden_states).view(B, L, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        cos, sin = self.rotary_emb(hidden_states, position_ids)
+        cos = cos.unsqueeze(1)
+        sin = sin.unsqueeze(1)
+        q, k = apply_rotary_pos_emb(q, k, cos, sin)
+        if self.num_kv_groups > 1:
+            k = k.repeat_interleave(self.num_kv_groups, dim=1)
+            v = v.repeat_interleave(self.num_kv_groups, dim=1)
+        attn = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        # Sliding window + causal mask
+        mask = torch.ones(L, L, device=attn.device, dtype=torch.bool)
+        mask = torch.triu(mask, diagonal=1)  # causal
+        mask = mask | torch.tril(torch.ones_like(mask), diagonal=-self.window_size)  # window
+        attn = attn.masked_fill(mask.unsqueeze(0).unsqueeze(0), float('-inf'))
+        attn = F.softmax(attn, dim=-1, dtype=torch.float32).to(q.dtype)
+        out = torch.matmul(attn, v)
+        out = out.transpose(1, 2).contiguous().view(B, L, -1)
+        gate = torch.sigmoid(self.gate(hidden_states))
+        out = out * gate
+        return self.o_proj(out)
+# ═══════════════════════════════════════════════════════════
+# 5. CROSS ATTENTION — for multimodal / tool bridging
+# ═══════════════════════════════════════════════════════════
+class CrossAttention(nn.Module):
+    """Cross attention for PROMETHEUS (world model) and HEPHAESTUS (embodiment) connection.
+    5 layers — bridges AETHER-Net to external modalities.
+    When no external context: falls back to self-attention with gating."""
+    def __init__(self, config):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.head_dim
+        # Self-attention path (default when no external context)
+        self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, config.hidden_size, bias=False)
+        # Cross-attention path (when external context available)
+        self.cross_k_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.cross_v_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
+        # Modality gate: lerp between self and cross
+        self.modality_gate = nn.Linear(config.hidden_size, 1, bias=True)
+        nn.init.constant_(self.modality_gate.bias, -2.0)  # default: mostly self-attention
+        self.gate = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)
+    def forward(self, hidden_states, attention_mask=None, position_ids=None,
+                encoder_hidden_states=None, **kwargs):
+        B, L, _ = hidden_states.shape
+        q = self.q_proj(hidden_states).view(B, L, self.num_heads, self.head_dim).transpose(1, 2)
+        if encoder_hidden_states is not None:
+            # Cross-attention mode
+            k_cross = self.cross_k_proj(encoder_hidden_states).view(
+                B, -1, self.num_heads, self.head_dim).transpose(1, 2)
+            v_cross = self.cross_v_proj(encoder_hidden_states).view(
+                B, -1, self.num_heads, self.head_dim).transpose(1, 2)
+            attn_cross = torch.matmul(q, k_cross.transpose(-2, -1)) / math.sqrt(self.head_dim)
+            attn_cross = F.softmax(attn_cross, dim=-1, dtype=torch.float32).to(q.dtype)
+            out_cross = torch.matmul(attn_cross, v_cross)
+            out_cross = out_cross.transpose(1, 2).contiguous().view(B, L, -1)
+            # Self-attention path (always runs)
+            k_self = self.k_proj(hidden_states).view(B, L, self.num_heads, self.head_dim).transpose(1, 2)
+            v_self = self.v_proj(hidden_states).view(B, L, self.num_heads, self.head_dim).transpose(1, 2)
+            attn_self = torch.matmul(q, k_self.transpose(-2, -1)) / math.sqrt(self.head_dim)
+            causal = torch.triu(torch.full((L, L), float('-inf'), device=attn_self.device), diagonal=1)
+            attn_self = attn_self + causal.unsqueeze(0).unsqueeze(0)
+            attn_self = F.softmax(attn_self, dim=-1, dtype=torch.float32).to(q.dtype)
+            out_self = torch.matmul(attn_self, v_self).transpose(1, 2).contiguous().view(B, L, -1)
+            # Blend via modality gate
+            mg = torch.sigmoid(self.modality_gate(hidden_states))
+            out = mg * out_cross + (1 - mg) * out_self
+        else:
+            # Pure self-attention fallback
+            k = self.k_proj(hidden_states).view(B, L, self.num_heads, self.head_dim).transpose(1, 2)
+            v = self.v_proj(hidden_states).view(B, L, self.num_heads, self.head_dim).transpose(1, 2)
+            attn = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+            causal = torch.triu(torch.full((L, L), float('-inf'), device=attn.device), diagonal=1)
+            attn = attn + causal.unsqueeze(0).unsqueeze(0)
+            attn = F.softmax(attn, dim=-1, dtype=torch.float32).to(q.dtype)
+            out = torch.matmul(attn, v).transpose(1, 2).contiguous().view(B, L, -1)
+        gate = torch.sigmoid(self.gate(hidden_states))
+        out = out * gate
+        return self.o_proj(out)
+# ═══════════════════════════════════════════════════════════
+# Factory
+# ═══════════════════════════════════════════════════════════
+ATTENTION_CLASSES = {
+    "gdn": GatedDeltaNet,
+    "full": FullAttention,
+    "mamba2": Mamba2Block,
+    "slide": SlidingWindowAttention,
+    "cross": CrossAttention,
+}
+def build_attention(layer_type: str, config):
+    cls = ATTENTION_CLASSES.get(layer_type)
+    if cls is None:
+        raise ValueError(f"Unknown attention type: {layer_type}. Choose from {list(ATTENTION_CLASSES.keys())}")
+    return cls(config)

model.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""
+AETHER-Net: Main Model
+Adaptive Elemental Transformer-Hybrid Efficient Recurrent Network
+25-layer hybrid LLM with 5×5 Latin orthogonal magic square layout
+and Oheng (五行) MoE routing.
+"""
+import torch
+import torch.nn as nn
+from typing import Dict, List, Optional, Tuple
+from config import AetherNetConfig, ELEMENTS, LAYER_TO_ELEMENT, ELEMENT_LAYERS
+from layers import RMSNorm, build_attention
+from oheng_moe import OhengMoE
+class AetherNetBlock(nn.Module):
+    """Single AETHER-Net transformer block.
+    Structure:
+        x → RMSNorm → Attention → residual → RMSNorm → OhengMoE → residual → out
+    """
+    def __init__(self, config: AetherNetConfig, layer_idx: int):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.layer_type = config.get_layer_type(layer_idx)
+        self.element = config.get_layer_element(layer_idx)
+        # Pre-norm
+        self.input_layernorm = RMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, config.rms_norm_eps)
+        # Attention (type determined by magic square)
+        self.attention = build_attention(self.layer_type, config)
+        # MoE FFN with Oheng routing
+        self.moe = OhengMoE(config, layer_idx)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        element_states: Optional[Dict[str, torch.Tensor]] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # Attention block with residual
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        hidden_states = residual + hidden_states
+        # MoE FFN block with residual
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.moe(hidden_states, element_states=element_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class AetherNetModel(nn.Module):
+    """AETHER-Net Language Model.
+    Architecture:
+    - Embedding → 25 × AetherNetBlock → RMSNorm → LM Head
+    - Blocks arranged in 5×5 Latin orthogonal magic square
+    - Oheng MoE with 상생 generate and 상극 overcome connections
+    - Element states flow between element groups for structural self-verification
+    """
+    def __init__(self, config: AetherNetConfig):
+        super().__init__()
+        self.config = config
+        # Token embedding
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        # 25 transformer blocks
+        self.layers = nn.ModuleList([
+            AetherNetBlock(config, layer_idx=i)
+            for i in range(config.num_layers)
+        ])
+        # Final norm
+        self.norm = RMSNorm(config.hidden_size, config.rms_norm_eps)
+        # LM Head
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Weight tying
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.embed_tokens.weight
+        # Initialize
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+    ) -> Dict[str, torch.Tensor]:
+        B, L = input_ids.shape
+        # Position IDs
+        if position_ids is None:
+            position_ids = torch.arange(L, device=input_ids.device).unsqueeze(0).expand(B, -1)
+        # Embed
+        hidden_states = self.embed_tokens(input_ids)
+        # ── Element state tracking for Oheng connections ──
+        # Each element group accumulates its output for 상생/상극 routing
+        element_states: Dict[str, torch.Tensor] = {}
+        element_layer_counts: Dict[str, int] = {e: 0 for e in ELEMENTS}
+        # ── Forward through 25 layers ──
+        for i, layer in enumerate(self.layers):
+            element = LAYER_TO_ELEMENT[i]
+            hidden_states = layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                element_states=element_states,
+                encoder_hidden_states=encoder_hidden_states,
+            )
+            # Update element state (running average of this element's layer outputs)
+            element_layer_counts[element] += 1
+            count = element_layer_counts[element]
+            if element in element_states:
+                # Exponential moving average of element's outputs
+                element_states[element] = (
+                    element_states[element] * (count - 1) / count
+                    + hidden_states.detach() / count
+                )
+            else:
+                element_states[element] = hidden_states.detach()
+        # Final norm
+        hidden_states = self.norm(hidden_states)
+        # LM Head
+        logits = self.lm_head(hidden_states)
+        # Loss
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss = nn.functional.cross_entropy(
+                shift_logits.view(-1, self.config.vocab_size),
+                shift_labels.view(-1),
+                ignore_index=-100,
+            )
+        return {
+            "loss": loss,
+            "logits": logits,
+            "element_states": element_states,
+        }
+    def count_parameters(self) -> Dict[str, int]:
+        """Count parameters by component."""
+        counts = {
+            "embedding": sum(p.numel() for p in self.embed_tokens.parameters()),
+            "lm_head": sum(p.numel() for p in self.lm_head.parameters()),
+            "norm": sum(p.numel() for p in self.norm.parameters()),
+        }
+        attn_total = 0
+        moe_total = 0
+        generate_total = 0
+        overcome_total = 0
+        for layer in self.layers:
+            attn_total += sum(p.numel() for p in layer.attention.parameters())
+            attn_total += sum(p.numel() for p in layer.input_layernorm.parameters())
+            attn_total += sum(p.numel() for p in layer.post_attention_layernorm.parameters())
+            moe_total += sum(p.numel() for p in layer.moe.experts.parameters())
+            moe_total += sum(p.numel() for p in layer.moe.shared_expert.parameters())
+            moe_total += sum(p.numel() for p in layer.moe.router.parameters())
+            if layer.moe.generate_boost is not None:
+                generate_total += sum(p.numel() for p in layer.moe.generate_boost.parameters())
+            if layer.moe.overcome_gate is not None:
+                overcome_total += sum(p.numel() for p in layer.moe.overcome_gate.parameters())
+        counts["attention_layers"] = attn_total
+        counts["moe_experts"] = moe_total
+        counts["oheng_generate"] = generate_total
+        counts["oheng_overcome"] = overcome_total
+        counts["total"] = sum(counts.values())
+        return counts
+    def get_layer_map(self) -> List[Dict]:
+        """Return human-readable layer map for diagnostics."""
+        result = []
+        for i, layer in enumerate(self.layers):
+            result.append({
+                "layer": i,
+                "type": layer.layer_type,
+                "element": layer.element,
+                "element_idx": ELEMENTS.index(layer.element),
+                "phase": i % 5,
+                "attn_class": layer.attention.__class__.__name__,
+            })
+        return result

oheng_moe.py ADDED Viewed

	@@ -0,0 +1,292 @@

+"""
+Oheng (五行) Mixture-of-Experts Router
+Core innovation: 25 experts organized in 5 element groups with:
+- 상생 (Generate) cycle: Wood→Fire→Earth→Metal→Water→Wood
+  Previous element's output provides residual boost to next element.
+- 상극 (Overcome) cycle: Wood⊣Earth, Earth⊣Water, Water⊣Fire, Fire⊣Metal, Metal⊣Wood
+  Opposing element provides critic gating to suppress hallucinations.
+- Loss-Free Balancing via dynamic expert bias (DeepSeek-style)
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, Optional, Tuple
+from config import (
+    ELEMENTS, GENERATE, GENERATE_REVERSE, OVERCOME, OVERCOME_REVERSE,
+    ELEMENT_EXPERTS, LAYER_TO_ELEMENT,
+)
+class Expert(nn.Module):
+    """Single SwiGLU expert (split from donor MLP)."""
+    def __init__(self, hidden_size: int, intermediate_size: int):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+    def forward(self, x):
+        return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
+class SharedExpert(nn.Module):
+    """Shared expert that processes all tokens (always active)."""
+    def __init__(self, hidden_size: int, intermediate_size: int):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+    def forward(self, x):
+        return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
+class GenerateBoost(nn.Module):
+    """상생 (Generate) mechanism: Previous element boosts current element.
+    Wood→Fire→Earth→Metal→Water→Wood
+    Implemented as learnable soft scalar α gating on the previous
+    element group's pooled expert state.
+    """
+    def __init__(self, hidden_size: int, num_elements: int = 5):
+        super().__init__()
+        # One learnable α per element
+        self.alpha = nn.Parameter(torch.full((num_elements,), 0.1))
+        # Lightweight projection for source→target mapping
+        self.proj = nn.Linear(hidden_size, hidden_size, bias=False)
+        nn.init.zeros_(self.proj.weight)  # Start with zero boost
+    def forward(self, hidden: torch.Tensor, source_state: Optional[torch.Tensor],
+                element_idx: int) -> torch.Tensor:
+        """
+        Args:
+            hidden: Current hidden states [B, L, D]
+            source_state: Previous element group's output [B, L, D] or None
+            element_idx: Index of current element (0=wood, 1=fire, ...)
+        Returns:
+            Boosted hidden states
+        """
+        if source_state is None:
+            return hidden
+        alpha = torch.sigmoid(self.alpha[element_idx])
+        boost = self.proj(source_state)
+        return hidden + alpha * boost
+class OvercomeGate(nn.Module):
+    """상극 (Overcome) mechanism: Opposing element provides critic gating.
+    Wood⊣Earth, Earth⊣Water, Water⊣Fire, Fire⊣Metal, Metal⊣Wood
+    A lightweight critic head from the opposing element group produces a
+    gate that suppresses potentially erroneous activations. This is the
+    structural self-verification mechanism that reduces hallucination.
+    """
+    def __init__(self, hidden_size: int, critic_hidden: int = 256, num_elements: int = 5):
+        super().__init__()
+        # One critic head per element pair
+        self.critics = nn.ModuleList([
+            nn.Sequential(
+                nn.Linear(hidden_size, critic_hidden, bias=False),
+                nn.SiLU(),
+                nn.Linear(critic_hidden, hidden_size, bias=False),
+            )
+            for _ in range(num_elements)
+        ])
+        # Initialize to near-identity (gate ≈ 1.0 at start)
+        for critic in self.critics:
+            nn.init.zeros_(critic[-1].weight)
+    def forward(self, hidden: torch.Tensor, critic_source: Optional[torch.Tensor],
+                element_idx: int) -> torch.Tensor:
+        """
+        Args:
+            hidden: Current hidden states [B, L, D]
+            critic_source: Opposing element's output [B, L, D] or None
+            element_idx: Index of current element
+        Returns:
+            Gated hidden states
+        """
+        if critic_source is None:
+            return hidden
+        gate = torch.sigmoid(self.critics[element_idx](critic_source))
+        return hidden * gate
+class OhengRouter(nn.Module):
+    """Top-K router with Loss-Free Balancing.
+    Routes tokens to experts within the current element group first,
+    then allows overflow to adjacent groups via generate connections.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.num_experts = config.num_experts
+        self.top_k = config.top_k
+        self.jitter_eps = config.moe_jitter_eps
+        # Router: hidden → expert scores
+        self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
+        # Loss-Free Balancing bias (DeepSeek-style, not trained by gradient)
+        self.register_buffer(
+            "expert_bias",
+            torch.zeros(config.num_experts),
+            persistent=True
+        )
+        # Running load tracker for bias update
+        self.register_buffer(
+            "expert_load_ema",
+            torch.ones(config.num_experts) / config.num_experts,
+            persistent=False
+        )
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            hidden_states: [B*L, D]
+        Returns:
+            expert_indices: [B*L, top_k] — indices of selected experts
+            expert_weights: [B*L, top_k] — softmax weights (from unbiased scores)
+            router_logits: [B*L, num_experts] — raw logits for auxiliary logging
+        """
+        # Raw scores
+        logits = self.gate(hidden_states)  # [B*L, E]
+        # Add jitter during training for exploration
+        if self.training and self.jitter_eps > 0:
+            noise = torch.empty_like(logits).uniform_(1.0 - self.jitter_eps, 1.0 + self.jitter_eps)
+            logits = logits * noise
+        # Biased scores for selection (Loss-Free Balancing)
+        biased_logits = logits + self.expert_bias.unsqueeze(0)
+        # Top-K selection on biased scores
+        topk_biased, indices = torch.topk(biased_logits, self.top_k, dim=-1)
+        # Weights from UNBIASED scores (clean gradients)
+        topk_logits = torch.gather(logits, 1, indices)
+        weights = F.softmax(topk_logits, dim=-1, dtype=torch.float32).to(hidden_states.dtype)
+        # Update bias (outside gradient, after each batch)
+        if self.training:
+            self._update_bias(indices)
+        return indices, weights, logits
+    @torch.no_grad()
+    def _update_bias(self, indices: torch.Tensor, momentum: float = 0.99, step: float = 0.001):
+        """Update expert bias based on current batch load."""
+        flat = indices.view(-1)
+        counts = torch.bincount(flat, minlength=self.num_experts).float()
+        load = counts / max(counts.sum().item(), 1.0)
+        self.expert_load_ema.mul_(momentum).add_(load, alpha=1 - momentum)
+        # Increase bias for underloaded experts, decrease for overloaded
+        target = 1.0 / self.num_experts
+        self.expert_bias.add_((target - self.expert_load_ema) * step)
+class OhengMoE(nn.Module):
+    """Complete Oheng MoE layer with Generate, Overcome, and expert computation.
+    Architecture per layer:
+    1. Router selects top-K experts
+    2. Selected experts process tokens
+    3. Shared expert processes all tokens
+    4. Generate boost from previous element group
+    5. Overcome gate from opposing element group
+    6. Sum all outputs
+    """
+    def __init__(self, config, layer_idx: int):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.element = LAYER_TO_ELEMENT[layer_idx]
+        self.element_idx = ELEMENTS.index(self.element)
+        self.hidden_size = config.hidden_size
+        self.top_k = config.top_k
+        # 25 routed experts
+        self.experts = nn.ModuleList([
+            Expert(config.hidden_size, config.expert_intermediate_size)
+            for _ in range(config.num_experts)
+        ])
+        # Shared expert (always active)
+        self.shared_expert = SharedExpert(config.hidden_size, config.expert_intermediate_size)
+        # Router
+        self.router = OhengRouter(config)
+        # Generate boost (상생)
+        if config.use_generate_boost:
+            self.generate_boost = GenerateBoost(config.hidden_size)
+        else:
+            self.generate_boost = None
+        # Overcome gate (상극)
+        if config.use_overcome_gate:
+            self.overcome_gate = OvercomeGate(config.hidden_size, config.overcome_gate_hidden)
+        else:
+            self.overcome_gate = None
+    def forward(self, hidden_states: torch.Tensor,
+                element_states: Optional[Dict[str, torch.Tensor]] = None) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: [B, L, D]
+            element_states: dict mapping element names to their latest output
+        Returns:
+            output: [B, L, D]
+        """
+        B, L, D = hidden_states.shape
+        flat = hidden_states.view(-1, D)  # [B*L, D]
+        # Route
+        indices, weights, _ = self.router(flat)  # [B*L, K], [B*L, K]
+        # Expert computation
+        expert_out = torch.zeros_like(flat)
+        for k in range(self.top_k):
+            expert_idx = indices[:, k]  # [B*L]
+            expert_w = weights[:, k].unsqueeze(-1)  # [B*L, 1]
+            for e_id in range(len(self.experts)):
+                mask = (expert_idx == e_id)
+                if mask.any():
+                    token_input = flat[mask]
+                    token_output = self.experts[e_id](token_input)
+                    expert_out[mask] += expert_w[mask] * token_output
+        # Shared expert (always active)
+        shared_out = self.shared_expert(flat)
+        output = (expert_out + shared_out).view(B, L, D)
+        # Apply Oheng connections if element states available
+        if element_states is not None:
+            # 상생 Generate boost
+            if self.generate_boost is not None:
+                gen_source_elem = GENERATE_REVERSE.get(self.element)
+                gen_source = element_states.get(gen_source_elem)
+                output = self.generate_boost(output, gen_source, self.element_idx)
+            # 상극 Overcome gate
+            if self.overcome_gate is not None:
+                overcome_source_elem = OVERCOME_REVERSE.get(self.element)
+                overcome_source = element_states.get(overcome_source_elem)
+                output = self.overcome_gate(output, overcome_source, self.element_idx)
+        return output

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch>=2.4.0
+safetensors>=0.4.0
+gradio>=5.0.0
+transformers>=4.45.0
+huggingface-hub>=0.25.0