Spaces:
Paused
Paused
| """ | |
| AETHER-Net Configuration | |
| Adaptive Elemental Transformer-Hybrid Efficient Recurrent Network | |
| 5Γ5 Latin Orthogonal Magic Square Layout + Oheng(δΊθ‘) MoE Routing | |
| """ | |
| from dataclasses import dataclass, field | |
| from typing import List, Tuple | |
| # ββ 5Γ5 Latin Orthogonal Magic Square ββ | |
| # Each row (element group) and each column (phase) contains | |
| # exactly one of each attention type β zero carry-over bias. | |
| MAGIC_SQUARE = [ | |
| # Phase1 Phase2 Phase3 Phase4 Phase5 | |
| ["gdn", "full", "mamba2", "slide", "cross"], # ζ¨ Wood | |
| ["slide", "gdn", "full", "cross", "mamba2"], # η« Fire | |
| ["full", "cross", "slide", "mamba2", "gdn"], # ε Earth | |
| ["mamba2", "slide", "cross", "gdn", "full"], # ι Metal | |
| ["cross", "mamba2", "gdn", "full", "slide"], # ζ°΄ Water | |
| ] | |
| # Flatten to 25-layer sequence (row-major) | |
| LAYER_TYPES = [t for row in MAGIC_SQUARE for t in row] | |
| # ββ Oheng (δΊθ‘) Element System ββ | |
| ELEMENTS = ["wood", "fire", "earth", "metal", "water"] | |
| # μμ (Generate): ζ¨βη«βεβιβζ°΄βζ¨ | |
| GENERATE = {"wood": "fire", "fire": "earth", "earth": "metal", "metal": "water", "water": "wood"} | |
| GENERATE_REVERSE = {v: k for k, v in GENERATE.items()} | |
| # μκ·Ή (Overcome): ζ¨β£ε, εβ£ζ°΄, ζ°΄β£η«, η«β£ι, ιβ£ζ¨ | |
| OVERCOME = {"wood": "earth", "earth": "water", "water": "fire", "fire": "metal", "metal": "wood"} | |
| OVERCOME_REVERSE = {v: k for k, v in OVERCOME.items()} | |
| # Element β Layer indices (0-based) | |
| ELEMENT_LAYERS = { | |
| "wood": [0, 1, 2, 3, 4], | |
| "fire": [5, 6, 7, 8, 9], | |
| "earth": [10, 11, 12, 13, 14], | |
| "metal": [15, 16, 17, 18, 19], | |
| "water": [20, 21, 22, 23, 24], | |
| } | |
| # Element β Expert indices (0-based, 5 experts per element) | |
| ELEMENT_EXPERTS = { | |
| "wood": [0, 1, 2, 3, 4], | |
| "fire": [5, 6, 7, 8, 9], | |
| "earth": [10, 11, 12, 13, 14], | |
| "metal": [15, 16, 17, 18, 19], | |
| "water": [20, 21, 22, 23, 24], | |
| } | |
| # Layer index β element name | |
| LAYER_TO_ELEMENT = {} | |
| for elem, indices in ELEMENT_LAYERS.items(): | |
| for idx in indices: | |
| LAYER_TO_ELEMENT[idx] = elem | |
| class AetherNetConfig: | |
| """Configuration for AETHER-Net model.""" | |
| # ββ Model dimensions ββ | |
| hidden_size: int = 4096 | |
| intermediate_size: int = 11008 # FFN intermediate (SwiGLU) | |
| num_layers: int = 25 | |
| num_attention_heads: int = 32 | |
| num_kv_heads: int = 8 # GQA for Full Attention layers | |
| head_dim: int = 128 # hidden_size // num_attention_heads | |
| vocab_size: int = 151936 # Qwen tokenizer | |
| max_position_embeddings: int = 262144 | |
| rope_theta: float = 10000000.0 | |
| # ββ Layer schedule (from magic square) ββ | |
| layer_types: List[str] = field(default_factory=lambda: LAYER_TYPES) | |
| # ββ MoE Configuration ββ | |
| num_experts: int = 25 | |
| num_experts_per_group: int = 5 | |
| num_element_groups: int = 5 | |
| top_k: int = 2 | |
| num_shared_experts: int = 1 | |
| expert_intermediate_size: int = 2752 # intermediate_size // 4 (per expert) | |
| moe_jitter_eps: float = 0.01 | |
| # ββ Oheng (δΊθ‘) routing ββ | |
| use_generate_boost: bool = True | |
| use_overcome_gate: bool = True | |
| generate_alpha_init: float = 0.1 # learnable soft scalar | |
| overcome_gate_hidden: int = 256 # critic head hidden dim | |
| # ββ Attention-specific ββ | |
| sliding_window_size: int = 4096 | |
| gdn_state_size: int = 128 # Gated DeltaNet state dimension | |
| mamba2_state_size: int = 128 | |
| mamba2_conv_size: int = 4 | |
| mamba2_expand: int = 2 | |
| # ββ Training / Inference ββ | |
| rms_norm_eps: float = 1e-6 | |
| initializer_range: float = 0.02 | |
| tie_word_embeddings: bool = False | |
| use_cache: bool = True | |
| torch_dtype: str = "bfloat16" | |
| # ββ Donor transplant info (metadata) ββ | |
| primary_donor: str = "Qwen/Qwen3.5-27B" | |
| secondary_donor: str = "meta-llama/Llama-3.1-8B" | |
| def get_layer_type(self, layer_idx: int) -> str: | |
| return self.layer_types[layer_idx] | |
| def get_layer_element(self, layer_idx: int) -> str: | |
| return LAYER_TO_ELEMENT[layer_idx] | |
| def get_element_expert_range(self, element: str) -> Tuple[int, int]: | |
| indices = ELEMENT_EXPERTS[element] | |
| return (indices[0], indices[-1] + 1) | |
| def summary(self) -> str: | |
| type_counts = {} | |
| for t in self.layer_types: | |
| type_counts[t] = type_counts.get(t, 0) + 1 | |
| total_params_b = ( | |
| self.num_experts * self.expert_intermediate_size * self.hidden_size * 3 * 2 # experts | |
| + self.num_layers * self.hidden_size * self.hidden_size * 4 # attention projections | |
| + self.vocab_size * self.hidden_size * 2 # embeddings | |
| ) / 1e9 | |
| active_params_b = total_params_b * (self.top_k + self.num_shared_experts) / self.num_experts_per_group | |
| lines = [ | |
| "β" * 60, | |
| " AETHER-Net Architecture Summary", | |
| "β" * 60, | |
| f" Layers: {self.num_layers} (5Γ5 magic square)", | |
| f" Hidden dim: {self.hidden_size}", | |
| f" Attention mix: {type_counts}", | |
| f" MoE: {self.num_experts} experts / {self.num_element_groups} groups / top-{self.top_k}", | |
| f" Est. total: ~{total_params_b:.1f}B params", | |
| f" Est. active: ~{active_params_b:.1f}B params", | |
| f" Context: {self.max_position_embeddings:,} tokens", | |
| f" Oheng generate: {self.use_generate_boost} (Ξ±={self.generate_alpha_init})", | |
| f" Oheng overcome: {self.use_overcome_gate}", | |
| f" Primary donor: {self.primary_donor}", | |
| f" Secondary donor:{self.secondary_donor}", | |
| "β" * 60, | |
| ] | |
| return "\n".join(lines) | |