Spaces:

ggunio
/

intelligent-tokenizer-v6-demo

Sleeping

App Files Files Community

ggunio commited on Oct 6

Commit

ff85374

verified ·

1 Parent(s): 8f9e907

Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

core/__pycache__/decoder.cpython-310.pyc +0 -0
core/__pycache__/decoder.cpython-313.pyc +0 -0
core/__pycache__/encoder.cpython-310.pyc +0 -0
core/__pycache__/encoder.cpython-313.pyc +0 -0
core/__pycache__/tokenizer.cpython-310.pyc +0 -0
core/__pycache__/tokenizer.cpython-313.pyc +0 -0
core/__pycache__/unified_model.cpython-310.pyc +0 -0
core/decoder.py +485 -0
core/encoder.py +588 -0
core/intelligent_loss.py +589 -0
core/scheduler.py +669 -0
core/tokenizer.py +477 -0
core/unified_model.py +481 -695

core/__pycache__/decoder.cpython-310.pyc ADDED Viewed

Binary file (11.8 kB). View file

core/__pycache__/decoder.cpython-313.pyc ADDED Viewed

Binary file (20.8 kB). View file

core/__pycache__/encoder.cpython-310.pyc ADDED Viewed

Binary file (13.4 kB). View file

core/__pycache__/encoder.cpython-313.pyc ADDED Viewed

Binary file (24.8 kB). View file

core/__pycache__/tokenizer.cpython-310.pyc ADDED Viewed

Binary file (12.3 kB). View file

core/__pycache__/tokenizer.cpython-313.pyc ADDED Viewed

Binary file (19.8 kB). View file

core/__pycache__/unified_model.cpython-310.pyc ADDED Viewed

Binary file (12.2 kB). View file

core/decoder.py ADDED Viewed

	@@ -0,0 +1,485 @@

+"""
+Intelligent Tokenizer v6.2.0 - 6-Layer Decoder with Multi-Level Cross-Attention
+Incorporates GPT-5 suggestions for KV cache optimization
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, List, Optional, Tuple
+import math
+class KVCacheOptimizedAttention(nn.Module):
+    """
+    KV Cache Optimized Attention - GPT-5 suggestion
+    16Q → 2K/V for 8x memory reduction
+    """
+    def __init__(self, hidden_dim: int = 1280, num_heads: int = 16, kv_compression: int = 8):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.kv_heads = max(2, num_heads // kv_compression)  # 16/8 = 2 KV heads
+        self.head_dim = hidden_dim // num_heads  # 80
+        # Query uses all heads
+        self.q_proj = nn.Linear(hidden_dim, hidden_dim)  # 16 heads
+        # Key/Value use fewer heads (GPT-5 suggestion)
+        self.k_proj = nn.Linear(hidden_dim, self.kv_heads * self.head_dim)  # 2 heads
+        self.v_proj = nn.Linear(hidden_dim, self.kv_heads * self.head_dim)  # 2 heads
+        # Output projection
+        self.o_proj = nn.Linear(hidden_dim, hidden_dim)
+        # KV cache for inference
+        self.register_buffer('cached_keys', None)
+        self.register_buffer('cached_values', None)
+    def forward(self,
+                hidden_states: torch.Tensor,
+                encoder_hidden: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                use_cache: bool = False) -> Tuple[torch.Tensor, Optional[Tuple]]:
+        """
+        Forward pass with KV cache optimization
+        """
+        batch_size, seq_len = hidden_states.shape[:2]
+        # Query projection (all heads)
+        Q = self.q_proj(hidden_states).view(batch_size, seq_len, self.num_heads, self.head_dim)
+        Q = Q.transpose(1, 2)  # [batch, heads, seq, dim]
+        # Key/Value source (self or cross)
+        kv_source = encoder_hidden if encoder_hidden is not None else hidden_states
+        # Key/Value projection (fewer heads)
+        K = self.k_proj(kv_source).view(batch_size, -1, self.kv_heads, self.head_dim)
+        V = self.v_proj(kv_source).view(batch_size, -1, self.kv_heads, self.head_dim)
+        K = K.transpose(1, 2)  # [batch, kv_heads, seq, dim]
+        V = V.transpose(1, 2)
+        # Repeat KV heads to match Q heads (broadcast)
+        K = K.repeat_interleave(self.num_heads // self.kv_heads, dim=1)
+        V = V.repeat_interleave(self.num_heads // self.kv_heads, dim=1)
+        # Cache management for incremental generation (GPT suggestion)
+        if use_cache:
+            # For incremental generation, only process new token
+            if self.cached_keys is not None and hidden_states.size(1) == 1:
+                # Append new K/V to cache
+                K = torch.cat([self.cached_keys, K], dim=2)
+                V = torch.cat([self.cached_values, V], dim=2)
+            # Update cache
+            self.cached_keys = K
+            self.cached_values = V
+        # Scaled dot-product attention
+        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        # Use additive mask (GPT suggestion)
+        if attention_mask is not None:
+            scores = scores + attention_mask  # additive mask: -inf where masked, 0 elsewhere
+        attn_weights = F.softmax(scores, dim=-1)
+        attn_output = torch.matmul(attn_weights, V)
+        # Reshape and project
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(batch_size, seq_len, self.hidden_dim)
+        output = self.o_proj(attn_output)
+        return output, (K, V) if use_cache else None
+class SelectiveCrossAttention(nn.Module):
+    """
+    Selective cross-attention - only attend to relevant encoder layers
+    Reduces 24 → 8 cross-attentions for efficiency
+    """
+    def __init__(self, hidden_dim: int = 1280, layer_id: int = 0):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.layer_id = layer_id
+        # Define which encoder layers this decoder layer should attend to
+        self.encoder_connections = {
+            0: [0],      # Decoder L0 → Encoder L0 (byte info)
+            1: [0],      # Decoder L1 → Encoder L0 (byte info)
+            2: [1, 2],   # Decoder L2 → Encoder L1,2 (language info)
+            3: [1, 2],   # Decoder L3 → Encoder L1,2 (language info)
+            4: [3],      # Decoder L4 → Encoder L3 (semantic info)
+            5: [3],      # Decoder L5 → Encoder L3 (semantic info)
+        }
+        # Get connections for this layer
+        self.connected_layers = self.encoder_connections.get(layer_id, [0])
+        # Create attention modules only for connected layers
+        self.cross_attentions = nn.ModuleList([
+            KVCacheOptimizedAttention(hidden_dim, num_heads=16, kv_compression=8)
+            for _ in self.connected_layers
+        ])
+        # Lightweight fusion with weighted sum (GPT suggestion)
+        self.fusion = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.LayerNorm(hidden_dim),
+            nn.SiLU(),
+            nn.Dropout(0.1)
+        )
+        # Learnable weights for connected layers only
+        self.layer_weights = nn.Parameter(torch.ones(len(self.connected_layers)) / len(self.connected_layers))
+    def forward(self,
+                decoder_hidden: torch.Tensor,
+                encoder_all_hidden: List[torch.Tensor],
+                attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Selectively attend to relevant encoder layers only
+        """
+        # Only attend to connected encoder layers
+        cross_outputs = []
+        for i, layer_idx in enumerate(self.connected_layers):
+            if layer_idx < len(encoder_all_hidden):
+                encoder_hidden = encoder_all_hidden[layer_idx]
+                cross_out, _ = self.cross_attentions[i](
+                    hidden_states=decoder_hidden,
+                    encoder_hidden=encoder_hidden,
+                    attention_mask=attention_mask
+                )
+                cross_outputs.append(cross_out)
+        # Weighted sum fusion for connected layers only
+        if len(cross_outputs) > 1:
+            weighted_outputs = torch.stack(cross_outputs, dim=0)  # [N, batch, seq, hidden]
+            weights = F.softmax(self.layer_weights, dim=0).view(-1, 1, 1, 1)
+            fused = (weighted_outputs * weights).sum(dim=0)  # [batch, seq, hidden]
+        else:
+            # Single connection - no fusion needed
+            fused = cross_outputs[0] if cross_outputs else decoder_hidden
+        # Apply lightweight fusion layer
+        fused = self.fusion(fused)
+        return fused
+class SwiGLU(nn.Module):
+    """SwiGLU activation for better convergence (GPT suggestion)"""
+    def __init__(self, dim: int, mult: float = 2.66):
+        super().__init__()
+        inner = int(round(dim * mult / 2)) * 2  # Even alignment
+        self.w1 = nn.Linear(dim, inner // 2)
+        self.w2 = nn.Linear(dim, inner // 2)
+        self.w3 = nn.Linear(inner // 2, dim)
+    def forward(self, x):
+        return self.w3(F.silu(self.w1(x)) * self.w2(x))
+class DecoderLayer(nn.Module):
+    """
+    Single decoder layer with self-attention and selective cross-attention
+    """
+    def __init__(self, hidden_dim: int = 1280, num_heads: int = 16, layer_id: int = 0):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.layer_id = layer_id
+        # Self-attention (with KV cache optimization)
+        self.self_attn = KVCacheOptimizedAttention(hidden_dim, num_heads, kv_compression=8)
+        self.self_attn_norm = nn.LayerNorm(hidden_dim)
+        # Selective cross-attention to specific encoder layers
+        self.cross_attn = SelectiveCrossAttention(hidden_dim, layer_id=layer_id)
+        self.cross_attn_norm = nn.LayerNorm(hidden_dim)
+        # Feed-forward network with SwiGLU (GPT suggestion)
+        self.ffn = SwiGLU(hidden_dim, mult=2.66)
+        self.ffn_norm = nn.LayerNorm(hidden_dim)
+        # Dropout for residual connections
+        self.dropout = nn.Dropout(0.1)
+    def forward(self,
+                hidden_states: torch.Tensor,
+                encoder_all_hidden: List[torch.Tensor],
+                self_attention_mask: Optional[torch.Tensor] = None,
+                cross_attention_mask: Optional[torch.Tensor] = None,
+                use_cache: bool = False) -> Tuple[torch.Tensor, Optional[Tuple]]:
+        """
+        Forward pass through decoder layer
+        """
+        # Self-attention with residual
+        residual = hidden_states
+        hidden_states = self.self_attn_norm(hidden_states)
+        self_attn_out, cache = self.self_attn(
+            hidden_states,
+            attention_mask=self_attention_mask,
+            use_cache=use_cache
+        )
+        hidden_states = residual + self.dropout(self_attn_out)
+        # Cross-attention with residual
+        residual = hidden_states
+        hidden_states = self.cross_attn_norm(hidden_states)
+        cross_attn_out = self.cross_attn(
+            hidden_states,
+            encoder_all_hidden,
+            attention_mask=cross_attention_mask
+        )
+        hidden_states = residual + self.dropout(cross_attn_out)
+        # FFN with residual
+        residual = hidden_states
+        hidden_states = self.ffn_norm(hidden_states)
+        ffn_out = self.ffn(hidden_states)
+        hidden_states = residual + self.dropout(ffn_out)
+        return hidden_states, cache
+class DecoderV62(nn.Module):
+    """
+    6-Layer Decoder with Multi-Level Cross-Attention
+    Reduced from 8 layers but compensated with better cross-attention
+    """
+    def __init__(self, config: Optional[Dict] = None):
+        super().__init__()
+        # Configuration
+        self.hidden_dim = 1280
+        self.num_heads = 16
+        self.num_layers = 6  # Reduced from 8
+        self.vocab_size = 260  # 256 bytes + special tokens
+        self.max_seq_len = 48
+        # Token constants (GPT suggestion - explicit constants)
+        self.PAD = 256
+        self.BOS = 257
+        self.EOS = 258
+        self.MASK = 259
+        # Token embedding and position encoding
+        self.token_embedding = nn.Embedding(self.vocab_size, self.hidden_dim)
+        self.position_embedding = nn.Embedding(self.max_seq_len, self.hidden_dim)
+        # 6 decoder layers with layer-specific cross-attention
+        self.layers = nn.ModuleList([
+            DecoderLayer(self.hidden_dim, self.num_heads, layer_id=i)
+            for i in range(self.num_layers)
+        ])
+        # Output projection
+        self.output_norm = nn.LayerNorm(self.hidden_dim)
+        self.output_projection = nn.Linear(self.hidden_dim, self.vocab_size)
+        # Monitoring (GPT-5 suggestion)
+        # Track importance of ENCODER layers (4) used by decoder
+        self.register_buffer('layer_importance', torch.zeros(4))  # Track importance of 4 encoder layers
+    def forward(self,
+                encoder_all_hidden: List[torch.Tensor],
+                decoder_input_ids: Optional[torch.Tensor] = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                use_cache: bool = False,
+                past_key_values: Optional[List] = None) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass through decoder
+        Args:
+            encoder_all_hidden: All encoder layer outputs (4 layers)
+            decoder_input_ids: Input token IDs for teacher forcing
+            attention_mask: Attention mask
+            use_cache: Whether to cache KV for inference
+            past_key_values: Cached KV from previous steps
+        """
+        batch_size = encoder_all_hidden[0].size(0)
+        device = encoder_all_hidden[0].device
+        # If no decoder input, start with compressed representation
+        if decoder_input_ids is None:
+            # Use encoder's final compressed output as starting point
+            hidden_states = encoder_all_hidden[-1]  # [batch, M tokens, 1280]
+            seq_len = hidden_states.size(1)
+        else:
+            # Teacher forcing mode: use provided tokens
+            seq_len = decoder_input_ids.size(1)
+            # Embeddings
+            token_embeds = self.token_embedding(decoder_input_ids)
+            position_ids = torch.arange(seq_len, device=device).expand(batch_size, -1)
+            position_embeds = self.position_embedding(position_ids)
+            hidden_states = token_embeds + position_embeds
+        # Create causal mask for self-attention (additive mask - GPT suggestion)
+        causal_mask = torch.full((1, 1, seq_len, seq_len), float('-inf'), device=device)
+        causal_mask = torch.triu(causal_mask, diagonal=1)  # [1, 1, seq, seq]
+        # Pass through decoder layers
+        all_hidden_states = []
+        all_caches = [] if use_cache else None
+        for i, layer in enumerate(self.layers):
+            # GPT final check: Create proper cross-attention mask for encoder hidden states
+            if encoder_all_hidden is not None and len(encoder_all_hidden) > 0:
+                S_enc = encoder_all_hidden[0].size(1)  # Encoder sequence length
+                # Create additive mask (0 = attend, -inf = mask)
+                cross_mask = torch.zeros((batch_size, 1, 1, S_enc), device=hidden_states.device)
+            else:
+                cross_mask = None
+            hidden_states, cache = layer(
+                hidden_states,
+                encoder_all_hidden,
+                self_attention_mask=causal_mask,
+                cross_attention_mask=cross_mask,  # Use proper cross mask
+                use_cache=use_cache
+            )
+            all_hidden_states.append(hidden_states)
+            if use_cache:
+                all_caches.append(cache)
+        # Final output projection
+        hidden_states = self.output_norm(hidden_states)
+        logits = self.output_projection(hidden_states)
+        # Update monitoring: track encoder layer importance
+        # (This would be computed based on cross-attention weights in practice)
+        with torch.no_grad():
+            # Simplified: assume equal importance for now
+            self.layer_importance = torch.tensor([0.25, 0.25, 0.25, 0.25])
+        outputs = {
+            'logits': logits,
+            'last_hidden_state': hidden_states,
+            'all_hidden_states': all_hidden_states,
+            'encoder_layer_importance': self.layer_importance
+        }
+        if use_cache:
+            outputs['past_key_values'] = all_caches
+        return outputs
+    def generate(self,
+                 encoder_all_hidden: List[torch.Tensor],
+                 max_length: int = 48,
+                 temperature: float = 1.0,
+                 top_k: int = 50,
+                 top_p: float = 0.95) -> torch.Tensor:
+        """
+        Autoregressive generation
+        """
+        batch_size = encoder_all_hidden[0].size(0)
+        device = encoder_all_hidden[0].device
+        # Start with BOS token
+        generated = torch.full((batch_size, 1), self.BOS, device=device)
+        # Generate tokens one by one
+        past_key_values = None
+        for _ in range(max_length - 1):
+            # GPT optimization: Only pass last token for O(T) complexity
+            if past_key_values is not None:
+                decoder_input = generated[:, -1:]  # Last token only
+            else:
+                decoder_input = generated  # Full sequence for first step
+            outputs = self.forward(
+                encoder_all_hidden,
+                decoder_input_ids=decoder_input,
+                use_cache=True,
+                past_key_values=past_key_values
+            )
+            logits = outputs['logits'][:, -1, :] / temperature
+            # Top-k filtering
+            if top_k > 0:
+                indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+                logits[indices_to_remove] = float('-inf')
+            # Top-p (nucleus) filtering
+            if top_p < 1.0:
+                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                # Remove tokens with cumulative probability above threshold
+                sorted_indices_to_remove = cumulative_probs > top_p
+                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                sorted_indices_to_remove[..., 0] = 0
+                indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+                logits[indices_to_remove] = float('-inf')
+            # Sample
+            probs = F.softmax(logits, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+            # Append to generated sequence
+            generated = torch.cat([generated, next_token], dim=1)
+            # Check for EOS
+            if (next_token == self.EOS).all():
+                break
+            past_key_values = outputs.get('past_key_values')
+        return generated
+    def get_memory_usage(self) -> Dict[str, float]:
+        """
+        Calculate memory usage with KV cache optimization (GPT-5 metric)
+        """
+        # Standard attention: 16 heads for K and V
+        standard_kv_memory = 2 * 16 * self.max_seq_len * 80 * 4  # bytes
+        # Optimized: 2 heads for K and V
+        optimized_kv_memory = 2 * 2 * self.max_seq_len * 80 * 4  # bytes
+        return {
+            'standard_kv_mb': standard_kv_memory / (1024 * 1024),
+            'optimized_kv_mb': optimized_kv_memory / (1024 * 1024),
+            'reduction_ratio': standard_kv_memory / optimized_kv_memory,
+            'total_params_m': sum(p.numel() for p in self.parameters()) / 1e6
+        }
+if __name__ == "__main__":
+    # Test the decoder
+    decoder = DecoderV62()
+    # Simulate encoder outputs (4 layers, 6 tokens each)
+    batch_size = 2
+    num_tokens = 6  # After progressive splitting
+    hidden_dim = 1280
+    encoder_outputs = [
+        torch.randn(batch_size, num_tokens, hidden_dim)
+        for _ in range(4)
+    ]
+    # Test with teacher forcing
+    decoder_input = torch.randint(0, 256, (batch_size, 48))
+    output = decoder(encoder_outputs, decoder_input_ids=decoder_input)
+    print(f"Decoder output shape: {output['logits'].shape}")
+    print(f"Encoder layer importance: {output['encoder_layer_importance']}")
+    # Test generation
+    generated = decoder.generate(encoder_outputs, max_length=48)
+    print(f"Generated shape: {generated.shape}")
+    # Memory usage
+    memory_stats = decoder.get_memory_usage()
+    print(f"Memory optimization: {memory_stats['reduction_ratio']:.1f}x reduction")
+    print(f"Total parameters: {memory_stats['total_params_m']:.1f}M")

core/encoder.py ADDED Viewed

	@@ -0,0 +1,588 @@

+"""
+Intelligent Tokenizer v6.2.0 - Progressive Splitting Encoder
+With GPT-5 suggested improvements
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, List, Optional, Tuple
+import math
+class RoPEPositionalEncoding(nn.Module):
+    """
+    Rotary Position Embedding (RoPE) - GPT-5 suggestion
+    Better for handling chunk boundaries and variable sequence lengths
+    """
+    def __init__(self, dim: int, max_seq_len: int = 48, base: int = 10000):
+        super().__init__()
+        self.dim = dim
+        self.max_seq_len = max_seq_len
+        self.base = base
+        # Precompute sinusoidal frequencies
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', inv_freq)
+        # Precompute positional encodings
+        t = torch.arange(max_seq_len).type_as(self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
+        self.register_buffer('cos_cached', freqs.cos())
+        self.register_buffer('sin_cached', freqs.sin())
+    def forward(self, x: torch.Tensor, seq_len: int = None) -> torch.Tensor:
+        """
+        Apply RoPE to input tensor
+        Handles chunk boundary corrections as suggested by GPT-5
+        """
+        if seq_len is None:
+            seq_len = x.shape[1]
+        # Get cached cos/sin values
+        cos = self.cos_cached[:seq_len]
+        sin = self.sin_cached[:seq_len]
+        # Apply rotary embedding
+        x_rot = self._apply_rotary_emb(x, cos, sin)
+        return x_rot
+    def _apply_rotary_emb(self, x, cos, sin):
+        """Apply rotary embedding to input"""
+        x1, x2 = x[..., ::2], x[..., 1::2]
+        x_rot = torch.stack([
+            x1 * cos - x2 * sin,
+            x1 * sin + x2 * cos
+        ], dim=-1).flatten(-2)
+        return x_rot
+class GatedCrossAttention(nn.Module):
+    """
+    Gated Cross-Attention with MQA - GPT-5 suggestion
+    Monitor gate values for quality assessment
+    16Q → 2K/V for 8x memory reduction
+    """
+    def __init__(self, hidden_dim: int = 1280, num_heads: int = 16, kv_heads: int = 2):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.kv_heads = kv_heads  # Reduced KV heads (GPT suggestion)
+        self.head_dim = hidden_dim // num_heads  # 80
+        # Multi-Query Attention projections
+        self.q_proj = nn.Linear(hidden_dim, hidden_dim)  # 16 heads
+        self.k_proj = nn.Linear(hidden_dim, kv_heads * self.head_dim)  # 2 heads
+        self.v_proj = nn.Linear(hidden_dim, kv_heads * self.head_dim)  # 2 heads
+        self.o_proj = nn.Linear(hidden_dim, hidden_dim)
+        # Gating mechanism (GPT-5 suggestion)
+        self.gate = nn.Sequential(
+            nn.Linear(hidden_dim * 2, hidden_dim),
+            nn.Sigmoid()
+        )
+        # Gate monitoring (for analysis)
+        self.register_buffer('gate_values', torch.zeros(1))
+        # Warmup factor (GPT suggestion)
+        self.register_buffer('warmup_alpha', torch.tensor(1.0))
+    def forward(self,
+                query: torch.Tensor,
+                key: torch.Tensor,
+                value: torch.Tensor,
+                mask: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward pass with gate monitoring
+        Returns: (output, gate_values)
+        """
+        batch_size, seq_len = query.shape[:2]
+        # Multi-head attention projections
+        Q = self.q_proj(query).view(batch_size, seq_len, self.num_heads, self.head_dim)
+        K = self.k_proj(key).view(batch_size, -1, self.kv_heads, self.head_dim)
+        V = self.v_proj(value).view(batch_size, -1, self.kv_heads, self.head_dim)
+        # Transpose for attention computation
+        Q = Q.transpose(1, 2)  # [batch, heads, seq, dim]
+        K = K.transpose(1, 2)  # [batch, kv_heads, seq, dim]
+        V = V.transpose(1, 2)
+        # Repeat KV heads to match Q heads if necessary
+        if self.kv_heads < self.num_heads:
+            repeat_factor = self.num_heads // self.kv_heads
+            K = K.repeat_interleave(repeat_factor, dim=1)
+            V = V.repeat_interleave(repeat_factor, dim=1)
+        # Scaled dot-product attention
+        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e9)
+        attn_weights = F.softmax(scores, dim=-1)
+        attn_output = torch.matmul(attn_weights, V)
+        # Reshape back
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(batch_size, seq_len, self.hidden_dim)
+        attn_output = self.o_proj(attn_output)
+        # Gating mechanism
+        gate_input = torch.cat([query, attn_output], dim=-1)
+        gate_values = self.gate(gate_input)
+        # Store gate values for monitoring (keep tensor shape consistent)
+        self.gate_values[0] = gate_values.mean().detach()
+        # Apply gate with warmup factor (GPT suggestion)
+        gate_values = gate_values * self.warmup_alpha
+        output = gate_values * attn_output + (1 - gate_values) * query
+        return output, gate_values
+class ProgressiveSplittingLayer(nn.Module):
+    """
+    Core innovation: 48 bytes → 1 token → N tokens → M tokens
+    """
+    def __init__(self, hidden_dim: int = 1280, config: Optional[Dict] = None):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.config = config or {}
+        # Dynamic splitting: 1~4 tokens for efficiency
+        # 48 bytes / 4 tokens = 12:1 compression (still beats BPE's 4:1)
+        self.min_tokens = 1  # 48:1 compression
+        self.max_tokens = 4  # 12:1 compression (still 3x better than BPE)
+        # Initial compression: 48 bytes → 1 super token
+        self.byte_embed = nn.Embedding(260, 64)  # Small embedding
+        self.initial_compressor = nn.Sequential(
+            nn.Linear(48 * 64, 2048),
+            nn.LayerNorm(2048),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(2048, hidden_dim),
+            nn.LayerNorm(hidden_dim)
+        )
+        # Language-aware splitting: 1 → N tokens (config-based)
+        self.language_splitter = nn.ModuleDict({
+            'analyzer': nn.Sequential(
+                nn.Linear(hidden_dim, 512),
+                nn.ReLU(),
+                nn.Linear(512, 256)  # Language features
+            ),
+            'split_predictor': nn.Linear(256, self.max_tokens),  # Predict 1~4 tokens
+            # Single unified expander that can produce any number of tokens
+            'dynamic_expander': nn.Sequential(
+                nn.Linear(hidden_dim, hidden_dim * 2),
+                nn.LayerNorm(hidden_dim * 2),
+                nn.GELU(),  # Better than ReLU for transformers
+                nn.Linear(hidden_dim * 2, hidden_dim * self.max_tokens)  # Can produce up to 4 tokens
+            ),
+            # Token-wise importance predictor
+            'importance_predictor': nn.Sequential(
+                nn.Linear(hidden_dim, 256),
+                nn.ReLU(),
+                nn.Linear(256, self.max_tokens),  # Importance for each potential token
+                nn.Softmax(dim=-1)
+            )
+        })
+        # Boundary refinement: N → M tokens with linguistic awareness
+        self.boundary_refiner = nn.ModuleDict({
+            'scorer': nn.Sequential(
+                nn.Linear(hidden_dim, 512),
+                nn.ReLU(),
+                nn.Linear(512, 1)
+            ),
+            'morpheme_detector': nn.Conv1d(256, 64, 3),  # 형태소
+            'word_detector': nn.Conv1d(256, 64, 5),       # 단어
+            'phrase_detector': nn.Conv1d(256, 64, 7),     # 구
+            'adjuster': nn.TransformerEncoderLayer(
+                d_model=hidden_dim,
+                nhead=16,
+                dim_feedforward=4 * hidden_dim,
+                dropout=0.1,
+                batch_first=True
+            )
+        })
+        # Initialize split_predictor bias to prefer 1 token initially
+        # This ensures untrained model starts with maximum compression
+        with torch.no_grad():
+            self.language_splitter['split_predictor'].bias.data = torch.tensor([2.0, -1.0, -1.0, -1.0])
+            # High bias for 1 token, negative for others
+    def forward(self, input_ids: torch.Tensor, temperature: float = 1.0) -> Dict[str, torch.Tensor]:
+        """
+        Progressive splitting forward pass
+        Args:
+            input_ids: Input byte sequence [batch, seq_len]
+            temperature: Gumbel-Softmax temperature for annealing
+        """
+        batch_size = input_ids.size(0)
+        # Step 1: 48 bytes → 1 super token
+        byte_embeddings = self.byte_embed(input_ids)  # [batch, 48, 64]
+        flattened = byte_embeddings.view(batch_size, -1)  # [batch, 3072]
+        super_token = self.initial_compressor(flattened)  # [batch, 1280]
+        super_token = super_token.unsqueeze(1)  # [batch, 1, 1280]
+        # Step 2: Language analysis and splitting (1 → N)
+        lang_features = self.language_splitter['analyzer'](super_token)
+        split_logits = self.language_splitter['split_predictor'](lang_features)
+        split_weights = F.softmax(split_logits, dim=-1)  # [batch, 1, 8]
+        # Direct transformation from super token to initial representation
+        # No hardcoded splits - let the model learn everything
+        lang_tokens = super_token  # Start with compressed representation
+        # TRUE Adaptive expansion - Model learns optimal split (1~4 tokens)
+        # Analyze content to decide how many tokens needed
+        expansion_features = self.language_splitter['analyzer'](lang_tokens)  # [batch, 1, 256]
+        # Dynamic expansion: generate up to 4 tokens from super token
+        expanded = self.language_splitter['dynamic_expander'](lang_tokens.squeeze(1))  # [batch, hidden_dim*4]
+        expanded = expanded.reshape(batch_size, self.max_tokens, self.hidden_dim)  # [batch, 4, hidden_dim]
+        # Predict how many tokens we actually need (1~4)
+        split_logits = self.language_splitter['split_predictor'](expansion_features.squeeze(1))  # [batch, 4]
+        # Clamp logits to prevent extreme values that cause NaN
+        split_logits = torch.clamp(split_logits, min=-10, max=10)
+        # Ensure minimum temperature to prevent instability
+        safe_temperature = max(temperature, 0.5)
+        split_weights = F.gumbel_softmax(split_logits, tau=safe_temperature, hard=False, dim=-1)  # [batch, 4]
+        # Predict importance for each potential token position
+        importance = self.language_splitter['importance_predictor'](lang_tokens.squeeze(1))  # [batch, 4]
+        # Dynamic token selection with importance-weighted allocation
+        # Create cumulative mask for progressive token usage
+        # If split_weights = [0.1, 0.2, 0.6, 0.1], we mainly use 3 tokens
+        # Create progressive masks for 1, 2, 3, 4 tokens
+        masks = []
+        for n in range(1, self.max_tokens + 1):
+            mask = torch.zeros(batch_size, self.max_tokens, 1, device=expanded.device)
+            mask[:, :n, :] = 1.0
+            masks.append(mask)
+        # Apply importance-weighted masking
+        # Important parts get more tokens, less important parts get fewer
+        weighted_outputs = []
+        for i, mask in enumerate(masks):
+            num_tokens = i + 1
+            # Weight by both split decision and importance
+            token_weight = split_weights[:, i:i+1].unsqueeze(-1)  # [batch, 1, 1]
+            # Apply importance modulation for asymmetric splits
+            if num_tokens > 1:
+                # Redistribute tokens based on importance
+                importance_adjusted = importance[:, :num_tokens].unsqueeze(-1)  # [batch, n, 1]
+                masked = expanded[:, :num_tokens] * importance_adjusted
+            else:
+                masked = expanded[:, :num_tokens]
+            # Pad to max length
+            if num_tokens < self.max_tokens:
+                padding = torch.zeros(batch_size, self.max_tokens - num_tokens, self.hidden_dim,
+                                     device=expanded.device)
+                masked = torch.cat([masked, padding], dim=1)
+            weighted_outputs.append(masked * token_weight)
+        # Sum all weighted possibilities (differentiable selection)
+        lang_tokens = sum(weighted_outputs)
+        # Determine effective number of tokens (for monitoring)
+        # Weighted average of token counts
+        token_counts = torch.arange(1, self.max_tokens + 1, device=split_weights.device, dtype=torch.float32)
+        avg_tokens = (split_weights * token_counts).sum(dim=-1).mean().item()
+        k = lang_tokens.size(1)
+        # Step 3: Boundary refinement (N → M)
+        # Calculate boundary scores for each token position
+        boundary_scores = self.boundary_refiner['scorer'](lang_tokens)  # [batch, N, 1]
+        # Detect linguistic boundaries (morpheme, word, phrase)
+        # Extract features for boundary detection
+        if hasattr(lang_tokens, 'shape') and len(lang_tokens.shape) == 3:
+            batch_size, num_tokens, hidden_dim = lang_tokens.shape
+            # For boundary detection, we need to consider the original byte sequence
+            # But we're working with compressed tokens here
+            # So we detect boundaries based on learned representations
+            # Apply boundary adjustment with TransformerEncoderLayer
+            # This learns to adjust token boundaries based on context
+            refined_tokens = self.boundary_refiner['adjuster'](lang_tokens)
+            # The adjuster should learn to:
+            # 1. Respect UTF-8 boundaries (learned during training)
+            # 2. Align with word/phrase boundaries (learned from language patterns)
+            # 3. Maintain semantic coherence within each token
+        else:
+            refined_tokens = lang_tokens
+        # Determine actual number of tokens based on highest probability
+        # During inference, use argmax. During training, use weighted average.
+        if self.training:
+            # During training, use weighted average for differentiability
+            actual_num_tokens = avg_tokens
+        else:
+            # During inference, select the split with highest probability
+            split_decision = torch.argmax(split_weights, dim=-1)  # [batch]
+            actual_num_tokens = (split_decision.float().mean() + 1).item()  # +1 because indices are 0-3
+        # Calculate compression ratio based on actual tokens used
+        compression_ratio = 48.0 / max(1, actual_num_tokens)
+        return {
+            'tokens': refined_tokens,
+            'num_tokens': actual_num_tokens,
+            'compression_ratio': torch.tensor(compression_ratio, device=refined_tokens.device),
+            'gate_values': None,  # Will be filled by cross-attention
+            'language_features': lang_features,
+            'split_weights': split_weights,
+            'avg_tokens': avg_tokens if 'avg_tokens' in locals() else refined_tokens.size(1),
+            'split_distribution': split_weights.mean(dim=0) if 'split_weights' in locals() else None
+        }
+class EncoderV62(nn.Module):
+    """
+    4-Layer Progressive Splitting Encoder with Cross-Attention
+    All layers: 1280 dimensions
+    """
+    def __init__(self, config: Optional[Dict] = None):
+        super().__init__()
+        # Store config for later use
+        self.config = config or {}
+        # Configuration
+        self.hidden_dim = 1280
+        self.num_heads = 16
+        self.num_layers = 4
+        self.max_seq_len = 48
+        self.dropout = 0.1
+        # RoPE positional encoding (GPT-5 suggestion)
+        self.rope = RoPEPositionalEncoding(self.hidden_dim, self.max_seq_len)
+        # Layer 0: Progressive Splitting (48→1→N→M) - Pass config
+        self.progressive_splitter = ProgressiveSplittingLayer(self.hidden_dim, config)
+        # Layers 1-3: Transformer encoders with cross-attention
+        self.encoder_layers = nn.ModuleList([
+            nn.TransformerEncoderLayer(
+                d_model=self.hidden_dim,
+                nhead=self.num_heads,
+                dim_feedforward=4 * self.hidden_dim,  # 5120
+                dropout=self.dropout,
+                batch_first=True
+            ) for _ in range(3)
+        ])
+        # Cross-attention between layers with MQA (GPT-5 suggestion)
+        self.cross_attentions = nn.ModuleList([
+            GatedCrossAttention(self.hidden_dim, self.num_heads, kv_heads=2)  # 8x memory reduction
+            for _ in range(3)
+        ])
+        # Output heads for different tasks
+        self.boundary_head = nn.Linear(self.hidden_dim, 4)
+        self.language_head = nn.Linear(self.hidden_dim, 128)  # Reduced from 512 (GPT suggestion)
+        self.compression_head = nn.Linear(self.hidden_dim, self.hidden_dim)
+        # Monitoring metrics (GPT-5 suggestion)
+        self.register_buffer('compression_ratios', torch.zeros(1))
+        self.register_buffer('gate_averages', torch.zeros(3))
+    def forward(self,
+                input_ids: torch.Tensor,
+                attention_mask: Optional[torch.Tensor] = None,
+                temperature: float = 1.0) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass through the encoder
+        Args:
+            input_ids: Input byte sequence
+            attention_mask: Optional attention mask
+            temperature: Gumbel-Softmax temperature for annealing
+        """
+        # Layer 0: Progressive splitting with temperature
+        split_output = self.progressive_splitter(input_ids, temperature)
+        x = split_output['tokens']  # [batch, M, 1280]
+        # Apply RoPE
+        x = self.rope(x, x.size(1))
+        # Store all hidden states for decoder
+        all_hidden_states = [x]
+        gate_values_list = []
+        # Layers 1-3 with cross-attention
+        for i, (encoder_layer, cross_attn) in enumerate(
+            zip(self.encoder_layers, self.cross_attentions)
+        ):
+            # Self-attention through transformer layer
+            # GPT final check: Don't pass mask after progressive splitting changes sequence length
+            x = encoder_layer(x)  # No mask needed (no padding after compression)
+            # Cross-attention with previous layer
+            if i > 0:
+                # Cross-attention with previous layer
+                x, gate_values = cross_attn(
+                    query=x,
+                    key=all_hidden_states[-1],
+                    value=all_hidden_states[-1],
+                    mask=None  # Mask not applicable after compression
+                )
+                gate_values_list.append(gate_values)
+                # Keep tensor shape consistent - store in existing buffer element
+                self.gate_averages[i-1] = gate_values.mean().detach().item()  # Fix indexing
+            all_hidden_states.append(x)
+        # Output projections
+        boundaries = self.boundary_head(x)
+        language_clusters = self.language_head(x)
+        compressed = self.compression_head(x)
+        # Update monitoring metrics
+        # Ensure tensor is 1-dimensional for buffer assignment
+        compression_ratio = split_output['compression_ratio']
+        if compression_ratio.dim() == 0:  # Scalar tensor
+            self.compression_ratios[0] = compression_ratio
+        else:
+            self.compression_ratios = compression_ratio
+        return {
+            'last_hidden_state': x,
+            'all_hidden_states': all_hidden_states,
+            'boundaries': boundaries,
+            'language_clusters': language_clusters,
+            'compressed': compressed,
+            'compression_ratio': split_output['compression_ratio'],
+            'num_tokens': split_output['num_tokens'],
+            'splitting_probs': split_output.get('split_weights', None),  # Add for diagnostics
+            'gate_values': gate_values_list,
+            'gate_averages': self.gate_averages,
+            'split_info': {
+                'language_features': split_output['language_features'],
+                'split_weights': split_output['split_weights']
+            }
+        }
+    def get_monitoring_stats(self) -> Dict[str, float]:
+        """
+        Get monitoring statistics (GPT-5 suggestion)
+        """
+        return {
+            'avg_compression_ratio': self.compression_ratios.item(),
+            'gate_layer1': self.gate_averages[0].item(),
+            'gate_layer2': self.gate_averages[1].item(),
+            'gate_layer3': self.gate_averages[2].item(),
+        }
+    def set_warmup_step(self, step: int, total_warmup: int = 1000):
+        """
+        Set warmup alpha for all gates (GPT suggestion)
+        Gradually increase gate influence from 0 to 1
+        """
+        alpha = min(1.0, step / total_warmup)
+        for cross_attn in self.cross_attentions:
+            cross_attn.warmup_alpha = torch.tensor(alpha, device=cross_attn.warmup_alpha.device)
+    def adaptive_compression_control(self, reconstruction_loss: float):
+        """
+        Adaptive compression based on reconstruction quality
+        No fixed phases - model learns optimal compression
+        """
+        # If reconstruction is poor, model will learn to use more tokens
+        # This happens automatically through gradient descent
+        # No manual phase control needed
+        pass  # Let gradients handle it
+class DualSlidingWindowEncoder(EncoderV62):
+    """
+    Extension with dual sliding window system
+    Handles both chunk-level and token-level boundaries
+    """
+    def __init__(self, config: Optional[Dict] = None):
+        super().__init__(config)
+        # Chunk-level sliding window
+        self.chunk_window = nn.Conv1d(
+            in_channels=1,
+            out_channels=1,
+            kernel_size=8,  # 8-byte overlap
+            stride=40,      # 48-8=40 stride
+            padding=4
+        )
+        # Token-level sliding window
+        self.token_window = nn.MultiheadAttention(
+            embed_dim=self.hidden_dim,
+            num_heads=self.num_heads,
+            batch_first=True
+        )
+    def process_long_sequence(self, input_ids: torch.Tensor) -> torch.Tensor:
+        """
+        Handle sequences longer than 48 bytes with sliding windows
+        """
+        batch_size, seq_len = input_ids.shape
+        if seq_len <= 48:
+            return super().forward(input_ids)
+        # Process in chunks with overlap
+        chunks = []
+        for i in range(0, seq_len - 48 + 1, 40):  # 8-byte overlap
+            chunk = input_ids[:, i:i+48]
+            chunk_output = super().forward(chunk)
+            chunks.append(chunk_output['last_hidden_state'])
+        # Combine chunks with attention
+        combined = torch.cat(chunks, dim=1)
+        attended, _ = self.token_window(combined, combined, combined)
+        return {
+            'last_hidden_state': attended,
+            'num_chunks': len(chunks),
+            'total_compression': seq_len / attended.size(1)
+        }
+if __name__ == "__main__":
+    # Test the encoder
+    encoder = EncoderV62()
+    # Test input
+    batch_size = 2
+    input_ids = torch.randint(0, 256, (batch_size, 48))
+    # Forward pass
+    output = encoder(input_ids)
+    print(f"Input shape: {input_ids.shape}")
+    print(f"Output tokens: {output['num_tokens']}")
+    print(f"Compression ratio: {output['compression_ratio']:.2f}:1")
+    print(f"Gate averages: {output['gate_averages']}")
+    print(f"Monitoring stats: {encoder.get_monitoring_stats()}")

core/intelligent_loss.py ADDED Viewed

	@@ -0,0 +1,589 @@

+"""
+Intelligent Loss Functions for v6.2.0
+Multi-objective loss with GPT-5 suggested improvements
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, Optional, Tuple
+import math
+class IntelligentLoss(nn.Module):
+    """
+    Comprehensive loss function for progressive splitting tokenizer
+    Combines multiple objectives with dynamic weighting
+    """
+    def __init__(self, config: Optional[Dict] = None):
+        super().__init__()
+        # Default configuration
+        self.config = config or {}
+        # Special tokens (must match tokenizer)
+        self.PAD = 256
+        self.BOS = 257
+        self.EOS = 258
+        self.MASK = 259
+        # Loss components
+        self.reconstruction_loss = ReconstructionLoss(self.PAD)
+        self.compression_loss = CompressionLoss()
+        self.boundary_loss = BoundaryLoss()
+        self.language_loss = LanguageLoss()
+        self.consistency_loss = ConsistencyLoss()
+        # Dynamic weight adjustment
+        self.use_dynamic_weights = True
+        self.weight_history = {
+            'reconstruction': [],
+            'compression': [],
+            'boundary': [],
+            'language': [],
+            'consistency': []
+        }
+    def estimate_language_difficulty(self, targets: Dict) -> float:
+        """Estimate language difficulty based on input characteristics"""
+        if 'input_ids' not in targets:
+            return 1.0
+        input_ids = targets['input_ids']
+        if input_ids.numel() == 0:
+            return 1.0
+        # Higher entropy = more complex language
+        unique_tokens = input_ids.unique().numel()
+        total_tokens = input_ids.numel()
+        diversity = min(1.0, (unique_tokens / total_tokens) * 2)
+        return diversity
+    def forward(self,
+                outputs: Dict[str, torch.Tensor],
+                targets: Dict[str, torch.Tensor],
+                weights: Optional[Dict[str, float]] = None) -> Dict[str, torch.Tensor]:
+        """
+        Compute combined loss with all objectives
+        Args:
+            outputs: Model outputs dictionary
+            targets: Target values dictionary
+            weights: Optional weight overrides
+        Returns:
+            Dictionary with total loss and individual components
+        """
+        losses = {}
+        # 1. Reconstruction loss (primary objective)
+        if 'logits' in outputs and 'input_ids' in targets:
+            losses['reconstruction'] = self.reconstruction_loss(
+                outputs['logits'],
+                targets['input_ids'],
+                targets.get('attention_mask')
+            )
+        # 2. Compression loss (encourage optimal compression)
+        if 'compression_ratio' in outputs:
+            losses['compression'] = self.compression_loss(
+                outputs['compression_ratio'],
+                outputs.get('num_tokens')
+            )
+        # 3. Boundary loss (learn meaningful boundaries)
+        if 'boundaries' in outputs and 'boundary_targets' in targets:
+            losses['boundary'] = self.boundary_loss(
+                outputs['boundaries'],
+                targets['boundary_targets'],
+                targets.get('boundary_mask')
+            )
+        # 4. Language loss (language identification/clustering)
+        if 'language_clusters' in outputs and 'language_targets' in targets:
+            losses['language'] = self.language_loss(
+                outputs['language_clusters'],
+                targets['language_targets']
+            )
+        # 5. Consistency loss (encoder-decoder consistency)
+        if 'encoder_hidden' in outputs and 'decoder_hidden' in outputs:
+            losses['consistency'] = self.consistency_loss(
+                outputs['encoder_hidden'],
+                outputs['decoder_hidden']
+            )
+        # Apply weights (either provided or dynamic)
+        if weights is None and self.use_dynamic_weights:
+            weights = self.compute_dynamic_weights(losses)
+        elif weights is None:
+            weights = {
+                'reconstruction': 1.0,
+                'compression': 1.0,
+                'boundary': 1.0,
+                'language': 0.5,
+                'consistency': 0.5
+            }
+        # Weighted sum
+        total_loss = torch.tensor(0.0, device=next(iter(losses.values())).device)
+        for key, loss in losses.items():
+            weight = weights.get(key, 1.0)
+            total_loss = total_loss + weight * loss
+            losses[f'{key}_weighted'] = weight * loss
+        losses['total'] = total_loss
+        # Update weight history
+        for key in self.weight_history:
+            if key in losses:
+                self.weight_history[key].append(losses[key].item())
+        return losses
+    def compute_dynamic_weights(self, losses: Dict[str, torch.Tensor]) -> Dict[str, float]:
+        """
+        Dynamically adjust weights based on loss magnitudes and progress
+        GPT-5 suggestion: balance loss magnitudes for stable training
+        """
+        weights = {}
+        eps = 1e-8  # GPT fix: prevent division by zero
+        # Get loss magnitudes with NaN protection
+        magnitudes = {}
+        for k, v in losses.items():
+            if torch.isnan(v) or torch.isinf(v):
+                magnitudes[k] = 1.0  # Default safe value
+            else:
+                magnitudes[k] = v.item()
+        # Compute relative scales (GPT fix: add epsilon)
+        avg_magnitude = max(eps, sum(magnitudes.values()) / len(magnitudes))
+        for key, magnitude in magnitudes.items():
+            # Inverse scaling to balance magnitudes (GPT fix: add epsilon)
+            weights[key] = avg_magnitude / max(eps, magnitude)
+        # Dynamic adjustment based on loss ratios
+        if 'reconstruction' in magnitudes and 'compression' in magnitudes:
+            recon_loss = magnitudes['reconstruction']
+            comp_loss = magnitudes['compression']
+            # If reconstruction loss is too high relative to compression
+            if recon_loss > comp_loss * 10:
+                # Drastically reduce compression pressure
+                weights['compression'] *= 0.1
+                weights['reconstruction'] *= 5.0
+            elif recon_loss > comp_loss * 5:
+                # Moderate adjustment
+                weights['compression'] *= 0.5
+                weights['reconstruction'] *= 2.0
+            elif recon_loss < comp_loss * 0.5:
+                # Good reconstruction, can push compression
+                weights['compression'] *= 2.0
+                weights['reconstruction'] *= 0.5
+        # Normalize weights to prevent explosion
+        total_weight = sum(weights.values())
+        if total_weight > 0:
+            weights = {k: min(10.0, v / total_weight * len(weights)) for k, v in weights.items()}
+        return weights
+class ReconstructionLoss(nn.Module):
+    """
+    Cross-entropy loss for sequence reconstruction
+    With label smoothing and focal loss options
+    """
+    def __init__(self, pad_token: int = 256, label_smoothing: float = 0.1):
+        super().__init__()
+        self.pad_token = pad_token
+        self.label_smoothing = label_smoothing
+        self.focal_alpha = 0.25
+        self.focal_gamma = 2.0
+        self.use_focal = False
+    def forward(self,
+                logits: torch.Tensor,
+                targets: torch.Tensor,
+                mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Compute reconstruction loss
+        Args:
+            logits: [batch, seq_len, vocab_size]
+            targets: [batch, seq_len]
+            mask: [batch, seq_len] attention mask
+        """
+        batch_size, seq_len, vocab_size = logits.shape
+        # Reshape for loss computation
+        logits_flat = logits.reshape(-1, vocab_size)
+        targets_flat = targets.reshape(-1)
+        if self.use_focal:
+            # Focal loss for hard examples
+            ce_loss = F.cross_entropy(logits_flat, targets_flat, reduction='none')
+            pt = torch.exp(-ce_loss)
+            focal_loss = self.focal_alpha * (1 - pt) ** self.focal_gamma * ce_loss
+            if mask is not None:
+                mask_flat = mask.reshape(-1)
+                focal_loss = focal_loss * mask_flat
+                loss = focal_loss.sum() / mask_flat.sum()
+            else:
+                loss = focal_loss.mean()
+        else:
+            # Standard cross-entropy with label smoothing
+            if mask is not None:
+                mask_flat = mask.reshape(-1).bool()  # GPT fix: ensure bool dtype
+                loss = F.cross_entropy(
+                    logits_flat[mask_flat],
+                    targets_flat[mask_flat],
+                    ignore_index=self.pad_token,
+                    label_smoothing=self.label_smoothing
+                )
+            else:
+                loss = F.cross_entropy(
+                    logits_flat,
+                    targets_flat,
+                    ignore_index=self.pad_token,
+                    label_smoothing=self.label_smoothing
+                )
+        return loss
+class CompressionLoss(nn.Module):
+    """
+    Aggressive compression loss - push for high compression
+    Must beat existing tokenizers (4 bytes/token = 4:1)
+    """
+    def __init__(self):
+        super().__init__()
+        # Dynamic compression based on token count
+        # 1 token = 48:1, 2 = 24:1, 3 = 16:1, 4 = 12:1
+        self.min_ratio = 12.0  # 4 tokens (worst case, still 3x better than BPE)
+        self.target_ratio = 24.0  # 2 tokens (optimal balance)
+        self.max_ratio = 48.0  # 1 token (best compression)
+    def forward(self,
+                compression_ratio: torch.Tensor,
+                num_tokens: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Compute compression loss (GPT fix: fully vectorized)
+        Args:
+            compression_ratio: Current compression ratio (scalar or batch)
+            num_tokens: Number of tokens used (for additional penalty)
+        """
+        # Ensure tensor (GPT fix: handle device properly)
+        if not torch.is_tensor(compression_ratio):
+            device = num_tokens.device if torch.is_tensor(num_tokens) else torch.device('cpu')
+            compression_ratio = torch.tensor(compression_ratio, dtype=torch.float32, device=device)
+        # Aggressive compression enforcement
+        # MUST achieve at least 16:1 to be viable
+        if compression_ratio < self.min_ratio:
+            # Moderate penalty for falling below minimum (reduced for stability)
+            under_loss = ((self.min_ratio - compression_ratio) / self.min_ratio) * 0.5
+        else:
+            under_loss = torch.tensor(0.0, dtype=compression_ratio.dtype, device=compression_ratio.device)
+        # Reward getting close to target (24:1)
+        if self.min_ratio <= compression_ratio < self.target_ratio:
+            # Encourage reaching target
+            target_loss = ((self.target_ratio - compression_ratio) / self.target_ratio) * 0.5
+        elif compression_ratio >= self.target_ratio:
+            # Excellent compression - small reward for going higher
+            target_loss = -0.1 * torch.log(compression_ratio / self.target_ratio + 1.0)
+        else:
+            target_loss = torch.tensor(0.0, dtype=compression_ratio.dtype, device=compression_ratio.device)
+        # Only mild penalty for extreme compression (>48:1)
+        if compression_ratio > self.max_ratio:
+            over_loss = ((compression_ratio - self.max_ratio) / self.max_ratio) * 0.2
+        else:
+            over_loss = torch.tensor(0.0, dtype=compression_ratio.dtype, device=compression_ratio.device)
+        loss = under_loss + target_loss + over_loss
+        # Additional penalty based on token count (GPT fix: vectorized)
+        if num_tokens is not None:
+            if not torch.is_tensor(num_tokens):
+                num_tokens = torch.tensor(num_tokens, dtype=torch.float32, device=compression_ratio.device)
+            token_penalty = 0.1 * torch.clamp(num_tokens - 8, min=0.0) ** 2
+            loss = loss + token_penalty
+        return loss.mean() if loss.dim() > 0 else loss
+class BoundaryLoss(nn.Module):
+    """
+    Learn meaningful chunk boundaries
+    Combines multiple boundary objectives
+    """
+    def __init__(self):
+        super().__init__()
+        self.bce_loss = nn.BCEWithLogitsLoss(reduction='none')
+    def forward(self,
+                predicted: torch.Tensor,
+                target: torch.Tensor,
+                mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Compute boundary loss
+        Args:
+            predicted: [batch, seq_len, boundary_classes] predicted boundaries
+            target: [batch, seq_len, boundary_classes] target boundaries
+            mask: [batch, seq_len] valid positions mask
+        """
+        # Binary cross-entropy for boundary prediction
+        loss = self.bce_loss(predicted, target.float())
+        if mask is not None:
+            # Apply mask
+            mask_expanded = mask.unsqueeze(-1).expand_as(loss)
+            loss = loss * mask_expanded
+            loss = loss.sum() / mask_expanded.sum()
+        else:
+            loss = loss.mean()
+        # Add regularization for boundary sparsity
+        # (boundaries should be relatively rare)
+        boundary_probs = torch.sigmoid(predicted)
+        sparsity_loss = 0.01 * boundary_probs.mean()
+        # Add smoothness regularization
+        # (boundaries should be somewhat smooth/continuous)
+        if predicted.size(1) > 1:
+            diff = predicted[:, 1:] - predicted[:, :-1]
+            smoothness_loss = 0.01 * (diff ** 2).mean()
+        else:
+            smoothness_loss = 0.0
+        total_loss = loss + sparsity_loss + smoothness_loss
+        return total_loss
+class LanguageLoss(nn.Module):
+    """
+    Language identification/clustering loss
+    Supports both classification and clustering objectives
+    """
+    def __init__(self, num_languages: int = 128, temperature: float = 0.07):
+        super().__init__()
+        self.num_languages = num_languages
+        self.temperature = temperature
+        # For supervised language classification
+        self.ce_loss = nn.CrossEntropyLoss()
+    def forward(self,
+                predicted: torch.Tensor,
+                target: torch.Tensor,
+                mode: str = 'classification') -> torch.Tensor:
+        """
+        Compute language loss
+        Args:
+            predicted: [batch, seq_len, num_languages] or [batch, num_languages]
+            target: Language labels or cluster assignments
+            mode: 'classification' or 'clustering'
+        """
+        if mode == 'classification':
+            # Standard classification loss
+            if predicted.dim() == 3:
+                # Sequence-level predictions
+                batch_size, seq_len, _ = predicted.shape
+                predicted = predicted.reshape(-1, self.num_languages)
+                target = target.reshape(-1)
+            loss = self.ce_loss(predicted, target)
+        elif mode == 'clustering':
+            # Contrastive clustering loss (similar to SimCLR)
+            # Normalize embeddings
+            predicted = F.normalize(predicted, dim=-1)
+            # Compute similarity matrix
+            sim_matrix = torch.matmul(predicted, predicted.t()) / self.temperature
+            # Create labels (assuming batch contains similar samples)
+            batch_size = predicted.size(0)
+            labels = torch.arange(batch_size, device=predicted.device)
+            # Contrastive loss
+            loss = F.cross_entropy(sim_matrix, labels)
+        else:
+            raise ValueError(f"Unknown mode: {mode}")
+        return loss
+class ConsistencyLoss(nn.Module):
+    """
+    Ensure consistency between encoder and decoder representations
+    GPT-5 suggestion: helps with training stability
+    """
+    def __init__(self, margin: float = 0.5):
+        super().__init__()
+        self.margin = margin
+    def forward(self,
+                encoder_hidden: torch.Tensor,
+                decoder_hidden: torch.Tensor) -> torch.Tensor:
+        """
+        Compute consistency loss between encoder and decoder
+        Args:
+            encoder_hidden: [batch, seq_len, hidden_dim]
+            decoder_hidden: [batch, seq_len, hidden_dim]
+        """
+        # Ensure same shape
+        if encoder_hidden.shape != decoder_hidden.shape:
+            # Align sequence lengths if different
+            min_len = min(encoder_hidden.size(1), decoder_hidden.size(1))
+            encoder_hidden = encoder_hidden[:, :min_len]
+            decoder_hidden = decoder_hidden[:, :min_len]
+        # L2 distance
+        l2_loss = F.mse_loss(encoder_hidden, decoder_hidden)
+        # Cosine similarity loss
+        encoder_norm = F.normalize(encoder_hidden, dim=-1)
+        decoder_norm = F.normalize(decoder_hidden, dim=-1)
+        cosine_sim = (encoder_norm * decoder_norm).sum(dim=-1)
+        cosine_loss = 1.0 - cosine_sim.mean()
+        # Combined loss
+        loss = l2_loss + 0.5 * cosine_loss
+        return loss
+class AdaptiveLossScheduler:
+    """
+    Dynamically adjust loss weights during training
+    Based on training progress and performance
+    """
+    def __init__(self, config: Dict):
+        self.config = config
+        self.current_phase = 0
+        self.phase_epochs = [30, 60, 100]  # Phase transition points
+        # Define phase-specific weights
+        self.phase_weights = [
+            # Phase 1: Boundary mastery
+            {
+                'reconstruction': 2.0,
+                'compression': 0.5,
+                'boundary': 3.0,
+                'language': 0.5,
+                'consistency': 0.5
+            },
+            # Phase 2: Compression focus
+            {
+                'reconstruction': 2.0,
+                'compression': 3.0,
+                'boundary': 1.0,
+                'language': 1.0,
+                'consistency': 1.0
+            },
+            # Phase 3: Balanced optimization
+            {
+                'reconstruction': 3.0,
+                'compression': 2.0,
+                'boundary': 1.0,
+                'language': 1.0,
+                'consistency': 1.5
+            }
+        ]
+    def get_weights(self, epoch: int, metrics: Optional[Dict] = None) -> Dict[str, float]:
+        """
+        Get current loss weights based on training phase
+        Args:
+            epoch: Current training epoch
+            metrics: Optional performance metrics for adaptive adjustment
+        """
+        # Determine current phase
+        for i, phase_end in enumerate(self.phase_epochs):
+            if epoch <= phase_end:
+                self.current_phase = i
+                break
+        weights = self.phase_weights[self.current_phase].copy()
+        # Adaptive adjustments based on metrics
+        if metrics:
+            # If reconstruction is poor, increase its weight
+            if metrics.get('reconstruction_accuracy', 1.0) < 0.9:
+                weights['reconstruction'] *= 1.5
+            # If compression is off target, adjust weight
+            compression_ratio = metrics.get('compression_ratio', 16.0)
+            if compression_ratio < 8.0 or compression_ratio > 20.0:
+                weights['compression'] *= 1.5
+        return weights
+if __name__ == "__main__":
+    # Test losses
+    print("Testing Intelligent Loss Functions")
+    # Create loss module
+    loss_fn = IntelligentLoss()
+    # Create dummy data
+    batch_size = 2
+    seq_len = 48
+    vocab_size = 260
+    hidden_dim = 1280
+    outputs = {
+        'logits': torch.randn(batch_size, seq_len, vocab_size),
+        'compression_ratio': torch.tensor(16.0),
+        'num_tokens': torch.tensor(3),
+        'boundaries': torch.randn(batch_size, seq_len, 4),
+        'language_clusters': torch.randn(batch_size, 128),
+        'encoder_hidden': torch.randn(batch_size, seq_len, hidden_dim),
+        'decoder_hidden': torch.randn(batch_size, seq_len, hidden_dim)
+    }
+    targets = {
+        'input_ids': torch.randint(0, 256, (batch_size, seq_len)),
+        'attention_mask': torch.ones(batch_size, seq_len),
+        'boundary_targets': torch.zeros(batch_size, seq_len, 4),
+        'language_targets': torch.randint(0, 128, (batch_size,))
+    }
+    # Compute losses
+    losses = loss_fn(outputs, targets)
+    print("\nLoss components:")
+    for key, value in losses.items():
+        if isinstance(value, torch.Tensor):
+            print(f"  {key}: {value.item():.4f}")
+    # Test adaptive scheduler
+    scheduler = AdaptiveLossScheduler({})
+    print("\nPhase weights:")
+    for epoch in [10, 40, 70]:
+        weights = scheduler.get_weights(epoch)
+        print(f"  Epoch {epoch}: {weights}")

core/scheduler.py ADDED Viewed

	@@ -0,0 +1,669 @@

+"""
+Learning Rate Schedulers for v6.2.0
+Advanced scheduling with warmup and phase-based adjustments
+"""
+import torch
+import math
+from typing import Optional, Dict, List, Any
+import numpy as np
+class WarmupCosineScheduler:
+    """
+    Cosine annealing with linear warmup
+    GPT-5 suggested: Essential for stable progressive splitting training
+    """
+    def __init__(self,
+                 optimizer: torch.optim.Optimizer,
+                 warmup_steps: int,
+                 total_steps: int,
+                 min_lr: float = 1e-6,
+                 max_lr: Optional[float] = None):
+        self.optimizer = optimizer
+        self.warmup_steps = warmup_steps
+        self.total_steps = total_steps
+        self.min_lr = min_lr
+        self.max_lr = max_lr or optimizer.param_groups[0]['lr']
+        self.current_step = 0
+    def step(self):
+        """Update learning rate"""
+        self.current_step += 1
+        if self.current_step <= self.warmup_steps:
+            # Linear warmup
+            lr = self.max_lr * (self.current_step / self.warmup_steps)
+        else:
+            # Cosine annealing (GPT fix: guard against division by zero)
+            if self.total_steps <= self.warmup_steps:
+                lr = self.min_lr
+            else:
+                progress = (self.current_step - self.warmup_steps) / max(1, self.total_steps - self.warmup_steps)
+                progress = min(1.0, max(0.0, progress))  # Clamp to [0, 1]
+                lr = self.min_lr + (self.max_lr - self.min_lr) * 0.5 * (1 + math.cos(math.pi * progress))
+        for param_group in self.optimizer.param_groups:
+            param_group['lr'] = lr
+        return lr
+    def get_lr(self):
+        """Get current learning rate"""
+        return self.optimizer.param_groups[0]['lr']
+class PhaseBasedScheduler:
+    """
+    Curriculum learning scheduler with phase transitions
+    Adjusts learning rate based on training phases
+    """
+    def __init__(self,
+                 optimizer: torch.optim.Optimizer,
+                 phase_configs: List[Dict],
+                 current_epoch: int = 0):
+        """
+        Args:
+            optimizer: PyTorch optimizer
+            phase_configs: List of phase configurations
+                [{
+                    'epochs': (start, end),
+                    'lr': learning_rate,
+                    'warmup_epochs': warmup_duration
+                }, ...]
+        """
+        self.optimizer = optimizer
+        self.phase_configs = phase_configs
+        self.current_epoch = current_epoch
+        self.current_phase = 0
+        self.base_lr = optimizer.param_groups[0]['lr']
+    def step(self, epoch: Optional[int] = None):
+        """Update learning rate based on current phase"""
+        if epoch is not None:
+            self.current_epoch = epoch
+        # Find current phase
+        for i, phase in enumerate(self.phase_configs):
+            start_epoch, end_epoch = phase['epochs']
+            if start_epoch <= self.current_epoch <= end_epoch:
+                self.current_phase = i
+                break
+        phase = self.phase_configs[self.current_phase]
+        target_lr = phase['lr']
+        warmup_epochs = phase.get('warmup_epochs', 0)
+        start_epoch = phase['epochs'][0]
+        # Apply warmup if in warmup period
+        if self.current_epoch - start_epoch < warmup_epochs:
+            warmup_progress = (self.current_epoch - start_epoch + 1) / warmup_epochs
+            lr = target_lr * warmup_progress
+        else:
+            lr = target_lr
+        # Update optimizer
+        for param_group in self.optimizer.param_groups:
+            param_group['lr'] = lr
+        return lr
+class AdaptiveScheduler:
+    """
+    Adaptive learning rate based on validation metrics
+    Reduces LR when metrics plateau
+    """
+    def __init__(self,
+                 optimizer: torch.optim.Optimizer,
+                 mode: str = 'min',
+                 factor: float = 0.5,
+                 patience: int = 10,
+                 threshold: float = 1e-4,
+                 min_lr: float = 1e-7):
+        """
+        Args:
+            optimizer: PyTorch optimizer
+            mode: 'min' or 'max' - whether to reduce LR when metric stops decreasing or increasing
+            factor: Factor to reduce LR by
+            patience: Number of epochs with no improvement to wait
+            threshold: Minimum change to qualify as improvement
+            min_lr: Minimum learning rate
+        """
+        self.optimizer = optimizer
+        self.mode = mode
+        self.factor = factor
+        self.patience = patience
+        self.threshold = threshold
+        self.min_lr = min_lr
+        self.best_score = None
+        self.num_bad_epochs = 0
+        self.last_reduction = 0
+    def step(self, metric: float, epoch: int = 0):
+        """Update learning rate based on metric"""
+        current_lr = self.optimizer.param_groups[0]['lr']
+        if self.best_score is None:
+            self.best_score = metric
+        else:
+            if self.mode == 'min':
+                improved = metric < self.best_score - self.threshold
+            else:
+                improved = metric > self.best_score + self.threshold
+            if improved:
+                self.best_score = metric
+                self.num_bad_epochs = 0
+            else:
+                self.num_bad_epochs += 1
+            # Reduce LR if patience exceeded
+            if self.num_bad_epochs >= self.patience:
+                new_lr = max(current_lr * self.factor, self.min_lr)
+                if new_lr < current_lr:
+                    print(f"Reducing learning rate from {current_lr:.2e} to {new_lr:.2e}")
+                    for param_group in self.optimizer.param_groups:
+                        param_group['lr'] = new_lr
+                    self.num_bad_epochs = 0
+                    self.last_reduction = epoch
+        return current_lr
+class ProgressiveSplittingScheduler:
+    """
+    Adaptive scheduler for progressive splitting
+    No fixed targets - adjusts based on quality feedback
+    """
+    def __init__(self,
+                 optimizer: torch.optim.Optimizer,
+                 initial_lr: float = 1e-4,
+                 min_reconstruction: float = 0.85,
+                 ema: float = 0.98,
+                 min_lr: float = 1e-7):
+        self.optimizer = optimizer
+        self.initial_lr = initial_lr
+        self.min_reconstruction = min_reconstruction  # Quality threshold
+        self.ema = ema
+        self.min_lr = min_lr
+        # Adaptive multipliers based on performance
+        self.quality_multiplier = 1.0  # Adjusts with reconstruction quality
+        # No phases - continuous adaptation
+        self.current_state = 'learning'
+        # EMA tracking for smooth transitions
+        self._ema_comp = None
+        self._ema_recon = None
+    def step(self, metrics: Dict[str, float]):
+        """
+        Update learning rate based on current metrics
+        GPT fix: EMA smoothing and minimum floor
+        Args:
+            metrics: Dictionary containing:
+                - compression_ratio: Current compression ratio
+                - reconstruction_acc: Reconstruction accuracy
+        """
+        compression_ratio = float(metrics.get('compression_ratio', 0.0))
+        reconstruction_acc = float(metrics.get('reconstruction_acc', 0.0))
+        # Update EMA (GPT fix: smooth transitions)
+        if self._ema_comp is None:
+            self._ema_comp = compression_ratio
+            self._ema_recon = reconstruction_acc
+        else:
+            self._ema_comp = self.ema * self._ema_comp + (1 - self.ema) * compression_ratio
+            self._ema_recon = self.ema * self._ema_recon + (1 - self.ema) * reconstruction_acc
+        # Adaptive adjustment based on reconstruction quality only
+        # No fixed compression target - emerges from quality
+        if self._ema_recon < self.min_reconstruction:
+            # Poor reconstruction - reduce LR for careful learning
+            self.quality_multiplier = max(0.5, self._ema_recon)
+        else:
+            # Good reconstruction - normal learning
+            self.quality_multiplier = 1.0
+        # Smooth LR changes
+        reconstruction_factor = max(0.1, self._ema_recon)
+        # Combined learning rate (adaptive, no phase multiplier)
+        lr = self.initial_lr * self.quality_multiplier * reconstruction_factor
+        lr = max(lr, self.min_lr)  # Ensure minimum LR
+        # Update optimizer
+        for param_group in self.optimizer.param_groups:
+            param_group['lr'] = lr
+        return lr
+class GumbelTemperatureScheduler:
+    """
+    Temperature annealing for Gumbel-Softmax
+    GPT-5 suggestion: Critical for progressive splitting
+    """
+    def __init__(self,
+                 initial_temp: float = 1.0,
+                 final_temp: float = 0.1,
+                 anneal_rate: float = 0.99995,
+                 anneal_steps: Optional[int] = None):
+        self.initial_temp = initial_temp
+        self.final_temp = final_temp
+        self.anneal_rate = anneal_rate
+        self.anneal_steps = anneal_steps
+        self.current_step = 0
+        self.current_temp = initial_temp
+    def step(self):
+        """Update temperature"""
+        self.current_step += 1
+        if self.anneal_steps:
+            # Linear annealing
+            progress = min(1.0, self.current_step / self.anneal_steps)
+            self.current_temp = self.initial_temp + (self.final_temp - self.initial_temp) * progress
+        else:
+            # Exponential annealing
+            self.current_temp = max(
+                self.final_temp,
+                self.initial_temp * (self.anneal_rate ** self.current_step)
+            )
+        return self.current_temp
+    def get_temperature(self):
+        """Get current temperature"""
+        return self.current_temp
+class CompressionRatioScheduler:
+    """
+    Schedule target compression ratio during training
+    Gradually increase compression requirements
+    """
+    def __init__(self,
+                 initial_ratio: float = 8.0,
+                 target_ratio: float = 24.0,
+                 warmup_epochs: int = 10,
+                 total_epochs: int = 100):
+        self.initial_ratio = initial_ratio
+        self.target_ratio = target_ratio
+        self.warmup_epochs = warmup_epochs
+        self.total_epochs = total_epochs
+        self.current_epoch = 0
+    def step(self, epoch: Optional[int] = None):
+        """Update target compression ratio"""
+        if epoch is not None:
+            self.current_epoch = epoch
+        else:
+            self.current_epoch += 1
+        if self.current_epoch < self.warmup_epochs:
+            # Start with lower compression requirement
+            ratio = self.initial_ratio
+        else:
+            # Gradually increase to target
+            progress = (self.current_epoch - self.warmup_epochs) / (self.total_epochs - self.warmup_epochs)
+            progress = min(1.0, progress)
+            ratio = self.initial_ratio + (self.target_ratio - self.initial_ratio) * progress
+        return ratio
+class MultiScheduler:
+    """
+    Combine multiple schedulers for comprehensive training control
+    """
+    def __init__(self, schedulers: Dict):
+        """
+        Args:
+            schedulers: Dictionary of schedulers
+                {
+                    'lr': learning_rate_scheduler,
+                    'gumbel': gumbel_temperature_scheduler,
+                    'compression': compression_ratio_scheduler,
+                    ...
+                }
+        """
+        self.schedulers = schedulers
+    def step(self, **kwargs):
+        """
+        Step all schedulers
+        GPT fix: unified input convention
+        Returns:
+            Dictionary with all scheduler outputs
+        """
+        results = {}
+        for name, scheduler in self.schedulers.items():
+            try:
+                # Check scheduler type and pass appropriate arguments
+                if hasattr(scheduler, '__class__'):
+                    class_name = scheduler.__class__.__name__
+                    if class_name == 'AdaptiveScheduler' and 'metric' in kwargs:
+                        results[name] = scheduler.step(kwargs['metric'], kwargs.get('epoch', 0))
+                    elif class_name == 'PhaseBasedScheduler' and 'epoch' in kwargs:
+                        results[name] = scheduler.step(kwargs['epoch'])
+                    elif class_name == 'CompressionRatioScheduler' and 'epoch' in kwargs:
+                        results[name] = scheduler.step(kwargs['epoch'])
+                    elif class_name == 'ProgressiveSplittingScheduler' and 'metrics' in kwargs:
+                        results[name] = scheduler.step(kwargs['metrics'])
+                    elif hasattr(scheduler, 'step'):
+                        # Generic step (no arguments)
+                        results[name] = scheduler.step()
+                else:
+                    if hasattr(scheduler, 'step'):
+                        results[name] = scheduler.step()
+            except Exception as e:
+                print(f"Warning: Scheduler '{name}' step failed: {e}")
+                results[name] = None
+        return results
+    def get_current_values(self):
+        """Get current values from all schedulers"""
+        values = {}
+        for name, scheduler in self.schedulers.items():
+            if hasattr(scheduler, 'get_lr'):
+                values[name] = scheduler.get_lr()
+            elif hasattr(scheduler, 'get_temperature'):
+                values[name] = scheduler.get_temperature()
+            elif hasattr(scheduler, 'current_temp'):
+                values[name] = scheduler.current_temp
+            elif hasattr(scheduler, 'current_epoch'):
+                values[name] = scheduler.current_epoch
+        return values
+class GateWarmupScheduler:
+    """게이트 파라미터 웜업 스케줄러
+    초기: 모든 레이어 동등 사용 (gate=1.0)
+    웜업: 점진적 게이트 학습 시작
+    후기: 최적 게이트 값으로 수렴
+    """
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        warmup_steps: int = 1000,
+        gate_param_group_name: str = 'gates',
+        importance_param_group_name: str = 'importance'
+    ):
+        """
+        Args:
+            optimizer: 옵티마이저
+            warmup_steps: 웜업 스텝 수
+            gate_param_group_name: 게이트 파라미터 그룹 이름
+            importance_param_group_name: 중요도 파라미터 그룹 이름
+        """
+        self.optimizer = optimizer
+        self.warmup_steps = warmup_steps
+        self.gate_group_name = gate_param_group_name
+        self.importance_group_name = importance_param_group_name
+        # 초기 학습률 저장
+        self.base_lrs = {}
+        for group in optimizer.param_groups:
+            if 'name' in group:
+                self.base_lrs[group['name']] = group['lr']
+    def get_gate_factor(self, step: int) -> float:
+        """게이트 학습률 계수 계산
+        웜업 기간 동안은 낮은 학습률,
+        이후 정상 학습률로 전환
+        """
+        if step < self.warmup_steps:
+            # 웜업 기간: 선형 증가
+            return step / self.warmup_steps
+        else:
+            # 정상 학습
+            return 1.0
+    def get_importance_factor(self, step: int) -> float:
+        """중요도 학습률 계수 계산
+        게이트보다 느리게 학습 시작
+        """
+        delayed_warmup = self.warmup_steps * 1.5
+        if step < delayed_warmup:
+            return step / delayed_warmup * 0.5
+        else:
+            return 1.0
+    def step(self, current_step: int):
+        """스케줄러 스텝
+        Args:
+            current_step: 현재 글로벌 스텝
+        """
+        # 게이트 파라미터 그룹 학습률 조정
+        gate_factor = self.get_gate_factor(current_step)
+        importance_factor = self.get_importance_factor(current_step)
+        for group in self.optimizer.param_groups:
+            if 'name' not in group:
+                continue
+            if group['name'] == self.gate_group_name:
+                # 게이트 학습률 조정
+                group['lr'] = self.base_lrs[self.gate_group_name] * gate_factor
+            elif group['name'] == self.importance_group_name:
+                # 중요도 학습률 조정
+                group['lr'] = self.base_lrs[self.importance_group_name] * importance_factor
+    def get_lr(self) -> Dict[str, float]:
+        """현재 학습률 반환"""
+        lrs = {}
+        for group in self.optimizer.param_groups:
+            if 'name' in group:
+                lrs[group['name']] = group['lr']
+        return lrs
+class UniversalCosineScheduler:
+    """Universal Cosine Annealing 스케줄러
+    모든 언어에 대해 동일한 스케줄 적용
+    """
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        warmup_steps: int = 1000,
+        total_steps: int = 10000,
+        min_lr_ratio: float = 0.1
+    ):
+        self.optimizer = optimizer
+        self.warmup_steps = warmup_steps
+        self.total_steps = total_steps
+        self.min_lr_ratio = min_lr_ratio
+        self.current_step = 0
+        # 초기 학습률 저장
+        self.base_lrs = [group['lr'] for group in optimizer.param_groups]
+    def step(self):
+        """스케줄러 스텝"""
+        self.current_step += 1
+        for idx, param_group in enumerate(self.optimizer.param_groups):
+            if self.current_step < self.warmup_steps:
+                # Warmup 단계
+                lr = self.base_lrs[idx] * (self.current_step / self.warmup_steps)
+            else:
+                # Cosine annealing
+                if self.total_steps <= self.warmup_steps:
+                    # warmup_steps가 total_steps보다 크거나 같은 경우
+                    lr = self.base_lrs[idx] * self.min_lr_ratio
+                else:
+                    progress = min(1.0, (self.current_step - self.warmup_steps) / max(1, self.total_steps - self.warmup_steps))
+                    lr = self.base_lrs[idx] * (
+                        self.min_lr_ratio + (1 - self.min_lr_ratio) * 0.5 * (1 + math.cos(math.pi * progress))
+                    )
+            param_group['lr'] = lr
+    def get_last_lr(self) -> List[float]:
+        """마지막 학습률 반환"""
+        return [group['lr'] for group in self.optimizer.param_groups]
+    def state_dict(self) -> Dict[str, Any]:
+        """스케줄러 상태 딕셔너리 반환 (체크포인트 저장용)"""
+        return {
+            'current_step': self.current_step,
+            'warmup_steps': self.warmup_steps,
+            'total_steps': self.total_steps,
+            'min_lr_ratio': self.min_lr_ratio,
+            'base_lrs': self.base_lrs
+        }
+    def load_state_dict(self, state_dict: Dict[str, Any]):
+        """스케줄러 상태 로드 (체크포인트 재시작용)"""
+        self.current_step = state_dict['current_step']
+        self.warmup_steps = state_dict['warmup_steps']
+        self.total_steps = state_dict['total_steps']
+        self.min_lr_ratio = state_dict['min_lr_ratio']
+        self.base_lrs = state_dict['base_lrs']
+class AdaptiveLayerScheduler:
+    """레이어별 적응적 스케줄러
+    각 레이어의 학습 진행도에 따라 동적으로 조정
+    """
+    def __init__(
+        self,
+        layer_builder,
+        threshold_active: float = 0.7,
+        threshold_skip: float = 0.3
+    ):
+        """
+        Args:
+            layer_builder: LayerBuilder 인스턴스
+            threshold_active: 활성 레이어 임계값
+            threshold_skip: 스킵 레이어 임계값
+        """
+        self.layer_builder = layer_builder
+        self.threshold_active = threshold_active
+        self.threshold_skip = threshold_skip
+        # 레이어별 통계
+        self.layer_stats = {
+            'usage_count': torch.zeros(5),
+            'contribution': torch.zeros(5)
+        }
+    def update_stats(self, batch_output):
+        """배치 출력으로 통계 업데이트"""
+        with torch.no_grad():
+            gates = torch.sigmoid(self.layer_builder.layer_gates)
+            # 사용 횟수 업데이트
+            self.layer_stats['usage_count'] += (gates > self.threshold_skip).float()
+            # 기여도 추정 (간단한 버전)
+            importance = torch.nn.functional.softmax(
+                self.layer_builder.layer_importance, dim=0
+            )
+            self.layer_stats['contribution'] += importance.detach()
+    def get_layer_status(self) -> Dict[int, str]:
+        """각 레이어의 상태 반환"""
+        gates = torch.sigmoid(self.layer_builder.layer_gates)
+        status = {}
+        for i in range(5):
+            if gates[i] > self.threshold_active:
+                status[i] = "ACTIVE"
+            elif gates[i] > self.threshold_skip:
+                status[i] = "PARTIAL"
+            else:
+                status[i] = "SKIP"
+        return status
+    def suggest_pruning(self) -> List[int]:
+        """프루닝 가능한 레이어 제안"""
+        gates = torch.sigmoid(self.layer_builder.layer_gates)
+        prunable = []
+        for i in range(5):
+            if gates[i] < self.threshold_skip:
+                # 낮은 게이트 값 + 낮은 기여도
+                if self.layer_stats['contribution'][i] < 0.1:
+                    prunable.append(i)
+        return prunable
+if __name__ == "__main__":
+    # Test schedulers
+    print("Testing Schedulers")
+    # Create dummy optimizer
+    model = torch.nn.Linear(10, 10)
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+    # Test WarmupCosineScheduler
+    print("\n1. WarmupCosineScheduler:")
+    scheduler = WarmupCosineScheduler(optimizer, warmup_steps=100, total_steps=1000)
+    lrs = []
+    for step in range(200):
+        lr = scheduler.step()
+        if step % 20 == 0:
+            print(f"  Step {step}: LR = {lr:.6f}")
+        lrs.append(lr)
+    # Test PhaseBasedScheduler
+    print("\n2. PhaseBasedScheduler:")
+    phase_configs = [
+        {'epochs': (0, 30), 'lr': 1e-4, 'warmup_epochs': 5},
+        {'epochs': (31, 60), 'lr': 5e-5, 'warmup_epochs': 2},
+        {'epochs': (61, 100), 'lr': 1e-5, 'warmup_epochs': 0}
+    ]
+    scheduler = PhaseBasedScheduler(optimizer, phase_configs)
+    for epoch in [0, 5, 31, 35, 61, 80]:
+        lr = scheduler.step(epoch)
+        print(f"  Epoch {epoch}: LR = {lr:.6f}")
+    # Test GumbelTemperatureScheduler
+    print("\n3. GumbelTemperatureScheduler:")
+    scheduler = GumbelTemperatureScheduler()
+    for step in [0, 100, 500, 1000, 5000]:
+        for _ in range(step - scheduler.current_step):
+            scheduler.step()
+        temp = scheduler.get_temperature()
+        print(f"  Step {step}: Temperature = {temp:.4f}")
+    # Test CompressionRatioScheduler
+    print("\n4. CompressionRatioScheduler:")
+    scheduler = CompressionRatioScheduler()
+    for epoch in [0, 5, 10, 30, 50, 80, 100]:
+        ratio = scheduler.step(epoch)
+        print(f"  Epoch {epoch}: Target ratio = {ratio:.1f}:1")

core/tokenizer.py ADDED Viewed

	@@ -0,0 +1,477 @@

+"""
+Intelligent Tokenizer v6.2.0 - Byte Tokenizer with 46+2 Configuration
+Handles chunking, sliding windows, and boundary adjustments
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Dict, List, Optional, Tuple, Union
+import numpy as np
+def _trim_utf8_boundary(byte_seq: List[int], limit: int) -> int:
+    """
+    Trim byte sequence to valid UTF-8 boundary (GPT suggestion)
+    """
+    end = min(limit, len(byte_seq))
+    while end > 0:
+        try:
+            bytes(byte_seq[:end]).decode('utf-8')
+            return end
+        except UnicodeDecodeError:
+            end -= 1
+    return limit
+class ByteTokenizerV62:
+    """
+    Pure byte-level tokenizer
+    46 content bytes + 2 special tokens (BOS/EOS) = 48 total
+    """
+    def __init__(self, config: Optional[Dict] = None):
+        # Configuration
+        self.content_size = 46  # Actual content bytes
+        self.max_seq_len = 48   # Total with BOS/EOS
+        self.chunk_overlap = 8  # Overlap for sliding window
+        # Special tokens
+        self.PAD = 256
+        self.BOS = 257
+        self.EOS = 258
+        self.MASK = 259
+        self.vocab_size = 260  # 256 bytes + 4 special
+    def encode(self,
+               text: str,
+               add_special_tokens: bool = True,
+               return_chunks: bool = False) -> Dict[str, torch.Tensor]:
+        """
+        Encode text to byte sequences
+        Args:
+            text: Input text
+            add_special_tokens: Whether to add BOS/EOS
+            return_chunks: Return multiple chunks for long sequences
+        """
+        # Convert to UTF-8 bytes
+        byte_sequence = list(text.encode('utf-8'))
+        if return_chunks and len(byte_sequence) > self.content_size:
+            # Handle long sequences with sliding window
+            return self._encode_with_chunks(byte_sequence, add_special_tokens)
+        # Single chunk processing with UTF-8 boundary (GPT suggestion)
+        if len(byte_sequence) > self.content_size:
+            cut_point = _trim_utf8_boundary(byte_sequence, self.content_size)
+            byte_sequence = byte_sequence[:cut_point]
+        # Add special tokens (GPT suggestion: cleaner padding order)
+        if add_special_tokens:
+            byte_sequence = [self.BOS] + byte_sequence + [self.EOS]
+        # Pad to max_seq_len (after special tokens for cleaner structure)
+        if len(byte_sequence) < self.max_seq_len:
+            padding_length = self.max_seq_len - len(byte_sequence)
+            byte_sequence = byte_sequence + [self.PAD] * padding_length
+        input_ids = torch.tensor(byte_sequence, dtype=torch.long)
+        attention_mask = (input_ids != self.PAD)  # bool type (GPT suggestion)
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'length': len(byte_sequence),
+            'original_length': len(text.encode('utf-8'))
+        }
+    def _encode_with_chunks(self,
+                           byte_sequence: List[int],
+                           add_special_tokens: bool) -> Dict[str, torch.Tensor]:
+        """
+        Encode long sequences with sliding window chunks
+        """
+        chunks = []
+        positions = []
+        # Calculate stride (content_size - overlap)
+        stride = self.content_size - self.chunk_overlap
+        for i in range(0, len(byte_sequence), stride):
+            # Extract chunk
+            chunk = byte_sequence[i:i + self.content_size]
+            # Skip if chunk is too small (last chunk)
+            if len(chunk) < self.content_size // 2:
+                if chunks:  # Merge with previous chunk if exists
+                    last_chunk = chunks[-1]['input_ids'].tolist()
+                    # Remove padding and special tokens from last chunk (GPT final check)
+                    last_chunk = [b for b in last_chunk if b not in [self.PAD, self.BOS, self.EOS]]
+                    # Add current chunk
+                    merged = last_chunk + chunk + [self.EOS]
+                    # Repad
+                    if len(merged) < self.max_seq_len:
+                        merged += [self.PAD] * (self.max_seq_len - len(merged))
+                    merged_ids = torch.tensor(merged[:self.max_seq_len], dtype=torch.long)
+                    merged_mask = (merged_ids != self.PAD)  # Recalculate mask (GPT suggestion)
+                    chunks[-1]['input_ids'] = merged_ids
+                    chunks[-1]['attention_mask'] = merged_mask
+                break
+            # Pad chunk if necessary
+            if len(chunk) < self.content_size:
+                chunk += [self.PAD] * (self.content_size - len(chunk))
+            # Add special tokens
+            if add_special_tokens:
+                chunk_with_special = [self.BOS] + chunk + [self.EOS]
+            else:
+                chunk_with_special = chunk
+            # Create tensors
+            input_ids = torch.tensor(chunk_with_special, dtype=torch.long)
+            attention_mask = (input_ids != self.PAD)  # bool type (GPT suggestion)
+            chunks.append({
+                'input_ids': input_ids,
+                'attention_mask': attention_mask,
+                'position': (i, min(i + self.content_size, len(byte_sequence)))
+            })
+            positions.append((i, min(i + self.content_size, len(byte_sequence))))
+        # Stack all chunks
+        all_input_ids = torch.stack([c['input_ids'] for c in chunks])
+        all_attention_masks = torch.stack([c['attention_mask'] for c in chunks])
+        return {
+            'input_ids': all_input_ids,  # [num_chunks, seq_len]
+            'attention_mask': all_attention_masks,
+            'num_chunks': len(chunks),
+            'chunk_positions': positions,
+            'original_length': len(byte_sequence)
+        }
+    def reconstruct(self,
+                    input_ids: torch.Tensor,
+                    positions: List[Tuple[int, int]] = None,
+                    skip_special_tokens: bool = True,
+                    overlap: int = 8) -> str:
+        """
+        Reconstruct text from multiple chunks (GPT suggestion)
+        Args:
+            input_ids: [num_chunks, seq_len] for multi-chunk
+            positions: List of (start, end) positions for each chunk
+            skip_special_tokens: Whether to skip special tokens
+            overlap: Overlap size between chunks
+        """
+        if input_ids.dim() == 1:
+            # Single sequence, use regular decode
+            return self.decode(input_ids, skip_special_tokens)
+        # Multi-chunk reconstruction
+        pieces = []
+        for i, chunk_ids in enumerate(input_ids):
+            chunk_ids = chunk_ids.cpu().numpy().tolist()
+            # Remove special tokens and padding
+            if skip_special_tokens:
+                chunk_ids = [
+                    b for b in chunk_ids
+                    if b not in [self.PAD, self.BOS, self.EOS, self.MASK] and b < 256
+                ]
+            pieces.append(chunk_ids)
+        # Merge chunks with overlap handling
+        output = []
+        for i, chunk in enumerate(pieces):
+            if i == 0:
+                output.extend(chunk)
+            else:
+                # Skip overlap bytes from current chunk
+                output.extend(chunk[overlap:] if len(chunk) > overlap else chunk)
+        # Convert to string
+        try:
+            text = bytes(output).decode('utf-8', errors='replace')
+        except:
+            text = ""
+        return text
+    def decode(self,
+               input_ids: torch.Tensor,
+               skip_special_tokens: bool = True) -> str:
+        """
+        Decode byte sequences back to text
+        """
+        if isinstance(input_ids, torch.Tensor):
+            input_ids = input_ids.cpu().numpy().tolist()
+        # Handle batch dimension
+        if isinstance(input_ids[0], list):
+            input_ids = input_ids[0]
+        # Remove special tokens and padding
+        if skip_special_tokens:
+            input_ids = [
+                b for b in input_ids
+                if b not in [self.PAD, self.BOS, self.EOS, self.MASK] and b < 256
+            ]
+        # Convert bytes to string
+        try:
+            text = bytes(input_ids).decode('utf-8', errors='replace')
+        except:
+            text = ""
+        return text
+    def batch_encode(self,
+                    texts: List[str],
+                    add_special_tokens: bool = True) -> Dict[str, torch.Tensor]:
+        """
+        Encode multiple texts as a batch
+        """
+        encoded = [self.encode(text, add_special_tokens) for text in texts]
+        # Find max length
+        max_len = max(e['length'] for e in encoded)
+        max_len = min(max_len, self.max_seq_len)
+        # Create batch tensors
+        batch_size = len(texts)
+        input_ids = torch.full((batch_size, max_len), self.PAD, dtype=torch.long)
+        attention_mask = torch.zeros((batch_size, max_len), dtype=torch.bool)  # bool type (GPT suggestion)
+        for i, enc in enumerate(encoded):
+            seq_len = min(enc['length'], max_len)
+            if enc['input_ids'].dim() == 0:  # Handle scalar
+                enc['input_ids'] = enc['input_ids'].unsqueeze(0)
+            input_ids[i, :seq_len] = enc['input_ids'][:seq_len]
+            attention_mask[i, :seq_len] = True
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'lengths': [e['length'] for e in encoded]
+        }
+class ChunkBoundaryAdjuster(nn.Module):
+    """
+    Neural network for adjusting chunk boundaries
+    Learns optimal splitting points
+    """
+    def __init__(self, hidden_dim: int = 256):
+        super().__init__()
+        # Boundary scoring network
+        self.boundary_scorer = nn.Sequential(
+            nn.Linear(256, hidden_dim),  # Input: byte embeddings
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden_dim, hidden_dim // 2),
+            nn.ReLU(),
+            nn.Linear(hidden_dim // 2, 1),  # Output: boundary score
+            nn.Sigmoid()
+        )
+        # UTF-8 boundary detector
+        self.utf8_detector = nn.Sequential(
+            nn.Conv1d(1, 16, kernel_size=4, padding=2),  # Detect multi-byte patterns
+            nn.ReLU(),
+            nn.Conv1d(16, 1, kernel_size=1),
+            nn.Sigmoid()
+        )
+    def forward(self, byte_sequence: torch.Tensor) -> torch.Tensor:
+        """
+        Find optimal chunk boundaries
+        Args:
+            byte_sequence: [batch, seq_len, embedding_dim]
+        Returns:
+            boundary_scores: [batch, seq_len] - probability of boundary at each position
+        """
+        batch_size, seq_len = byte_sequence.shape[:2]
+        # Score each position as potential boundary
+        boundary_scores = self.boundary_scorer(byte_sequence).squeeze(-1)
+        # Detect UTF-8 boundaries (avoid splitting multi-byte characters)
+        byte_values = byte_sequence[..., 0].unsqueeze(1)  # [batch, 1, seq_len]
+        utf8_scores = self.utf8_detector(byte_values).squeeze(1)  # [batch, seq_len]
+        # Combine scores (prefer boundaries at valid UTF-8 positions)
+        combined_scores = boundary_scores * utf8_scores
+        # Apply constraints: boundaries should be ~46 bytes apart
+        for i in range(0, seq_len, 46):
+            if i < seq_len:
+                # Boost score at expected positions
+                combined_scores[:, i] = combined_scores[:, i] * 1.5
+        return combined_scores
+class SlidingWindowProcessor(nn.Module):
+    """
+    Process sequences with sliding windows at multiple scales
+    """
+    def __init__(self, window_sizes: List[int] = [8, 16, 32, 46]):
+        super().__init__()
+        self.window_sizes = window_sizes
+        # Multi-scale convolutions for different window sizes
+        self.convs = nn.ModuleList([
+            nn.Conv1d(256, 128, kernel_size=ws, stride=ws//2, padding=ws//4)
+            for ws in window_sizes
+        ])
+        # Fusion layer
+        self.fusion = nn.Sequential(
+            nn.Linear(128 * len(window_sizes), 256),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(256, 256)
+        )
+    def forward(self, byte_embeddings: torch.Tensor) -> torch.Tensor:
+        """
+        Apply multi-scale sliding windows
+        Args:
+            byte_embeddings: [batch, seq_len, embedding_dim]
+        Returns:
+            processed: [batch, seq_len, embedding_dim]
+        """
+        # Transpose for conv1d
+        x = byte_embeddings.transpose(1, 2)  # [batch, embed, seq]
+        # Apply multi-scale convolutions
+        multi_scale_features = []
+        for conv in self.convs:
+            features = conv(x)  # Different seq lengths
+            # Global average pooling to fixed size
+            pooled = F.adaptive_avg_pool1d(features, byte_embeddings.size(1))
+            multi_scale_features.append(pooled)
+        # Concatenate and transpose back
+        concat = torch.cat(multi_scale_features, dim=1)  # [batch, 128*scales, seq]
+        concat = concat.transpose(1, 2)  # [batch, seq, 128*scales]
+        # Fuse multi-scale features
+        fused = self.fusion(concat)  # [batch, seq, 256]
+        # Residual connection
+        output = fused + byte_embeddings
+        return output
+class AdaptiveChunker:
+    """
+    Adaptive chunking based on content complexity
+    Simple heuristic-based chunker for inference
+    """
+    def __init__(self):
+        self.min_chunk = 32
+        self.max_chunk = 46
+        self.target_chunk = 46
+    def determine_chunk_size(self, text: str) -> int:
+        """
+        Determine optimal chunk size based on text characteristics
+        """
+        byte_seq = text.encode('utf-8')
+        # Check character types
+        has_cjk = any(b >= 0x80 for b in byte_seq[:100])  # Non-ASCII
+        has_arabic = any(0x0600 <= ord(c) <= 0x06FF for c in text[:100])
+        # Adjust chunk size based on content
+        if has_cjk:
+            # CJK characters need smaller chunks (multi-byte)
+            return self.min_chunk
+        elif has_arabic:
+            # Arabic also benefits from smaller chunks
+            return 40
+        else:
+            # ASCII/Latin can use larger chunks
+            return self.target_chunk
+    def chunk_text(self, text: str) -> List[str]:
+        """
+        Split text into adaptive chunks
+        """
+        chunk_size = self.determine_chunk_size(text)
+        byte_seq = text.encode('utf-8')
+        chunks = []
+        i = 0
+        while i < len(byte_seq):
+            # Find chunk boundary (don't split UTF-8 sequences)
+            end = min(i + chunk_size, len(byte_seq))
+            # Backtrack to valid UTF-8 boundary if needed
+            while end > i and end < len(byte_seq):
+                try:
+                    _ = byte_seq[i:end].decode('utf-8')
+                    break
+                except:
+                    end -= 1
+            chunk_bytes = byte_seq[i:end]
+            chunks.append(chunk_bytes.decode('utf-8', errors='replace'))
+            i = end
+        return chunks
+if __name__ == "__main__":
+    # Test the tokenizer
+    tokenizer = ByteTokenizerV62()
+    # Test texts
+    test_texts = [
+        "Hello, world!",
+        "안녕하세요, 세계!",
+        "今天天气很好。",
+        "مرحبا بالعالم",
+        "A" * 100  # Long text
+    ]
+    for text in test_texts:
+        print(f"\nText: {text[:50]}...")
+        # Single chunk encoding
+        encoded = tokenizer.encode(text)
+        print(f"  Encoded shape: {encoded['input_ids'].shape}")
+        print(f"  Original length: {encoded['original_length']} bytes")
+        # Decode back
+        decoded = tokenizer.decode(encoded['input_ids'])
+        print(f"  Decoded: {decoded[:50]}...")
+        # Check multi-chunk for long text
+        if encoded['original_length'] > 46:
+            multi_encoded = tokenizer.encode(text, return_chunks=True)
+            print(f"  Chunks: {multi_encoded['num_chunks']}")
+    # Test batch encoding
+    batch = tokenizer.batch_encode(test_texts[:3])
+    print(f"\nBatch shape: {batch['input_ids'].shape}")
+    # Test adaptive chunker
+    chunker = AdaptiveChunker()
+    for text in test_texts[:3]:
+        chunk_size = chunker.determine_chunk_size(text)
+        print(f"\n{text[:30]}... → Chunk size: {chunk_size}")

core/unified_model.py CHANGED Viewed

@@ -1,755 +1,541 @@
 """
-Unified Intelligent Tokenizer Model v6.1.2
-Compression-First Learning with Adaptive Splitting
-- 64 byte chunks for aggressive compression
-- 50 epoch checkpoints with automatic splitting
-- Group relation learning for reconstruction
-- Boundary adjustment for semantic units
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import math
 from typing import Dict, List, Optional, Tuple, Union
-class PositionalEncoding(nn.Module):
     """
-    Sinusoidal Positional Encoding (Transformer 원본 방식)
-    학습 가능한 위치 임베딩 대신 고정된 sin/cos 패턴 사용
     """
-    def __init__(self, d_model: int, max_len: int = 5000, dropout: float = 0.1):
         super().__init__()
-        self.dropout = nn.Dropout(dropout)
-        # Create sinusoidal position encodings
-        pe = torch.zeros(max_len, d_model)
-        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
-        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
-                           -(math.log(10000.0) / d_model))
-        pe[:, 0::2] = torch.sin(position * div_term)  # Even dimensions
-        pe[:, 1::2] = torch.cos(position * div_term)  # Odd dimensions
-        # Register as buffer (not trainable)
-        self.register_buffer('pe', pe.unsqueeze(0))
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
-        Add positional encoding to input
         Args:
-            x: (batch_size, seq_len, d_model)
         """
-        x = x + self.pe[:, :x.size(1)]
-        return self.dropout(x)
-class ByteTokenizer:
-    """
-    Pure byte-level tokenizer - no language rules
-    """
-    def __init__(self, max_seq_len: int = 64):  # v6.1.2: 64 bytes for compression-first approach
-        self.max_seq_len = max_seq_len
-        self.PAD = 256
-        self.BOS = 257
-        self.EOS = 258
-        self.MASK = 259
-    def encode(self, text: str, add_special_tokens: bool = True) -> Dict[str, torch.Tensor]:
-        # Convert to UTF-8 bytes
-        byte_seq = list(text.encode('utf-8'))
-        # Truncate if needed
-        if len(byte_seq) > self.max_seq_len - 2:
-            byte_seq = byte_seq[:self.max_seq_len - 2]
-        # Add special tokens
-        if add_special_tokens:
-            byte_seq = [self.BOS] + byte_seq + [self.EOS]
-        input_ids = torch.tensor(byte_seq, dtype=torch.long)
-        attention_mask = torch.ones_like(input_ids)
-        return {
-            'input_ids': input_ids,
-            'attention_mask': attention_mask,
-            'length': len(input_ids)
-        }
-    def encode_batch(self, texts: List[str]) -> Dict[str, torch.Tensor]:
-        encoded = [self.encode(text) for text in texts]
-        max_len = min(max(e['length'] for e in encoded), self.max_seq_len)
-        batch_size = len(texts)
-        input_ids = torch.full((batch_size, max_len), self.PAD, dtype=torch.long)
-        attention_mask = torch.zeros((batch_size, max_len), dtype=torch.float32)
-        for i, enc in enumerate(encoded):
-            seq_len = min(enc['length'], max_len)
-            input_ids[i, :seq_len] = enc['input_ids'][:seq_len]
-            attention_mask[i, :seq_len] = 1.0
-        return {
-            'input_ids': input_ids,
-            'attention_mask': attention_mask
-        }
-    def decode(self, input_ids: torch.Tensor, skip_special_tokens: bool = True) -> str:
-        if isinstance(input_ids, torch.Tensor):
-            input_ids = input_ids.cpu().numpy().tolist()
-        if skip_special_tokens:
-            input_ids = [b for b in input_ids if b < 256]
-        try:
-            byte_array = bytes([min(b, 255) for b in input_ids if b != self.PAD])
-            return byte_array.decode('utf-8', errors='replace')
-        except:
-            return "".join([chr(b) if b < 128 else '?' for b in input_ids if b < 256])
-class ByteEncoderV61(nn.Module):
-    """
-    v6.1: 5-Layer Encoder with Layer-Specialized Architecture
-    Layer 0: 768d - Byte to character (with curriculum learning)
-    Layer 1: 896d - Language pattern discovery (no labels)
-    Layer 2: 1024d - Eojeol/Word formation (+ eojeol PE)
-    Layer 3: 1152d - Small phrase grouping (2-3 eojeols)
-    Layer 4: 1280d - Final refinement (+ context PE)
-    Target: 어절(eojeol) to 구(phrase) level compression (3:1 ratio)
-    """
-    def __init__(
-        self,
-        vocab_size: int = 260,
-        hidden_dims: List[int] = [768, 896, 1024, 1152, 1280],  # v6.1 dimensions
-        num_heads: List[int] = [12, 14, 16, 18, 20],  # v6.1: Progressive heads per layer
-        dropout: float = 0.1,
-        max_seq_len: int = 64  # v6.1.2: 64 chunk for compression-first
-    ):
-        super().__init__()
-        # Layer 0: Byte to Character with Curriculum Learning
-        self.byte_embedding = nn.Embedding(vocab_size, hidden_dims[0])
-        # v6.1: Multi-level boundary predictors for hierarchical segmentation
-        # Level 1: Character boundaries (UTF-8 multi-byte)
-        self.char_boundary_predictor = nn.Linear(hidden_dims[0], 3)  # 0: continue, 1: start, 2: end
-        # Level 2: Eojeol boundaries (space + particle analysis)
-        self.eojeol_boundary_predictor = nn.Linear(hidden_dims[2], 4)  # 0: inside, 1: space, 2: particle, 3: punct
-        # Level 3: Phrase boundaries (syntactic chunks)
-        self.phrase_boundary_predictor = nn.Linear(hidden_dims[3], 3)  # 0: inside, 1: weak boundary, 2: strong boundary
-        # v6.1: Positional encoding ONLY for Layer 0
-        self.pos_encoding = PositionalEncoding(hidden_dims[0], max_seq_len, dropout)
-        # v6.1: Layer 1 - Language pattern discovery (no labels!)
-        self.pattern_discoverer = nn.Linear(hidden_dims[1], 256)  # Discover patterns autonomously (from 896d)
-        self.lang_signal_generator = nn.Linear(hidden_dims[1], 128)  # Generate language signals (from 896d)
-        # v6.1: Group-aware relative position encodings for Layer 2-4
-        self.group_pe_layer2 = nn.Embedding(max_seq_len, hidden_dims[2])  # For eojeol/word units
-        self.group_pe_layer3 = nn.Embedding(max_seq_len, hidden_dims[3])  # For small phrases (2-3 eojeols)
-        self.group_pe_layer4 = nn.Embedding(max_seq_len, hidden_dims[4])  # For context/discourse
-        # 5 Transformer layers with dimension changes
-        self.layers = nn.ModuleList()
-        for i in range(len(hidden_dims)):
-            input_dim = hidden_dims[i-1] if i > 0 else hidden_dims[0]
-            output_dim = hidden_dims[i]
-            # Projection layer if dimension changes
-            if input_dim != output_dim:
-                proj = nn.Linear(input_dim, output_dim)
             else:
-                proj = None
-            # v6.1: Layer-specific head count for optimal dimension per head
-            # Target: 64-80 dim per head
-            layer_heads = num_heads[i] if isinstance(num_heads, list) else num_heads
-            # Transformer encoder layer
-            layer = nn.TransformerEncoderLayer(
-                d_model=output_dim,
-                nhead=layer_heads,
-                dim_feedforward=output_dim * 4,
-                dropout=dropout,
-                activation='gelu',
-                batch_first=True,
-                norm_first=True
-            )
-            self.layers.append(nn.ModuleDict({
-                'projection': proj,
-                'transformer': layer,
-                'norm': nn.LayerNorm(output_dim)
-            }))
-        self.dropout = nn.Dropout(dropout)
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        boundary_labels: Optional[torch.Tensor] = None,
-        epoch: int = 0
-    ) -> Dict[str, torch.Tensor]:
         """
-        v6.1 Forward pass with curriculum learning
         Args:
-            boundary_labels: UTF-8 boundary labels for curriculum learning (training only)
-            epoch: Current epoch for curriculum schedule
         """
-        batch_size, seq_len = input_ids.shape
-        # Layer 0: Byte embedding with curriculum learning
-        x = self.byte_embedding(input_ids)
-        # v6.1: Positional encoding ONLY at Layer 0
-        x = self.pos_encoding(x)
-        # v6.1: Predict character boundaries (Layer 0)
-        char_boundaries = self.char_boundary_predictor(x)
-        # v6.1: Curriculum learning for character boundaries
-        # Note: boundary_labels are eojeol boundaries (4 classes), not char boundaries (3 classes)
-        # So we don't mix them with char_boundaries - they serve different purposes
-        char_boundary_weights = F.softmax(char_boundaries, dim=-1)
-        # Prepare attention mask
-        if attention_mask is not None:
-            # Keep attention mask as is for TransformerEncoderLayer
-            # It expects shape (batch_size, seq_len) and handles masking internally
-            pass
-        # v6.1: Process through 5 specialized layers
-        all_hidden_states = []
-        discovered_patterns = None
-        eojeol_boundaries = None
-        phrase_boundaries = None
-        for i, layer_dict in enumerate(self.layers):
-            # Project if needed (before layer-specific processing)
-            if layer_dict['projection'] is not None:
-                x = layer_dict['projection'](x)
-            # Layer 1: Add language signals (autonomous discovery)
-            if i == 1:
-                # Discover language patterns WITHOUT labels (x is now 896d)
-                discovered_patterns = self.pattern_discoverer(x)
-                lang_signals = self.lang_signal_generator(x)
-            # Layer 2: Predict eojeol boundaries and add position encoding
-            elif i == 2:
-                # Predict eojeol boundaries (spaces, particles, punctuation)
-                eojeol_boundaries = self.eojeol_boundary_predictor(x)
-                positions = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(batch_size, -1)
-                group_pe = self.group_pe_layer2(positions)
-                x = x + group_pe * 0.1  # Mild addition to preserve main signal
-            # Layer 3: Predict phrase boundaries and add position encoding
-            elif i == 3:
-                # Predict phrase boundaries (weak/strong syntactic breaks)
-                phrase_boundaries = self.phrase_boundary_predictor(x)
-                positions = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(batch_size, -1)
-                group_pe = self.group_pe_layer3(positions)
-                x = x + group_pe * 0.1
-            elif i == 4:
-                positions = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(batch_size, -1)
-                group_pe = self.group_pe_layer4(positions)
-                x = x + group_pe * 0.1
-            # Transformer layer - properly handle mask
-            if attention_mask is not None:
-                key_padding_mask = (attention_mask == 0)
-                x = layer_dict['transformer'](x, src_key_padding_mask=key_padding_mask)
             else:
-                x = layer_dict['transformer'](x)
-            x = layer_dict['norm'](x)
-            all_hidden_states.append(x)
-        # Pool for sequence representation
-        if attention_mask is not None:
-            # Masked mean pooling - attention_mask is (batch, seq)
-            mask = attention_mask.unsqueeze(-1)  # (batch, seq, 1)
-            pooled = (x * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-9)
         else:
-            pooled = x.mean(dim=1)
-        return {
-            'last_hidden_state': x,
-            'pooled_output': pooled,
-            'all_hidden_states': all_hidden_states,
-            # v6.1 boundary predictions
-            'char_boundaries': char_boundaries,
-            'char_boundary_weights': char_boundary_weights,
-            'eojeol_boundaries': eojeol_boundaries,
-            'phrase_boundaries': phrase_boundaries,
-            'discovered_patterns': discovered_patterns
-        }
-class CrossAttention(nn.Module):
-    """
-    Enhanced Cross-attention for relation learning between sequences
-    추론 레이어 연결을 위한 강화된 관계 학습
-    """
-    def __init__(self, hidden_dim: int = 1280, num_heads: int = 20, dropout: float = 0.1):
-        super().__init__()
-        # v6.1: Adjusted for 1280d (64 per head with 20 heads)
-        self.cross_attn = nn.MultiheadAttention(
-            hidden_dim, num_heads, dropout, batch_first=True
-        )
-        # v6.1: Enhanced relation classifier with reconstruction focus
-        # 0: identity (완벽한 복원), 1: similar, 2: different, 3: continuation
-        # 4: translation, 5: summary, 6: expansion, 7: contradiction
-        self.relation_head = nn.Sequential(
-            nn.Linear(hidden_dim * 2, hidden_dim),
-            nn.GELU(),
-            nn.Dropout(dropout),
-            nn.Linear(hidden_dim, hidden_dim // 2),
-            nn.GELU(),
-            nn.Dropout(dropout),
-            nn.Linear(hidden_dim // 2, 8)
-        )
-        # v6.1: Reconstruction-specific attention (복원 전용 어텐션)
-        # Use 10 heads for reconstruction (128 per head)
-        self.reconstruction_attn = nn.MultiheadAttention(
-            hidden_dim, 10, dropout * 0.5, batch_first=True
-        )
-        # Gating mechanism for adaptive fusion
-        self.gate = nn.Sequential(
-            nn.Linear(hidden_dim * 2, hidden_dim),
-            nn.Sigmoid()
-        )
-        self.norm1 = nn.LayerNorm(hidden_dim)
-        self.norm2 = nn.LayerNorm(hidden_dim)
-    def forward(
-        self,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        query_mask: Optional[torch.Tensor] = None,
-        key_mask: Optional[torch.Tensor] = None
-    ) -> Dict[str, torch.Tensor]:
-        # Normalize inputs
-        query_norm = self.norm1(query)
-        key_norm = self.norm2(key)
-        # Fix key_mask dimension if needed
-        if key_mask is not None:
-            # Ensure key_mask matches key sequence length
-            if key_mask.dim() == 2 and key_mask.size(1) != key.size(1):
-                # Create new mask with correct dimensions
-                batch_size = key.size(0)
-                seq_len = key.size(1)
-                key_mask = torch.ones(batch_size, seq_len, dtype=key_mask.dtype, device=key_mask.device)
-        # Cross attention
-        attn_output, attn_weights = self.cross_attn(
-            query_norm, key_norm, key_norm,
-            key_padding_mask=(key_mask == 0) if key_mask is not None else None
-        )
-        # Residual connection
-        attn_output = attn_output + query
-        # v6.1: Reconstruction-focused attention (복원 최적화)
-        recon_output, recon_weights = self.reconstruction_attn(
-            query_norm, query_norm, query_norm,  # Self-attention for consistency
-            key_padding_mask=(query_mask == 0) if query_mask is not None else None
-        )
-        # Combine cross and reconstruction attention
-        combined_attn = attn_output * 0.7 + recon_output * 0.3
-        # Adaptive gating for fusion
-        gate_input = torch.cat([query.mean(dim=1), key.mean(dim=1)], dim=-1)
-        gate_weights = self.gate(gate_input).unsqueeze(1)
-        # Gated fusion: 적응적으로 attention 결과 조절
-        fused_output = gate_weights * combined_attn + (1 - gate_weights) * query
-        # Pool for relation classification
-        query_pooled = query.mean(dim=1) if query_mask is None else \
-                      (query * query_mask.unsqueeze(-1)).sum(1) / query_mask.sum(1, keepdim=True).clamp(min=1e-9)
-        key_pooled = key.mean(dim=1) if key_mask is None else \
-                    (key * key_mask.unsqueeze(-1)).sum(1) / key_mask.sum(1, keepdim=True).clamp(min=1e-9)
-        # Classify relations with enhanced head
-        combined = torch.cat([query_pooled, key_pooled], dim=-1)
-        relation_logits = self.relation_head(combined)
-        return {
-            'cross_attention': fused_output,  # Gated fusion output
-            'attention_weights': attn_weights,
-            'reconstruction_weights': recon_weights,  # v6.1: 복원 어텐션 가중치
-            'relation_logits': relation_logits,
-            'gate_weights': gate_weights.squeeze(1),  # For analysis
-            'reconstruction_score': F.softmax(relation_logits, dim=-1)[:, 0]  # identity 확률 (복원도)
-        }
-class TransformerDecoder(nn.Module):
-    """
-    Transformer Decoder with Positional Encoding
-    """
-    def __init__(
-        self,
-        vocab_size: int = 260,
-        hidden_dim: int = 1280,  # v6.1: Match final encoder dim
-        num_heads: int = 16,      # v6.1: 1280/16 = 80 per head
-        num_layers: int = 8,      # v6.1 FINAL: 8 layers for better reconstruction
-        dropout: float = 0.1,
-        max_seq_len: int = 64    # v6.1.2: 64 chunk for compression-first
-    ):
-        super().__init__()
-        # Token embedding
-        self.token_embedding = nn.Embedding(vocab_size, hidden_dim)
-        # Positional encoding
-        self.pos_encoding = PositionalEncoding(hidden_dim, max_seq_len, dropout)
-        # Transformer decoder
-        decoder_layer = nn.TransformerDecoderLayer(
-            d_model=hidden_dim,
-            nhead=num_heads,
-            dim_feedforward=hidden_dim * 4,
-            dropout=dropout,
-            activation='gelu',
-            batch_first=True,
-            norm_first=True
-        )
-        self.transformer = nn.TransformerDecoder(decoder_layer, num_layers)
-        # Output projection
-        self.output_projection = nn.Linear(hidden_dim, vocab_size)
-        self.hidden_dim = hidden_dim
-        self.vocab_size = vocab_size
-    def forward(
-        self,
-        encoder_hidden: torch.Tensor,
-        decoder_input_ids: Optional[torch.Tensor] = None,
-        encoder_mask: Optional[torch.Tensor] = None,
-        decoder_mask: Optional[torch.Tensor] = None
-    ) -> Dict[str, torch.Tensor]:
-        batch_size = encoder_hidden.size(0)
-        # Start with BOS if no input
-        if decoder_input_ids is None:
-            decoder_input_ids = torch.full((batch_size, 1), 257, device=encoder_hidden.device)
-        # Embed and add positional encoding
-        dec_seq_len = decoder_input_ids.size(1)
-        x = self.token_embedding(decoder_input_ids)
-        x = self.pos_encoding(x)
-        # Create causal mask
-        causal_mask = torch.triu(
-            torch.ones(dec_seq_len, dec_seq_len, device=x.device) * float('-inf'),
-            diagonal=1
-        )
-        # Decoder forward - handle variable-length encoder outputs
-        # The encoder may compress the sequence, so memory (encoder_hidden) might be shorter
-        # than the decoder sequence. This is expected and correct behavior.
-        enc_seq_len = encoder_hidden.size(1)
-        # Adjust encoder mask if needed
-        if encoder_mask is not None:
-            if encoder_mask.size(1) != enc_seq_len:
-                # Encoder compressed the sequence, create new mask for compressed length
-                # All compressed positions are valid (not masked)
-                memory_key_padding_mask = torch.zeros(
-                    encoder_hidden.size(0), enc_seq_len,
-                    dtype=torch.bool, device=encoder_hidden.device
-                )
-            else:
-                memory_key_padding_mask = (encoder_mask == 0)
         else:
-            memory_key_padding_mask = None
-        # Decoder attends to compressed encoder states via cross-attention
-        # This naturally handles different sequence lengths
-        decoder_output = self.transformer(
-            tgt=x,  # Decoder sequence (original length)
-            memory=encoder_hidden,  # Encoder sequence (possibly compressed)
-            tgt_mask=causal_mask,
-            memory_key_padding_mask=memory_key_padding_mask,
-            tgt_key_padding_mask=(decoder_mask == 0) if decoder_mask is not None else None
-        )
-        # Project to vocabulary
-        logits = self.output_projection(decoder_output)
-        return {
-            'logits': logits,
-            'hidden_states': decoder_output
-        }
-    @torch.no_grad()
-    def generate(
-        self,
-        encoder_hidden: torch.Tensor,
-        encoder_mask: Optional[torch.Tensor] = None,
-        max_length: int = 128,
-        temperature: float = 0.1,  # 토크나이저는 보수적 생성 (정확한 복원)
-        top_k: int = 10,  # 상위 10개만 고려
-        top_p: float = 0.95
-    ) -> torch.Tensor:
-        batch_size = encoder_hidden.size(0)
-        device = encoder_hidden.device
-        # Start with BOS
-        decoder_input_ids = torch.full((batch_size, 1), 257, device=device)
-        # Track which sequences are done
-        finished = torch.zeros(batch_size, dtype=torch.bool, device=device)
-        for _ in range(max_length - 1):
-            # Forward pass
-            outputs = self.forward(encoder_hidden, decoder_input_ids, encoder_mask)
-            next_token_logits = outputs['logits'][:, -1, :] / temperature
-            # Top-k filtering
-            if top_k > 0:
-                indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
-                next_token_logits[indices_to_remove] = float('-inf')
-            # Top-p filtering
-            if top_p < 1.0:
-                sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
-                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
-                sorted_indices_to_remove = cumulative_probs > top_p
-                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-                sorted_indices_to_remove[..., 0] = 0
-                indices_to_remove = sorted_indices_to_remove.scatter(-1, sorted_indices, sorted_indices_to_remove)
-                next_token_logits[indices_to_remove] = float('-inf')
-            # Sample
-            probs = F.softmax(next_token_logits, dim=-1)
-            next_tokens = torch.multinomial(probs, 1)
-            # For finished sequences, force PAD token
-            next_tokens[finished] = 256  # PAD token
-            decoder_input_ids = torch.cat([decoder_input_ids, next_tokens], dim=-1)
-            # Update finished status
-            finished = finished | (next_tokens.squeeze(-1) == 258)  # Mark as finished if EOS
-            # Stop when all sequences are done
-            if finished.all():
-                break
-        return decoder_input_ids
-class IntelligentTokenizerModelV61(nn.Module):
-    """
-    Complete Intelligent Tokenizer Model v6.1
-    Pure learning-based with curriculum learning
-    - No language labels during training
-    - Curriculum learning for boundaries
-    - Group-aware position encodings
-    """
-    def __init__(
-        self,
-        vocab_size: int = 260,
-        encoder_dims: List[int] = [768, 896, 1024, 1152, 1280],  # v6.1 dimensions
-        encoder_heads: List[int] = [12, 14, 16, 18, 20],  # v6.1: Optimal heads per layer
-        decoder_hidden: int = 1280,  # Match final encoder dim
-        decoder_heads: int = 16,     # v6.1: 80 per head for decoder
-        num_decoder_layers: int = 8,  # v6.1 FINAL: 8 layers for better reconstruction
-        dropout: float = 0.1,
-        max_seq_len: int = 64  # v6.1.2: 64 chunk for compression-first
-    ):
-        super().__init__()
-        # v6.1 Components with optimized head counts
-        self.tokenizer = ByteTokenizer(max_seq_len)
-        self.encoder = ByteEncoderV61(vocab_size, encoder_dims, encoder_heads, dropout, max_seq_len)
-        self.decoder = TransformerDecoder(vocab_size, decoder_hidden, decoder_heads, num_decoder_layers, dropout, max_seq_len)
-        self.cross_attention = CrossAttention(encoder_dims[-1], 20, dropout)  # 20 heads for 1280d
-    def forward(
-        self,
-        input_texts: Optional[List[str]] = None,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        decoder_input_ids: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        boundary_labels: Optional[torch.Tensor] = None,  # v6.1: for curriculum learning
-        epoch: int = 0,  # v6.1: for curriculum schedule
-        use_cross_attention: bool = True
-    ) -> Dict[str, torch.Tensor]:
-        # Tokenize if text input
-        if input_texts is not None:
-            tokenized = self.tokenizer.encode_batch(input_texts)
-            input_ids = tokenized['input_ids']
-            attention_mask = tokenized['attention_mask']
-        # 시퀀스 길이 체크 및 조정
-        batch_size, seq_len = input_ids.shape
-        device = input_ids.device
-        # v6.1: Encode with curriculum learning
-        encoder_outputs = self.encoder(input_ids, attention_mask, boundary_labels, epoch)
-        encoder_hidden = encoder_outputs['last_hidden_state']  # v6.1: [batch, seq, 1280]
-        # v6.1: 차원 확인 - 최종 차원은 1280
-        assert encoder_hidden.size(-1) == 1280, f"Encoder dim mismatch: {encoder_hidden.size(-1)}"
-        # Prepare decoder input for teacher forcing during training
-        if decoder_input_ids is None:
-            if labels is not None:
-                # During training, use shifted labels as decoder input (teacher forcing)
-                # Add BOS at the beginning and remove last token
-                bos_tokens = torch.full((batch_size, 1), self.tokenizer.BOS, device=labels.device, dtype=labels.dtype)
-                decoder_input_ids = torch.cat([bos_tokens, labels[:, :-1]], dim=1)
-            else:
-                # For inference/test, start with BOS token
-                decoder_input_ids = torch.full((batch_size, 1), self.tokenizer.BOS, device=device, dtype=torch.long)
-        # Decode
-        decoder_outputs = self.decoder(
-            encoder_hidden,
-            decoder_input_ids,
-            attention_mask
-        )
-        decoder_hidden = decoder_outputs['hidden_states']  # [batch, seq, 768]
-        # Cross-Attention (마지막 레이어에서 관계 학습)
-        cross_attn_outputs = None
-        relation_logits = None
-        if use_cross_attention and decoder_hidden is not None:
-            # 디코더 출력과 인코더 출력 간 크로스어텐션
-            cross_attn_outputs = self.cross_attention(
-                query=decoder_hidden,  # 디코더가 query
-                key=encoder_hidden,     # 인코더가 key/value
-                query_mask=None,        # decoder mask는 causal이므로 별도 처리
-                key_mask=attention_mask
-            )
-            # 관계 학습 결과
-            relation_logits = cross_attn_outputs['relation_logits']
-            # Cross-attention으로 강화된 디코더 표현
-            enhanced_decoder = decoder_hidden + cross_attn_outputs['cross_attention']
-            # 최종 로짓 재계산 (cross-attention 적용 후)
-            if hasattr(self.decoder, 'output_projection'):
-                decoder_outputs['logits'] = self.decoder.output_projection(enhanced_decoder)
-        # Calculate loss if labels provided
-        loss = None
-        if labels is not None:
-            # Reconstruction loss
-            loss_fct = nn.CrossEntropyLoss(ignore_index=self.tokenizer.PAD)
-            recon_loss = loss_fct(
-                decoder_outputs['logits'].reshape(-1, decoder_outputs['logits'].size(-1)),
-                labels.reshape(-1)
-            )
-            # Boundary loss (if boundary labels provided)
-            boundary_loss = 0
-            if boundary_labels is not None and encoder_outputs.get('eojeol_boundaries') is not None:
-                # Eojeol boundary loss
-                eojeol_boundaries = encoder_outputs['eojeol_boundaries']  # [batch, seq, 4]
-                if eojeol_boundaries.size(1) == boundary_labels.size(1):
-                    # Ensure boundary labels are in valid range (0-3)
-                    # Clamp to valid range to prevent CUDA errors
-                    boundary_labels_clamped = torch.clamp(boundary_labels, min=0, max=3)
-                    boundary_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)  # Use -1 for padding
-                    boundary_loss = boundary_loss_fct(
-                        eojeol_boundaries.reshape(-1, 4),
-                        boundary_labels_clamped.reshape(-1)
-                    ) * 0.5  # Weight for boundary loss
-            # Relation loss (if cross-attention used)
-            relation_loss = 0
-            if relation_logits is not None:
-                # 자기 관계는 identity (class 0)여야 함
-                batch_identity = torch.zeros(batch_size, dtype=torch.long, device=device)
-                relation_loss = F.cross_entropy(relation_logits, batch_identity) * 0.1
-            loss = recon_loss + boundary_loss + relation_loss
-        return {
-            'loss': loss,
-            'logits': decoder_outputs['logits'],
-            'decoder_logits': decoder_outputs['logits'],  # Add for compatibility
-            'encoder_hidden_states': encoder_hidden,
-            'decoder_hidden_states': decoder_hidden,
-            'pooled_output': encoder_outputs['pooled_output'],
-            'cross_attention': cross_attn_outputs['cross_attention'] if cross_attn_outputs else None,
-            'relation_logits': relation_logits,
-            'all_encoder_states': encoder_outputs.get('all_hidden_states', None),
-            # Add boundary predictions for visualization
-            'char_boundaries': encoder_outputs.get('char_boundaries'),
-            'eojeol_boundaries': encoder_outputs.get('eojeol_boundaries'),
-            'phrase_boundaries': encoder_outputs.get('phrase_boundaries'),
-            'discovered_patterns': encoder_outputs.get('discovered_patterns')
         }
-    def encode_text(self, text: str) -> torch.Tensor:
-        """Encode single text to representation"""
-        tokenized = self.tokenizer.encode(text)
-        # Move to same device as model
-        device = next(self.parameters()).device
-        input_ids = tokenized['input_ids'].unsqueeze(0).to(device)
-        attention_mask = tokenized['attention_mask'].unsqueeze(0).to(device)
-        with torch.no_grad():
-            outputs = self.encoder(input_ids, attention_mask)
-        return outputs['pooled_output'].squeeze(0)
-    def decode_representation(self, representation: torch.Tensor, max_length: int = 128) -> str:
-        """Decode representation back to text"""
-        if representation.dim() == 1:
-            representation = representation.unsqueeze(0).unsqueeze(0)
-        elif representation.dim() == 2:
-            representation = representation.unsqueeze(1)
-        with torch.no_grad():
-            output_ids = self.decoder.generate(representation, max_length=max_length)
-        text = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
-        return text
-    def compute_relation(self, text1: str, text2: str) -> torch.Tensor:
-        """Compute relation between two texts"""
-        # Encode both texts
-        enc1 = self.encode_text(text1).unsqueeze(0).unsqueeze(0)
-        enc2 = self.encode_text(text2).unsqueeze(0).unsqueeze(0)
-        # Compute cross-attention and relations
-        with torch.no_grad():
-            outputs = self.cross_attention(enc1, enc2)
-        return F.softmax(outputs['relation_logits'], dim=-1)

 """
+Intelligent Tokenizer v6.2.0 - Unified Model
+Integrates encoder, decoder, and tokenizer with all GPT improvements
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from typing import Dict, List, Optional, Tuple, Union
+import math
+# Import our components
+try:
+    from .encoder import EncoderV62
+    from .decoder import DecoderV62
+    from .tokenizer import ByteTokenizerV62
+except ImportError:
+    # For standalone testing
+    from encoder import EncoderV62
+    from decoder import DecoderV62
+    from tokenizer import ByteTokenizerV62
+class IntelligentTokenizerV62(nn.Module):
     """
+    Complete v6.2.0 model with progressive splitting and optimizations
+    Key features:
+    - 48-byte chunks (46+2 with BOS/EOS)
+    - Progressive splitting: 48→1→N→M tokens
+    - Multi-level cross-attention
+    - KV cache optimization (8x reduction)
+    - All GPT-5 improvements integrated
     """
+    def __init__(self, config: Optional[Dict] = None):
         super().__init__()
+        # Default configuration
+        self.config = config or {}
+        # Model components
+        self.tokenizer = ByteTokenizerV62(config)
+        self.encoder = EncoderV62(config)
+        self.decoder = DecoderV62(config)
+        # Training configuration
+        self.compression_weight = 0.1
+        self.reconstruction_weight = 0.1
+        self.boundary_weight = 0.1
+        # Monitoring
+        self.register_buffer('training_step', torch.tensor(0))
+        self.register_buffer('current_epoch', torch.tensor(0))
+    def forward(self,
+                input_ids: torch.Tensor = None,
+                attention_mask: torch.Tensor = None,
+                labels: torch.Tensor = None,
+                text: str = None,
+                return_loss: bool = True,
+                temperature: float = 1.0) -> Dict[str, torch.Tensor]:
         """
+        Unified forward pass
         Args:
+            input_ids: Pre-tokenized input (optional)
+            attention_mask: Attention mask (optional)
+            labels: Target labels for training (optional)
+            text: Raw text input (alternative to input_ids)
+            return_loss: Whether to compute loss
+            temperature: Temperature for Gumbel-Softmax in encoder
+        Returns:
+            Dictionary with model outputs
         """
+        # Handle text input
+        if text is not None:
+            encoded = self.tokenizer.encode(text, add_special_tokens=True)
+            input_ids = encoded['input_ids'].unsqueeze(0) if encoded['input_ids'].dim() == 1 else encoded['input_ids']
+            attention_mask = encoded['attention_mask'].unsqueeze(0) if encoded['attention_mask'].dim() == 1 else encoded['attention_mask']
+        # Handle string passed as input_ids (common mistake)
+        if isinstance(input_ids, str):
+            text = input_ids
+            encoded = self.tokenizer.encode(text, add_special_tokens=True)
+            input_ids = encoded['input_ids'].unsqueeze(0) if encoded['input_ids'].dim() == 1 else encoded['input_ids']
+            attention_mask = encoded['attention_mask'].unsqueeze(0) if encoded['attention_mask'].dim() == 1 else encoded['attention_mask']
+        # Ensure tensors are on the right device
+        device = next(self.parameters()).device
+        if input_ids is not None and torch.is_tensor(input_ids):
+            input_ids = input_ids.to(device)
+        if attention_mask is not None and torch.is_tensor(attention_mask):
+            attention_mask = attention_mask.to(device)
+        if labels is not None and torch.is_tensor(labels):
+            labels = labels.to(device)
+        # Encoder forward pass with temperature for Gumbel annealing
+        encoder_outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            temperature=temperature
+        )
+        # Decoder forward pass
+        if labels is not None:
+            # Training mode with teacher forcing (GPT suggestion: shift by 1)
+            # Input: labels[:-1], Target: labels[1:]
+            decoder_input = labels[:, :-1] if labels.dim() > 1 else labels[:-1]
+            decoder_mask = attention_mask[:, :-1] if attention_mask is not None and attention_mask.dim() > 1 else None
+            decoder_outputs = self.decoder(
+                encoder_all_hidden=encoder_outputs['all_hidden_states'],
+                decoder_input_ids=decoder_input,
+                attention_mask=decoder_mask
+            )
+        else:
+            # Inference mode (without teacher forcing)
+            # For now, fallback to using input as labels for stable training
+            # TODO: Implement proper autoregressive generation
+            if return_loss and input_ids is not None:
+                labels = input_ids  # Use input as both input and target
+                decoder_input = labels[:, :-1] if labels.dim() > 1 else labels[:-1]
+                decoder_mask = attention_mask[:, :-1] if attention_mask is not None and attention_mask.dim() > 1 else None
+                decoder_outputs = self.decoder(
+                    encoder_all_hidden=encoder_outputs['all_hidden_states'],
+                    decoder_input_ids=decoder_input,
+                    attention_mask=decoder_mask
+                )
+            else:
+                decoder_outputs = self.decoder(
+                    encoder_all_hidden=encoder_outputs['all_hidden_states'],
+                    decoder_input_ids=None,
+                    attention_mask=attention_mask
+                )
+        # Combine outputs with prefix to avoid key collision (GPT suggestion)
+        outputs = {}
+        for key, value in encoder_outputs.items():
+            outputs[f'enc_{key}'] = value
+        for key, value in decoder_outputs.items():
+            outputs[f'dec_{key}'] = value
+        # Compute loss if requested
+        if return_loss and labels is not None:
+            loss = self.compute_loss(outputs, labels, attention_mask)
+            outputs['loss'] = loss
+        return outputs
+    def compute_loss(self,
+                    outputs: Dict[str, torch.Tensor],
+                    labels: torch.Tensor,
+                    attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Compute combined loss with multiple objectives
+        Components:
+        1. Reconstruction loss (cross-entropy)
+        2. Compression loss (encourage higher compression)
+        3. Boundary loss (boundary prediction accuracy)
+        """
+        losses = {}
+        # 1. Reconstruction loss (GPT suggestion: use shifted targets)
+        if 'dec_logits' in outputs:
+            logits = outputs['dec_logits']
+            # Shift targets for next-token prediction
+            target_labels = labels[:, 1:] if labels.dim() > 1 else labels[1:]
+            target_mask = attention_mask[:, 1:] if attention_mask is not None and attention_mask.dim() > 1 else None
+            # Reshape for cross-entropy
+            batch_size, seq_len, vocab_size = logits.shape
+            logits_flat = logits.reshape(-1, vocab_size)
+            labels_flat = target_labels.reshape(-1)
+            # Mask out padding (GPT suggestion: use bool mask)
+            if target_mask is not None:
+                mask_flat = target_mask.reshape(-1).bool()
+                reconstruction_loss = F.cross_entropy(
+                    logits_flat[mask_flat],
+                    labels_flat[mask_flat],
+                    ignore_index=self.tokenizer.PAD,
+                    label_smoothing=0.1  # Added label smoothing
+                )
+            else:
+                reconstruction_loss = F.cross_entropy(
+                    logits_flat,
+                    labels_flat,
+                    ignore_index=self.tokenizer.PAD,
+                    label_smoothing=0.1
+                )
+            losses['reconstruction'] = reconstruction_loss * self.reconstruction_weight
+        # 2. Compression loss (GPT suggestion: use proper device tensor creation)
+        if 'enc_compression_ratio' in outputs:
+            # Target compression ratio (e.g., 24:1 as per config)
+            target_ratio = 24.0
+            current_ratio = outputs['enc_compression_ratio']
+            # Create tensors on same device (GPT suggestion)
+            if isinstance(current_ratio, (int, float)):
+                current_ratio_tensor = labels.new_tensor(current_ratio, dtype=torch.float32)
+            else:
+                current_ratio_tensor = current_ratio.float()
+            target_ratio_tensor = labels.new_tensor(target_ratio, dtype=torch.float32)
+            # Penalize deviation from target (use smooth L1 to avoid explosion)
+            compression_loss = F.smooth_l1_loss(
+                current_ratio_tensor,
+                target_ratio_tensor,
+                beta=2.0  # Transition point from L2 to L1
+            )
+            losses['compression'] = compression_loss * self.compression_weight
+        # 3. Boundary loss (GPT suggestion: more meaningful boundary learning)
+        if 'enc_boundaries' in outputs and outputs['enc_boundaries'] is not None:
+            boundary_scores = outputs['enc_boundaries']
+            # Boundary sparsity + smoothness (GPT suggestion)
+            # Encourage sparse but clear boundaries
+            boundary_probs = torch.sigmoid(boundary_scores)
+            # Sparsity loss (boundaries should be rare)
+            sparsity_loss = boundary_probs.mean() * 0.1
+            # Smoothness loss (adjacent boundaries should be different)
+            if boundary_scores.size(1) > 1:
+                diff = boundary_scores[:, 1:] - boundary_scores[:, :-1]
+                smoothness_loss = (diff ** 2).mean() * 0.01
             else:
+                smoothness_loss = 0.0
+            boundary_loss = sparsity_loss + smoothness_loss
+            losses['boundary'] = boundary_loss * self.boundary_weight
+        # Combine all losses
+        total_loss = sum(losses.values())
+        # Store individual losses for monitoring
+        self.last_losses = losses
+        return total_loss
+    def generate(self,
+                text: str = None,
+                input_ids: torch.Tensor = None,
+                max_length: int = 48,
+                temperature: float = 0.1,
+                top_k: int = 10,
+                top_p: float = 0.95) -> str:
         """
+        Generate/reconstruct text
         Args:
+            text: Input text to encode and reconstruct
+            input_ids: Pre-encoded input
+            max_length: Maximum generation length
+            temperature: Sampling temperature
+            top_k: Top-k sampling
+            top_p: Top-p (nucleus) sampling
+        Returns:
+            Reconstructed/generated text
         """
+        # Encode input if text is provided (GPT suggestion: handle multi-chunk properly)
+        chunk_positions = None
+        if text is not None:
+            # Check if text needs chunking
+            if len(text.encode('utf-8')) > self.tokenizer.content_size:
+                encoded = self.tokenizer.encode(text, add_special_tokens=True, return_chunks=True)
+                chunk_positions = encoded.get('chunk_positions', None)
             else:
+                encoded = self.tokenizer.encode(text, add_special_tokens=True)
+            input_ids = encoded['input_ids'].unsqueeze(0) if encoded['input_ids'].dim() == 1 else encoded['input_ids']
+            attention_mask = encoded['attention_mask'].unsqueeze(0) if encoded['attention_mask'].dim() == 1 else encoded['attention_mask']
         else:
+            attention_mask = (input_ids != self.tokenizer.PAD).bool()  # GPT suggestion: bool mask
+        # Move to device
+        device = next(self.parameters()).device
+        input_ids = input_ids.to(device)
+        attention_mask = attention_mask.to(device)
+        # Encode
+        with torch.no_grad():
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask
+            )
+            # Prepare all hidden states for decoder
+            if 'all_hidden_states' in encoder_outputs:
+                encoder_all_hidden = encoder_outputs['all_hidden_states']
+            else:
+                compressed = encoder_outputs.get('compressed', encoder_outputs.get('hidden_states'))
+                encoder_all_hidden = [compressed] * 4
+        # Autoregressive generation (fixed version)
+        batch_size = input_ids.size(0)
+        # Start with BOS token
+        generated_ids = torch.full((batch_size, 1), self.tokenizer.BOS, device=device)
+        for step in range(max_length - 1):
+            with torch.no_grad():
+                # Decode current sequence
+                decoder_outputs = self.decoder(
+                    encoder_all_hidden=encoder_all_hidden,
+                    decoder_input_ids=generated_ids,
+                    attention_mask=torch.ones_like(generated_ids),
+                    use_cache=False
+                )
+                # Get next token prediction
+                logits = decoder_outputs['logits'][:, -1, :] / temperature
+                # Top-k filtering
+                if top_k > 0:
+                    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+                    logits[indices_to_remove] = float('-inf')
+                # Sample next token
+                probs = F.softmax(logits, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1)
+                # Append to generated sequence
+                generated_ids = torch.cat([generated_ids, next_token], dim=1)
+                # Check for EOS
+                if (next_token == self.tokenizer.EOS).all():
+                    break
+        # Decode to text (GPT suggestion: proper multi-chunk reconstruction)
+        if generated_ids.dim() > 2 and chunk_positions is not None:
+            # Multi-chunk output with positions
+            text = self.tokenizer.reconstruct(
+                generated_ids,
+                positions=chunk_positions,
+                overlap=self.tokenizer.chunk_overlap
+            )
+        elif generated_ids.dim() > 2:
+            # Multi-chunk without positions (fallback)
+            text = self.tokenizer.reconstruct(generated_ids)
         else:
+            # Single sequence
+            text = self.tokenizer.decode(generated_ids[0] if generated_ids.dim() > 1 else generated_ids)
+        return text
+    def compress(self, text: str) -> Dict[str, Union[torch.Tensor, float]]:
+        """
+        Compress text and return compression statistics
+        Args:
+            text: Input text to compress
+        Returns:
+            Dictionary with compressed representation and statistics
+        """
+        # Encode text
+        encoded = self.tokenizer.encode(text, add_special_tokens=True)
+        input_ids = encoded['input_ids'].unsqueeze(0) if encoded['input_ids'].dim() == 1 else encoded['input_ids']
+        attention_mask = encoded['attention_mask'].unsqueeze(0) if encoded['attention_mask'].dim() == 1 else encoded['attention_mask']
+        # Move to device
+        device = next(self.parameters()).device
+        input_ids = input_ids.to(device)
+        attention_mask = attention_mask.to(device)
+        # Get compressed representation
+        with torch.no_grad():
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask
+            )
+        return {
+            'compressed': encoder_outputs['compressed'],
+            'num_tokens': encoder_outputs['num_tokens'],
+            'compression_ratio': encoder_outputs['compression_ratio'],
+            'original_bytes': len(text.encode('utf-8')),
+            'compressed_size': encoder_outputs['num_tokens'] * 2  # Approximate bytes
+        }
+    def update_training_state(self, epoch: int, step: int = 0, reconstruction_loss: float = None):
+        """
+        Update training state - adaptive, not phase-based
+        Args:
+            epoch: Current epoch
+            step: Current training step
+            reconstruction_loss: Current reconstruction quality
+        """
+        self.current_epoch = torch.tensor(epoch)
+        self.training_step = torch.tensor(step)
+        # Update encoder warmup (gates only)
+        self.encoder.set_warmup_step(step)
+        # Adaptive weight adjustment based on performance
+        if reconstruction_loss is not None:
+            # If reconstruction is poor, increase its weight
+            if reconstruction_loss > 1.0:
+                self.reconstruction_weight = 1.0
+                self.compression_weight = 0.1  # Less compression focus
+            else:
+                # Good reconstruction, can focus on compression
+                self.reconstruction_weight = 0.5
+                self.compression_weight = 0.1
+            # Boundary weight stays moderate
+            self.boundary_weight = 0.1
+            # Let encoder know about reconstruction quality
+            self.encoder.adaptive_compression_control(reconstruction_loss)
+        else:
+            # Default balanced weights
+            self.reconstruction_weight = 0.5
+            self.compression_weight = 0.1
+            self.boundary_weight = 0.1
+    def get_model_stats(self) -> Dict[str, float]:
+        """
+        Get model statistics for monitoring
+        Returns:
+            Dictionary with various model statistics
+        """
+        stats = {}
+        # Encoder stats (GPT suggestion: already prefixed)
+        encoder_stats = self.encoder.get_monitoring_stats()
+        stats.update({f'encoder_{k}': v for k, v in encoder_stats.items()})
+        # Decoder memory stats
+        decoder_memory = self.decoder.get_memory_usage()
+        stats.update({f'decoder_{k}': v for k, v in decoder_memory.items()})
+        # Loss stats (if available) - check for tensor items
+        if hasattr(self, 'last_losses'):
+            for k, v in self.last_losses.items():
+                if isinstance(v, torch.Tensor):
+                    stats[f'loss_{k}'] = v.item() if v.numel() == 1 else v.mean().item()
+                else:
+                    stats[f'loss_{k}'] = float(v)
+        # Training info
+        stats['current_epoch'] = self.current_epoch.item()
+        stats['training_step'] = self.training_step.item()
+        return stats
+    def save_checkpoint(self, path: str):
+        """
+        Save model checkpoint
+        Args:
+            path: Path to save checkpoint
+        """
+        checkpoint = {
+            'model_state_dict': self.state_dict(),
+            'config': self.config,
+            'epoch': self.current_epoch.item(),
+            'step': self.training_step.item(),
+            'stats': self.get_model_stats()
         }
+        torch.save(checkpoint, path)
+        print(f"Checkpoint saved to {path}")
+    @classmethod
+    def from_checkpoint(cls, path: str, device: str = 'cuda'):
+        """
+        Load model from checkpoint
+        Args:
+            path: Path to checkpoint
+            device: Device to load model on
+        Returns:
+            Loaded model instance
+        """
+        checkpoint = torch.load(path, map_location=device)
+        # Create model with saved config
+        model = cls(checkpoint.get('config', {}))
+        model.load_state_dict(checkpoint['model_state_dict'])
+        model.to(device)
+        # Restore training state
+        if 'epoch' in checkpoint:
+            model.current_epoch = torch.tensor(checkpoint['epoch'])
+        if 'step' in checkpoint:
+            model.training_step = torch.tensor(checkpoint['step'])
+        print(f"Model loaded from {path} (Epoch {checkpoint.get('epoch', 0)})")
+        return model
+if __name__ == "__main__":
+    # Test unified model
+    print("Testing Intelligent Tokenizer v6.2.0")
+    # Create model
+    model = IntelligentTokenizerV62()
+    print(f"Model created with {sum(p.numel() for p in model.parameters())/1e6:.1f}M parameters")
+    # Test texts
+    test_texts = [
+        "Hello, world!",
+        "안녕하세요, 만나서 반갑습니다. 오늘 날씨가 좋네요!",
+        "今天天气很好。",
+    ]
+    for text in test_texts:
+        print(f"\nInput: {text}")
+        # Compress
+        compression = model.compress(text)
+        print(f"  Compression ratio: {compression['compression_ratio']:.1f}:1")
+        print(f"  Tokens: {compression['num_tokens']}")
+        # Generate (reconstruct)
+        reconstructed = model.generate(text, temperature=0.1)
+        print(f"  Reconstructed: {reconstructed}")
+    # Get model stats
+    stats = model.get_model_stats()
+    print(f"\nModel Statistics:")
+    for key, value in stats.items():
+        if isinstance(value, float):
+            print(f"  {key}: {value:.4f}")
+        else:
+            print(f"  {key}: {value}")