TeszenAI
/

MTP3.7

@@ -1,573 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.utils.data import DataLoader, random_split
-from torch.optim import AdamW
-from torch.optim.lr_scheduler import CosineAnnealingLR
-from torch.cuda.amp import autocast, GradScaler
-from tqdm import tqdm
-import yaml
-import os
-import pickle
-import math
-import numpy as np
-from model import MTPMiniModel
-from tokenizer import MTPTokenizer
-from dataset import MTPDataset, collate_fn
-class MTPTrainer:
-    """Entrenador MEJORADO x20 con capacidades avanzadas"""
-    def __init__(self, config_path='config.yaml'):
-        with open(config_path, 'r', encoding='utf-8') as f:
-            self.config = yaml.safe_load(f)
-        # ========== CONFIGURAR DISPOSITIVO ==========
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        print("=" * 70)
-        print("MTP MINI x20 - Transformer Avanzado con Razonamiento")
-        print("=" * 70)
-        print(f"\n🔥 Device: {self.device}")
-        if self.device.type == 'cuda':
-            print(f"🔥 GPU: {torch.cuda.get_device_name(0)}")
-            print(f"🔥 VRAM Total: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
-            torch.backends.cudnn.benchmark = True
-            torch.backends.cuda.matmul.allow_tf32 = True
-            torch.backends.cudnn.allow_tf32 = True
-            print(f"🔥 Optimizaciones CUDA: Activadas")
-            # Gradient checkpointing para ahorrar memoria
-            self.use_gradient_checkpointing = self.config['training'].get('use_gradient_checkpointing', True)
-            if self.use_gradient_checkpointing:
-                print(f"🔥 Gradient Checkpointing: Activado (ahorra VRAM)")
-        else:
-            print("⚠️ WARNING: Usando CPU - El entrenamiento será MUY lento")
-            self.use_gradient_checkpointing = False
-        # Mixed precision training
-        self.use_mixed_precision = self.device.type == 'cuda' and self.config['training'].get('use_mixed_precision', True)
-        if self.use_mixed_precision:
-            self.scaler = GradScaler()
-            print(f"🔥 Mixed Precision (FP16): Activado")
-        torch.set_num_threads(self.config['training']['num_threads'])
-        # ========== TOKENIZER ==========
-        print("\n[1/7] Inicializando tokenizer mejorado...")
-        self.tokenizer = MTPTokenizer()
-        tokenizer_path = 'mtp_tokenizer.model'
-        if not os.path.exists(tokenizer_path):
-            print("   -> Entrenando nuevo tokenizer...")
-            self.tokenizer.train(
-                self.config['data']['corpus_path'],
-                vocab_size=self.config['model']['vocab_size'],
-                model_prefix='mtp_tokenizer'
-            )
-        else:
-            print(f"   -> Cargando tokenizer: {tokenizer_path}")
-            self.tokenizer.load(tokenizer_path)
-        print(f"   ✅ Vocabulario: {self.tokenizer.vocab_size()} tokens")
-        # ========== MODELO ==========
-        print("\n[2/7] Inicializando modelo GRANDE (x20)...")
-        model_config = self.config['model']
-        self.model = MTPMiniModel(
-            vocab_size=self.tokenizer.vocab_size(),
-            d_model=model_config['d_model'],
-            n_layers=model_config['n_layers'],
-            n_heads=model_config['n_heads'],
-            d_ff=model_config['d_ff'],
-            max_seq_len=model_config['max_seq_len'],
-            dropout=model_config['dropout'],
-            use_swiglu=model_config.get('use_swiglu', True),
-            use_flash_attention=model_config.get('use_flash_attention', True),
-            use_reasoning_layer=model_config.get('use_reasoning_layer', True),
-            reasoning_steps=model_config.get('reasoning_steps', 3),
-            use_confidence_score=model_config.get('use_confidence_score', True)
-        ).to(self.device)
-        param_count = self.model.count_parameters()
-        print(f"   ✅ Parámetros TOTALES: {param_count:,} ({param_count/1e6:.1f}M)")
-        print(f"   ✅ Arquitectura:")
-        print(f"      • Capas: {model_config['n_layers']}")
-        print(f"      • Cabezas de atención: {model_config['n_heads']}")
-        print(f"      • Dimensión: {model_config['d_model']}")
-        print(f"      • FFN: {model_config['d_ff']}")
-        print(f"      • Contexto máximo: {model_config['max_seq_len']} tokens")
-        # Mostrar memoria GPU
-        if self.device.type == 'cuda':
-            memory_allocated = torch.cuda.memory_allocated(0) / 1e9
-            memory_reserved = torch.cuda.memory_reserved(0) / 1e9
-            print(f"   ✅ VRAM usada: {memory_allocated:.2f} GB (reservada: {memory_reserved:.2f} GB)")
-        improvements = [
-            "RoPE", "RMSNorm", "SwiGLU", "Flash Attention",
-            "Reasoning Layers", "Confidence Score", "Anti-Hallucination",
-            "Label Smoothing", "Repetition Penalty", "Early Stopping",
-            "Mixed Precision", "Gradient Checkpointing"
-        ]
-        print(f"   ✅ Mejoras activas: {', '.join(improvements)}")
-        # ========== DATASET ==========
-        print("\n[3/7] Cargando dataset grande...")
-        full_dataset = MTPDataset(
-            self.config['data']['corpus_path'],
-            self.tokenizer,
-            max_seq_len=model_config['max_seq_len'],
-            use_augmentation=self.config['data'].get('use_augmentation', True),
-            augmentation_prob=self.config['data'].get('augmentation_prob', 0.4)
-        )
-        total_examples = len(full_dataset)
-        print(f"   ✅ Total ejemplos: {total_examples}")
-        if total_examples < 100:
-            print(f"   ⚠️ WARNING: Dataset pequeño ({total_examples} ejemplos)")
-            print(f"   ⚠️ Se recomienda al menos 1000 ejemplos para este modelo")
-        val_split = self.config.get('data', {}).get('validation_split', 0.12)
-        val_size = max(1, int(total_examples * val_split))
-        train_size = total_examples - val_size
-        if train_size > 0:
-            self.train_dataset, self.val_dataset = random_split(
-                full_dataset,
-                [train_size, val_size],
-                generator=torch.Generator().manual_seed(42)
-            )
-            print(f"   ✅ Train: {len(self.train_dataset)} ejemplos ({train_size/total_examples*100:.1f}%)")
-            print(f"   ✅ Validation: {len(self.val_dataset)} ejemplos ({val_size/total_examples*100:.1f}%)")
-        else:
-            self.train_dataset = full_dataset
-            self.val_dataset = full_dataset
-            print(f"   ⚠️ Dataset muy pequeño - usando todo para train y validación")
-        # DataLoaders optimizados
-        num_workers = 4 if self.device.type == 'cuda' else 2
-        self.train_loader = DataLoader(
-            self.train_dataset,
-            batch_size=self.config['training']['batch_size'],
-            shuffle=True,
-            collate_fn=lambda batch: collate_fn(batch, self.tokenizer.pad_id()),
-            num_workers=num_workers,
-            pin_memory=True if self.device.type == 'cuda' else False,
-            persistent_workers=True if num_workers > 0 else False
-        )
-        self.val_loader = DataLoader(
-            self.val_dataset,
-            batch_size=self.config['training']['batch_size'],
-            shuffle=False,
-            collate_fn=lambda batch: collate_fn(batch, self.tokenizer.pad_id()),
-            num_workers=num_workers,
-            pin_memory=True if self.device.type == 'cuda' else False,
-            persistent_workers=True if num_workers > 0 else False
-        )
-        # ========== OPTIMIZER ==========
-        print("\n[4/7] Configurando optimizer avanzado...")
-        # Grupos de parámetros con weight decay diferencial
-        decay_params = []
-        no_decay_params = []
-        reasoning_params = []
-        for name, param in self.model.named_parameters():
-            if param.requires_grad:
-                if 'reasoning' in name:
-                    reasoning_params.append(param)
-                elif 'bias' in name or 'norm' in name or 'embedding' in name:
-                    no_decay_params.append(param)
-                else:
-                    decay_params.append(param)
-        param_groups = [
-            {'params': decay_params, 'weight_decay': self.config['training']['weight_decay']},
-            {'params': no_decay_params, 'weight_decay': 0.0},
-        ]
-        if reasoning_params:
-            # Learning rate ligeramente menor para capas de razonamiento
-            param_groups.append({
-                'params': reasoning_params,
-                'weight_decay': self.config['training']['weight_decay'] * 0.5,
-                'lr': self.config['training']['learning_rate'] * 0.8
-            })
-            print(f"   ✅ Reasoning params: {sum(p.numel() for p in reasoning_params):,}")
-        self.optimizer = AdamW(
-            param_groups,
-            lr=self.config['training']['learning_rate'],
-            betas=(0.9, 0.95),  # Betas optimizados para LLMs
-            eps=1e-8
-        )
-        print(f"   ✅ Optimizer: AdamW")
-        print(f"   ✅ LR base: {self.config['training']['learning_rate']}")
-        print(f"   ✅ Weight decay: {self.config['training']['weight_decay']}")
-        # ========== SCHEDULER ==========
-        print("\n[5/7] Configurando LR scheduler...")
-        self.warmup_steps = self.config['training'].get('warmup_steps', 500)
-        total_steps = len(self.train_loader) * self.config['training']['epochs']
-        if self.config['training'].get('use_lr_scheduler', True):
-            self.scheduler = CosineAnnealingLR(
-                self.optimizer,
-                T_max=total_steps - self.warmup_steps,
-                eta_min=self.config['training'].get('min_lr', 0.000005)
-            )
-            print(f"   ✅ Scheduler: Cosine Annealing")
-            print(f"   ✅ Total steps: {total_steps:,}")
-        else:
-            self.scheduler = None
-            print(f"   ✅ Scheduler: None")
-        print(f"   ✅ Warmup steps: {self.warmup_steps}")
-        # ========== TRAINING STATE ==========
-        self.start_epoch = 0
-        self.global_step = 0
-        self.best_val_loss = float('inf')
-        # Early stopping
-        self.patience = self.config['training'].get('patience', 8)
-        self.min_delta = self.config['training'].get('min_delta', 0.0005)
-        self.patience_counter = 0
-        print(f"   ✅ Early stopping: patience={self.patience}, min_delta={self.min_delta}")
-        # Gradient accumulation
-        self.accumulation_steps = self.config['training'].get('accumulation_steps', 8)
-        effective_batch = self.config['training']['batch_size'] * self.accumulation_steps
-        print(f"   ✅ Gradient accumulation: {self.accumulation_steps} steps")
-        print(f"   ✅ Effective batch size: {effective_batch}")
-        self.use_eos_weight = self.config['training'].get('use_eos_loss_weight', True)
-        if self.use_eos_weight:
-            print(f"   ✅ EOS token weight: 2.0x")
-        # ========== RESUME CHECKPOINT ==========
-        print("\n[6/7] Verificando checkpoints...")
-        if os.path.exists('checkpoint.pt'):
-            print("   -> Cargando checkpoint...")
-            self.load_checkpoint('checkpoint.pt')
-        else:
-            print("   ✅ No hay checkpoint previo")
-        print("\n[7/7] ✅ Sistema listo para entrenar!")
-        print("=" * 70)
-    def get_lr(self):
-        """Get current learning rate with warmup"""
-        if self.global_step < self.warmup_steps:
-            return self.config['training']['learning_rate'] * (self.global_step / self.warmup_steps)
-        return self.optimizer.param_groups[0]['lr']
-    def train_epoch(self, epoch):
-        """Train one epoch con mixed precision"""
-        self.model.train()
-        total_loss = 0
-        total_confidence = 0
-        confidence_samples = 0
-        progress_bar = tqdm(self.train_loader, desc=f"Epoch {epoch+1}")
-        self.optimizer.zero_grad()
-        for batch_idx, (input_ids, target_ids) in enumerate(progress_bar):
-            # Mover datos a GPU
-            input_ids = input_ids.to(self.device, non_blocking=True)
-            target_ids = target_ids.to(self.device, non_blocking=True)
-            # Forward pass con mixed precision
-            if self.use_mixed_precision:
-                with autocast():
-                    if self.model.use_confidence:
-                        logits, loss, confidence = self.model(
-                            input_ids, target_ids,
-                            use_eos_weight=self.use_eos_weight,
-                            return_confidence=True
-                        )
-                        # Trackear confianza promedio
-                        mask = (target_ids != 0).float()
-                        avg_conf = (confidence * mask).sum() / mask.sum()
-                        total_confidence += avg_conf.item()
-                        confidence_samples += 1
-                    else:
-                        logits, loss = self.model(
-                            input_ids, target_ids,
-                            use_eos_weight=self.use_eos_weight
-                        )
-                    loss = loss / self.accumulation_steps
-                # Backward con scaling
-                self.scaler.scale(loss).backward()
-            else:
-                # Sin mixed precision (CPU o GPU sin FP16)
-                if self.model.use_confidence:
-                    logits, loss, confidence = self.model(
-                        input_ids, target_ids,
-                        use_eos_weight=self.use_eos_weight,
-                        return_confidence=True
-                    )
-                    mask = (target_ids != 0).float()
-                    avg_conf = (confidence * mask).sum() / mask.sum()
-                    total_confidence += avg_conf.item()
-                    confidence_samples += 1
-                else:
-                    logits, loss = self.model(
-                        input_ids, target_ids,
-                        use_eos_weight=self.use_eos_weight
-                    )
-                loss = loss / self.accumulation_steps
-                loss.backward()
-            # Optimizer step cada accumulation_steps
-            if (batch_idx + 1) % self.accumulation_steps == 0:
-                if self.use_mixed_precision:
-                    # Gradient clipping con scaler
-                    self.scaler.unscale_(self.optimizer)
-                    torch.nn.utils.clip_grad_norm_(
-                        self.model.parameters(),
-                        self.config['training']['max_grad_norm']
-                    )
-                    # Optimizer step
-                    self.scaler.step(self.optimizer)
-                    self.scaler.update()
-                else:
-                    # Gradient clipping normal
-                    torch.nn.utils.clip_grad_norm_(
-                        self.model.parameters(),
-                        self.config['training']['max_grad_norm']
-                    )
-                    # Optimizer step
-                    self.optimizer.step()
-                # Warmup
-                if self.global_step < self.warmup_steps:
-                    lr = self.get_lr()
-                    for param_group in self.optimizer.param_groups:
-                        param_group['lr'] = lr
-                # Scheduler
-                if self.scheduler and self.global_step >= self.warmup_steps:
-                    self.scheduler.step()
-                self.optimizer.zero_grad()
-                self.global_step += 1
-            total_loss += loss.item() * self.accumulation_steps
-            # Progress bar
-            postfix = {
-                'loss': f"{loss.item() * self.accumulation_steps:.4f}",
-                'lr': f"{self.get_lr():.6f}"
-            }
-            if confidence_samples > 0:
-                postfix['conf'] = f"{total_confidence/confidence_samples:.3f}"
-            if self.device.type == 'cuda' and batch_idx % 10 == 0:
-                vram_used = torch.cuda.memory_allocated(0) / 1e9
-                vram_total = torch.cuda.get_device_properties(0).total_memory / 1e9
-                postfix['vram'] = f"{vram_used:.1f}/{vram_total:.1f}GB"
-            progress_bar.set_postfix(postfix)
-        avg_loss = total_loss / len(self.train_loader)
-        avg_confidence = total_confidence / confidence_samples if confidence_samples > 0 else 0
-        return avg_loss, avg_confidence
-    def validate(self):
-        """Validate model"""
-        self.model.eval()
-        total_loss = 0
-        total_confidence = 0
-        confidence_samples = 0
-        with torch.no_grad():
-            for input_ids, target_ids in self.val_loader:
-                input_ids = input_ids.to(self.device, non_blocking=True)
-                target_ids = target_ids.to(self.device, non_blocking=True)
-                if self.model.use_confidence:
-                    logits, loss, confidence = self.model(
-                        input_ids, target_ids,
-                        return_confidence=True
-                    )
-                    mask = (target_ids != 0).float()
-                    avg_conf = (confidence * mask).sum() / mask.sum()
-                    total_confidence += avg_conf.item()
-                    confidence_samples += 1
-                else:
-                    logits, loss = self.model(input_ids, target_ids)
-                total_loss += loss.item()
-        avg_loss = total_loss / len(self.val_loader)
-        avg_confidence = total_confidence / confidence_samples if confidence_samples > 0 else 0
-        return avg_loss, avg_confidence
-    def train(self):
-        """Main training loop mejorado"""
-        print("\n" + "=" * 70)
-        print("INICIANDO ENTRENAMIENTO AVANZADO")
-        print("=" * 70)
-        if self.device.type == 'cuda':
-            print(f"🔥 GPU: {torch.cuda.get_device_name(0)}")
-            print(f"🔥 VRAM Disponible: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
-        epochs = self.config['training']['epochs']
-        print(f"📊 Total epochs: {epochs}")
-        print(f"📊 Train batches: {len(self.train_loader)}")
-        print(f"📊 Val batches: {len(self.val_loader)}")
-        print("=" * 70 + "\n")
-        for epoch in range(self.start_epoch, epochs):
-            train_loss, train_conf = self.train_epoch(epoch)
-            val_loss, val_conf = self.validate()
-            # Limpiar caché GPU
-            if self.device.type == 'cuda':
-                torch.cuda.empty_cache()
-            # Mostrar resultados
-            print(f"\n{'='*70}")
-            print(f"Epoch {epoch+1}/{epochs} - Resultados")
-            print(f"{'='*70}")
-            print(f"  Train Loss:       {train_loss:.4f}")
-            print(f"  Val Loss:         {val_loss:.4f}")
-            print(f"  Train Confidence: {train_conf:.3f}")
-            print(f"  Val Confidence:   {val_conf:.3f}")
-            print(f"  Learning Rate:    {self.get_lr():.6f}")
-            if self.device.type == 'cuda':
-                vram_used = torch.cuda.memory_allocated(0) / 1e9
-                vram_peak = torch.cuda.max_memory_allocated(0) / 1e9
-                print(f"  VRAM Used:        {vram_used:.2f} GB (peak: {vram_peak:.2f} GB)")
-                torch.cuda.reset_peak_memory_stats()
-            # Early stopping check
-            improvement = self.best_val_loss - val_loss
-            if improvement > self.min_delta:
-                self.best_val_loss = val_loss
-                self.patience_counter = 0
-                self.save_checkpoint('best_model.pt', epoch + 1, is_best=True)
-                print(f"  ✅ ¡NUEVO MEJOR MODELO! (Val Loss: {val_loss:.4f})")
-            else:
-                self.patience_counter += 1
-                print(f"  ⏳ No improvement. Patience: {self.patience_counter}/{self.patience}")
-                if self.patience_counter >= self.patience:
-                    print(f"\n⚠️ EARLY STOPPING - Mejor val loss: {self.best_val_loss:.4f}")
-                    break
-            # Save periodic checkpoint
-            if (epoch + 1) % self.config['training']['save_every'] == 0:
-                self.save_checkpoint('checkpoint.pt', epoch + 1)
-            print(f"{'='*70}\n")
-        print("\n" + "=" * 70)
-        print("ENTRENAMIENTO COMPLETADO")
-        print(f"Mejor Val Loss: {self.best_val_loss:.4f}")
-        print("=" * 70)
-        # Load best model
-        if os.path.exists('best_model.pt'):
-            print("\n📦 Cargando mejor modelo...")
-            checkpoint = torch.load('best_model.pt', map_location=self.device)
-            self.model.load_state_dict(checkpoint['model_state_dict'])
-            print("✅ Mejor modelo cargado")
-        self.save_model()
-    def save_checkpoint(self, path, epoch, is_best=False):
-        """Save checkpoint"""
-        checkpoint = {
-            'epoch': epoch,
-            'global_step': self.global_step,
-            'model_state_dict': self.model.state_dict(),
-            'optimizer_state_dict': self.optimizer.state_dict(),
-            'scheduler_state_dict': self.scheduler.state_dict() if self.scheduler else None,
-            'best_val_loss': self.best_val_loss,
-            'patience_counter': self.patience_counter,
-            'config': self.config,
-            'scaler_state_dict': self.scaler.state_dict() if self.use_mixed_precision else None
-        }
-        torch.save(checkpoint, path)
-        if not is_best:
-            print(f"   💾 Checkpoint guardado: {path}")
-    def load_checkpoint(self, path):
-        """Load checkpoint"""
-        checkpoint = torch.load(path, map_location=self.device)
-        self.model.load_state_dict(checkpoint['model_state_dict'])
-        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
-        if self.scheduler and checkpoint.get('scheduler_state_dict'):
-            self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
-        if self.use_mixed_precision and checkpoint.get('scaler_state_dict'):
-            self.scaler.load_state_dict(checkpoint['scaler_state_dict'])
-        self.start_epoch = checkpoint['epoch']
-        self.global_step = checkpoint['global_step']
-        self.best_val_loss = checkpoint.get('best_val_loss', float('inf'))
-        self.patience_counter = checkpoint.get('patience_counter', 0)
-        print(f"   ✅ Resumido desde epoch {self.start_epoch}")
-        print(f"   ✅ Mejor val loss: {self.best_val_loss:.4f}")
-    def save_model(self):
-        """Save final model"""
-        os.makedirs('output', exist_ok=True)
-        # Mover modelo a CPU para guardar
-        self.model.to('cpu')
-        model_data = {
-            'model_state_dict': self.model.state_dict(),
-            'config': self.config,
-            'vocab_size': self.tokenizer.vocab_size(),
-            'tokenizer_path': self.tokenizer.model_path,
-            'training_info': {
-                'final_epoch': self.start_epoch,
-                'best_val_loss': self.best_val_loss,
-                'total_parameters': self.model.count_parameters()
-            }
-        }
-        output_path = 'output/mtp_mini.pkl'
-        with open(output_path, 'wb') as f:
-            pickle.dump(model_data, f)
-        file_size_mb = os.path.getsize(output_path) / (1024*1024)
-        print(f"\n{'='*70}")
-        print(f"✅ MODELO FINAL GUARDADO")
-        print(f"{'='*70}")
-        print(f"📁 Ruta: {output_path}")
-        print(f"💾 Tamaño: {file_size_mb:.2f} MB")
-        print(f"🧠 Parámetros: {self.model.count_parameters()/1e6:.1f}M")
-        print(f"📊 Mejor Val Loss: {self.best_val_loss:.4f}")
-        print(f"{'='*70}\n")
-if __name__ == '__main__':
-    trainer = MTPTrainer('config.yaml')
-    trainer.train()