TeszenAI
/

MTP3.6

+# MTP Mini - Configuración Optimizada 20x Más Grande e Inteligente
+model:
+  vocab_size: 8000              # 2x más vocabulario
+  d_model: 1024                 # 2x dimensión (512 → 1024)
+  n_layers: 24                  # 3x capas (8 → 24)
+  n_heads: 16                   # 2x cabezas (8 → 16)
+  d_ff: 4096                    # 4x d_model
+  max_seq_len: 2048             # 4x contexto (512 → 2048)
+  dropout: 0.15                 # Dropout optimizado
+  use_swiglu: true              # Mejor activación
+  use_flash_attention: true     # Atención optimizada
+  use_confidence_scoring: true  # Anti-alucinación
+  min_confidence: 0.3
+training:
+  batch_size: 2                 # Pequeño para modelo grande
+  accumulation_steps: 16        # Effective batch = 32
+  epochs: 25                    # 25 épocas como pediste
+  learning_rate: 0.0002         # LR bajo para estabilidad
+  min_lr: 0.000005
+  weight_decay: 0.15            # Regularización fuerte
+  max_grad_norm: 0.5
+  num_threads: 4
+  save_every: 5                 # Guardar cada 5 épocas
+  # Early stopping (para no perder info)
+  patience: 10                  # Muy paciente (espera 10 épocas sin mejora)
+  min_delta: 0.0003             # Mejora mínima aceptable
+  # Learning rate
+  warmup_steps: 500
+  use_lr_scheduler: true
+  # Regularización
+  label_smoothing: 0.15
+  use_eos_loss_weight: true
+  eos_weight: 3.0
+  # Optimizaciones GPU
+  use_gradient_checkpointing: true   # Ahorra VRAM
+  use_fp16: true                     # Mixed precision
+data:
+  corpus_path: corpus/mtp_mini_corpus.jsonl
+  min_text_length: 100
+  max_text_length: 4000
+  validation_split: 0.2         # 20% para validación
+  # Augmentación
+  use_augmentation: true
+  augmentation_prob: 0.4
+generation:
+  default_max_tokens: 300
+  default_temperature: 0.65
+  default_top_k: 50
+  default_top_p: 0.9
+  default_repetition_penalty: 1.2
+  min_response_length: 30
+  # Anti-alucinación
+  use_perplexity_filter: true
+  max_perplexity: 80.0
+  use_entropy_threshold: true
+  max_entropy: 4.0
+  # Control de calidad
+  use_confidence_filter: true
+  min_confidence_threshold: 0.3
+  stop_sequences:
+    - "###"
+    - "\n\n\n\n"
+    - "Instrucción:"
+    - "Usuario:"
+# Optimización de memoria
+memory:
+  use_fp16: true
+  use_gradient_checkpointing: true
+  max_memory_gb: 14