| | """
|
| | COGNITIVE-CORE: Training Utilities
|
| | ====================================
|
| |
|
| | Standardized training utilities for cognitive models, including:
|
| | - Training configurations
|
| | - Trainer wrappers
|
| | - Dataset preparation helpers
|
| | - Progress tracking
|
| |
|
| | Copyright © 2026 Mike Amega (Logo) - Ame Web Studio
|
| | License: Proprietary - All Rights Reserved
|
| | """
|
| |
|
| | import os
|
| | import torch
|
| | import torch.nn as nn
|
| | from typing import Dict, List, Optional, Any, Callable
|
| | from dataclasses import dataclass, field
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | @dataclass
|
| | class CognitiveTrainingConfig:
|
| | """
|
| | Configuration standard pour l'entraînement de modèles cognitifs.
|
| | """
|
| |
|
| |
|
| | output_dir: str = "./cognitive-output"
|
| |
|
| |
|
| | num_epochs: int = 1
|
| | batch_size: int = 1
|
| | gradient_accumulation_steps: int = 8
|
| | learning_rate: float = 1e-5
|
| | warmup_steps: int = 100
|
| | weight_decay: float = 0.01
|
| | max_grad_norm: float = 1.0
|
| |
|
| |
|
| | max_seq_len: int = 2048
|
| |
|
| |
|
| | use_fp16: bool = True
|
| | use_bf16: bool = False
|
| |
|
| |
|
| | logging_steps: int = 10
|
| | save_steps: int = 200
|
| | save_total_limit: int = 2
|
| |
|
| |
|
| | push_to_hub: bool = False
|
| | hub_model_id: Optional[str] = None
|
| | hub_private: bool = True
|
| |
|
| |
|
| | device: Optional[str] = None
|
| |
|
| | def __post_init__(self):
|
| | os.makedirs(self.output_dir, exist_ok=True)
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def prepare_dataset(
|
| | dataset,
|
| | tokenizer,
|
| | text_column: str = "text",
|
| | max_length: int = 2048,
|
| | num_proc: int = 4,
|
| | ):
|
| | """
|
| | Prépare un dataset pour l'entraînement d'un modèle cognitif.
|
| |
|
| | Args:
|
| | dataset: Dataset HuggingFace
|
| | tokenizer: Tokenizer du modèle
|
| | text_column: Nom de la colonne contenant le texte
|
| | max_length: Longueur maximale des séquences
|
| | num_proc: Nombre de processus pour le mapping
|
| |
|
| | Returns:
|
| | Dataset tokenisé prêt pour l'entraînement
|
| | """
|
| |
|
| | def tokenize_function(examples):
|
| | texts = examples[text_column]
|
| | if not isinstance(texts, list):
|
| | texts = [texts]
|
| |
|
| | return tokenizer(
|
| | texts,
|
| | truncation=True,
|
| | padding="max_length",
|
| | max_length=max_length,
|
| | return_tensors=None,
|
| | )
|
| |
|
| |
|
| | columns_to_remove = dataset.column_names
|
| | if isinstance(columns_to_remove, dict):
|
| | columns_to_remove = columns_to_remove.get("train", [])
|
| |
|
| | tokenized = dataset.map(
|
| | tokenize_function,
|
| | batched=True,
|
| | num_proc=num_proc,
|
| | remove_columns=columns_to_remove,
|
| | )
|
| |
|
| | tokenized.set_format(type="torch")
|
| | return tokenized
|
| |
|
| |
|
| | def create_instruction_dataset(
|
| | examples: List[Dict[str, str]],
|
| | tokenizer,
|
| | max_length: int = 2048,
|
| | instruction_template: str = "### Instruction:\n{instruction}\n\n### Response:\n{response}",
|
| | ):
|
| | """
|
| | Crée un dataset d'instructions à partir d'exemples.
|
| |
|
| | Args:
|
| | examples: Liste de dicts avec 'instruction' et 'response'
|
| | tokenizer: Tokenizer du modèle
|
| | max_length: Longueur maximale
|
| | instruction_template: Template de formatage
|
| |
|
| | Returns:
|
| | Dataset tokenisé
|
| | """
|
| | from datasets import Dataset
|
| |
|
| | formatted = []
|
| | for ex in examples:
|
| | text = instruction_template.format(
|
| | instruction=ex.get("instruction", ""), response=ex.get("response", "")
|
| | )
|
| | formatted.append({"text": text})
|
| |
|
| | dataset = Dataset.from_list(formatted)
|
| | return prepare_dataset(dataset, tokenizer, "text", max_length)
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | class CognitiveTrainer:
|
| | """
|
| | Trainer simplifié pour modèles cognitifs.
|
| |
|
| | Wrapper autour du Trainer HuggingFace avec configuration optimisée
|
| | pour les architectures cognitives.
|
| | """
|
| |
|
| | def __init__(
|
| | self,
|
| | model,
|
| | tokenizer,
|
| | train_dataset,
|
| | config: CognitiveTrainingConfig,
|
| | eval_dataset=None,
|
| | callbacks: Optional[List] = None,
|
| | ):
|
| | self.model = model
|
| | self.tokenizer = tokenizer
|
| | self.train_dataset = train_dataset
|
| | self.eval_dataset = eval_dataset
|
| | self.config = config
|
| | self.callbacks = callbacks or []
|
| |
|
| |
|
| | if tokenizer.pad_token is None:
|
| | tokenizer.pad_token = tokenizer.eos_token
|
| |
|
| | self._setup_trainer()
|
| |
|
| | def _setup_trainer(self):
|
| | """Configure le Trainer HuggingFace."""
|
| | from transformers import (
|
| | Trainer,
|
| | TrainingArguments,
|
| | DataCollatorForLanguageModeling,
|
| | )
|
| |
|
| |
|
| | if self.config.device:
|
| | device = self.config.device
|
| | elif torch.cuda.is_available():
|
| | device = "cuda"
|
| | else:
|
| | device = "cpu"
|
| |
|
| |
|
| | training_args = TrainingArguments(
|
| | output_dir=self.config.output_dir,
|
| | overwrite_output_dir=True,
|
| | num_train_epochs=self.config.num_epochs,
|
| | per_device_train_batch_size=self.config.batch_size,
|
| | gradient_accumulation_steps=self.config.gradient_accumulation_steps,
|
| | learning_rate=self.config.learning_rate,
|
| | warmup_steps=self.config.warmup_steps,
|
| | weight_decay=self.config.weight_decay,
|
| | max_grad_norm=self.config.max_grad_norm,
|
| | logging_steps=self.config.logging_steps,
|
| | save_steps=self.config.save_steps,
|
| | save_total_limit=self.config.save_total_limit,
|
| | fp16=self.config.use_fp16 and device == "cuda",
|
| | bf16=self.config.use_bf16 and device == "cuda",
|
| | push_to_hub=self.config.push_to_hub,
|
| | hub_model_id=self.config.hub_model_id,
|
| | hub_private_repo=self.config.hub_private,
|
| | report_to="none",
|
| | remove_unused_columns=False,
|
| | dataloader_num_workers=0,
|
| | )
|
| |
|
| |
|
| | data_collator = DataCollatorForLanguageModeling(
|
| | tokenizer=self.tokenizer, mlm=False
|
| | )
|
| |
|
| |
|
| | self.trainer = Trainer(
|
| | model=self.model,
|
| | args=training_args,
|
| | train_dataset=self.train_dataset,
|
| | eval_dataset=self.eval_dataset,
|
| | data_collator=data_collator,
|
| | tokenizer=self.tokenizer,
|
| | callbacks=self.callbacks,
|
| | )
|
| |
|
| | def train(self, resume_from_checkpoint: Optional[str] = None):
|
| | """
|
| | Lance l'entraînement.
|
| |
|
| | Args:
|
| | resume_from_checkpoint: Chemin pour reprendre l'entraînement
|
| |
|
| | Returns:
|
| | Résultats de l'entraînement
|
| | """
|
| | print("\n🚀 ENTRAÎNEMENT COGNITIF")
|
| | print("=" * 60)
|
| |
|
| | try:
|
| | result = self.trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
| | print("=" * 60)
|
| | print("✅ Entraînement terminé!")
|
| | return result
|
| | except Exception as e:
|
| | print(f"❌ Erreur: {e}")
|
| | import traceback
|
| |
|
| | traceback.print_exc()
|
| | return None
|
| |
|
| | def save(self, output_dir: Optional[str] = None):
|
| | """Sauvegarde le modèle et tokenizer."""
|
| | save_dir = output_dir or self.config.output_dir
|
| | self.trainer.save_model(save_dir)
|
| | self.tokenizer.save_pretrained(save_dir)
|
| | print(f"💾 Modèle sauvegardé: {save_dir}")
|
| |
|
| | def push_to_hub(self, repo_id: Optional[str] = None):
|
| | """Push le modèle vers HuggingFace Hub."""
|
| | if repo_id:
|
| | self.config.hub_model_id = repo_id
|
| |
|
| | try:
|
| | self.trainer.push_to_hub()
|
| | print(f"📤 Modèle pushé: {self.config.hub_model_id}")
|
| | except Exception as e:
|
| | print(f"⚠️ Erreur push: {e}")
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | class CognitiveStateCallback:
|
| | """
|
| | Callback pour monitorer l'état des modules cognitifs pendant l'entraînement.
|
| | """
|
| |
|
| | def __init__(self, log_every: int = 100):
|
| | self.log_every = log_every
|
| | self.step = 0
|
| |
|
| | def on_step_end(self, args, state, control, model=None, **kwargs):
|
| | self.step += 1
|
| |
|
| | if self.step % self.log_every == 0 and model is not None:
|
| | if hasattr(model, "get_cognitive_state"):
|
| | cog_state = model.get_cognitive_state()
|
| | print(f"\n📊 État cognitif (step {self.step}):")
|
| | for name, state_dict in cog_state.items():
|
| | if state_dict:
|
| | print(f" {name}: {len(state_dict)} buffers")
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | def quick_train(
|
| | model,
|
| | tokenizer,
|
| | texts: List[str],
|
| | output_dir: str = "./quick-train-output",
|
| | num_epochs: int = 1,
|
| | max_seq_len: int = 2048,
|
| | learning_rate: float = 1e-5,
|
| | push_to_hub: bool = False,
|
| | hub_model_id: Optional[str] = None,
|
| | ):
|
| | """
|
| | Entraînement rapide avec configuration minimale.
|
| |
|
| | Args:
|
| | model: Modèle à entraîner
|
| | tokenizer: Tokenizer
|
| | texts: Liste de textes d'entraînement
|
| | output_dir: Répertoire de sortie
|
| | num_epochs: Nombre d'époques
|
| | max_seq_len: Longueur max des séquences
|
| | learning_rate: Taux d'apprentissage
|
| | push_to_hub: Pusher vers HuggingFace
|
| | hub_model_id: ID du repo HuggingFace
|
| |
|
| | Returns:
|
| | Résultats de l'entraînement
|
| | """
|
| | from datasets import Dataset
|
| |
|
| |
|
| | dataset = Dataset.from_dict({"text": texts})
|
| | tokenized = prepare_dataset(dataset, tokenizer, "text", max_seq_len)
|
| |
|
| |
|
| | config = CognitiveTrainingConfig(
|
| | output_dir=output_dir,
|
| | num_epochs=num_epochs,
|
| | max_seq_len=max_seq_len,
|
| | learning_rate=learning_rate,
|
| | push_to_hub=push_to_hub,
|
| | hub_model_id=hub_model_id,
|
| | )
|
| |
|
| |
|
| | trainer = CognitiveTrainer(model, tokenizer, tokenized, config)
|
| | result = trainer.train()
|
| |
|
| | if result:
|
| | trainer.save()
|
| |
|
| | return result
|
| |
|