SmolFactory / config /train_gpt_oss_openhermes_fr.py
Tonic's picture
adds better launch.sh and eval / test splits auto
0fa6045
"""
GPT-OSS OpenHermes-FR Optimized Configuration
Specifically optimized for the legmlai/openhermes-fr dataset
800K French instruction-response pairs with quality filtering
"""
from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig
# OpenHermes-FR optimized configuration
config = GPTOSSEnhancedCustomConfig(
# ============================================================================
# DATASET CONFIGURATION - OpenHermes-FR Specific
# ============================================================================
dataset_name="legmlai/openhermes-fr",
dataset_split="train",
dataset_format="openhermes_fr",
# OpenHermes-FR field mapping
input_field="prompt", # French prompts
target_field="accepted_completion", # GPT-4o generated completions
# Quality filtering using OpenHermes-FR metadata
filter_bad_entries=True, # Use built-in quality flags
bad_entry_field="bad_entry",
bad_prompt_field="bad_prompt_detected",
bad_response_field="bad_response_detected",
# Data processing optimized for French with GPT-OSS Harmony Format
concatenate_fields=True,
field_separator="\n\n### RΓ©ponse:\n", # Fallback separator (harmony format takes precedence)
add_eos_token=True,
use_harmony_format=True, # Enable GPT-OSS harmony format
# Dataset sampling (use all 800K examples by default)
max_samples=None, # Use full dataset
min_length=20, # Minimum for meaningful French text
max_length=None, # Auto-set to max_seq_length
# ============================================================================
# TRAINING HYPERPARAMETERS - French Language Optimized
# ============================================================================
num_train_epochs=1.5, # 1.5 epochs optimal for large dataset
batch_size=6, # Balanced for most GPUs
gradient_accumulation_steps=6, # Effective batch size: 36
# Learning rate schedule optimized for French fine-tuning
learning_rate=2.5e-4, # Slightly higher for multilingual
min_lr=2.5e-5, # 10% of max learning rate
warmup_ratio=0.05, # 5% warmup for stability
weight_decay=0.01, # Standard L2 regularization
max_grad_norm=1.0, # Gradient clipping
# ============================================================================
# MODEL CONFIGURATION - Optimized for French
# ============================================================================
model_name="openai/gpt-oss-20b",
max_seq_length=3072, # Balanced length for French
use_flash_attention=True,
use_gradient_checkpointing=True,
# Mixed precision for efficiency
fp16=False,
bf16=True, # Better for GPT-OSS
# ============================================================================
# LORA CONFIGURATION - Optimized for French Language Learning
# ============================================================================
use_lora=True,
lora_config={
"r": 24, # Higher rank for language adaptation
"lora_alpha": 48, # 2x rank scaling
"lora_dropout": 0.05, # Light regularization
"target_modules": "all-linear",
"target_parameters": [
"7.mlp.experts.gate_up_proj",
"7.mlp.experts.down_proj",
"15.mlp.experts.gate_up_proj",
"15.mlp.experts.down_proj",
"23.mlp.experts.gate_up_proj",
"23.mlp.experts.down_proj",
],
"bias": "none",
"task_type": "CAUSAL_LM",
},
# ============================================================================
# QUANTIZATION - Balanced Performance/Memory
# ============================================================================
use_quantization=True,
quantization_config={
"dequantize": True, # MXFP4 as per GPT-OSS tutorial
"load_in_4bit": False, # Standard precision for quality
},
# ============================================================================
# PERFORMANCE OPTIMIZATION
# ============================================================================
# Data loading optimized for large dataset
dataloader_num_workers=6, # More workers for large dataset
dataloader_pin_memory=True,
dataloader_prefetch_factor=3, # Higher prefetch for efficiency
# Memory management
low_cpu_mem_usage=True,
group_by_length=True, # Efficient batching
remove_unused_columns=True,
# ============================================================================
# EVALUATION & LOGGING
# ============================================================================
eval_strategy="steps",
eval_steps=200, # Evaluate every 200 steps
logging_steps=20, # Log every 20 steps
save_strategy="steps",
save_steps=500, # Save every 500 steps
save_total_limit=3, # Keep 3 best checkpoints
metric_for_best_model="eval_loss",
greater_is_better=False,
load_best_model_at_end=True,
# Split ratios for automatic validation/test creation
eval_ratio=0.01,
test_ratio=0.01,
# ============================================================================
# MULTILINGUAL & FRENCH SPECIFIC SETTINGS
# ============================================================================
primary_language="fr", # French as primary language
reasoning_languages=["French", "English"], # Bilingual reasoning
domain_focus="instruction", # Instruction following
# ============================================================================
# GENERATION CONFIG FOR EVALUATION - GPT-OSS Harmony Format
# ============================================================================
generation_config={
"max_new_tokens": 512,
"do_sample": True,
"temperature": 0.7,
"top_p": 0.9,
"top_k": 50,
"repetition_penalty": 1.1,
"pad_token_id": None,
"eos_token_id": None,
# GPT-OSS Harmony Format specific settings
"reasoning_effort": "medium", # Configurable reasoning level
"use_harmony_format": True, # Ensure harmony format in generation
},
# ============================================================================
# HF HUB INTEGRATION
# ============================================================================
push_to_hub=False, # Set to True to auto-push
hub_model_id=None, # Will be set by launch script
hub_private_repo=False,
# ============================================================================
# MONITORING
# ============================================================================
enable_tracking=True, # Trackio monitoring
log_artifacts=True,
log_metrics=True,
log_config=True,
)
# Print configuration summary on import
print("\nπŸ‡«πŸ‡· OpenHermes-FR Configuration Loaded")
print("=" * 50)
print(f"πŸ“Š Dataset: {config.dataset_name}")
print(f"πŸ—£οΈ Language: French (with {config.dataset_format} format)")
print(f"πŸ“ˆ Training: {config.num_train_epochs} epochs")
print(f"πŸ”„ Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}")
print(f"🧠 LoRA Rank: {config.lora_config['r']}")
print(f"πŸ“ Sequence Length: {config.max_seq_length}")
print(f"πŸ” Quality Filtering: {'Enabled' if config.filter_bad_entries else 'Disabled'}")
print(f"🎡 GPT-OSS Harmony Format: {'Enabled' if config.use_harmony_format else 'Disabled'}")
print("=" * 50)