Spaces:
Running
Running
File size: 8,225 Bytes
59e57ff 0fa6045 59e57ff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
"""
GPT-OSS OpenHermes-FR Optimized Configuration
Specifically optimized for the legmlai/openhermes-fr dataset
800K French instruction-response pairs with quality filtering
"""
from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig
# OpenHermes-FR optimized configuration
config = GPTOSSEnhancedCustomConfig(
# ============================================================================
# DATASET CONFIGURATION - OpenHermes-FR Specific
# ============================================================================
dataset_name="legmlai/openhermes-fr",
dataset_split="train",
dataset_format="openhermes_fr",
# OpenHermes-FR field mapping
input_field="prompt", # French prompts
target_field="accepted_completion", # GPT-4o generated completions
# Quality filtering using OpenHermes-FR metadata
filter_bad_entries=True, # Use built-in quality flags
bad_entry_field="bad_entry",
bad_prompt_field="bad_prompt_detected",
bad_response_field="bad_response_detected",
# Data processing optimized for French with GPT-OSS Harmony Format
concatenate_fields=True,
field_separator="\n\n### RΓ©ponse:\n", # Fallback separator (harmony format takes precedence)
add_eos_token=True,
use_harmony_format=True, # Enable GPT-OSS harmony format
# Dataset sampling (use all 800K examples by default)
max_samples=None, # Use full dataset
min_length=20, # Minimum for meaningful French text
max_length=None, # Auto-set to max_seq_length
# ============================================================================
# TRAINING HYPERPARAMETERS - French Language Optimized
# ============================================================================
num_train_epochs=1.5, # 1.5 epochs optimal for large dataset
batch_size=6, # Balanced for most GPUs
gradient_accumulation_steps=6, # Effective batch size: 36
# Learning rate schedule optimized for French fine-tuning
learning_rate=2.5e-4, # Slightly higher for multilingual
min_lr=2.5e-5, # 10% of max learning rate
warmup_ratio=0.05, # 5% warmup for stability
weight_decay=0.01, # Standard L2 regularization
max_grad_norm=1.0, # Gradient clipping
# ============================================================================
# MODEL CONFIGURATION - Optimized for French
# ============================================================================
model_name="openai/gpt-oss-20b",
max_seq_length=3072, # Balanced length for French
use_flash_attention=True,
use_gradient_checkpointing=True,
# Mixed precision for efficiency
fp16=False,
bf16=True, # Better for GPT-OSS
# ============================================================================
# LORA CONFIGURATION - Optimized for French Language Learning
# ============================================================================
use_lora=True,
lora_config={
"r": 24, # Higher rank for language adaptation
"lora_alpha": 48, # 2x rank scaling
"lora_dropout": 0.05, # Light regularization
"target_modules": "all-linear",
"target_parameters": [
"7.mlp.experts.gate_up_proj",
"7.mlp.experts.down_proj",
"15.mlp.experts.gate_up_proj",
"15.mlp.experts.down_proj",
"23.mlp.experts.gate_up_proj",
"23.mlp.experts.down_proj",
],
"bias": "none",
"task_type": "CAUSAL_LM",
},
# ============================================================================
# QUANTIZATION - Balanced Performance/Memory
# ============================================================================
use_quantization=True,
quantization_config={
"dequantize": True, # MXFP4 as per GPT-OSS tutorial
"load_in_4bit": False, # Standard precision for quality
},
# ============================================================================
# PERFORMANCE OPTIMIZATION
# ============================================================================
# Data loading optimized for large dataset
dataloader_num_workers=6, # More workers for large dataset
dataloader_pin_memory=True,
dataloader_prefetch_factor=3, # Higher prefetch for efficiency
# Memory management
low_cpu_mem_usage=True,
group_by_length=True, # Efficient batching
remove_unused_columns=True,
# ============================================================================
# EVALUATION & LOGGING
# ============================================================================
eval_strategy="steps",
eval_steps=200, # Evaluate every 200 steps
logging_steps=20, # Log every 20 steps
save_strategy="steps",
save_steps=500, # Save every 500 steps
save_total_limit=3, # Keep 3 best checkpoints
metric_for_best_model="eval_loss",
greater_is_better=False,
load_best_model_at_end=True,
# Split ratios for automatic validation/test creation
eval_ratio=0.01,
test_ratio=0.01,
# ============================================================================
# MULTILINGUAL & FRENCH SPECIFIC SETTINGS
# ============================================================================
primary_language="fr", # French as primary language
reasoning_languages=["French", "English"], # Bilingual reasoning
domain_focus="instruction", # Instruction following
# ============================================================================
# GENERATION CONFIG FOR EVALUATION - GPT-OSS Harmony Format
# ============================================================================
generation_config={
"max_new_tokens": 512,
"do_sample": True,
"temperature": 0.7,
"top_p": 0.9,
"top_k": 50,
"repetition_penalty": 1.1,
"pad_token_id": None,
"eos_token_id": None,
# GPT-OSS Harmony Format specific settings
"reasoning_effort": "medium", # Configurable reasoning level
"use_harmony_format": True, # Ensure harmony format in generation
},
# ============================================================================
# HF HUB INTEGRATION
# ============================================================================
push_to_hub=False, # Set to True to auto-push
hub_model_id=None, # Will be set by launch script
hub_private_repo=False,
# ============================================================================
# MONITORING
# ============================================================================
enable_tracking=True, # Trackio monitoring
log_artifacts=True,
log_metrics=True,
log_config=True,
)
# Print configuration summary on import
print("\nπ«π· OpenHermes-FR Configuration Loaded")
print("=" * 50)
print(f"π Dataset: {config.dataset_name}")
print(f"π£οΈ Language: French (with {config.dataset_format} format)")
print(f"π Training: {config.num_train_epochs} epochs")
print(f"π Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}")
print(f"π§ LoRA Rank: {config.lora_config['r']}")
print(f"π Sequence Length: {config.max_seq_length}")
print(f"π Quality Filtering: {'Enabled' if config.filter_bad_entries else 'Disabled'}")
print(f"π΅ GPT-OSS Harmony Format: {'Enabled' if config.use_harmony_format else 'Disabled'}")
print("=" * 50)
|