File size: 8,225 Bytes
59e57ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0fa6045
 
 
59e57ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
"""
GPT-OSS OpenHermes-FR Optimized Configuration
Specifically optimized for the legmlai/openhermes-fr dataset
800K French instruction-response pairs with quality filtering
"""

from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig

# OpenHermes-FR optimized configuration
config = GPTOSSEnhancedCustomConfig(
    # ============================================================================
    # DATASET CONFIGURATION - OpenHermes-FR Specific
    # ============================================================================
    dataset_name="legmlai/openhermes-fr",
    dataset_split="train",
    dataset_format="openhermes_fr",
    
    # OpenHermes-FR field mapping
    input_field="prompt",                    # French prompts
    target_field="accepted_completion",      # GPT-4o generated completions
    
    # Quality filtering using OpenHermes-FR metadata
    filter_bad_entries=True,                 # Use built-in quality flags
    bad_entry_field="bad_entry",
    bad_prompt_field="bad_prompt_detected",
    bad_response_field="bad_response_detected",
    
    # Data processing optimized for French with GPT-OSS Harmony Format
    concatenate_fields=True,
    field_separator="\n\n### RΓ©ponse:\n",   # Fallback separator (harmony format takes precedence)
    add_eos_token=True,
    use_harmony_format=True,                 # Enable GPT-OSS harmony format
    
    # Dataset sampling (use all 800K examples by default)
    max_samples=None,                        # Use full dataset
    min_length=20,                          # Minimum for meaningful French text
    max_length=None,                        # Auto-set to max_seq_length
    
    # ============================================================================
    # TRAINING HYPERPARAMETERS - French Language Optimized
    # ============================================================================
    num_train_epochs=1.5,                   # 1.5 epochs optimal for large dataset
    batch_size=6,                           # Balanced for most GPUs
    gradient_accumulation_steps=6,          # Effective batch size: 36
    
    # Learning rate schedule optimized for French fine-tuning
    learning_rate=2.5e-4,                   # Slightly higher for multilingual
    min_lr=2.5e-5,                          # 10% of max learning rate
    warmup_ratio=0.05,                      # 5% warmup for stability
    weight_decay=0.01,                      # Standard L2 regularization
    max_grad_norm=1.0,                      # Gradient clipping
    
    # ============================================================================
    # MODEL CONFIGURATION - Optimized for French
    # ============================================================================
    model_name="openai/gpt-oss-20b",
    max_seq_length=3072,                    # Balanced length for French
    use_flash_attention=True,
    use_gradient_checkpointing=True,
    
    # Mixed precision for efficiency
    fp16=False,
    bf16=True,                              # Better for GPT-OSS
    
    # ============================================================================
    # LORA CONFIGURATION - Optimized for French Language Learning
    # ============================================================================
    use_lora=True,
    lora_config={
        "r": 24,                            # Higher rank for language adaptation
        "lora_alpha": 48,                   # 2x rank scaling
        "lora_dropout": 0.05,               # Light regularization
        "target_modules": "all-linear",
        "target_parameters": [
            "7.mlp.experts.gate_up_proj",
            "7.mlp.experts.down_proj",
            "15.mlp.experts.gate_up_proj", 
            "15.mlp.experts.down_proj",
            "23.mlp.experts.gate_up_proj",
            "23.mlp.experts.down_proj",
        ],
        "bias": "none",
        "task_type": "CAUSAL_LM",
    },
    
    # ============================================================================
    # QUANTIZATION - Balanced Performance/Memory
    # ============================================================================
    use_quantization=True,
    quantization_config={
        "dequantize": True,                 # MXFP4 as per GPT-OSS tutorial
        "load_in_4bit": False,              # Standard precision for quality
    },
    
    # ============================================================================
    # PERFORMANCE OPTIMIZATION
    # ============================================================================
    # Data loading optimized for large dataset
    dataloader_num_workers=6,               # More workers for large dataset
    dataloader_pin_memory=True,
    dataloader_prefetch_factor=3,           # Higher prefetch for efficiency
    
    # Memory management
    low_cpu_mem_usage=True,
    group_by_length=True,                   # Efficient batching
    remove_unused_columns=True,
    
    # ============================================================================
    # EVALUATION & LOGGING
    # ============================================================================
    eval_strategy="steps",
    eval_steps=200,                         # Evaluate every 200 steps
    logging_steps=20,                       # Log every 20 steps
    
    save_strategy="steps", 
    save_steps=500,                         # Save every 500 steps
    save_total_limit=3,                     # Keep 3 best checkpoints
    
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    load_best_model_at_end=True,
    # Split ratios for automatic validation/test creation
    eval_ratio=0.01,
    test_ratio=0.01,
    
    # ============================================================================
    # MULTILINGUAL & FRENCH SPECIFIC SETTINGS
    # ============================================================================
    primary_language="fr",                  # French as primary language
    reasoning_languages=["French", "English"],  # Bilingual reasoning
    domain_focus="instruction",             # Instruction following
    
    # ============================================================================
    # GENERATION CONFIG FOR EVALUATION - GPT-OSS Harmony Format
    # ============================================================================
    generation_config={
        "max_new_tokens": 512,
        "do_sample": True,
        "temperature": 0.7,
        "top_p": 0.9,
        "top_k": 50,
        "repetition_penalty": 1.1,
        "pad_token_id": None,
        "eos_token_id": None,
        # GPT-OSS Harmony Format specific settings
        "reasoning_effort": "medium",           # Configurable reasoning level
        "use_harmony_format": True,             # Ensure harmony format in generation
    },
    
    # ============================================================================
    # HF HUB INTEGRATION
    # ============================================================================
    push_to_hub=False,                      # Set to True to auto-push
    hub_model_id=None,                      # Will be set by launch script
    hub_private_repo=False,
    
    # ============================================================================
    # MONITORING
    # ============================================================================
    enable_tracking=True,                   # Trackio monitoring
    log_artifacts=True,
    log_metrics=True,
    log_config=True,
)

# Print configuration summary on import
print("\nπŸ‡«πŸ‡· OpenHermes-FR Configuration Loaded")
print("=" * 50)
print(f"πŸ“Š Dataset: {config.dataset_name}")
print(f"πŸ—£οΈ  Language: French (with {config.dataset_format} format)")
print(f"πŸ“ˆ Training: {config.num_train_epochs} epochs")
print(f"πŸ”„ Effective Batch Size: {config.batch_size * config.gradient_accumulation_steps}")
print(f"🧠 LoRA Rank: {config.lora_config['r']}")
print(f"πŸ“ Sequence Length: {config.max_seq_length}")
print(f"πŸ” Quality Filtering: {'Enabled' if config.filter_bad_entries else 'Disabled'}")
print(f"🎡 GPT-OSS Harmony Format: {'Enabled' if config.use_harmony_format else 'Disabled'}")
print("=" * 50)