File size: 10,898 Bytes
d8dd7a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
769bb84
 
 
 
 
 
 
 
 
 
 
d8dd7a1
 
40fd629
ebe598e
d8dd7a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fe83da
 
 
 
 
 
 
 
 
 
ebe598e
 
 
 
 
 
40fd629
 
 
 
d8dd7a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fe83da
 
 
 
 
 
 
 
 
 
ebe598e
 
 
 
 
 
d8dd7a1
 
 
 
 
 
 
 
5f8b28d
ebe598e
5f8b28d
 
 
 
 
 
 
ebe598e
 
5f8b28d
ebe598e
 
 
 
5f8b28d
 
 
ebe598e
d8dd7a1
 
 
 
 
 
 
5fe83da
54ebacf
5fe83da
 
 
 
 
54ebacf
 
 
 
 
5fe83da
 
93ed7a1
d8dd7a1
 
 
5fe83da
 
93ed7a1
 
 
d8dd7a1
 
40fd629
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8dd7a1
 
 
 
 
ebe598e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8dd7a1
 
ebe598e
 
 
 
 
 
 
 
 
 
 
 
 
 
d8dd7a1
ebe598e
 
 
 
 
 
 
 
d8dd7a1
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
#!/usr/bin/env python3
"""
SmolLM3 Fine-tuning Script for FlexAI Console
Based on the nanoGPT structure but adapted for SmolLM3 model
"""

import os
import sys
import argparse
import json
import torch
import logging
from pathlib import Path
from typing import Optional, Dict, Any

# Add the current directory to the path for imports
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

# Add project root to path for config imports
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

try:
    from config import get_config
except ImportError:
    # Fallback: try direct import
    sys.path.insert(0, os.path.join(project_root, 'src'))
    from config import get_config
from model import SmolLM3Model
from data import SmolLM3Dataset
from trainer import SmolLM3Trainer, SmolLM3DPOTrainer
from monitoring import create_monitor_from_config

def setup_logging():
    """Setup logging configuration"""
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.StreamHandler(sys.stdout),
            logging.FileHandler('training.log')
        ]
    )
    return logging.getLogger(__name__)

def parse_args():
    """Parse command line arguments"""
    parser = argparse.ArgumentParser(description='SmolLM3 Fine-tuning Script')
    
    # Configuration file
    parser.add_argument('config', type=str, help='Path to configuration file')
    
    # Dataset arguments
    parser.add_argument('--dataset_dir', type=str, default='my_dataset',
                       help='Path to dataset directory within /input')
    
    # Checkpoint arguments
    parser.add_argument('--out_dir', type=str, default='/output-checkpoint',
                       help='Output directory for checkpoints')
    parser.add_argument('--init_from', type=str, default='scratch',
                       choices=['scratch', 'resume', 'pretrained'],
                       help='Initialization method')
    
    # Training arguments
    parser.add_argument('--max_iters', type=int, default=None,
                       help='Maximum number of training iterations')
    parser.add_argument('--batch_size', type=int, default=None,
                       help='Batch size for training')
    parser.add_argument('--learning_rate', type=float, default=None,
                       help='Learning rate')
    parser.add_argument('--gradient_accumulation_steps', type=int, default=None,
                       help='Gradient accumulation steps')
    
    # Model arguments
    parser.add_argument('--model_name', type=str, 
                       default='HuggingFaceTB/SmolLM3-3B',
                       help='Model name or path')
    parser.add_argument('--max_seq_length', type=int, default=4096,
                       help='Maximum sequence length')
    
    # Logging and saving
    parser.add_argument('--save_steps', type=int, default=500,
                       help='Save checkpoint every N steps')
    parser.add_argument('--eval_steps', type=int, default=100,
                       help='Evaluate every N steps')
    parser.add_argument('--logging_steps', type=int, default=10,
                       help='Log every N steps')
    
    # Trackio monitoring arguments
    parser.add_argument('--enable_tracking', action='store_true', default=True,
                       help='Enable Trackio experiment tracking')
    parser.add_argument('--trackio_url', type=str, default=None,
                       help='Trackio server URL')
    parser.add_argument('--trackio_token', type=str, default=None,
                       help='Trackio authentication token')
    parser.add_argument('--experiment_name', type=str, default=None,
                       help='Custom experiment name for tracking')
    
    # HF Datasets arguments
    parser.add_argument('--hf_token', type=str, default=None,
                       help='Hugging Face token for dataset access')
    parser.add_argument('--dataset_repo', type=str, default=None,
                       help='HF Dataset repository for experiment storage')
    
    # Trainer type selection
    parser.add_argument('--trainer_type', type=str, choices=['sft', 'dpo'], default=None,
                       help='Trainer type: sft (Supervised Fine-tuning) or dpo (Direct Preference Optimization)')
    
    return parser.parse_args()

def main():
    """Main training function"""
    args = parse_args()
    logger = setup_logging()
    
    logger.info("Starting SmolLM3 fine-tuning...")
    logger.info(f"Arguments: {vars(args)}")
    
    # Load configuration
    config = get_config(args.config)
    
    # Override config with command line arguments
    if args.max_iters is not None:
        config.max_iters = args.max_iters
    if args.batch_size is not None:
        config.batch_size = args.batch_size
    if args.learning_rate is not None:
        config.learning_rate = args.learning_rate
    if args.gradient_accumulation_steps is not None:
        config.gradient_accumulation_steps = args.gradient_accumulation_steps
    
    # Override Trackio configuration
    if args.enable_tracking is not None:
        config.enable_tracking = args.enable_tracking
    if args.trackio_url is not None:
        config.trackio_url = args.trackio_url
    if args.trackio_token is not None:
        config.trackio_token = args.trackio_token
    if args.experiment_name is not None:
        config.experiment_name = args.experiment_name
    
    # Override HF Datasets configuration
    if args.hf_token is not None:
        os.environ['HF_TOKEN'] = args.hf_token
    if args.dataset_repo is not None:
        os.environ['TRACKIO_DATASET_REPO'] = args.dataset_repo
    
    # Setup paths
    output_path = args.out_dir
    
    # Ensure output directory exists
    os.makedirs(output_path, exist_ok=True)
    
    logger.info(f"Output path: {output_path}")
    
    # Initialize monitoring (supports local-only mode)
    monitor = None
    try:
        monitoring_mode = getattr(config, 'monitoring_mode', os.environ.get('MONITORING_MODE', 'both')).lower()
        should_create_monitor = (
            monitoring_mode in ('both', 'dataset', 'trackio', 'none')
            and (getattr(config, 'enable_tracking', True) or monitoring_mode in ('dataset', 'none'))
        )
        if should_create_monitor:
            monitor = create_monitor_from_config(config, args.experiment_name)
            logger.info(f"βœ… Monitoring initialized for experiment: {monitor.experiment_name}")
            logger.info(f"πŸ“Š Monitoring mode: {monitor.monitoring_mode}")
            logger.info(f"πŸ“Š Dataset repository: {monitor.dataset_repo}")
            # Log configuration
            config_dict = {k: v for k, v in vars(config).items() if not k.startswith('_')}
            monitor.log_configuration(config_dict)
    except Exception as e:
        logger.error(f"Failed to initialize monitoring: {e}")
        logger.warning("Continuing without monitoring...")
    
    # Initialize model
    model = SmolLM3Model(
        model_name=args.model_name,
        max_seq_length=args.max_seq_length,
        config=config
    )
    
    # Determine dataset path
    # Check if using Hugging Face dataset or local dataset
    if hasattr(config, 'dataset_name') and config.dataset_name:
        # Use Hugging Face dataset
        dataset_path = config.dataset_name
        logger.info(f"Using Hugging Face dataset: {dataset_path}")
    else:
        # Use local dataset from config or command line argument
        if args.dataset_dir:
            dataset_path = os.path.join('/input', args.dataset_dir)
        else:
            dataset_path = os.path.join('/input', config.data_dir)
        logger.info(f"Using local dataset: {dataset_path}")
    
    # Load dataset with filtering options and sampling
    dataset = SmolLM3Dataset(
        data_path=dataset_path,
        tokenizer=model.tokenizer,
        max_seq_length=args.max_seq_length,
        filter_bad_entries=getattr(config, 'filter_bad_entries', False),
        bad_entry_field=getattr(config, 'bad_entry_field', 'bad_entry'),
        sample_size=getattr(config, 'sample_size', None),
        sample_seed=getattr(config, 'sample_seed', 42)
    )
    
    # Determine trainer type (command line overrides config)
    trainer_type = args.trainer_type or getattr(config, 'trainer_type', 'sft')
    logger.info(f"Using trainer type: {trainer_type}")
    
    # Import the appropriate trainer class
    # from trainer import SmolLM3Trainer, SmolLM3DPOTrainer # This line is removed as per the edit hint
    
    # Initialize trainer based on type
    if trainer_type.lower() == 'dpo':
        logger.info("Initializing DPO trainer...")
        trainer = SmolLM3DPOTrainer(
            model=model,
            dataset=dataset,
            config=config,
            output_dir=output_path
        )
    else:
        logger.info("Initializing SFT trainer...")
        trainer = SmolLM3Trainer(
            model=model,
            dataset=dataset,
            config=config,
            output_dir=output_path,
            init_from=args.init_from
        )
    
    # Start training
    try:
        trainer.train()
        logger.info("Training completed successfully!")
        
        # Log training summary
        if monitor:
            try:
                summary = {
                    'final_loss': getattr(trainer, 'final_loss', None),
                    'total_steps': getattr(trainer, 'total_steps', None),
                    'training_duration': getattr(trainer, 'training_duration', None),
                    'model_path': output_path,
                    'config_file': args.config
                }
                monitor.log_training_summary(summary)
                logger.info("βœ… Training summary logged")
            except Exception as e:
                logger.error(f"Failed to log training summary: {e}")
        
    except Exception as e:
        logger.error(f"Training failed: {e}")
        
        # Log error to monitoring
        if monitor:
            try:
                error_summary = {
                    'error': str(e),
                    'status': 'failed',
                    'model_path': output_path,
                    'config_file': args.config
                }
                monitor.log_training_summary(error_summary)
            except Exception as log_error:
                logger.error(f"Failed to log error to monitoring: {log_error}")
        
        raise
    finally:
        # Close monitoring
        if monitor:
            try:
                monitor.close()
                logger.info("βœ… Monitoring session closed")
            except Exception as e:
                logger.error(f"Failed to close monitoring: {e}")

if __name__ == '__main__':
    main()