Spaces:
Running
Running
File size: 10,898 Bytes
d8dd7a1 769bb84 d8dd7a1 40fd629 ebe598e d8dd7a1 5fe83da ebe598e 40fd629 d8dd7a1 5fe83da ebe598e d8dd7a1 5f8b28d ebe598e 5f8b28d ebe598e 5f8b28d ebe598e 5f8b28d ebe598e d8dd7a1 5fe83da 54ebacf 5fe83da 54ebacf 5fe83da 93ed7a1 d8dd7a1 5fe83da 93ed7a1 d8dd7a1 40fd629 d8dd7a1 ebe598e d8dd7a1 ebe598e d8dd7a1 ebe598e d8dd7a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 |
#!/usr/bin/env python3
"""
SmolLM3 Fine-tuning Script for FlexAI Console
Based on the nanoGPT structure but adapted for SmolLM3 model
"""
import os
import sys
import argparse
import json
import torch
import logging
from pathlib import Path
from typing import Optional, Dict, Any
# Add the current directory to the path for imports
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# Add project root to path for config imports
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if project_root not in sys.path:
sys.path.insert(0, project_root)
try:
from config import get_config
except ImportError:
# Fallback: try direct import
sys.path.insert(0, os.path.join(project_root, 'src'))
from config import get_config
from model import SmolLM3Model
from data import SmolLM3Dataset
from trainer import SmolLM3Trainer, SmolLM3DPOTrainer
from monitoring import create_monitor_from_config
def setup_logging():
"""Setup logging configuration"""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler('training.log')
]
)
return logging.getLogger(__name__)
def parse_args():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(description='SmolLM3 Fine-tuning Script')
# Configuration file
parser.add_argument('config', type=str, help='Path to configuration file')
# Dataset arguments
parser.add_argument('--dataset_dir', type=str, default='my_dataset',
help='Path to dataset directory within /input')
# Checkpoint arguments
parser.add_argument('--out_dir', type=str, default='/output-checkpoint',
help='Output directory for checkpoints')
parser.add_argument('--init_from', type=str, default='scratch',
choices=['scratch', 'resume', 'pretrained'],
help='Initialization method')
# Training arguments
parser.add_argument('--max_iters', type=int, default=None,
help='Maximum number of training iterations')
parser.add_argument('--batch_size', type=int, default=None,
help='Batch size for training')
parser.add_argument('--learning_rate', type=float, default=None,
help='Learning rate')
parser.add_argument('--gradient_accumulation_steps', type=int, default=None,
help='Gradient accumulation steps')
# Model arguments
parser.add_argument('--model_name', type=str,
default='HuggingFaceTB/SmolLM3-3B',
help='Model name or path')
parser.add_argument('--max_seq_length', type=int, default=4096,
help='Maximum sequence length')
# Logging and saving
parser.add_argument('--save_steps', type=int, default=500,
help='Save checkpoint every N steps')
parser.add_argument('--eval_steps', type=int, default=100,
help='Evaluate every N steps')
parser.add_argument('--logging_steps', type=int, default=10,
help='Log every N steps')
# Trackio monitoring arguments
parser.add_argument('--enable_tracking', action='store_true', default=True,
help='Enable Trackio experiment tracking')
parser.add_argument('--trackio_url', type=str, default=None,
help='Trackio server URL')
parser.add_argument('--trackio_token', type=str, default=None,
help='Trackio authentication token')
parser.add_argument('--experiment_name', type=str, default=None,
help='Custom experiment name for tracking')
# HF Datasets arguments
parser.add_argument('--hf_token', type=str, default=None,
help='Hugging Face token for dataset access')
parser.add_argument('--dataset_repo', type=str, default=None,
help='HF Dataset repository for experiment storage')
# Trainer type selection
parser.add_argument('--trainer_type', type=str, choices=['sft', 'dpo'], default=None,
help='Trainer type: sft (Supervised Fine-tuning) or dpo (Direct Preference Optimization)')
return parser.parse_args()
def main():
"""Main training function"""
args = parse_args()
logger = setup_logging()
logger.info("Starting SmolLM3 fine-tuning...")
logger.info(f"Arguments: {vars(args)}")
# Load configuration
config = get_config(args.config)
# Override config with command line arguments
if args.max_iters is not None:
config.max_iters = args.max_iters
if args.batch_size is not None:
config.batch_size = args.batch_size
if args.learning_rate is not None:
config.learning_rate = args.learning_rate
if args.gradient_accumulation_steps is not None:
config.gradient_accumulation_steps = args.gradient_accumulation_steps
# Override Trackio configuration
if args.enable_tracking is not None:
config.enable_tracking = args.enable_tracking
if args.trackio_url is not None:
config.trackio_url = args.trackio_url
if args.trackio_token is not None:
config.trackio_token = args.trackio_token
if args.experiment_name is not None:
config.experiment_name = args.experiment_name
# Override HF Datasets configuration
if args.hf_token is not None:
os.environ['HF_TOKEN'] = args.hf_token
if args.dataset_repo is not None:
os.environ['TRACKIO_DATASET_REPO'] = args.dataset_repo
# Setup paths
output_path = args.out_dir
# Ensure output directory exists
os.makedirs(output_path, exist_ok=True)
logger.info(f"Output path: {output_path}")
# Initialize monitoring (supports local-only mode)
monitor = None
try:
monitoring_mode = getattr(config, 'monitoring_mode', os.environ.get('MONITORING_MODE', 'both')).lower()
should_create_monitor = (
monitoring_mode in ('both', 'dataset', 'trackio', 'none')
and (getattr(config, 'enable_tracking', True) or monitoring_mode in ('dataset', 'none'))
)
if should_create_monitor:
monitor = create_monitor_from_config(config, args.experiment_name)
logger.info(f"β
Monitoring initialized for experiment: {monitor.experiment_name}")
logger.info(f"π Monitoring mode: {monitor.monitoring_mode}")
logger.info(f"π Dataset repository: {monitor.dataset_repo}")
# Log configuration
config_dict = {k: v for k, v in vars(config).items() if not k.startswith('_')}
monitor.log_configuration(config_dict)
except Exception as e:
logger.error(f"Failed to initialize monitoring: {e}")
logger.warning("Continuing without monitoring...")
# Initialize model
model = SmolLM3Model(
model_name=args.model_name,
max_seq_length=args.max_seq_length,
config=config
)
# Determine dataset path
# Check if using Hugging Face dataset or local dataset
if hasattr(config, 'dataset_name') and config.dataset_name:
# Use Hugging Face dataset
dataset_path = config.dataset_name
logger.info(f"Using Hugging Face dataset: {dataset_path}")
else:
# Use local dataset from config or command line argument
if args.dataset_dir:
dataset_path = os.path.join('/input', args.dataset_dir)
else:
dataset_path = os.path.join('/input', config.data_dir)
logger.info(f"Using local dataset: {dataset_path}")
# Load dataset with filtering options and sampling
dataset = SmolLM3Dataset(
data_path=dataset_path,
tokenizer=model.tokenizer,
max_seq_length=args.max_seq_length,
filter_bad_entries=getattr(config, 'filter_bad_entries', False),
bad_entry_field=getattr(config, 'bad_entry_field', 'bad_entry'),
sample_size=getattr(config, 'sample_size', None),
sample_seed=getattr(config, 'sample_seed', 42)
)
# Determine trainer type (command line overrides config)
trainer_type = args.trainer_type or getattr(config, 'trainer_type', 'sft')
logger.info(f"Using trainer type: {trainer_type}")
# Import the appropriate trainer class
# from trainer import SmolLM3Trainer, SmolLM3DPOTrainer # This line is removed as per the edit hint
# Initialize trainer based on type
if trainer_type.lower() == 'dpo':
logger.info("Initializing DPO trainer...")
trainer = SmolLM3DPOTrainer(
model=model,
dataset=dataset,
config=config,
output_dir=output_path
)
else:
logger.info("Initializing SFT trainer...")
trainer = SmolLM3Trainer(
model=model,
dataset=dataset,
config=config,
output_dir=output_path,
init_from=args.init_from
)
# Start training
try:
trainer.train()
logger.info("Training completed successfully!")
# Log training summary
if monitor:
try:
summary = {
'final_loss': getattr(trainer, 'final_loss', None),
'total_steps': getattr(trainer, 'total_steps', None),
'training_duration': getattr(trainer, 'training_duration', None),
'model_path': output_path,
'config_file': args.config
}
monitor.log_training_summary(summary)
logger.info("β
Training summary logged")
except Exception as e:
logger.error(f"Failed to log training summary: {e}")
except Exception as e:
logger.error(f"Training failed: {e}")
# Log error to monitoring
if monitor:
try:
error_summary = {
'error': str(e),
'status': 'failed',
'model_path': output_path,
'config_file': args.config
}
monitor.log_training_summary(error_summary)
except Exception as log_error:
logger.error(f"Failed to log error to monitoring: {log_error}")
raise
finally:
# Close monitoring
if monitor:
try:
monitor.close()
logger.info("β
Monitoring session closed")
except Exception as e:
logger.error(f"Failed to close monitoring: {e}")
if __name__ == '__main__':
main() |