Tonic's picture
adds sft , quantization, better readmes
40fd629 verified
#!/usr/bin/env python3
"""
Script to run A100 large-scale experiments on OpenHermes-FR dataset
Supports multiple configurations for different training scenarios
"""
import argparse
import os
import sys
from pathlib import Path
# Set CUDA memory optimization
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
def main():
parser = argparse.ArgumentParser(description="Run A100 large-scale experiments")
parser.add_argument(
"--config",
type=str,
default="config/train_smollm3_openhermes_fr_a100_large.py",
help="Configuration file to use"
)
parser.add_argument(
"--experiment-name",
type=str,
help="Custom experiment name for tracking"
)
parser.add_argument(
"--output-dir",
type=str,
default="./outputs",
help="Output directory for checkpoints and logs"
)
parser.add_argument(
"--resume",
type=str,
help="Resume training from checkpoint"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Print configuration without starting training"
)
parser.add_argument(
"--trackio-url",
"--trackio_url",
type=str,
help="Trackio URL for experiment tracking"
)
parser.add_argument(
"--trackio-token",
"--trackio_token",
type=str,
help="Trackio token for authentication"
)
parser.add_argument(
"--dataset-dir",
type=str,
default="my_dataset",
help="Dataset directory path"
)
parser.add_argument(
"--trainer-type",
type=str,
choices=['sft', 'dpo'],
help="Trainer type: sft (Supervised Fine-tuning) or dpo (Direct Preference Optimization)"
)
args = parser.parse_args()
# Add the project root to Python path
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))
# Import the configuration
try:
# Import all available configurations
from config.train_smollm3_openhermes_fr_a100_large import get_config as get_large_config
from config.train_smollm3_openhermes_fr_a100_multiple_passes import get_config as get_multiple_passes_config
from config.train_smollm3_h100_lightweight import get_config as get_h100_lightweight_config
# Map config files to their respective functions
config_map = {
"config/train_smollm3_openhermes_fr_a100_large.py": get_large_config,
"config/train_smollm3_openhermes_fr_a100_multiple_passes.py": get_multiple_passes_config,
"config/train_smollm3_h100_lightweight.py": get_h100_lightweight_config,
}
if args.config in config_map:
config = config_map[args.config](args.config)
else:
# Try to load from the specified config file
config = get_large_config(args.config)
except ImportError as e:
print(f"Error importing configuration: {e}")
print("Available configurations:")
print(" - config/train_smollm3_openhermes_fr_a100_large.py (Large batch, 1.3 passes)")
print(" - config/train_smollm3_openhermes_fr_a100_multiple_passes.py (Multiple passes, 4 epochs)")
print(" - config/train_smollm3_h100_lightweight.py (H100 lightweight, 80K samples)")
return 1
# Override experiment name if provided
if args.experiment_name:
config.experiment_name = args.experiment_name
# Override Trackio settings if provided
if args.trackio_url:
config.trackio_url = args.trackio_url
if args.trackio_token:
config.trackio_token = args.trackio_token
# Create output directory
os.makedirs(args.output_dir, exist_ok=True)
# Print configuration summary
print(f"\n{'='*60}")
print(f"EXPERIMENT CONFIGURATION")
print(f"{'='*60}")
print(f"Config file: {args.config}")
print(f"Experiment name: {config.experiment_name}")
print(f"Output directory: {args.output_dir}")
print(f"Model: {config.model_name}")
print(f"Batch size: {config.batch_size}")
print(f"Gradient accumulation: {config.gradient_accumulation_steps}")
print(f"Effective batch size: {config.batch_size * config.gradient_accumulation_steps}")
print(f"Learning rate: {config.learning_rate}")
print(f"Max iterations: {config.max_iters}")
print(f"Max sequence length: {config.max_seq_length}")
print(f"Mixed precision: {'bf16' if config.bf16 else 'fp16'}")
print(f"Trainer type: {getattr(config, 'trainer_type', 'sft')}")
if hasattr(config, 'dataset_name') and config.dataset_name:
print(f"Dataset: {config.dataset_name}")
if hasattr(config, 'sample_size') and config.sample_size:
print(f"Sample size: {config.sample_size}")
else:
print(f"Dataset directory: {config.data_dir}")
print(f"Training file: {config.train_file}")
if config.validation_file:
print(f"Validation file: {config.validation_file}")
if config.trackio_url:
print(f"Trackio URL: {config.trackio_url}")
if config.trackio_token:
print(f"Trackio Token: {'*' * len(config.trackio_token)}")
print(f"{'='*60}\n")
if args.dry_run:
print("DRY RUN - Configuration printed above. Use without --dry-run to start training.")
return 0
# Import and run training
try:
# Add src directory to path
src_path = str(project_root / "src")
sys.path.insert(0, src_path)
from train import main as train_main
# Set up training arguments - config is positional, not --config
train_args = [
args.config, # Config file as positional argument
"--out_dir", args.output_dir,
]
if args.resume:
train_args.extend(["--init_from", "resume"])
# Add Trackio arguments if provided
if args.trackio_url:
train_args.extend(["--trackio_url", args.trackio_url])
if args.trackio_token:
train_args.extend(["--trackio_token", args.trackio_token])
if args.experiment_name:
train_args.extend(["--experiment_name", args.experiment_name])
# Add dataset directory argument
train_args.extend(["--dataset_dir", args.dataset_dir])
# Add trainer type argument if provided
if args.trainer_type:
train_args.extend(["--trainer_type", args.trainer_type])
# Override sys.argv for the training script
original_argv = sys.argv
sys.argv = ["train.py"] + train_args
# Run training
train_main()
# Restore original argv
sys.argv = original_argv
except ImportError as e:
print(f"Error importing training module: {e}")
print("Make sure train.py is available in the current directory.")
return 1
except Exception as e:
print(f"Error during training: {e}")
return 1
return 0
if __name__ == "__main__":
exit(main())